Cluster Analysis of Insurance Customers

5 minute read

Imports

import pandas as pd
import numpy as np

#Plot styling
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling
%matplotlib inline

plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')

Here is the Link to Data

df = pd.read_excel('Actual.Data.xlsx')

Exploratory Data Analysis - Takaful Dataset

df.head()

	Policy.ID	Assets	Gold.Customer	Status	Installment.Payment.Date	Next.Installment.Due.Date	Policy.Started.On	Mode	BasicPlan	Gender	...	Customer.Age.Policy.Start	Policy.Since (Year)	Customer.Age.Current	Agent	Installment.Amount	Premium.Amount	Total.Paid	City	Single/Joint	PaymentType
0	P60266	3	Active	Active	2019-05-28 00:00:00	2021-05-01	2012-05-01	Annual	Super.Savings	M	...	37.0	9	45.0	Individual	1463119.0	329954.0	659908.0	Karachi	Single	CHEQUE
1	P27953	1	Active	Active	2017-12-28 11:45:25	2021-01-01	2018-01-01	Annual	Super.Savings	M	...	34.0	3	36.0	Financial.Company	1200000.0	300000.0	900000.0	Naushera	Single	Online.Debit
2	P41583	1	Active	Active	2018-09-13 09:47:48	2019-10-01	2018-10-01	Annual	Super.Savings	NaN	...	45.0	1	46.0	Financial.Company	750000.0	750000.0	750000.0	Sialkot	Single	Online.Debit
3	P65843	1	Active	Active	2019-08-23 00:00:00	2021-10-01	2019-10-01	Annual	Family.Future.Security	NaN	...	39.0	2	39.0	Financial.Company	600000.0	200000.0	400000.0	Karachi	Single	Online.Debit
4	P40045	2	Inactive	Lapsed	2018-08-17 16:00:10	2019-09-01	2018-09-01	Annual	Savings	F	...	59.0	1	60.0	Individual	525000.0	500000.0	500000.0	Hyderabad	Single	CHEQUE

5 rows × 21 columns

Data Types of columns

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57019 entries, 0 to 57018
Data columns (total 21 columns):
Policy.ID                    57019 non-null object
Assets                       57019 non-null int64
Gold.Customer                57019 non-null object
Status                       57019 non-null object
Installment.Payment.Date     57019 non-null datetime64[ns]
Next.Installment.Due.Date    57019 non-null datetime64[ns]
Policy.Started.On            57019 non-null datetime64[ns]
Mode                         57019 non-null object
BasicPlan                    57019 non-null object
Gender                       18981 non-null object
Weight (KG)                  56968 non-null float64
Customer.Age.Policy.Start    57019 non-null float64
Policy.Since (Year)          57019 non-null int64
Customer.Age.Current         57019 non-null float64
Agent                        57019 non-null object
Installment.Amount           57019 non-null float64
Premium.Amount               57019 non-null float64
Total.Paid                   57017 non-null float64
City                         57019 non-null object
Single/Joint                 57019 non-null object
PaymentType                  57019 non-null object
dtypes: datetime64[ns](3), float64(6), int64(2), object(10)
memory usage: 9.1+ MB

Check Null Values

df.apply(lambda x: sum(x.isnull()),axis=0)

Policy.ID                        0
Assets                           0
Gold.Customer                    0
Status                           0
Installment.Payment.Date         0
Next.Installment.Due.Date        0
Policy.Started.On                0
Mode                             0
BasicPlan                        0
Gender                       38038
Weight (KG)                     51
Customer.Age.Policy.Start        0
Policy.Since (Year)              0
Customer.Age.Current             0
Agent                            0
Installment.Amount               0
Premium.Amount                   0
Total.Paid                       2
City                             0
Single/Joint                     0
PaymentType                      0
dtype: int64

df['Gender'].value_counts()

M    18344
F      637
Name: Gender, dtype: int64

df.rename(columns={'Weight (KG)': 'Weight'}, inplace=True)

Tackling Missing Values in Gender and Weight

sns.boxplot(x=df['Weight'])

#Drop 51 null values in weight as they are very less and will not have a major impact

#df.dropna(subset=['Weight'],inplace=True)
df.apply(lambda x: sum(x.isnull()),axis=0)

Policy.ID                        0
Assets                           0
Gold.Customer                    0
Status                           0
Installment.Payment.Date         0
Next.Installment.Due.Date        0
Policy.Started.On                0
Mode                             0
BasicPlan                        0
Gender                       38038
Weight                          51
Customer.Age.Policy.Start        0
Policy.Since (Year)              0
Customer.Age.Current             0
Agent                            0
Installment.Amount               0
Premium.Amount                   0
Total.Paid                       2
City                             0
Single/Joint                     0
PaymentType                      0
dtype: int64

df.groupby('Gender')['Weight'].mean()

Gender
F    63.683533
M    70.839145
Name: Weight, dtype: float64

#Making A copy of the dataframe so that we dont lose our data while transformation

new_df = df.copy()

new_df['Gender'] = new_df['Gender'].replace(np.nan, 'X')

Replace Missing Values wrt Male and Female Averages

for i in range(len(new_df.index)):
    if new_df['Gender'].iloc[i] =='X':
        if new_df['Weight'].iloc[i] < 70.839145:
            new_df.iloc[i, new_df.columns.get_loc('Gender')] = 'F'
        else:
            new_df.iloc[i, new_df.columns.get_loc('Gender')] = 'M'
            

new_df.dropna(subset=['Total.Paid'],inplace=True)
#There were two Missing values in Total.Paid Column that are dropped too

#Cleaned Data
new_df.to_excel(r'E:\Semester_8\Business_Intelligence\TAKAFUL_ASSIGNMENT\Cleaned_Takaful_Dataset_Final.xlsx', index = False, header = True)

Data Visualizations

sns.barplot(x='Gold.Customer',y='Total.Paid',data=new_df)

Customer Segmentation - Installment and Premium

data = pd.read_excel('E:\Semester_8\Business_Intelligence\TAKAFUL_ASSIGNMENT\Cleaned_Takaful_Dataset_Final.xlsx')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57017 entries, 0 to 57016
Data columns (total 23 columns):
Policy.ID                    56567 non-null object
Assets                       56567 non-null float64
Gold.Customer                56567 non-null object
Status                       56567 non-null object
Installment.Payment.Date     56567 non-null datetime64[ns]
Next.Installment.Due.Date    56567 non-null datetime64[ns]
Policy.Started.On            56567 non-null datetime64[ns]
Mode                         56567 non-null object
BasicPlan                    56567 non-null object
Gender                       56567 non-null object
Weight                       56521 non-null float64
Customer.Age.Policy.Start    56567 non-null float64
Policy.Since (Year)          56567 non-null float64
Customer.Age.Current         56567 non-null float64
Agent                        56567 non-null object
Installment.Amount           56567 non-null float64
Installment_Amount           57017 non-null int64
Premium.Amount               56567 non-null float64
Premium_Amount               57017 non-null int64
Total.Paid                   56567 non-null float64
City                         56567 non-null object
Single/Joint                 56567 non-null object
PaymentType                  56567 non-null object
dtypes: datetime64[ns](3), float64(8), int64(2), object(10)
memory usage: 10.0+ MB

#I have created two new Columns in Excel = Installment_Amount and Premium_Amount of INT TYPE

Installment = data['Installment_Amount'].values
Premium = data['Premium_Amount'].values
X = np.array(list(zip(Installment, Premium)))
plt.scatter(Installment, Premium, c='black', s=50)

##plot in 3D space
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(X[:, 0], X[:, 1])

dataset = data.iloc[:,[16,18]]

X=dataset.iloc[:,[0,1]].values

array([[1463119,  329954],
       [1200000,  300000],
       [ 750000,  750000],
       ...,
       [      0,   50000],
       [      0,   30900],
       [      0,   30675]], dtype=int64)

#Using the elbow method to find the ideal number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
    km=KMeans(n_clusters=i,init='k-means++', max_iter=300, n_init=10, random_state=0)
    km.fit(X)
    wcss.append(km.inertia_)
plt.plot(range(1,11),wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('wcss')
plt.show()

Ideal K size = 4,5,6

#Calculating the silhoutte coefficient
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans



for n_cluster in range(2, 11):
    kmeans = KMeans(n_clusters=n_cluster).fit(X)
    label = kmeans.labels_
    sil_coeff = silhouette_score(X, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

For n_clusters=2, The Silhouette Coefficient is 0.9934664851024786
For n_clusters=3, The Silhouette Coefficient is 0.9482019450001424
For n_clusters=4, The Silhouette Coefficient is 0.9403171140499658
For n_clusters=5, The Silhouette Coefficient is 0.903696294345598
For n_clusters=6, The Silhouette Coefficient is 0.8209100056819879
For n_clusters=7, The Silhouette Coefficient is 0.8209194931504735
For n_clusters=8, The Silhouette Coefficient is 0.8205152286640742
For n_clusters=9, The Silhouette Coefficient is 0.740046045775413
For n_clusters=10, The Silhouette Coefficient is 0.7469443387160237

import pylab as pl
from sklearn.decomposition import PCA

Nc = range(1, 20)
kmeans = [KMeans(n_clusters=i) for i in Nc]
kmeans
score = [kmeans[i].fit(X).score(X) for i in range(len(kmeans))]
score
pl.plot(Nc,score)
pl.xlabel('Number of Clusters')
pl.ylabel('Score')
pl.title('Elbow Curve')
pl.show()

print(score)

[-4342885204296660.0, -1965246538483911.8, -1190849308786024.8, -735865989905271.1, -464847169949274.5, -288260660074600.3, -221593993407933.6, -157027108634278.66, -106136302511251.6, -69345722080836.19, -57571566408134.14, -49745133662816.21, -44210314228213.82, -37851157457083.914, -32932468783416.984, -26461974880429.766, -21260906227471.258, -18700418224852.87, -17319322252699.422]

##Fitting kmeans to the dataset
km4=KMeans(n_clusters=4,init='k-means++', max_iter=300, n_init=10, random_state=0)
y_means = km4.fit_predict(X)

#Visualising the clusters for k=4
plt.scatter(X[y_means==0,0],X[y_means==0,1],s=50, c='purple',label='Cluster1')
plt.scatter(X[y_means==1,0],X[y_means==1,1],s=50, c='blue',label='Cluster2')
plt.scatter(X[y_means==2,0],X[y_means==2,1],s=50, c='green',label='Cluster3')
plt.scatter(X[y_means==3,0],X[y_means==3,1],s=50, c='cyan',label='Cluster4')

plt.scatter(km4.cluster_centers_[:,0], km4.cluster_centers_[:,1],s=200,marker='s', c='red', alpha=0.7, label='Centroids')
plt.title('Customer segments')
plt.xlabel('Installments')
plt.ylabel('Premium')
plt.legend()
plt.show()

K- Means clustering didn’t turn out to be effective due to problems with the data collection procedure and poor data Quality

Identifying Patterns And Relationships

TP = data['Total.Paid'].values
Premium = data['Premium_Amount'].values
M = np.array(list(zip(TP, Premium)))
plt.scatter(TP, Premium, c='blue', s=50)

TP = data['Premium_Amount'].values
Premium = data['Total.Paid'].values
M = np.array(list(zip(TP, Premium)))
plt.scatter(TP, Premium, c='blue', s=50)

Identifying Important Correlations

sns.heatmap(data.corr(),annot=True)

sns.countplot(x='Gender', data=data);
plt.title('Distribution of Gender');

data.hist('Customer.Age.Policy.Start', bins=35,);
plt.title('Distribution of Age');
plt.xlabel('Age');

data['Customer.Age.Policy.Start'].mean()

38.890749023282126

The Average age tend to be around 39, and there are more Males in this Dataset than Females

plt.hist('Installment_Amount', data=data[data['Gender'] == 'M'], alpha=0.5, label='Male');
plt.hist('Installment_Amount', data=data[data['Gender'] == 'F'], alpha=0.5, label='Female');
plt.title('Distribution of Installments by Gender');
plt.xlabel('Installment Amounts');
plt.legend();

sns.scatterplot('Customer.Age.Current', 'Total.Paid', hue='Gender', data=data);
plt.title('Age to Total_Paid, Colored by Gender');

Share on

Twitter Facebook Google+ LinkedIn

Mohammad Affan Sheikh