# Imports 

import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv("insurance_claims.csv")
display(df)

missing_values = df.isnull().sum()
missing_values

months_as_customer                0
age                               0
policy_number                     0
policy_bind_date                  0
policy_state                      0
policy_csl                        0
policy_deductable                 0
policy_annual_premium             0
umbrella_limit                    0
insured_zip                       0
insured_sex                       0
insured_education_level           0
insured_occupation                0
insured_hobbies                   0
insured_relationship              0
capital-gains                     0
capital-loss                      0
incident_date                     0
incident_type                     0
collision_type                    0
incident_severity                 0
authorities_contacted            91
incident_state                    0
incident_city                     0
incident_location                 0
incident_hour_of_the_day          0
number_of_vehicles_involved       0
property_damage                   0
bodily_injuries                   0
witnesses                         0
police_report_available           0
total_claim_amount                0
injury_claim                      0
property_claim                    0
vehicle_claim                     0
auto_make                         0
auto_model                        0
auto_year                         0
fraud_reported                    0
_c39                           1000
dtype: int64

place_holder = df[df=='?'].count()
place_holder

months_as_customer               0
age                              0
policy_number                    0
policy_bind_date                 0
policy_state                     0
policy_csl                       0
policy_deductable                0
policy_annual_premium            0
umbrella_limit                   0
insured_zip                      0
insured_sex                      0
insured_education_level          0
insured_occupation               0
insured_hobbies                  0
insured_relationship             0
capital-gains                    0
capital-loss                     0
incident_date                    0
incident_type                    0
collision_type                 178
incident_severity                0
authorities_contacted            0
incident_state                   0
incident_city                    0
incident_location                0
incident_hour_of_the_day         0
number_of_vehicles_involved      0
property_damage                360
bodily_injuries                  0
witnesses                        0
police_report_available        343
total_claim_amount               0
injury_claim                     0
property_claim                   0
vehicle_claim                    0
auto_make                        0
auto_model                       0
auto_year                        0
fraud_reported                   0
_c39                             0
dtype: int64

replaced_var = df.replace('?', np.NaN)

def is_null_data_heat(df):
    df.replace('Varies with device', np.nan, inplace=True)
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), cmap='viridis', cbar=False)
    plt.title('Heatmap of Missing Data')
    plt.show()

is_null_data_heat(replaced_var)

columns_to_remove = ['umbrella_limit', 'insured_relationship', 'property_damage', '_c39']
df = df.drop(columns=columns_to_remove)

for column in df.columns:
    unique_strings_overview = df[column].value_counts()
    print(f"\nOverview for {column}:")
    print(unique_strings_overview)

Overview for months_as_customer:
months_as_customer
194    8
128    7
254    7
140    7
210    7
      ..
390    1
411    1
453    1
448    1
17     1
Name: count, Length: 391, dtype: int64

Overview for age:
age
43    49
39    48
41    45
34    44
38    42
30    42
31    42
37    41
33    39
40    38
32    38
29    35
46    33
42    32
35    32
36    32
44    32
28    30
26    26
45    26
48    25
47    24
27    24
57    16
25    14
55    14
49    14
53    13
50    13
24    10
54    10
61    10
51     9
60     9
58     8
56     8
23     7
21     6
59     5
62     4
52     4
64     2
63     2
19     1
20     1
22     1
Name: count, dtype: int64

Overview for policy_number:
policy_number
521585    1
687755    1
674485    1
223404    1
991480    1
         ..
563878    1
620855    1
583169    1
337677    1
556080    1
Name: count, Length: 1000, dtype: int64

Overview for policy_bind_date:
policy_bind_date
2006-01-01    3
1992-04-28    3
1992-08-05    3
1991-12-14    2
2004-08-09    2
             ..
2014-06-03    1
1998-12-12    1
1999-02-18    1
1997-10-30    1
1996-11-11    1
Name: count, Length: 951, dtype: int64

Overview for policy_state:
policy_state
OH    352
IL    338
IN    310
Name: count, dtype: int64

Overview for policy_csl:
policy_csl
250/500     351
100/300     349
500/1000    300
Name: count, dtype: int64

Overview for policy_deductable:
policy_deductable
1000    351
500     342
2000    307
Name: count, dtype: int64

Overview for policy_annual_premium:
policy_annual_premium
1558.29    2
1215.36    2
1362.87    2
1073.83    2
1389.13    2
          ..
1085.03    1
1437.33    1
988.29     1
1238.89    1
766.19     1
Name: count, Length: 991, dtype: int64

Overview for insured_zip:
insured_zip
477695    2
469429    2
446895    2
431202    2
456602    2
         ..
476303    1
450339    1
476502    1
600561    1
612260    1
Name: count, Length: 995, dtype: int64

Overview for insured_sex:
insured_sex
FEMALE    537
MALE      463
Name: count, dtype: int64

Overview for insured_education_level:
insured_education_level
JD             161
High School    160
Associate      145
MD             144
Masters        143
PhD            125
College        122
Name: count, dtype: int64

Overview for insured_occupation:
insured_occupation
machine-op-inspct    93
prof-specialty       85
tech-support         78
sales                76
exec-managerial      76
craft-repair         74
transport-moving     72
other-service        71
priv-house-serv      71
armed-forces         69
adm-clerical         65
protective-serv      63
handlers-cleaners    54
farming-fishing      53
Name: count, dtype: int64

Overview for insured_hobbies:
insured_hobbies
reading           64
exercise          57
paintball         57
bungie-jumping    56
movies            55
golf              55
camping           55
kayaking          54
yachting          53
hiking            52
video-games       50
skydiving         49
base-jumping      49
board-games       48
polo              47
chess             46
dancing           43
sleeping          41
cross-fit         35
basketball        34
Name: count, dtype: int64

Overview for capital-gains:
capital-gains
0        508
46300      5
51500      4
68500      4
55600      3
        ... 
36700      1
54900      1
69200      1
48800      1
50300      1
Name: count, Length: 338, dtype: int64

Overview for capital-loss:
capital-loss
 0        475
-31700      5
-53700      5
-50300      5
-45300      4
         ... 
-12100      1
-17000      1
-72900      1
-19700      1
-82100      1
Name: count, Length: 354, dtype: int64

Overview for incident_date:
incident_date
2015-02-02    28
2015-02-17    26
2015-01-07    25
2015-01-10    24
2015-02-04    24
2015-01-24    24
2015-01-19    23
2015-01-08    22
2015-01-13    21
2015-01-30    21
2015-02-12    20
2015-02-22    20
2015-01-31    20
2015-02-06    20
2015-02-21    19
2015-01-01    19
2015-02-23    19
2015-01-12    19
2015-01-14    19
2015-01-21    19
2015-01-03    18
2015-02-14    18
2015-02-01    18
2015-02-28    18
2015-01-20    18
2015-01-18    18
2015-02-25    18
2015-01-06    17
2015-01-09    17
2015-02-08    17
2015-02-24    17
2015-02-26    17
2015-02-13    16
2015-02-15    16
2015-02-16    16
2015-02-05    16
2015-01-16    16
2015-01-17    15
2015-02-18    15
2015-01-28    15
2015-01-15    15
2015-01-22    14
2015-02-20    14
2015-02-27    14
2015-01-23    13
2015-02-03    13
2015-01-27    13
2015-02-09    13
2015-01-04    12
2015-03-01    12
2015-01-26    11
2015-01-29    11
2015-01-02    11
2015-02-19    10
2015-02-11    10
2015-02-10    10
2015-02-07    10
2015-01-25    10
2015-01-11     9
2015-01-05     7
Name: count, dtype: int64

Overview for incident_type:
incident_type
Multi-vehicle Collision     419
Single Vehicle Collision    403
Vehicle Theft                94
Parked Car                   84
Name: count, dtype: int64

Overview for collision_type:
collision_type
Rear Collision     292
Side Collision     276
Front Collision    254
?                  178
Name: count, dtype: int64

Overview for incident_severity:
incident_severity
Minor Damage      354
Total Loss        280
Major Damage      276
Trivial Damage     90
Name: count, dtype: int64

Overview for authorities_contacted:
authorities_contacted
Police       292
Fire         223
Other        198
Ambulance    196
Name: count, dtype: int64

Overview for incident_state:
incident_state
NY    262
SC    248
WV    217
VA    110
NC    110
PA     30
OH     23
Name: count, dtype: int64

Overview for incident_city:
incident_city
Springfield    157
Arlington      152
Columbus       149
Northbend      145
Hillsdale      141
Riverwood      134
Northbrook     122
Name: count, dtype: int64

Overview for incident_location:
incident_location
9935 4th Drive         1
4214 MLK Ridge         1
8548 Cherokee Ridge    1
2352 MLK Drive         1
9734 2nd Ridge         1
                      ..
6770 1st St            1
4119 Texas St          1
4347 2nd Ridge         1
1091 1st Drive         1
1416 Cherokee Ridge    1
Name: count, Length: 1000, dtype: int64

Overview for incident_hour_of_the_day:
incident_hour_of_the_day
17    54
3     53
0     52
23    51
16    49
13    46
10    46
4     46
6     44
9     43
14    43
21    42
18    41
12    40
19    40
7     40
15    39
22    38
8     36
20    34
5     33
2     31
11    30
1     29
Name: count, dtype: int64

Overview for number_of_vehicles_involved:
number_of_vehicles_involved
1    581
3    358
4     31
2     30
Name: count, dtype: int64

Overview for bodily_injuries:
bodily_injuries
0    340
2    332
1    328
Name: count, dtype: int64

Overview for witnesses:
witnesses
1    258
2    250
0    249
3    243
Name: count, dtype: int64

Overview for police_report_available:
police_report_available
?      343
NO     343
YES    314
Name: count, dtype: int64

Overview for total_claim_amount:
total_claim_amount
59400    5
2640     4
70400    4
4320     4
44200    4
        ..
65250    1
87100    1
6240     1
66600    1
67500    1
Name: count, Length: 763, dtype: int64

Overview for injury_claim:
injury_claim
0        25
640       7
480       7
660       5
580       5
         ..
14840     1
6580      1
11820     1
16650     1
7500      1
Name: count, Length: 638, dtype: int64

Overview for property_claim:
property_claim
0        19
860       6
480       5
660       5
10000     5
         ..
3590      1
6480      1
4580      1
4920      1
7500      1
Name: count, Length: 626, dtype: int64

Overview for vehicle_claim:
vehicle_claim
5040     7
3360     6
52080    5
4720     5
3600     5
        ..
43360    1
25130    1
38940    1
47430    1
52500    1
Name: count, Length: 726, dtype: int64

Overview for auto_make:
auto_make
Saab          80
Dodge         80
Suburu        80
Nissan        78
Chevrolet     76
Ford          72
BMW           72
Toyota        70
Audi          69
Accura        68
Volkswagen    68
Jeep          67
Mercedes      65
Honda         55
Name: count, dtype: int64

Overview for auto_model:
auto_model
RAM               43
Wrangler          42
A3                37
Neon              37
MDX               36
Jetta             35
Passat            33
A5                32
Legacy            32
Pathfinder        31
Malibu            30
92x               28
Camry             28
Forrestor         28
F150              27
95                27
E400              27
93                25
Grand Cherokee    25
Escape            24
Tahoe             24
Maxima            24
Ultima            23
X5                23
Highlander        22
Civic             22
Silverado         22
Fusion            21
ML350             20
Impreza           20
Corolla           20
TL                20
CRV               20
C300              18
3 Series          18
X6                16
M5                15
Accord            13
RSX               12
Name: count, dtype: int64

Overview for auto_year:
auto_year
1995    56
1999    55
2005    54
2006    53
2011    53
2007    52
2003    51
2009    50
2010    50
2013    49
2002    49
2015    47
1997    46
2012    46
2008    45
2014    44
2001    42
2000    42
1998    40
2004    39
1996    37
Name: count, dtype: int64

Overview for fraud_reported:
fraud_reported
N    753
Y    247
Name: count, dtype: int64

df.replace('?', pd.NA, inplace=True)

plt.figure(figsize=(8, 6))
sns.heatmap(df.isnull(), cmap='viridis', cbar=False, yticklabels=False)
plt.title('Missing Values Heatmap')
plt.show()

df.duplicated().sum()

0

df.duplicated(subset=df.columns.difference(['policy_number']), keep=False).sum()

0

def detect_outliers_iqr(column):
    q1 = column.quantile(0.25)
    q3 = column.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = (column < lower_bound) | (column > upper_bound)
    return outliers

outliers_column1 = detect_outliers_iqr(df['months_as_customer'])
outliers_column1.sum()

0

outliers_column2 = detect_outliers_iqr(df['age'])
outliers_column2.sum()
df[outliers_column2]

outliers_column3 = detect_outliers_iqr(df['policy_deductable'])
outliers_column3.sum()

0

outliers_column4 = detect_outliers_iqr(df['policy_annual_premium'])
outliers_column4
df[outliers_column4]

plt.figure(figsize=(10, 6))

sns.kdeplot(data=df[df['insured_sex'] == 'MALE']['age'], label='Male', fill=True)
sns.kdeplot(data=df[df['insured_sex'] == 'FEMALE']['age'], label='Female', fill=True)


plt.xlabel('Age Range')
plt.ylabel('Age Density Estimate')
plt.title('Average age of insured female and male clients (KDE)')
plt.legend()

plt.show()

state_counts = df['incident_state'].value_counts()


plt.figure(figsize=(10, 6)) 
state_counts.plot(kind='bar', color='blue')
plt.title('Total Count of incidents per State')
plt.xlabel('State')
plt.ylabel('Count of Incidents')
plt.xticks(rotation=45)  
plt.grid(axis='y')

plt.show()

grouped_data = df.groupby('auto_make')[['capital-gains', 'capital-loss']].mean()


fig, ax = plt.subplots(figsize=(10, 5))

grouped_data.plot(kind='barh', ax=ax)
ax.set_xlabel('Total')
ax.set_ylabel('Auto Make')
ax.set_title('Average Capital Gains and Losses by Auto Make')

plt.show()

mean_witnesses = df.groupby('incident_hour_of_the_day')['witnesses'].mean()

fig = plt.figure(figsize=(10, 7))
plt.plot(mean_witnesses.index, mean_witnesses.values, marker='o', linestyle='-')
plt.xlabel('Incident Hour of the Day')
plt.ylabel('Mean Number of Witnesses')
plt.title('Mean Number of Witnesses by Incident Hour of the Day')

plt.xticks(np.arange(0, 24, step=1), labels=np.arange(0, 24, step=1))  # Set x-axis ticks for each hour
plt.grid(True)
plt.show()

mean_incidents = df['incident_hour_of_the_day'].value_counts().sort_index()

fig = plt.figure(figsize=(10, 7))
plt.plot(mean_incidents, marker='o', linestyle='-')
plt.xlabel('Hour of the Day')
plt.ylabel('Frequency of incidents')
plt.title('Frequency of Incidents by Hour of the Day')

plt.xticks(np.arange(0, 24, step=1), labels=np.arange(0, 24, step=1))  
plt.grid(True)
plt.show()

grouped_claims = df['fraud_reported'].value_counts()

plt.pie(grouped_claims, labels=grouped_claims.index, autopct='%1.1f%%')
plt.title('Fraud Reported')
plt.show()

fraudulent_claims = df[df['fraud_reported'] == 'Y']['total_claim_amount']

plt.hist(fraudulent_claims, bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Total Claim Amount')
plt.ylabel('Frequency of claims')
plt.title('Total Claim Amount for Fraudulent Claims')
plt.grid(True)
plt.show()

Investigation into Vehicle Insurance Claims Fraud¶

Report Contents¶

Section 1 : Data Overview¶

Section 2: Data Cleaning¶

Section 3 : Data Visualisations¶

Section 3a: Report Based Evaluations and Insights¶

Section 3b: Dashboard Based Evaluations and Insights¶

Section 5: Conclusion¶

	months_as_customer	age	policy_number	policy_bind_date	policy_state	policy_csl	policy_deductable	policy_annual_premium	umbrella_limit	insured_zip	...	police_report_available	total_claim_amount	injury_claim	property_claim	vehicle_claim	auto_make	auto_model	auto_year	fraud_reported	_c39
0	328	48	521585	2014-10-17	OH	250/500	1000	1406.91	0	466132	...	YES	71610	6510	13020	52080	Saab	92x	2004	Y	NaN
1	228	42	342868	2006-06-27	IN	250/500	2000	1197.22	5000000	468176	...	?	5070	780	780	3510	Mercedes	E400	2007	Y	NaN
2	134	29	687698	2000-09-06	OH	100/300	2000	1413.14	5000000	430632	...	NO	34650	7700	3850	23100	Dodge	RAM	2007	N	NaN
3	256	41	227811	1990-05-25	IL	250/500	2000	1415.74	6000000	608117	...	NO	63400	6340	6340	50720	Chevrolet	Tahoe	2014	Y	NaN
4	228	44	367455	2014-06-06	IL	500/1000	1000	1583.91	6000000	610706	...	NO	6500	1300	650	4550	Accura	RSX	2009	N	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	3	38	941851	1991-07-16	OH	500/1000	1000	1310.80	0	431289	...	?	87200	17440	8720	61040	Honda	Accord	2006	N	NaN
996	285	41	186934	2014-01-05	IL	100/300	1000	1436.79	0	608177	...	?	108480	18080	18080	72320	Volkswagen	Passat	2015	N	NaN
997	130	34	918516	2003-02-17	OH	250/500	500	1383.49	3000000	442797	...	YES	67500	7500	7500	52500	Suburu	Impreza	1996	N	NaN
998	458	62	533940	2011-11-18	IL	500/1000	2000	1356.92	5000000	441714	...	YES	46980	5220	5220	36540	Audi	A5	1998	N	NaN
999	456	60	556080	1996-11-11	OH	250/500	1000	766.19	0	612260	...	?	5060	460	920	3680	Mercedes	E400	2007	N	NaN

	months_as_customer	age	policy_number	policy_bind_date	policy_state	policy_csl	policy_deductable	policy_annual_premium	insured_zip	insured_sex	...	witnesses	police_report_available	total_claim_amount	injury_claim	property_claim	vehicle_claim	auto_make	auto_model	auto_year	fraud_reported
67	478	64	346002	1990-08-20	OH	250/500	500	835.02	602433	FEMALE	...	1	NO	33930	0	3770	30160	BMW	X6	1998	N
538	478	63	852002	2009-06-29	IL	250/500	1000	1292.30	479724	MALE	...	3	NO	4700	940	470	3290	Dodge	Neon	2007	Y
588	472	64	911429	2012-08-25	IN	250/500	500	989.24	615821	MALE	...	3	NO	70900	14180	7090	49630	Mercedes	ML350	2002	N
947	465	63	531160	2012-01-12	IL	250/500	500	1006.99	458936	FEMALE	...	3	<NA>	43560	4840	4840	33880	Suburu	Legacy	2015	N

	months_as_customer	age	policy_number	policy_bind_date	policy_state	policy_csl	policy_deductable	policy_annual_premium	insured_zip	insured_sex	...	witnesses	police_report_available	total_claim_amount	injury_claim	property_claim	vehicle_claim	auto_make	auto_model	auto_year	fraud_reported
43	106	30	790442	2003-04-13	OH	250/500	500	538.17	433782	FEMALE	...	2	NO	51120	5680	5680	39760	Mercedes	E400	2005	N
133	24	33	649082	1996-01-19	IL	500/1000	1000	1922.84	431277	FEMALE	...	1	NO	46800	4680	9360	32760	Jeep	Wrangler	2002	N
229	154	37	241562	2010-01-28	IL	250/500	1000	2047.59	439269	FEMALE	...	3	NO	79530	14460	7230	57840	Accura	MDX	2000	N
248	101	27	326322	2007-02-10	IL	250/500	1000	433.33	603269	MALE	...	3	NO	5900	1180	590	4130	Mercedes	E400	2009	N
544	254	40	349658	1994-06-07	IN	100/300	500	1927.87	609317	MALE	...	1	<NA>	43280	0	5410	37870	Honda	Civic	1996	Y
681	32	38	719989	1994-04-07	IL	250/500	2000	566.11	453164	MALE	...	0	NO	2640	440	440	1760	Honda	CRV	2015	N
747	37	31	230223	2008-09-06	IL	500/1000	500	1969.63	615218	FEMALE	...	1	YES	76450	6950	13900	55600	Dodge	RAM	1995	N
763	66	30	984456	2003-06-24	IN	500/1000	500	484.67	608309	FEMALE	...	2	YES	65560	11920	11920	41720	Volkswagen	Passat	2015	Y
796	421	56	728025	1990-02-15	IN	100/300	500	1935.85	470826	MALE	...	3	<NA>	92730	16860	8430	67440	Mercedes	E400	2004	Y