Skip to content

Commit ae79675

Browse files
authored
task-3 done from ayanasamuel8/task-3
update: task-3 added
2 parents 632b8cf + b734184 commit ae79675

15 files changed

Lines changed: 992 additions & 4 deletions

notebooks/task_3/05_hypothesis_testing.ipynb

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.

src/data_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ def load_data(path: str) -> pd.DataFrame:
77
skiprows=0
88
)
99
def load_raw_data(path: str) -> pd.DataFrame:
10-
return pd.read_csv(path)
10+
return pd.read_csv(path, low_memory=False)
1111

1212
def check_structure(df: pd.DataFrame):
1313
return df.info(), df.dtypes

src/preprocessing.py

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,85 @@
11
import pandas as pd
2-
def clean_numeric_strings(df: pd.DataFrame, cols: list):
2+
import numpy as np
3+
from scipy import stats
4+
from src.data_loader import extract_numeric_cols, extract_categorical_cols, extract_date_time_cols
5+
6+
7+
def clean_numeric_strings(df: pd.DataFrame, cols: list) -> pd.DataFrame:
38
"""
49
Convert string-formatted numbers with commas to proper floats.
510
"""
611
for col in cols:
712
if df[col].dtype == 'object':
8-
df[col] = df[col].str.replace(',', '', regex=False) # Remove thousands separator
9-
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to float
13+
df[col] = df[col].str.replace(',', '', regex=False)
14+
df[col] = pd.to_numeric(df[col], errors='coerce')
1015
return df
16+
17+
18+
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
19+
"""
20+
Clean the dataset by performing the following steps:
21+
- Drop duplicate records.
22+
- Fill missing numerical values with the median.
23+
- Standardize and fill missing categorical values with the mode.
24+
- Clean numeric strings (e.g., "1,000").
25+
- Cap numerical outliers between 1st and 99th percentile.
26+
- Convert datetime columns.
27+
28+
Returns the cleaned DataFrame.
29+
"""
30+
# 1. Remove Duplicate Rows
31+
df_clean = df.drop_duplicates().copy()
32+
print(print(df['TotalClaims'].value_counts()))
33+
34+
# 2. Handle Numeric Columns
35+
num_cols = extract_numeric_cols(df_clean)
36+
print('TotalClaims' in num_cols)
37+
for col in num_cols:
38+
median_val = df_clean[col].median()
39+
df_clean[col] = df_clean[col].fillna(median_val)
40+
41+
# 3. Handle Categorical Columns
42+
cat_cols = extract_categorical_cols(df_clean)
43+
for col in cat_cols:
44+
# Ensure string formatting and remove invalid 'nan' strings
45+
df_clean[col] = df_clean[col].astype(str).str.strip().str.lower()
46+
df_clean[col] = df_clean[col].replace('nan', np.nan)
47+
48+
for col in cat_cols:
49+
if df_clean[col].isnull().all():
50+
continue # Skip if entire column is NaN
51+
mode_vals = df_clean[col].mode()
52+
if not mode_vals.empty:
53+
mode_val = mode_vals[0]
54+
df_clean[col] = df_clean[col].fillna(mode_val)
55+
56+
# 4. Clean Numeric Strings
57+
numeric_string_cols = [
58+
col for col in df_clean.columns
59+
if df_clean[col].dtype == 'object' and df_clean[col].str.contains(',', na=False).any()
60+
]
61+
df_clean = clean_numeric_strings(df_clean, numeric_string_cols)
62+
print('after string clean:')
63+
print(print(df['TotalClaims'].value_counts()))
64+
65+
# 5. Cap Outliers to 1st and 99th percentiles
66+
for col in num_cols:
67+
if col != 'TotalClaims':
68+
lower, upper = df_clean[col].quantile([0.01, 0.99])
69+
df_clean[col] = np.clip(df_clean[col], lower, upper)
70+
71+
# 6. Convert Date/Time Columns
72+
date_time_cols = extract_date_time_cols(df_clean)
73+
for col in date_time_cols:
74+
df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
75+
76+
df_clean = df_clean.dropna(thresh=len(df) * 0.2, axis=1)
77+
# df_clean.drop_duplicates(inplace=True)
78+
79+
return df_clean
80+
def save_cleaned_data(df: pd.DataFrame, file_path: str = '../../data/cleaned/cleaned_data.csv') -> None:
81+
"""
82+
Save the cleaned DataFrame to a CSV file.
83+
"""
84+
df.to_csv(file_path, index=False)
85+
print(f"Cleaned data saved to {file_path}")

src/task_3/__init__.py

Whitespace-only changes.

src/task_3/business_analysis.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
def interpret_test_result(test_name, p_value, additional_info=""):
2+
"""
3+
Returns a clear business interpretation of the test result.
4+
If p_value < 0.05, we reject the null hypothesis, and we can add further explanation.
5+
"""
6+
if p_value < 0.05:
7+
interpretation = f"We reject the null hypothesis for {test_name} (p = {p_value:.3f}). {additional_info}"
8+
else:
9+
interpretation = f"We fail to reject the null hypothesis for {test_name} (p = {p_value:.3f}). No significant difference observed."
10+
return interpretation

src/task_3/data_segmentation.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import pandas as pd
2+
3+
def segment_data(data, feature, test_value):
4+
"""
5+
Segments the data into two groups:
6+
- Group A (Control): Plans without the test feature.
7+
- Group B (Test): Plans with the test feature (i.e., where feature == test_value).
8+
9+
Returns: control, test as two DataFrame objects.
10+
"""
11+
control = data[data[feature] != test_value].copy()
12+
test = data[data[feature] == test_value].copy()
13+
return control, test

src/task_3/hypothesis_tests.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import numpy as np
2+
import pandas as pd
3+
from src.task_3.stats_helpers import anova_test, t_test_independent, chi_square_test
4+
def province_risk_test(data, risk_metric='claim_frequency'):
5+
"""
6+
Compare risk across provinces using the specified risk metric.
7+
8+
For claim frequency: Calculate the proportion of policies with at least one claim.
9+
For claim severity: Consider only policies with at least one claim and compute their average.
10+
Use ANOVA to compare differences across provinces.
11+
12+
Returns a dictionary with test statistics and p-values.
13+
"""
14+
results = {}
15+
provinces = data['Province'].unique()
16+
groups = []
17+
18+
if risk_metric == 'claim_frequency':
19+
for province in provinces:
20+
subset = data[data['Province'] == province]
21+
# Binary flag: 1 if claim exists, 0 otherwise
22+
freq = subset['claim_indicator'].mean()
23+
groups.append(subset['claim_indicator'].values)
24+
results[province] = {'claim_frequency': freq}
25+
elif risk_metric == 'claim_severity':
26+
for province in provinces:
27+
subset = data[(data['Province'] == province) & (data['claim_indicator'] == 1)]
28+
severity = subset['TotalClaims'].mean() if not subset.empty else np.nan
29+
groups.append(subset['TotalClaims'].values if not subset.empty else np.array([0]))
30+
results[province] = {'claim_severity': severity}
31+
else:
32+
raise ValueError("Invalid risk metric specified.")
33+
34+
stat, p_val = anova_test(groups)
35+
return {'results_by_province': results, 'anova_stat': stat, 'p_value': p_val}
36+
37+
38+
def zip_risk_test(data, risk_metric='claim_frequency'):
39+
"""
40+
Compares risk across zip codes using the specified risk metric (frequency or severity).
41+
Performs ANOVA across groups.
42+
"""
43+
results = {}
44+
zip_codes = data['PostalCode'].dropna().unique()
45+
groups = []
46+
47+
if risk_metric == 'claim_frequency':
48+
for z in zip_codes:
49+
subset = data[data['PostalCode'] == z]
50+
freq = subset['claim_indicator'].mean()
51+
groups.append(subset['claim_indicator'].values)
52+
results[z] = {'claim_frequency': freq}
53+
elif risk_metric == 'claim_severity':
54+
for z in zip_codes:
55+
subset = data[(data['PostalCode'] == z) & (data['claim_indicator'] == 1)]
56+
severity = subset['TotalClaims'].mean() if not subset.empty else np.nan
57+
groups.append(subset['TotalClaims'].values if not subset.empty else np.array([0]))
58+
results[z] = {'claim_severity': severity}
59+
else:
60+
raise ValueError("Invalid risk metric.")
61+
62+
stat, p_val = anova_test(groups)
63+
return {'results_by_zip': results, 'anova_stat': stat, 'p_value': p_val}
64+
65+
def zip_margin_test(data):
66+
"""
67+
Tests whether margin (TotalPremium - TotalClaims) differs by zip code using ANOVA.
68+
"""
69+
data = data.copy()
70+
data['margin'] = data['TotalPremium'] - data['TotalClaims']
71+
72+
results = {}
73+
zip_codes = data['PostalCode'].dropna().unique()
74+
groups = []
75+
76+
for z in zip_codes:
77+
subset = data[data['PostalCode'] == z]
78+
margin_mean = subset['margin'].mean()
79+
groups.append(subset['margin'].values)
80+
results[z] = {'average_margin': margin_mean}
81+
82+
stat, p_val = anova_test(groups)
83+
return {'results_by_zip': results, 'anova_stat': stat, 'p_value': p_val}
84+
85+
def gender_risk_test(data, risk_metric='claim_severity'):
86+
"""
87+
Compare risk between genders using a two-sample t-test.
88+
"""
89+
results = {}
90+
91+
male_data = data[data['Gender'] == 'Male']
92+
female_data = data[data['Gender'] == 'Female']
93+
94+
if risk_metric == 'claim_frequency':
95+
male_vals = male_data['claim_indicator']
96+
female_vals = female_data['claim_indicator']
97+
results['Male'] = male_vals.mean()
98+
results['Female'] = female_vals.mean()
99+
elif risk_metric == 'claim_severity':
100+
male_vals = male_data[male_data['claim_indicator'] == 1]['TotalClaims']
101+
female_vals = female_data[female_data['claim_indicator'] == 1]['TotalClaims']
102+
results['Male'] = male_vals.mean()
103+
results['Female'] = female_vals.mean()
104+
else:
105+
raise ValueError("Invalid risk metric.")
106+
107+
stat, p_val = t_test_independent(male_vals, female_vals)
108+
return {'results_by_gender': results, 't_stat': stat, 'p_value': p_val}

src/task_3/segmentation_utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
def calculate_claim_frequency(data):
5+
"""
6+
Calculate Claim Frequency as the proportion of policies with at least one claim.
7+
"""
8+
# data['claim_indicator'] = data['TotalClaims'] > 0
9+
return data['claim_indicator'].mean()
10+
11+
def calculate_claim_severity(data):
12+
"""
13+
Calculate Claim Severity as the average claim amount for policies with a claim.
14+
"""
15+
data['claim_indicator'] = data['TotalClaims'] > 0
16+
claims = data[data['claim_indicator']]
17+
return claims['TotalClaims'].mean() if not claims.empty else np.nan
18+
19+
def calculate_margin(data):
20+
"""
21+
Calculate margin as (TotalPremium - TotalClaims) for the data provided.
22+
"""
23+
return (data['TotalPremium'] - data['TotalClaims']).sum()
24+
25+
def get_groups_by_feature(df, feature: str, value_a, value_b):
26+
group_a = df[df[feature] == value_a]
27+
group_b = df[df[feature] == value_b]
28+
return group_a, group_b

src/task_3/stats_helpers.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from scipy.stats import ttest_ind, chi2_contingency, f_oneway
2+
import pandas as pd
3+
4+
def t_test_independent(group_a, group_b):
5+
return ttest_ind(group_a.dropna(), group_b.dropna(), equal_var=False)
6+
7+
def chi_square_test(df, col1, col2):
8+
contingency = pd.crosstab(df[col1], df[col2])
9+
return chi2_contingency(contingency)
10+
11+
def anova_test(groups):
12+
"""
13+
Perform one-way ANOVA.
14+
`groups` should be a list/tuple of arrays (or lists) containing your samples.
15+
Returns the ANOVA statistic and p-value.
16+
"""
17+
return f_oneway(*groups)

tests/test_task_3/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)