Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
661 changes: 661 additions & 0 deletions notebooks/task_3/05_hypothesis_testing.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def load_data(path: str) -> pd.DataFrame:
skiprows=0
)
def load_raw_data(path: str) -> pd.DataFrame:
return pd.read_csv(path)
return pd.read_csv(path, low_memory=False)

def check_structure(df: pd.DataFrame):
return df.info(), df.dtypes
Expand Down
81 changes: 78 additions & 3 deletions src/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,85 @@
import pandas as pd
def clean_numeric_strings(df: pd.DataFrame, cols: list):
import numpy as np
from scipy import stats
from src.data_loader import extract_numeric_cols, extract_categorical_cols, extract_date_time_cols


def clean_numeric_strings(df: pd.DataFrame, cols: list) -> pd.DataFrame:
"""
Convert string-formatted numbers with commas to proper floats.
"""
for col in cols:
if df[col].dtype == 'object':
df[col] = df[col].str.replace(',', '', regex=False) # Remove thousands separator
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to float
df[col] = df[col].str.replace(',', '', regex=False)
df[col] = pd.to_numeric(df[col], errors='coerce')
return df


def clean_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Clean the dataset by performing the following steps:
- Drop duplicate records.
- Fill missing numerical values with the median.
- Standardize and fill missing categorical values with the mode.
- Clean numeric strings (e.g., "1,000").
- Cap numerical outliers between 1st and 99th percentile.
- Convert datetime columns.

Returns the cleaned DataFrame.
"""
# 1. Remove Duplicate Rows
df_clean = df.drop_duplicates().copy()
print(print(df['TotalClaims'].value_counts()))

# 2. Handle Numeric Columns
num_cols = extract_numeric_cols(df_clean)
print('TotalClaims' in num_cols)
for col in num_cols:
median_val = df_clean[col].median()
df_clean[col] = df_clean[col].fillna(median_val)

# 3. Handle Categorical Columns
cat_cols = extract_categorical_cols(df_clean)
for col in cat_cols:
# Ensure string formatting and remove invalid 'nan' strings
df_clean[col] = df_clean[col].astype(str).str.strip().str.lower()
df_clean[col] = df_clean[col].replace('nan', np.nan)

for col in cat_cols:
if df_clean[col].isnull().all():
continue # Skip if entire column is NaN
mode_vals = df_clean[col].mode()
if not mode_vals.empty:
mode_val = mode_vals[0]
df_clean[col] = df_clean[col].fillna(mode_val)

# 4. Clean Numeric Strings
numeric_string_cols = [
col for col in df_clean.columns
if df_clean[col].dtype == 'object' and df_clean[col].str.contains(',', na=False).any()
]
df_clean = clean_numeric_strings(df_clean, numeric_string_cols)
print('after string clean:')
print(print(df['TotalClaims'].value_counts()))

# 5. Cap Outliers to 1st and 99th percentiles
for col in num_cols:
if col != 'TotalClaims':
lower, upper = df_clean[col].quantile([0.01, 0.99])
df_clean[col] = np.clip(df_clean[col], lower, upper)

# 6. Convert Date/Time Columns
date_time_cols = extract_date_time_cols(df_clean)
for col in date_time_cols:
df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

df_clean = df_clean.dropna(thresh=len(df) * 0.2, axis=1)
# df_clean.drop_duplicates(inplace=True)

return df_clean
def save_cleaned_data(df: pd.DataFrame, file_path: str = '../../data/cleaned/cleaned_data.csv') -> None:
"""
Save the cleaned DataFrame to a CSV file.
"""
df.to_csv(file_path, index=False)
print(f"Cleaned data saved to {file_path}")
Empty file added src/task_3/__init__.py
Empty file.
10 changes: 10 additions & 0 deletions src/task_3/business_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
def interpret_test_result(test_name, p_value, additional_info=""):
"""
Returns a clear business interpretation of the test result.
If p_value < 0.05, we reject the null hypothesis, and we can add further explanation.
"""
if p_value < 0.05:
interpretation = f"We reject the null hypothesis for {test_name} (p = {p_value:.3f}). {additional_info}"
else:
interpretation = f"We fail to reject the null hypothesis for {test_name} (p = {p_value:.3f}). No significant difference observed."
return interpretation
13 changes: 13 additions & 0 deletions src/task_3/data_segmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import pandas as pd

def segment_data(data, feature, test_value):
"""
Segments the data into two groups:
- Group A (Control): Plans without the test feature.
- Group B (Test): Plans with the test feature (i.e., where feature == test_value).

Returns: control, test as two DataFrame objects.
"""
control = data[data[feature] != test_value].copy()
test = data[data[feature] == test_value].copy()
return control, test
108 changes: 108 additions & 0 deletions src/task_3/hypothesis_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import numpy as np
import pandas as pd
from src.task_3.stats_helpers import anova_test, t_test_independent, chi_square_test
def province_risk_test(data, risk_metric='claim_frequency'):
"""
Compare risk across provinces using the specified risk metric.

For claim frequency: Calculate the proportion of policies with at least one claim.
For claim severity: Consider only policies with at least one claim and compute their average.
Use ANOVA to compare differences across provinces.

Returns a dictionary with test statistics and p-values.
"""
results = {}
provinces = data['Province'].unique()
groups = []

if risk_metric == 'claim_frequency':
for province in provinces:
subset = data[data['Province'] == province]
# Binary flag: 1 if claim exists, 0 otherwise
freq = subset['claim_indicator'].mean()
groups.append(subset['claim_indicator'].values)
results[province] = {'claim_frequency': freq}
elif risk_metric == 'claim_severity':
for province in provinces:
subset = data[(data['Province'] == province) & (data['claim_indicator'] == 1)]
severity = subset['TotalClaims'].mean() if not subset.empty else np.nan
groups.append(subset['TotalClaims'].values if not subset.empty else np.array([0]))
results[province] = {'claim_severity': severity}
else:
raise ValueError("Invalid risk metric specified.")

stat, p_val = anova_test(groups)
return {'results_by_province': results, 'anova_stat': stat, 'p_value': p_val}


def zip_risk_test(data, risk_metric='claim_frequency'):
"""
Compares risk across zip codes using the specified risk metric (frequency or severity).
Performs ANOVA across groups.
"""
results = {}
zip_codes = data['PostalCode'].dropna().unique()
groups = []

if risk_metric == 'claim_frequency':
for z in zip_codes:
subset = data[data['PostalCode'] == z]
freq = subset['claim_indicator'].mean()
groups.append(subset['claim_indicator'].values)
results[z] = {'claim_frequency': freq}
elif risk_metric == 'claim_severity':
for z in zip_codes:
subset = data[(data['PostalCode'] == z) & (data['claim_indicator'] == 1)]
severity = subset['TotalClaims'].mean() if not subset.empty else np.nan
groups.append(subset['TotalClaims'].values if not subset.empty else np.array([0]))
results[z] = {'claim_severity': severity}
else:
raise ValueError("Invalid risk metric.")

stat, p_val = anova_test(groups)
return {'results_by_zip': results, 'anova_stat': stat, 'p_value': p_val}

def zip_margin_test(data):
"""
Tests whether margin (TotalPremium - TotalClaims) differs by zip code using ANOVA.
"""
data = data.copy()
data['margin'] = data['TotalPremium'] - data['TotalClaims']

results = {}
zip_codes = data['PostalCode'].dropna().unique()
groups = []

for z in zip_codes:
subset = data[data['PostalCode'] == z]
margin_mean = subset['margin'].mean()
groups.append(subset['margin'].values)
results[z] = {'average_margin': margin_mean}

stat, p_val = anova_test(groups)
return {'results_by_zip': results, 'anova_stat': stat, 'p_value': p_val}

def gender_risk_test(data, risk_metric='claim_severity'):
"""
Compare risk between genders using a two-sample t-test.
"""
results = {}

male_data = data[data['Gender'] == 'Male']
female_data = data[data['Gender'] == 'Female']

if risk_metric == 'claim_frequency':
male_vals = male_data['claim_indicator']
female_vals = female_data['claim_indicator']
results['Male'] = male_vals.mean()
results['Female'] = female_vals.mean()
elif risk_metric == 'claim_severity':
male_vals = male_data[male_data['claim_indicator'] == 1]['TotalClaims']
female_vals = female_data[female_data['claim_indicator'] == 1]['TotalClaims']
results['Male'] = male_vals.mean()
results['Female'] = female_vals.mean()
else:
raise ValueError("Invalid risk metric.")

stat, p_val = t_test_independent(male_vals, female_vals)
return {'results_by_gender': results, 't_stat': stat, 'p_value': p_val}
28 changes: 28 additions & 0 deletions src/task_3/segmentation_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import numpy as np
import pandas as pd

def calculate_claim_frequency(data):
"""
Calculate Claim Frequency as the proportion of policies with at least one claim.
"""
# data['claim_indicator'] = data['TotalClaims'] > 0
return data['claim_indicator'].mean()

def calculate_claim_severity(data):
"""
Calculate Claim Severity as the average claim amount for policies with a claim.
"""
data['claim_indicator'] = data['TotalClaims'] > 0
claims = data[data['claim_indicator']]
return claims['TotalClaims'].mean() if not claims.empty else np.nan

def calculate_margin(data):
"""
Calculate margin as (TotalPremium - TotalClaims) for the data provided.
"""
return (data['TotalPremium'] - data['TotalClaims']).sum()

def get_groups_by_feature(df, feature: str, value_a, value_b):
group_a = df[df[feature] == value_a]
group_b = df[df[feature] == value_b]
return group_a, group_b
17 changes: 17 additions & 0 deletions src/task_3/stats_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from scipy.stats import ttest_ind, chi2_contingency, f_oneway
import pandas as pd

def t_test_independent(group_a, group_b):
return ttest_ind(group_a.dropna(), group_b.dropna(), equal_var=False)

def chi_square_test(df, col1, col2):
contingency = pd.crosstab(df[col1], df[col2])
return chi2_contingency(contingency)

def anova_test(groups):
"""
Perform one-way ANOVA.
`groups` should be a list/tuple of arrays (or lists) containing your samples.
Returns the ANOVA statistic and p-value.
"""
return f_oneway(*groups)
Empty file added tests/test_task_3/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions tests/test_task_3/test_business_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pytest
from src.task_3.business_analysis import interpret_test_result

def test_interpret_test_result_reject():
msg = interpret_test_result("Province Risk", 0.01, "Higher risk observed in Gauteng.")
assert "reject the null hypothesis" in msg
assert "Higher risk observed" in msg

def test_interpret_test_result_fail():
msg = interpret_test_result("Gender Risk", 0.2)
assert "fail to reject the null hypothesis" in msg
assert "No significant difference" in msg
11 changes: 11 additions & 0 deletions tests/test_task_3/test_data_segmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pandas as pd
from src.task_3.data_segmentation import segment_data

def test_segment_data():
df = pd.DataFrame({
'Gender': ['Male', 'Female', 'Male', 'Female'],
'claim_amount': [100, 200, 150, 250]
})
control, test = segment_data(df, 'Gender', 'Female')
assert all(control['Gender'] != 'Female')
assert all(test['Gender'] == 'Female')
33 changes: 33 additions & 0 deletions tests/test_task_3/test_hypothesis_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pandas as pd
import numpy as np
from src.task_3.hypothesis_tests import province_risk_test, zip_margin_test, gender_risk_test

def test_province_risk_test():
df = pd.DataFrame({
'Province': ['A', 'A', 'B', 'B'],
'claim_indicator': [1, 0, 1, 1],
'TotalClaims': [100, 0, 200, 300]
})
result = province_risk_test(df, 'claim_frequency')
assert 'anova_stat' in result
assert 'p_value' in result

def test_zip_margin_test():
df = pd.DataFrame({
'PostalCode': ['X', 'X', 'Y', 'Y'],
'TotalPremium': [500, 600, 300, 400],
'TotalClaims': [200, 300, 150, 200]
})
result = zip_margin_test(df)
assert 'p_value' in result
assert 'results_by_zip' in result

def test_gender_risk_test():
df = pd.DataFrame({
'Gender': ['Male', 'Female', 'Male', 'Female'],
'claim_indicator': [1, 0, 1, 1],
'TotalClaims': [1000, 0, 800, 300]
})
result = gender_risk_test(df, 'claim_severity')
assert 'p_value' in result
assert 'results_by_gender' in result
20 changes: 20 additions & 0 deletions tests/test_task_3/test_segmentation_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pandas as pd
from src.task_3.segmentation_utils import calculate_claim_frequency, calculate_claim_severity, calculate_margin, get_groups_by_feature

def test_claim_frequency():
df = pd.DataFrame({'claim_indicator': [1, 0, 1, 1]})
assert calculate_claim_frequency(df) == 0.75

def test_claim_severity():
df = pd.DataFrame({'claim_indicator': [1, 0, 1], 'TotalClaims': [100, 0, 300]})
assert calculate_claim_severity(df) == 200

def test_calculate_margin():
df = pd.DataFrame({'TotalPremium': [1000, 800], 'TotalClaims': [400, 200]})
assert calculate_margin(df) == 1200

def test_get_groups_by_feature():
df = pd.DataFrame({'Gender': ['M', 'F', 'F', 'M']})
a, b = get_groups_by_feature(df, 'Gender', 'M', 'F')
assert set(a['Gender']) == {'M'}
assert set(b['Gender']) == {'F'}
Empty file.