ayanasamuel8 · ayanasamuel8 · Jun 17, 2025 · Jun 17, 2025
diff --git a/notebooks/task_3/05_hypothesis_testing.ipynb b/notebooks/task_3/05_hypothesis_testing.ipynb
diff --git a/src/data_loader.py b/src/data_loader.py
@@ -7,7 +7,7 @@ def load_data(path: str) -> pd.DataFrame:
     skiprows=0 
     )
 def load_raw_data(path: str) -> pd.DataFrame:
-    return pd.read_csv(path)
+    return pd.read_csv(path, low_memory=False)
 
 def check_structure(df: pd.DataFrame):
     return df.info(), df.dtypes

diff --git a/src/preprocessing.py b/src/preprocessing.py
@@ -1,10 +1,85 @@
 import pandas as pd
-def clean_numeric_strings(df: pd.DataFrame, cols: list):
+import numpy as np
+from scipy import stats
+from src.data_loader import extract_numeric_cols, extract_categorical_cols, extract_date_time_cols
+
+
+def clean_numeric_strings(df: pd.DataFrame, cols: list) -> pd.DataFrame:
     """
     Convert string-formatted numbers with commas to proper floats.
     """
     for col in cols:
         if df[col].dtype == 'object':
-            df[col] = df[col].str.replace(',', '', regex=False)  # Remove thousands separator
-            df[col] = pd.to_numeric(df[col], errors='coerce')    # Convert to float
+            df[col] = df[col].str.replace(',', '', regex=False)
+            df[col] = pd.to_numeric(df[col], errors='coerce')
     return df
+
+
+def clean_data(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Clean the dataset by performing the following steps:
+      - Drop duplicate records.
+      - Fill missing numerical values with the median.
+      - Standardize and fill missing categorical values with the mode.
+      - Clean numeric strings (e.g., "1,000").
+      - Cap numerical outliers between 1st and 99th percentile.
+      - Convert datetime columns.
+
+    Returns the cleaned DataFrame.
+    """
+    # 1. Remove Duplicate Rows
+    df_clean = df.drop_duplicates().copy()
+    print(print(df['TotalClaims'].value_counts()))
+
+    # 2. Handle Numeric Columns
+    num_cols = extract_numeric_cols(df_clean)
+    print('TotalClaims' in num_cols)
+    for col in num_cols:
+        median_val = df_clean[col].median()
+        df_clean[col] = df_clean[col].fillna(median_val)
+
+    # 3. Handle Categorical Columns
+    cat_cols = extract_categorical_cols(df_clean)
+    for col in cat_cols:
+        # Ensure string formatting and remove invalid 'nan' strings
+        df_clean[col] = df_clean[col].astype(str).str.strip().str.lower()
+        df_clean[col] = df_clean[col].replace('nan', np.nan)
+
+    for col in cat_cols:
+        if df_clean[col].isnull().all():
+            continue  # Skip if entire column is NaN
+        mode_vals = df_clean[col].mode()
+        if not mode_vals.empty:
+            mode_val = mode_vals[0]
+            df_clean[col] = df_clean[col].fillna(mode_val)
+
+    # 4. Clean Numeric Strings
+    numeric_string_cols = [
+        col for col in df_clean.columns
+        if df_clean[col].dtype == 'object' and df_clean[col].str.contains(',', na=False).any()
+    ]
+    df_clean = clean_numeric_strings(df_clean, numeric_string_cols)
+    print('after string clean:')
+    print(print(df['TotalClaims'].value_counts()))
+
+    # 5. Cap Outliers to 1st and 99th percentiles
+    for col in num_cols:
+        if col != 'TotalClaims':
+            lower, upper = df_clean[col].quantile([0.01, 0.99])
+            df_clean[col] = np.clip(df_clean[col], lower, upper)
+
+    # 6. Convert Date/Time Columns
+    date_time_cols = extract_date_time_cols(df_clean)
+    for col in date_time_cols:
+        df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
+
+    df_clean = df_clean.dropna(thresh=len(df) * 0.2, axis=1)
+    # df_clean.drop_duplicates(inplace=True)
+
+    return df_clean
+def save_cleaned_data(df: pd.DataFrame, file_path: str = '../../data/cleaned/cleaned_data.csv') -> None:
+    """
+    Save the cleaned DataFrame to a CSV file.
+    """
+    df.to_csv(file_path, index=False)
+    print(f"Cleaned data saved to {file_path}")
diff --git a/src/task_3/__init__.py b/src/task_3/__init__.py
diff --git a/src/task_3/business_analysis.py b/src/task_3/business_analysis.py
@@ -0,0 +1,10 @@
+def interpret_test_result(test_name, p_value, additional_info=""):
+    """
+    Returns a clear business interpretation of the test result.
+    If p_value < 0.05, we reject the null hypothesis, and we can add further explanation.
+    """
+    if p_value < 0.05:
+        interpretation = f"We reject the null hypothesis for {test_name} (p = {p_value:.3f}). {additional_info}"
+    else:
+        interpretation = f"We fail to reject the null hypothesis for {test_name} (p = {p_value:.3f}). No significant difference observed."
+    return interpretation
diff --git a/src/task_3/data_segmentation.py b/src/task_3/data_segmentation.py
@@ -0,0 +1,13 @@
+import pandas as pd
+
+def segment_data(data, feature, test_value):
+    """
+    Segments the data into two groups:
+      - Group A (Control): Plans without the test feature.
+      - Group B (Test): Plans with the test feature (i.e., where feature == test_value).
+
+    Returns: control, test as two DataFrame objects.
+    """
+    control = data[data[feature] != test_value].copy()
+    test = data[data[feature] == test_value].copy()
+    return control, test
diff --git a/src/task_3/hypothesis_tests.py b/src/task_3/hypothesis_tests.py
@@ -0,0 +1,108 @@
+import numpy as np
+import pandas as pd
+from src.task_3.stats_helpers import anova_test, t_test_independent, chi_square_test
+def province_risk_test(data, risk_metric='claim_frequency'):
+    """
+    Compare risk across provinces using the specified risk metric.
+
+    For claim frequency: Calculate the proportion of policies with at least one claim.
+    For claim severity: Consider only policies with at least one claim and compute their average.
+    Use ANOVA to compare differences across provinces.
+
+    Returns a dictionary with test statistics and p-values.
+    """
+    results = {}
+    provinces = data['Province'].unique()
+    groups = []
+
+    if risk_metric == 'claim_frequency':
+        for province in provinces:
+            subset = data[data['Province'] == province]
+            # Binary flag: 1 if claim exists, 0 otherwise
+            freq = subset['claim_indicator'].mean()
+            groups.append(subset['claim_indicator'].values)
+            results[province] = {'claim_frequency': freq}
+    elif risk_metric == 'claim_severity':
+        for province in provinces:
+            subset = data[(data['Province'] == province) & (data['claim_indicator'] == 1)]
+            severity = subset['TotalClaims'].mean() if not subset.empty else np.nan
+            groups.append(subset['TotalClaims'].values if not subset.empty else np.array([0]))
+            results[province] = {'claim_severity': severity}
+    else:
+        raise ValueError("Invalid risk metric specified.")
+
+    stat, p_val = anova_test(groups)
+    return {'results_by_province': results, 'anova_stat': stat, 'p_value': p_val}
+
+
+def zip_risk_test(data, risk_metric='claim_frequency'):
+    """
+    Compares risk across zip codes using the specified risk metric (frequency or severity).
+    Performs ANOVA across groups.
+    """
+    results = {}
+    zip_codes = data['PostalCode'].dropna().unique()
+    groups = []
+
+    if risk_metric == 'claim_frequency':
+        for z in zip_codes:
+            subset = data[data['PostalCode'] == z]
+            freq = subset['claim_indicator'].mean()
+            groups.append(subset['claim_indicator'].values)
+            results[z] = {'claim_frequency': freq}
+    elif risk_metric == 'claim_severity':
+        for z in zip_codes:
+            subset = data[(data['PostalCode'] == z) & (data['claim_indicator'] == 1)]
+            severity = subset['TotalClaims'].mean() if not subset.empty else np.nan
+            groups.append(subset['TotalClaims'].values if not subset.empty else np.array([0]))
+            results[z] = {'claim_severity': severity}
+    else:
+        raise ValueError("Invalid risk metric.")
+
+    stat, p_val = anova_test(groups)
+    return {'results_by_zip': results, 'anova_stat': stat, 'p_value': p_val}
+
+def zip_margin_test(data):
+    """
+    Tests whether margin (TotalPremium - TotalClaims) differs by zip code using ANOVA.
+    """
+    data = data.copy()
+    data['margin'] = data['TotalPremium'] - data['TotalClaims']
+
+    results = {}
+    zip_codes = data['PostalCode'].dropna().unique()
+    groups = []
+
+    for z in zip_codes:
+        subset = data[data['PostalCode'] == z]
+        margin_mean = subset['margin'].mean()
+        groups.append(subset['margin'].values)
+        results[z] = {'average_margin': margin_mean}
+
+    stat, p_val = anova_test(groups)
+    return {'results_by_zip': results, 'anova_stat': stat, 'p_value': p_val}
+
+def gender_risk_test(data, risk_metric='claim_severity'):
+    """
+    Compare risk between genders using a two-sample t-test.
+    """
+    results = {}
+
+    male_data = data[data['Gender'] == 'Male']
+    female_data = data[data['Gender'] == 'Female']
+
+    if risk_metric == 'claim_frequency':
+        male_vals = male_data['claim_indicator']
+        female_vals = female_data['claim_indicator']
+        results['Male'] = male_vals.mean()
+        results['Female'] = female_vals.mean()
+    elif risk_metric == 'claim_severity':
+        male_vals = male_data[male_data['claim_indicator'] == 1]['TotalClaims']
+        female_vals = female_data[female_data['claim_indicator'] == 1]['TotalClaims']
+        results['Male'] = male_vals.mean()
+        results['Female'] = female_vals.mean()
+    else:
+        raise ValueError("Invalid risk metric.")
+
+    stat, p_val = t_test_independent(male_vals, female_vals)
+    return {'results_by_gender': results, 't_stat': stat, 'p_value': p_val}
diff --git a/src/task_3/segmentation_utils.py b/src/task_3/segmentation_utils.py
@@ -0,0 +1,28 @@
+import numpy as np
+import pandas as pd
+
+def calculate_claim_frequency(data):
+    """
+    Calculate Claim Frequency as the proportion of policies with at least one claim.
+    """
+    # data['claim_indicator'] = data['TotalClaims'] > 0
+    return data['claim_indicator'].mean()
+
+def calculate_claim_severity(data):
+    """
+    Calculate Claim Severity as the average claim amount for policies with a claim.
+    """
+    data['claim_indicator'] = data['TotalClaims'] > 0
+    claims = data[data['claim_indicator']]
+    return claims['TotalClaims'].mean() if not claims.empty else np.nan
+
+def calculate_margin(data):
+    """
+    Calculate margin as (TotalPremium - TotalClaims) for the data provided.
+    """
+    return (data['TotalPremium'] - data['TotalClaims']).sum()
+
+def get_groups_by_feature(df, feature: str, value_a, value_b):
+    group_a = df[df[feature] == value_a]
+    group_b = df[df[feature] == value_b]
+    return group_a, group_b
diff --git a/src/task_3/stats_helpers.py b/src/task_3/stats_helpers.py
@@ -0,0 +1,17 @@
+from scipy.stats import ttest_ind, chi2_contingency, f_oneway
+import pandas as pd
+
+def t_test_independent(group_a, group_b):
+    return ttest_ind(group_a.dropna(), group_b.dropna(), equal_var=False)
+
+def chi_square_test(df, col1, col2):
+    contingency = pd.crosstab(df[col1], df[col2])
+    return chi2_contingency(contingency)
+
+def anova_test(groups):
+    """
+    Perform one-way ANOVA.
+    `groups` should be a list/tuple of arrays (or lists) containing your samples.
+    Returns the ANOVA statistic and p-value.
+    """
+    return f_oneway(*groups)
diff --git a/tests/test_task_3/__init__.py b/tests/test_task_3/__init__.py
diff --git a/tests/test_task_3/test_business_analysis.py b/tests/test_task_3/test_business_analysis.py
@@ -0,0 +1,12 @@
+import pytest
+from src.task_3.business_analysis import interpret_test_result
+
+def test_interpret_test_result_reject():
+    msg = interpret_test_result("Province Risk", 0.01, "Higher risk observed in Gauteng.")
+    assert "reject the null hypothesis" in msg
+    assert "Higher risk observed" in msg
+
+def test_interpret_test_result_fail():
+    msg = interpret_test_result("Gender Risk", 0.2)
+    assert "fail to reject the null hypothesis" in msg
+    assert "No significant difference" in msg
diff --git a/tests/test_task_3/test_data_segmentation.py b/tests/test_task_3/test_data_segmentation.py
@@ -0,0 +1,11 @@
+import pandas as pd
+from src.task_3.data_segmentation import segment_data
+
+def test_segment_data():
+    df = pd.DataFrame({
+        'Gender': ['Male', 'Female', 'Male', 'Female'],
+        'claim_amount': [100, 200, 150, 250]
+    })
+    control, test = segment_data(df, 'Gender', 'Female')
+    assert all(control['Gender'] != 'Female')
+    assert all(test['Gender'] == 'Female')
diff --git a/tests/test_task_3/test_hypothesis_tests.py b/tests/test_task_3/test_hypothesis_tests.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import numpy as np
+from src.task_3.hypothesis_tests import province_risk_test, zip_margin_test, gender_risk_test
+
+def test_province_risk_test():
+    df = pd.DataFrame({
+        'Province': ['A', 'A', 'B', 'B'],
+        'claim_indicator': [1, 0, 1, 1],
+        'TotalClaims': [100, 0, 200, 300]
+    })
+    result = province_risk_test(df, 'claim_frequency')
+    assert 'anova_stat' in result
+    assert 'p_value' in result
+
+def test_zip_margin_test():
+    df = pd.DataFrame({
+        'PostalCode': ['X', 'X', 'Y', 'Y'],
+        'TotalPremium': [500, 600, 300, 400],
+        'TotalClaims': [200, 300, 150, 200]
+    })
+    result = zip_margin_test(df)
+    assert 'p_value' in result
+    assert 'results_by_zip' in result
+
+def test_gender_risk_test():
+    df = pd.DataFrame({
+        'Gender': ['Male', 'Female', 'Male', 'Female'],
+        'claim_indicator': [1, 0, 1, 1],
+        'TotalClaims': [1000, 0, 800, 300]
+    })
+    result = gender_risk_test(df, 'claim_severity')
+    assert 'p_value' in result
+    assert 'results_by_gender' in result
diff --git a/tests/test_task_3/test_segmentation_utils.py b/tests/test_task_3/test_segmentation_utils.py
@@ -0,0 +1,20 @@
+import pandas as pd
+from src.task_3.segmentation_utils import calculate_claim_frequency, calculate_claim_severity, calculate_margin, get_groups_by_feature
+
+def test_claim_frequency():
+    df = pd.DataFrame({'claim_indicator': [1, 0, 1, 1]})
+    assert calculate_claim_frequency(df) == 0.75
+
+def test_claim_severity():
+    df = pd.DataFrame({'claim_indicator': [1, 0, 1], 'TotalClaims': [100, 0, 300]})
+    assert calculate_claim_severity(df) == 200
+
+def test_calculate_margin():
+    df = pd.DataFrame({'TotalPremium': [1000, 800], 'TotalClaims': [400, 200]})
+    assert calculate_margin(df) == 1200
+
+def test_get_groups_by_feature():
+    df = pd.DataFrame({'Gender': ['M', 'F', 'F', 'M']})
+    a, b = get_groups_by_feature(df, 'Gender', 'M', 'F')
+    assert set(a['Gender']) == {'M'}
+    assert set(b['Gender']) == {'F'}
diff --git a/tests/test_task_3/test_stats_helpers.py b/tests/test_task_3/test_stats_helpers.py