Update: task-4 - model training and interpretability

Ayana Samuel · Ayana Samuel · commit d4cab736498e · 2025-06-17T18:15:45.000+03:00
diff --git a/notebooks/task_4/06_model_training_ and _interpretability.ipynb b/notebooks/task_4/06_model_training_ and _interpretability.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -6,6 +6,7 @@ seaborn
 scikit-learn
 scipy
 statsmodels
+xgboost
 
 # EDA & Plotting
 plotly
diff --git a/src/task_4/__init__.py b/src/task_4/__init__.py
diff --git a/src/task_4/data_processing.py b/src/task_4/data_processing.py
@@ -0,0 +1,99 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+# from src.preprocessing.cleaning import clean_data
+from src.task_4.feature_engineering import add_features  # you defined this earlier
+
+def encode_categoricals(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Label encode all categorical columns, safely handling NaNs and type issues.
+    """
+    df_encoded = df.copy()
+    cat_cols = df_encoded.select_dtypes(include=['object', 'category']).columns
+
+    for col in cat_cols:
+        # If NaNs exist, temporarily fill with placeholder
+        if df_encoded[col].isnull().any():
+            df_encoded[col] = df_encoded[col].fillna('___missing___')
+
+        try:
+            le = LabelEncoder()
+            df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
+        except (ValueError, TypeError):
+            # Fallback for very dirty columns
+            df_encoded[col] = df_encoded[col].astype('category').cat.codes
+
+    return df_encoded
+
+def is_date_column(series: pd.Series) -> bool:
+    try:
+        parsed = pd.to_datetime(series, errors='coerce')
+        non_null_ratio = parsed.notna().mean()
+        return non_null_ratio > 0.9  # 90%+ of values must be valid dates
+    except:
+        return False
+
+
+
+
+def prepare_claim_severity_data(df):
+    target = 'TotalClaims'
+    drop_cols = ['UnderwrittenCoverID', 'PolicyID', 'claim_indicator']
+
+    # Filter only likely date columns
+    date_cols = [col for col in df.columns if df[col].dtype == 'object' and is_date_column(df[col])]
+
+    # Convert and extract
+    for col in date_cols:
+        df[col] = pd.to_datetime(df[col], errors='coerce')
+        df[f'{col}_year'] = df[col].dt.year
+        df[f'{col}_month'] = df[col].dt.month
+        df.drop(columns=[col], inplace=True)
+
+    for col in date_cols:
+        if col in df.columns:
+            df[col] = pd.to_datetime(df[col], errors='coerce')
+            df[f'{col}_year'] = df[col].dt.year
+            df[f'{col}_month'] = df[col].dt.month
+            df.drop(columns=[col], inplace=True)
+
+    # Keep only positive-claim rows
+    df = df[df[target] > 0]
+
+    # Encode categoricals
+    df = encode_categoricals(df)
+
+    # Drop unnecessary columns
+    df = df.drop(columns=drop_cols)
+
+
+
+    X = df.drop(columns=[target])
+    y = df[target]
+
+    return train_test_split(X, y, test_size=0.2, random_state=42)
+
+
+
+
+
+def prepare_claim_probability_data(df: pd.DataFrame) -> tuple:
+    """
+    Load, clean, engineer, encode, and split the data for claim probability classification.
+    Assumes a `claim_indicator` column exists (boolean or binary 0/1).
+    """
+
+    # Ensure claim_indicator is binary (0/1)
+    if df['claim_indicator'].dtype == 'bool':
+        df['claim_indicator'] = df['claim_indicator'].astype(int)
+    elif df['claim_indicator'].nunique() == 2 and sorted(df['claim_indicator'].unique()) == [False, True]:
+        df['claim_indicator'] = df['claim_indicator'].map({False: 0, True: 1})
+
+    df = encode_categoricals(df)
+    df = df.dropna()
+
+    X = df.drop(columns=['claim_indicator'])
+    y = df['claim_indicator']
+
+    return train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
+
diff --git a/src/task_4/feature_engineering.py b/src/task_4/feature_engineering.py
@@ -0,0 +1,20 @@
+import pandas as pd
+def add_features(df: pd.DataFrame) -> pd.DataFrame:
+    df['ClaimRatio'] = df['TotalClaims'] / df['TotalPremium'].replace(0, 1)
+    df['VehicleAge'] = 2025 - df['RegistrationYear']
+    df['IsNew'] = (df['VehicleAge'] <= 1).astype(int)
+    df['PowerPerCylinder'] = df['kilowatts'] / df['Cylinders'].replace(0, 1)
+    df['IsHighValue'] = (df['CustomValueEstimate'] > df['CustomValueEstimate'].median()).astype(int)
+
+    # 🔁 Map TermFrequency from string to numeric multiplier
+    term_map = {
+        'Monthly': 12,
+        'Quarterly': 4,
+        'Semi-Annual': 2,
+        'Annual': 1
+    }
+    df['TermFrequency'] = df['TermFrequency'].map(term_map).fillna(1)
+
+    df['MonthlyPremium'] = df['CalculatedPremiumPerTerm'] / df['TermFrequency'].replace(0, 1)
+
+    return df
diff --git a/src/task_4/interpretability.py b/src/task_4/interpretability.py
@@ -0,0 +1,57 @@
+import lime
+import lime.lime_tabular
+import numpy as np
+import pandas as pd
+
+def explain_model_with_lime(model, X_train, X_test, feature_names, instance_idx=0, mode='regression'):
+    """
+    Explain the prediction of a single instance using LIME.
+    
+    Parameters:
+    - model: trained model (regressor or classifier)
+    - X_train: training features as DataFrame or numpy array (for LIME background distribution)
+    - X_test: test features as DataFrame or numpy array
+    - feature_names: list of feature names (columns)
+    - instance_idx: index of the test instance to explain
+    - mode: 'regression' or 'classification'
+    
+    Returns:
+    - explanation object from LIME (can be used to display or save explanations)
+    """
+    if isinstance(X_train, pd.DataFrame):
+        training_data = X_train.values
+    else:
+        training_data = X_train
+        
+    if isinstance(X_test, pd.DataFrame):
+        test_data = X_test.values
+    else:
+        test_data = X_test
+        
+    # Setup the LIME explainer for tabular data
+    explainer = lime.lime_tabular.LimeTabularExplainer(
+        training_data,
+        feature_names=feature_names,
+        mode=mode,
+        discretize_continuous=True
+    )
+    
+    # Select one instance to explain
+    instance = test_data[instance_idx]
+    
+    # Get explanation
+    explanation = explainer.explain_instance(
+        instance, 
+        model.predict,
+        num_features=10  # top 10 features in explanation
+    )
+    
+    return explanation
+
+def show_lime_explanation(explanation):
+    """
+    Display the LIME explanation in a readable format.
+    """
+    print("Feature contributions to prediction:")
+    for feature, weight in explanation.as_list():
+        print(f"{feature}: {weight:.4f}")
diff --git a/src/task_4/model_training.py b/src/task_4/model_training.py
@@ -0,0 +1,45 @@
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+from xgboost import XGBRegressor
+from sklearn.metrics import mean_squared_error, r2_score
+from joblib import dump
+from src.task_4.data_processing import prepare_claim_severity_data
+
+def evaluate_model(name, model, X_test, y_test):
+    preds = model.predict(X_test)
+    rmse = np.sqrt(mean_squared_error(y_test, preds))
+    r2 = r2_score(y_test, preds)
+    print(f"{name} Results → RMSE: {rmse:.2f}, R²: {r2:.4f}")
+    return rmse, r2
+
+def train_and_compare_models(X_train,y_train, X_test, y_test):
+
+    results = {}
+    models = {}
+
+    # 1. Linear Regression
+    lr = LinearRegression()
+    lr.fit(X_train, y_train)
+    results['LinearRegression'] = evaluate_model("Linear Regression", lr, X_test, y_test)
+    models['LinearRegression'] = lr
+
+    # 2. Random Forest
+    rf = RandomForestRegressor(n_estimators=100, random_state=42)
+    rf.fit(X_train, y_train)
+    results['RandomForest'] = evaluate_model("Random Forest", rf, X_test, y_test)
+    models['RandomForest'] = rf
+
+    # 3. XGBoost
+    xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
+    xgb.fit(X_train, y_train)
+    results['XGBoost'] = evaluate_model("XGBoost", xgb, X_test, y_test)
+    models['XGBoost'] = xgb
+
+    # Identify best model by lowest RMSE
+    best_model_name = min(results, key=lambda k: results[k][0])
+    best_model = models[best_model_name]
+
+
+    return best_model, best_model_name, results
diff --git a/tests/test_task_4/__init__.py b/tests/test_task_4/__init__.py
diff --git a/tests/test_task_4/test_feature_engineering.py b/tests/test_task_4/test_feature_engineering.py
@@ -0,0 +1,31 @@
+import pandas as pd
+from src.task_4.feature_engineering import add_features
+
+def test_add_features():
+    df = pd.DataFrame({
+        'TotalClaims': [1000, 2000, 0, 5000],
+        'TotalPremium': [5000, 4000, 0, 1000],
+        'RegistrationYear': [2024, 2023, 2025, 2010],
+        'kilowatts': [100, 200, 150, 300],
+        'Cylinders': [4, 0, 3, 6],
+        'CustomValueEstimate': [10000, 20000, 15000, 30000],
+        'CalculatedPremiumPerTerm': [1200, 800, 0, 400],
+        'TermFrequency': [12, 4, 0, 1]
+    })
+    
+    df_new = add_features(df)
+    
+    expected_claim_ratio = df['TotalClaims'] / df['TotalPremium'].replace(0, 1)
+    expected_vehicle_age = 2025 - df['RegistrationYear']
+    expected_is_new = (expected_vehicle_age <= 1).astype(int)
+    expected_power_per_cylinder = df['kilowatts'] / df['Cylinders'].replace(0, 1)
+    median_value = df['CustomValueEstimate'].median()
+    expected_is_high_value = (df['CustomValueEstimate'] > median_value).astype(int)
+    expected_monthly_premium = df['CalculatedPremiumPerTerm'] / df['TermFrequency'].replace(0, 1)
+
+    assert all(df_new['ClaimRatio'] == expected_claim_ratio)
+    assert all(df_new['VehicleAge'] == expected_vehicle_age)
+    assert all(df_new['IsNew'] == expected_is_new)
+    assert all(df_new['PowerPerCylinder'] == expected_power_per_cylinder)
+    assert all(df_new['IsHighValue'] == expected_is_high_value)
+    assert all(df_new['MonthlyPremium'] == expected_monthly_premium)
diff --git a/tests/test_task_4/test_model_training.py b/tests/test_task_4/test_model_training.py
@@ -0,0 +1,46 @@
+import os
+import pandas as pd
+import pytest
+from joblib import load
+from src.task_4.model_training import train_and_compare_models
+
+# Create a small dummy CSV for testing
+dummy_csv_path = 'test_dummy_data.csv'
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_dummy_data():
+    # Create a dummy dataset for claim severity with minimal rows and columns
+    data = {
+        'TotalClaims': [100, 200, 150, 300],
+        'TotalPremium': [1000, 2000, 1500, 3000],
+        'RegistrationYear': [2020, 2019, 2021, 2018],
+        'kilowatts': [100, 150, 120, 130],
+        'Cylinders': [4, 6, 4, 4],
+        'CustomValueEstimate': [5000, 7000, 6000, 8000],
+        'CalculatedPremiumPerTerm': [100, 150, 120, 130],
+        'TermFrequency': [12, 12, 12, 12],
+        # Add categorical columns as strings (with small number of categories)
+        'Gender': ['male', 'female', 'male', 'female'],
+        'claim_indicator': [1, 0, 1, 0]  # Just to mimic full data format
+    }
+    df = pd.DataFrame(data)
+    df.to_csv(dummy_csv_path, index=False)
+    yield
+    os.remove(dummy_csv_path)
+
+def test_train_and_compare_models_runs():
+    best_model, best_name, results = train_and_compare_models(dummy_csv_path)
+
+    # Check return types
+    assert best_name in results
+    assert hasattr(best_model, 'predict')  # model should have predict method
+
+    # Check results contain expected keys
+    assert 'LinearRegression' in results
+    assert 'RandomForest' in results
+    assert 'XGBoost' in results
+
+    # Check RMSE and R2 are floats
+    for key, (rmse, r2) in results.items():
+        assert isinstance(rmse, float)
+        assert isinstance(r2, float)