Skip to content

Commit d4cab73

Browse files
author
Ayana Samuel
committed
Update: task-4 - model training and interpretability
1 parent 2350640 commit d4cab73

10 files changed

Lines changed: 1823 additions & 0 deletions

notebooks/task_4/06_model_training_ and _interpretability.ipynb

Lines changed: 1524 additions & 0 deletions
Large diffs are not rendered by default.

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ seaborn
66
scikit-learn
77
scipy
88
statsmodels
9+
xgboost
910

1011
# EDA & Plotting
1112
plotly

src/task_4/__init__.py

Whitespace-only changes.

src/task_4/data_processing.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import pandas as pd
2+
from sklearn.model_selection import train_test_split
3+
from sklearn.preprocessing import LabelEncoder
4+
# from src.preprocessing.cleaning import clean_data
5+
from src.task_4.feature_engineering import add_features # you defined this earlier
6+
7+
def encode_categoricals(df: pd.DataFrame) -> pd.DataFrame:
8+
"""
9+
Label encode all categorical columns, safely handling NaNs and type issues.
10+
"""
11+
df_encoded = df.copy()
12+
cat_cols = df_encoded.select_dtypes(include=['object', 'category']).columns
13+
14+
for col in cat_cols:
15+
# If NaNs exist, temporarily fill with placeholder
16+
if df_encoded[col].isnull().any():
17+
df_encoded[col] = df_encoded[col].fillna('___missing___')
18+
19+
try:
20+
le = LabelEncoder()
21+
df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
22+
except (ValueError, TypeError):
23+
# Fallback for very dirty columns
24+
df_encoded[col] = df_encoded[col].astype('category').cat.codes
25+
26+
return df_encoded
27+
28+
def is_date_column(series: pd.Series) -> bool:
29+
try:
30+
parsed = pd.to_datetime(series, errors='coerce')
31+
non_null_ratio = parsed.notna().mean()
32+
return non_null_ratio > 0.9 # 90%+ of values must be valid dates
33+
except:
34+
return False
35+
36+
37+
38+
39+
def prepare_claim_severity_data(df):
40+
target = 'TotalClaims'
41+
drop_cols = ['UnderwrittenCoverID', 'PolicyID', 'claim_indicator']
42+
43+
# Filter only likely date columns
44+
date_cols = [col for col in df.columns if df[col].dtype == 'object' and is_date_column(df[col])]
45+
46+
# Convert and extract
47+
for col in date_cols:
48+
df[col] = pd.to_datetime(df[col], errors='coerce')
49+
df[f'{col}_year'] = df[col].dt.year
50+
df[f'{col}_month'] = df[col].dt.month
51+
df.drop(columns=[col], inplace=True)
52+
53+
for col in date_cols:
54+
if col in df.columns:
55+
df[col] = pd.to_datetime(df[col], errors='coerce')
56+
df[f'{col}_year'] = df[col].dt.year
57+
df[f'{col}_month'] = df[col].dt.month
58+
df.drop(columns=[col], inplace=True)
59+
60+
# Keep only positive-claim rows
61+
df = df[df[target] > 0]
62+
63+
# Encode categoricals
64+
df = encode_categoricals(df)
65+
66+
# Drop unnecessary columns
67+
df = df.drop(columns=drop_cols)
68+
69+
70+
71+
X = df.drop(columns=[target])
72+
y = df[target]
73+
74+
return train_test_split(X, y, test_size=0.2, random_state=42)
75+
76+
77+
78+
79+
80+
def prepare_claim_probability_data(df: pd.DataFrame) -> tuple:
81+
"""
82+
Load, clean, engineer, encode, and split the data for claim probability classification.
83+
Assumes a `claim_indicator` column exists (boolean or binary 0/1).
84+
"""
85+
86+
# Ensure claim_indicator is binary (0/1)
87+
if df['claim_indicator'].dtype == 'bool':
88+
df['claim_indicator'] = df['claim_indicator'].astype(int)
89+
elif df['claim_indicator'].nunique() == 2 and sorted(df['claim_indicator'].unique()) == [False, True]:
90+
df['claim_indicator'] = df['claim_indicator'].map({False: 0, True: 1})
91+
92+
df = encode_categoricals(df)
93+
df = df.dropna()
94+
95+
X = df.drop(columns=['claim_indicator'])
96+
y = df['claim_indicator']
97+
98+
return train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
99+

src/task_4/feature_engineering.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import pandas as pd
2+
def add_features(df: pd.DataFrame) -> pd.DataFrame:
3+
df['ClaimRatio'] = df['TotalClaims'] / df['TotalPremium'].replace(0, 1)
4+
df['VehicleAge'] = 2025 - df['RegistrationYear']
5+
df['IsNew'] = (df['VehicleAge'] <= 1).astype(int)
6+
df['PowerPerCylinder'] = df['kilowatts'] / df['Cylinders'].replace(0, 1)
7+
df['IsHighValue'] = (df['CustomValueEstimate'] > df['CustomValueEstimate'].median()).astype(int)
8+
9+
# 🔁 Map TermFrequency from string to numeric multiplier
10+
term_map = {
11+
'Monthly': 12,
12+
'Quarterly': 4,
13+
'Semi-Annual': 2,
14+
'Annual': 1
15+
}
16+
df['TermFrequency'] = df['TermFrequency'].map(term_map).fillna(1)
17+
18+
df['MonthlyPremium'] = df['CalculatedPremiumPerTerm'] / df['TermFrequency'].replace(0, 1)
19+
20+
return df

src/task_4/interpretability.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import lime
2+
import lime.lime_tabular
3+
import numpy as np
4+
import pandas as pd
5+
6+
def explain_model_with_lime(model, X_train, X_test, feature_names, instance_idx=0, mode='regression'):
7+
"""
8+
Explain the prediction of a single instance using LIME.
9+
10+
Parameters:
11+
- model: trained model (regressor or classifier)
12+
- X_train: training features as DataFrame or numpy array (for LIME background distribution)
13+
- X_test: test features as DataFrame or numpy array
14+
- feature_names: list of feature names (columns)
15+
- instance_idx: index of the test instance to explain
16+
- mode: 'regression' or 'classification'
17+
18+
Returns:
19+
- explanation object from LIME (can be used to display or save explanations)
20+
"""
21+
if isinstance(X_train, pd.DataFrame):
22+
training_data = X_train.values
23+
else:
24+
training_data = X_train
25+
26+
if isinstance(X_test, pd.DataFrame):
27+
test_data = X_test.values
28+
else:
29+
test_data = X_test
30+
31+
# Setup the LIME explainer for tabular data
32+
explainer = lime.lime_tabular.LimeTabularExplainer(
33+
training_data,
34+
feature_names=feature_names,
35+
mode=mode,
36+
discretize_continuous=True
37+
)
38+
39+
# Select one instance to explain
40+
instance = test_data[instance_idx]
41+
42+
# Get explanation
43+
explanation = explainer.explain_instance(
44+
instance,
45+
model.predict,
46+
num_features=10 # top 10 features in explanation
47+
)
48+
49+
return explanation
50+
51+
def show_lime_explanation(explanation):
52+
"""
53+
Display the LIME explanation in a readable format.
54+
"""
55+
print("Feature contributions to prediction:")
56+
for feature, weight in explanation.as_list():
57+
print(f"{feature}: {weight:.4f}")

src/task_4/model_training.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import pandas as pd
2+
import numpy as np
3+
from sklearn.linear_model import LinearRegression
4+
from sklearn.ensemble import RandomForestRegressor
5+
from xgboost import XGBRegressor
6+
from sklearn.metrics import mean_squared_error, r2_score
7+
from joblib import dump
8+
from src.task_4.data_processing import prepare_claim_severity_data
9+
10+
def evaluate_model(name, model, X_test, y_test):
11+
preds = model.predict(X_test)
12+
rmse = np.sqrt(mean_squared_error(y_test, preds))
13+
r2 = r2_score(y_test, preds)
14+
print(f"{name} Results → RMSE: {rmse:.2f}, R²: {r2:.4f}")
15+
return rmse, r2
16+
17+
def train_and_compare_models(X_train,y_train, X_test, y_test):
18+
19+
results = {}
20+
models = {}
21+
22+
# 1. Linear Regression
23+
lr = LinearRegression()
24+
lr.fit(X_train, y_train)
25+
results['LinearRegression'] = evaluate_model("Linear Regression", lr, X_test, y_test)
26+
models['LinearRegression'] = lr
27+
28+
# 2. Random Forest
29+
rf = RandomForestRegressor(n_estimators=100, random_state=42)
30+
rf.fit(X_train, y_train)
31+
results['RandomForest'] = evaluate_model("Random Forest", rf, X_test, y_test)
32+
models['RandomForest'] = rf
33+
34+
# 3. XGBoost
35+
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
36+
xgb.fit(X_train, y_train)
37+
results['XGBoost'] = evaluate_model("XGBoost", xgb, X_test, y_test)
38+
models['XGBoost'] = xgb
39+
40+
# Identify best model by lowest RMSE
41+
best_model_name = min(results, key=lambda k: results[k][0])
42+
best_model = models[best_model_name]
43+
44+
45+
return best_model, best_model_name, results

tests/test_task_4/__init__.py

Whitespace-only changes.
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import pandas as pd
2+
from src.task_4.feature_engineering import add_features
3+
4+
def test_add_features():
5+
df = pd.DataFrame({
6+
'TotalClaims': [1000, 2000, 0, 5000],
7+
'TotalPremium': [5000, 4000, 0, 1000],
8+
'RegistrationYear': [2024, 2023, 2025, 2010],
9+
'kilowatts': [100, 200, 150, 300],
10+
'Cylinders': [4, 0, 3, 6],
11+
'CustomValueEstimate': [10000, 20000, 15000, 30000],
12+
'CalculatedPremiumPerTerm': [1200, 800, 0, 400],
13+
'TermFrequency': [12, 4, 0, 1]
14+
})
15+
16+
df_new = add_features(df)
17+
18+
expected_claim_ratio = df['TotalClaims'] / df['TotalPremium'].replace(0, 1)
19+
expected_vehicle_age = 2025 - df['RegistrationYear']
20+
expected_is_new = (expected_vehicle_age <= 1).astype(int)
21+
expected_power_per_cylinder = df['kilowatts'] / df['Cylinders'].replace(0, 1)
22+
median_value = df['CustomValueEstimate'].median()
23+
expected_is_high_value = (df['CustomValueEstimate'] > median_value).astype(int)
24+
expected_monthly_premium = df['CalculatedPremiumPerTerm'] / df['TermFrequency'].replace(0, 1)
25+
26+
assert all(df_new['ClaimRatio'] == expected_claim_ratio)
27+
assert all(df_new['VehicleAge'] == expected_vehicle_age)
28+
assert all(df_new['IsNew'] == expected_is_new)
29+
assert all(df_new['PowerPerCylinder'] == expected_power_per_cylinder)
30+
assert all(df_new['IsHighValue'] == expected_is_high_value)
31+
assert all(df_new['MonthlyPremium'] == expected_monthly_premium)
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import os
2+
import pandas as pd
3+
import pytest
4+
from joblib import load
5+
from src.task_4.model_training import train_and_compare_models
6+
7+
# Create a small dummy CSV for testing
8+
dummy_csv_path = 'test_dummy_data.csv'
9+
10+
@pytest.fixture(scope="module", autouse=True)
11+
def setup_dummy_data():
12+
# Create a dummy dataset for claim severity with minimal rows and columns
13+
data = {
14+
'TotalClaims': [100, 200, 150, 300],
15+
'TotalPremium': [1000, 2000, 1500, 3000],
16+
'RegistrationYear': [2020, 2019, 2021, 2018],
17+
'kilowatts': [100, 150, 120, 130],
18+
'Cylinders': [4, 6, 4, 4],
19+
'CustomValueEstimate': [5000, 7000, 6000, 8000],
20+
'CalculatedPremiumPerTerm': [100, 150, 120, 130],
21+
'TermFrequency': [12, 12, 12, 12],
22+
# Add categorical columns as strings (with small number of categories)
23+
'Gender': ['male', 'female', 'male', 'female'],
24+
'claim_indicator': [1, 0, 1, 0] # Just to mimic full data format
25+
}
26+
df = pd.DataFrame(data)
27+
df.to_csv(dummy_csv_path, index=False)
28+
yield
29+
os.remove(dummy_csv_path)
30+
31+
def test_train_and_compare_models_runs():
32+
best_model, best_name, results = train_and_compare_models(dummy_csv_path)
33+
34+
# Check return types
35+
assert best_name in results
36+
assert hasattr(best_model, 'predict') # model should have predict method
37+
38+
# Check results contain expected keys
39+
assert 'LinearRegression' in results
40+
assert 'RandomForest' in results
41+
assert 'XGBoost' in results
42+
43+
# Check RMSE and R2 are floats
44+
for key, (rmse, r2) in results.items():
45+
assert isinstance(rmse, float)
46+
assert isinstance(r2, float)

0 commit comments

Comments
 (0)