33import pytest
44from joblib import load
55from src .task_4 .model_training import train_and_compare_models
6+ from src .task_4 .data_processing import prepare_claim_severity_data
67
78# Create a small dummy CSV for testing
89dummy_csv_path = 'test_dummy_data.csv'
1112def setup_dummy_data ():
1213 # Create a dummy dataset for claim severity with minimal rows and columns
1314 data = {
15+ 'UnderwrittenCoverID' : [1 , 2 , 3 , 4 ],
16+ 'PolicyID' : [101 , 102 , 103 , 104 ],
1417 'TotalClaims' : [100 , 200 , 150 , 300 ],
1518 'TotalPremium' : [1000 , 2000 , 1500 , 3000 ],
1619 'RegistrationYear' : [2020 , 2019 , 2021 , 2018 ],
@@ -20,7 +23,7 @@ def setup_dummy_data():
2023 'CalculatedPremiumPerTerm' : [100 , 150 , 120 , 130 ],
2124 'TermFrequency' : [12 , 12 , 12 , 12 ],
2225 # Add categorical columns as strings (with small number of categories)
23- 'Gender' : ['male ' , 'female ' , 'male ' , 'female ' ],
26+ 'Gender' : ['Male ' , 'Female ' , 'Male ' , 'Female ' ],
2427 'claim_indicator' : [1 , 0 , 1 , 0 ] # Just to mimic full data format
2528 }
2629 df = pd .DataFrame (data )
@@ -29,7 +32,16 @@ def setup_dummy_data():
2932 os .remove (dummy_csv_path )
3033
3134def test_train_and_compare_models_runs ():
32- best_model , best_name , results = train_and_compare_models (dummy_csv_path )
35+ # Read the dummy data
36+ df = pd .read_csv (dummy_csv_path )
37+ # Assume 'TotalClaims' is the target
38+ # X = df.drop(columns=['TotalClaims'])
39+ # y = df['TotalClaims']
40+
41+ # Simple train-test split (2 train, 2 test)
42+ X_train , X_test , y_train , y_test = prepare_claim_severity_data (df )
43+
44+ best_model , best_name , results = train_and_compare_models (X_train , y_train , X_test , y_test )
3345
3446 # Check return types
3547 assert best_name in results
0 commit comments