Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
499 changes: 499 additions & 0 deletions notebooks/task_1/01_Data_understanding.ipynb

Large diffs are not rendered by default.

1,766 changes: 1,766 additions & 0 deletions notebooks/task_1/02_eda_univariate.ipynb

Large diffs are not rendered by default.

930 changes: 930 additions & 0 deletions notebooks/task_1/03_eda_bivariate.ipynb

Large diffs are not rendered by default.

320 changes: 320 additions & 0 deletions notebooks/task_1/04_visualizations.ipynb

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import os

# Paths
RAW_DATA_PATH = "../../data/raw/raw_data.csv"
CLEANED_DATA_PATH = '../data/cleaned'
PROCESSED_DATA_PATH = '../data/processed'
19 changes: 19 additions & 0 deletions src/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,29 @@
import pandas as pd

def load_data(path: str) -> pd.DataFrame:
return pd.read_csv(path, sep='|',
skipinitialspace=False,
engine='python',
skiprows=0
)
def load_raw_data(path: str) -> pd.DataFrame:
return pd.read_csv(path)

def check_structure(df: pd.DataFrame):
return df.info(), df.dtypes

def save_raw_data(df: pd.DataFrame):
df.to_csv('../../data/raw/raw_data.csv', index=False)

def check_missing(df: pd.DataFrame):
return df.isnull().sum()

def extract_numeric_cols(df: pd.DataFrame):
return df.select_dtypes(include=['int64', 'float64']).columns.tolist()

def extract_categorical_cols(df: pd.DataFrame):
return df.select_dtypes(include=['object', 'category']).columns.tolist()

def extract_date_time_cols(df: pd.DataFrame):
return df.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns.tolist()

10 changes: 10 additions & 0 deletions src/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import pandas as pd
def clean_numeric_strings(df: pd.DataFrame, cols: list):
"""
Convert string-formatted numbers with commas to proper floats.
"""
for col in cols:
if df[col].dtype == 'object':
df[col] = df[col].str.replace(',', '', regex=False) # Remove thousands separator
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to float
return df
Empty file added src/task_1/__init__.py
Empty file.
Empty file added src/task_1/eda/__init__.py
Empty file.
35 changes: 35 additions & 0 deletions src/task_1/eda/bivariate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def correlation_matrix(df):
corr = df.corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()
return corr


def scatter_plot(df, x, y):
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x=x, y=y)
plt.title(f"{y} vs {x}")
plt.show()


def group_loss_ratio(df, by):
return df.groupby(by).agg(
TotalClaims=("TotalClaims", "sum"),
TotalPremium=("TotalPremium", "sum")
).assign(LossRatio=lambda x: x.TotalClaims / x.TotalPremium.replace(0, pd.NA))


def line_plot(df, date_col, value_col):
df_sorted = df.sort_values(by=date_col)
plt.figure(figsize=(10, 5))
sns.lineplot(x=df_sorted[date_col], y=df_sorted[value_col])
plt.title(f"Trend of {value_col} over time")
plt.xticks(rotation=45)
plt.show()

22 changes: 22 additions & 0 deletions src/task_1/eda/outlier_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def detect_outliers_iqr(df: pd.DataFrame, column: str):
"""
Returns rows where the column has outliers using IQR method.
"""
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = df[(df[column] < lower) | (df[column] > upper)]
return outliers

def plot_outliers_box(df: pd.DataFrame, column: str, title: str = ""):
plt.figure(figsize=(8, 4))
sns.boxplot(x=df[column])
plt.title(title or f"Boxplot for {column}")
plt.tight_layout()
plt.show()
8 changes: 8 additions & 0 deletions src/task_1/eda/summary_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pandas as pd
def describe_numerical(df: pd.DataFrame):
return df.describe()


def calculate_loss_ratio(df: pd.DataFrame):
df["LossRatio"] = df["TotalClaims"] / df["TotalPremium"].replace(0, pd.NA)
return df
93 changes: 93 additions & 0 deletions src/task_1/eda/univariate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from src.data_loader import extract_numeric_cols, extract_categorical_cols, extract_date_time_cols

sns.set(style='whitegrid', palette='muted')

def _save_fig(save_fig: bool, fig_path: str, filename: str):
if save_fig:
os.makedirs(fig_path, exist_ok=True)
plt.savefig(os.path.join(fig_path, filename), bbox_inches='tight')

# ---- Numeric Features ----
def analyze_numeric_features(df: pd.DataFrame, save_fig: bool = False, fig_path: str = "outputs/univariate/numeric/"):
numeric_cols = extract_numeric_cols(df)
results = {}

for col in numeric_cols:
stats = df[col].describe()
missing = df[col].isna().sum()
results[col] = {"stats": stats, "missing": missing}

# Histogram
plt.figure(figsize=(8, 4))
sns.histplot(df[col].dropna(), kde=True, bins=30, color='skyblue')
plt.title(f'Distribution of {col}')
plt.xlabel(col)
plt.ylabel("Frequency")
_save_fig(save_fig, fig_path, f"hist_{col}.png")
plt.show()

# Boxplot
plt.figure(figsize=(6, 2))
sns.boxplot(x=df[col].dropna(), color='lightcoral')
plt.title(f'Boxplot of {col}')
plt.xlabel(col)
_save_fig(save_fig, fig_path, f"box_{col}.png")
plt.show()

return results


# ---- Categorical Features ----
def analyze_categorical_features(df: pd.DataFrame, top_n: int = 10, save_fig: bool = False, fig_path: str = "outputs/univariate/categorical/"):
cat_cols = extract_categorical_cols(df)
results = {}

for col in cat_cols:
counts = df[col].value_counts(dropna=False)
results[col] = {"counts": counts}

plt.figure(figsize=(8, 4))
top_values = counts[:top_n]
sns.barplot(x=top_values.values, y=top_values.index, palette="viridis")
plt.title(f'Top {top_n} Categories in {col}')
plt.xlabel("Count")
plt.ylabel(col)
_save_fig(save_fig, fig_path, f"bar_{col}.png")
plt.show()

return results


# ---- Datetime Features ----
def analyze_datetime_features(df: pd.DataFrame, save_fig: bool = False, fig_path: str = "outputs/univariate/datetime/"):
datetime_cols = extract_date_time_cols(df)
results = {}

for col in datetime_cols:
df[col] = pd.to_datetime(df[col], errors='coerce') # Ensure datetime conversion
df_temp = df.copy()
df_temp["year"] = df[col].dt.year
df_temp["month"] = df[col].dt.month
df_temp["weekday"] = df[col].dt.dayofweek
df_temp["day"] = df[col].dt.day

results[col] = {
"min_date": df[col].min(),
"max_date": df[col].max(),
"n_missing": df[col].isna().sum()
}

# Plot by month
plt.figure(figsize=(10, 4))
sns.histplot(df[col].dropna(), bins=30, kde=False, color="slateblue")
plt.title(f'Date Distribution of {col}')
plt.xlabel(col)
plt.ylabel("Count")
_save_fig(save_fig, fig_path, f"dist_{col}.png")
plt.show()

return results
Empty file added src/task_1/viz/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions src/task_1/viz/plot_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
def plot_top_bottom_claims(df, group_col, value_col, top_n=5):
grouped = df.groupby(group_col)[value_col].mean().sort_values()
top = grouped.tail(top_n)
bottom = grouped.head(top_n)

plt.figure(figsize=(12, 6))
top.plot(kind="bar", color="green", label="Top")
bottom.plot(kind="bar", color="red", label="Bottom")
plt.title(f"Top and Bottom {top_n} {group_col}s by Average {value_col}")
plt.ylabel(f"Avg {value_col}")
plt.legend()
plt.show()
6 changes: 6 additions & 0 deletions tests/test_data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import pandas as pd
from src.data_loader import check_missing

def test_missing():
df = pd.DataFrame({"A": [1, None, 2]})
assert check_missing(df).A == 1