ayanasamuel8 · ayanasamuel8 · Jun 15, 2025 · Jun 15, 2025 · Jun 15, 2025
diff --git a/notebooks/task_1/01_Data_understanding.ipynb b/notebooks/task_1/01_Data_understanding.ipynb
diff --git a/notebooks/task_1/02_eda_univariate.ipynb b/notebooks/task_1/02_eda_univariate.ipynb
diff --git a/notebooks/task_1/03_eda_bivariate.ipynb b/notebooks/task_1/03_eda_bivariate.ipynb
diff --git a/notebooks/task_1/04_visualizations.ipynb b/notebooks/task_1/04_visualizations.ipynb
diff --git a/src/config.py b/src/config.py
@@ -0,0 +1,6 @@
+import os
+
+# Paths
+RAW_DATA_PATH = "../../data/raw/raw_data.csv"
+CLEANED_DATA_PATH = '../data/cleaned'
+PROCESSED_DATA_PATH = '../data/processed'
diff --git a/src/data_loader.py b/src/data_loader.py
@@ -1,10 +1,29 @@
 import pandas as pd
 
 def load_data(path: str) -> pd.DataFrame:
+    return pd.read_csv(path, sep='|',
+    skipinitialspace=False,
+    engine='python',
+    skiprows=0 
+    )
+def load_raw_data(path: str) -> pd.DataFrame:
     return pd.read_csv(path)
 
 def check_structure(df: pd.DataFrame):
     return df.info(), df.dtypes
 
+def save_raw_data(df: pd.DataFrame):
+    df.to_csv('../../data/raw/raw_data.csv', index=False)
+
 def check_missing(df: pd.DataFrame):
     return df.isnull().sum()
+
+def extract_numeric_cols(df: pd.DataFrame):
+    return df.select_dtypes(include=['int64', 'float64']).columns.tolist()
+
+def extract_categorical_cols(df: pd.DataFrame):
+    return df.select_dtypes(include=['object', 'category']).columns.tolist()
+
+def extract_date_time_cols(df: pd.DataFrame):
+    return df.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns.tolist()
+
diff --git a/src/preprocessing.py b/src/preprocessing.py
@@ -0,0 +1,10 @@
+import pandas as pd
+def clean_numeric_strings(df: pd.DataFrame, cols: list):
+    """
+    Convert string-formatted numbers with commas to proper floats.
+    """
+    for col in cols:
+        if df[col].dtype == 'object':
+            df[col] = df[col].str.replace(',', '', regex=False)  # Remove thousands separator
+            df[col] = pd.to_numeric(df[col], errors='coerce')    # Convert to float
+    return df
diff --git a/src/task_1/__init__.py b/src/task_1/__init__.py
diff --git a/src/task_1/eda/__init__.py b/src/task_1/eda/__init__.py
diff --git a/src/task_1/eda/bivariate.py b/src/task_1/eda/bivariate.py
@@ -0,0 +1,35 @@
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+def correlation_matrix(df):
+    corr = df.corr(numeric_only=True)
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(corr, annot=True, cmap="coolwarm")
+    plt.title("Correlation Matrix")
+    plt.show()
+    return corr
+
+
+def scatter_plot(df, x, y):
+    plt.figure(figsize=(8, 5))
+    sns.scatterplot(data=df, x=x, y=y)
+    plt.title(f"{y} vs {x}")
+    plt.show()
+
+
+def group_loss_ratio(df, by):
+    return df.groupby(by).agg(
+        TotalClaims=("TotalClaims", "sum"),
+        TotalPremium=("TotalPremium", "sum")
+    ).assign(LossRatio=lambda x: x.TotalClaims / x.TotalPremium.replace(0, pd.NA))
+
+
+def line_plot(df, date_col, value_col):
+    df_sorted = df.sort_values(by=date_col)
+    plt.figure(figsize=(10, 5))
+    sns.lineplot(x=df_sorted[date_col], y=df_sorted[value_col])
+    plt.title(f"Trend of {value_col} over time")
+    plt.xticks(rotation=45)
+    plt.show()
+
diff --git a/src/task_1/eda/outlier_detection.py b/src/task_1/eda/outlier_detection.py
@@ -0,0 +1,22 @@
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+def detect_outliers_iqr(df: pd.DataFrame, column: str):
+    """
+    Returns rows where the column has outliers using IQR method.
+    """
+    Q1 = df[column].quantile(0.25)
+    Q3 = df[column].quantile(0.75)
+    IQR = Q3 - Q1
+    lower = Q1 - 1.5 * IQR
+    upper = Q3 + 1.5 * IQR
+    outliers = df[(df[column] < lower) | (df[column] > upper)]
+    return outliers
+
+def plot_outliers_box(df: pd.DataFrame, column: str, title: str = ""):
+    plt.figure(figsize=(8, 4))
+    sns.boxplot(x=df[column])
+    plt.title(title or f"Boxplot for {column}")
+    plt.tight_layout()
+    plt.show()
diff --git a/src/task_1/eda/summary_stats.py b/src/task_1/eda/summary_stats.py
@@ -0,0 +1,8 @@
+import pandas as pd
+def describe_numerical(df: pd.DataFrame):
+    return df.describe()
+
+
+def calculate_loss_ratio(df: pd.DataFrame):
+    df["LossRatio"] = df["TotalClaims"] / df["TotalPremium"].replace(0, pd.NA)
+    return df
diff --git a/src/task_1/eda/univariate.py b/src/task_1/eda/univariate.py
@@ -0,0 +1,93 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import os
+from src.data_loader import extract_numeric_cols, extract_categorical_cols, extract_date_time_cols
+
+sns.set(style='whitegrid', palette='muted')
+
+def _save_fig(save_fig: bool, fig_path: str, filename: str):
+    if save_fig:
+        os.makedirs(fig_path, exist_ok=True)
+        plt.savefig(os.path.join(fig_path, filename), bbox_inches='tight')
+
+# ---- Numeric Features ----
+def analyze_numeric_features(df: pd.DataFrame, save_fig: bool = False, fig_path: str = "outputs/univariate/numeric/"):
+    numeric_cols = extract_numeric_cols(df)
+    results = {}
+
+    for col in numeric_cols:
+        stats = df[col].describe()
+        missing = df[col].isna().sum()
+        results[col] = {"stats": stats, "missing": missing}
+
+        # Histogram
+        plt.figure(figsize=(8, 4))
+        sns.histplot(df[col].dropna(), kde=True, bins=30, color='skyblue')
+        plt.title(f'Distribution of {col}')
+        plt.xlabel(col)
+        plt.ylabel("Frequency")
+        _save_fig(save_fig, fig_path, f"hist_{col}.png")
+        plt.show()
+
+        # Boxplot
+        plt.figure(figsize=(6, 2))
+        sns.boxplot(x=df[col].dropna(), color='lightcoral')
+        plt.title(f'Boxplot of {col}')
+        plt.xlabel(col)
+        _save_fig(save_fig, fig_path, f"box_{col}.png")
+        plt.show()
+
+    return results
+
+
+# ---- Categorical Features ----
+def analyze_categorical_features(df: pd.DataFrame, top_n: int = 10, save_fig: bool = False, fig_path: str = "outputs/univariate/categorical/"):
+    cat_cols = extract_categorical_cols(df)
+    results = {}
+
+    for col in cat_cols:
+        counts = df[col].value_counts(dropna=False)
+        results[col] = {"counts": counts}
+
+        plt.figure(figsize=(8, 4))
+        top_values = counts[:top_n]
+        sns.barplot(x=top_values.values, y=top_values.index, palette="viridis")
+        plt.title(f'Top {top_n} Categories in {col}')
+        plt.xlabel("Count")
+        plt.ylabel(col)
+        _save_fig(save_fig, fig_path, f"bar_{col}.png")
+        plt.show()
+
+    return results
+
+
+# ---- Datetime Features ----
+def analyze_datetime_features(df: pd.DataFrame, save_fig: bool = False, fig_path: str = "outputs/univariate/datetime/"):
+    datetime_cols = extract_date_time_cols(df)
+    results = {}
+
+    for col in datetime_cols:
+        df[col] = pd.to_datetime(df[col], errors='coerce')  # Ensure datetime conversion
+        df_temp = df.copy()
+        df_temp["year"] = df[col].dt.year
+        df_temp["month"] = df[col].dt.month
+        df_temp["weekday"] = df[col].dt.dayofweek
+        df_temp["day"] = df[col].dt.day
+
+        results[col] = {
+            "min_date": df[col].min(),
+            "max_date": df[col].max(),
+            "n_missing": df[col].isna().sum()
+        }
+
+        # Plot by month
+        plt.figure(figsize=(10, 4))
+        sns.histplot(df[col].dropna(), bins=30, kde=False, color="slateblue")
+        plt.title(f'Date Distribution of {col}')
+        plt.xlabel(col)
+        plt.ylabel("Count")
+        _save_fig(save_fig, fig_path, f"dist_{col}.png")
+        plt.show()
+
+    return results
diff --git a/src/task_1/viz/__init__.py b/src/task_1/viz/__init__.py
diff --git a/src/task_1/viz/plot_utils.py b/src/task_1/viz/plot_utils.py
@@ -0,0 +1,12 @@
+def plot_top_bottom_claims(df, group_col, value_col, top_n=5):
+    grouped = df.groupby(group_col)[value_col].mean().sort_values()
+    top = grouped.tail(top_n)
+    bottom = grouped.head(top_n)
+
+    plt.figure(figsize=(12, 6))
+    top.plot(kind="bar", color="green", label="Top")
+    bottom.plot(kind="bar", color="red", label="Bottom")
+    plt.title(f"Top and Bottom {top_n} {group_col}s by Average {value_col}")
+    plt.ylabel(f"Avg {value_col}")
+    plt.legend()
+    plt.show()
diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py
@@ -0,0 +1,6 @@
+import pandas as pd
+from src.data_loader import check_missing
+
+def test_missing():
+    df = pd.DataFrame({"A": [1, None, 2]})
+    assert check_missing(df).A == 1