Skip to content

Commit 659a5e0

Browse files
authored
after task-1 from ayanasamuel8/task-1
Task 1
2 parents 478d90d + b5ff0ab commit 659a5e0

16 files changed

Lines changed: 3726 additions & 0 deletions

notebooks/task_1/01_Data_understanding.ipynb

Lines changed: 499 additions & 0 deletions
Large diffs are not rendered by default.

notebooks/task_1/02_eda_univariate.ipynb

Lines changed: 1766 additions & 0 deletions
Large diffs are not rendered by default.

notebooks/task_1/03_eda_bivariate.ipynb

Lines changed: 930 additions & 0 deletions
Large diffs are not rendered by default.

notebooks/task_1/04_visualizations.ipynb

Lines changed: 320 additions & 0 deletions
Large diffs are not rendered by default.

src/config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import os
2+
3+
# Paths
4+
RAW_DATA_PATH = "../../data/raw/raw_data.csv"
5+
CLEANED_DATA_PATH = '../data/cleaned'
6+
PROCESSED_DATA_PATH = '../data/processed'

src/data_loader.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,29 @@
11
import pandas as pd
22

33
def load_data(path: str) -> pd.DataFrame:
4+
return pd.read_csv(path, sep='|',
5+
skipinitialspace=False,
6+
engine='python',
7+
skiprows=0
8+
)
9+
def load_raw_data(path: str) -> pd.DataFrame:
410
return pd.read_csv(path)
511

612
def check_structure(df: pd.DataFrame):
713
return df.info(), df.dtypes
814

15+
def save_raw_data(df: pd.DataFrame):
16+
df.to_csv('../../data/raw/raw_data.csv', index=False)
17+
918
def check_missing(df: pd.DataFrame):
1019
return df.isnull().sum()
20+
21+
def extract_numeric_cols(df: pd.DataFrame):
22+
return df.select_dtypes(include=['int64', 'float64']).columns.tolist()
23+
24+
def extract_categorical_cols(df: pd.DataFrame):
25+
return df.select_dtypes(include=['object', 'category']).columns.tolist()
26+
27+
def extract_date_time_cols(df: pd.DataFrame):
28+
return df.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns.tolist()
29+

src/preprocessing.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import pandas as pd
2+
def clean_numeric_strings(df: pd.DataFrame, cols: list):
3+
"""
4+
Convert string-formatted numbers with commas to proper floats.
5+
"""
6+
for col in cols:
7+
if df[col].dtype == 'object':
8+
df[col] = df[col].str.replace(',', '', regex=False) # Remove thousands separator
9+
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to float
10+
return df

src/task_1/__init__.py

Whitespace-only changes.

src/task_1/eda/__init__.py

Whitespace-only changes.

src/task_1/eda/bivariate.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import pandas as pd
2+
import seaborn as sns
3+
import matplotlib.pyplot as plt
4+
5+
def correlation_matrix(df):
6+
corr = df.corr(numeric_only=True)
7+
plt.figure(figsize=(10, 8))
8+
sns.heatmap(corr, annot=True, cmap="coolwarm")
9+
plt.title("Correlation Matrix")
10+
plt.show()
11+
return corr
12+
13+
14+
def scatter_plot(df, x, y):
15+
plt.figure(figsize=(8, 5))
16+
sns.scatterplot(data=df, x=x, y=y)
17+
plt.title(f"{y} vs {x}")
18+
plt.show()
19+
20+
21+
def group_loss_ratio(df, by):
22+
return df.groupby(by).agg(
23+
TotalClaims=("TotalClaims", "sum"),
24+
TotalPremium=("TotalPremium", "sum")
25+
).assign(LossRatio=lambda x: x.TotalClaims / x.TotalPremium.replace(0, pd.NA))
26+
27+
28+
def line_plot(df, date_col, value_col):
29+
df_sorted = df.sort_values(by=date_col)
30+
plt.figure(figsize=(10, 5))
31+
sns.lineplot(x=df_sorted[date_col], y=df_sorted[value_col])
32+
plt.title(f"Trend of {value_col} over time")
33+
plt.xticks(rotation=45)
34+
plt.show()
35+

0 commit comments

Comments
 (0)