Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
239 changes: 239 additions & 0 deletions src/mini_nn/karthik_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

from torch.utils.data import TensorDataset, DataLoader
from pathlib import Path

# =========================
# CONFIG
# =========================
EPOCHS = 150
PATIENCE = 15
LEARNING_RATE = 1e-4 # lower to prevent NaNs
BATCH_SIZE = 64
SAMPLE_SIZE = 100000 # reduce dataset for testing speed

# =========================
# MODEL
# =========================
class PredictorModel(nn.Module):
def __init__(self, input_size):
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_size, 256),
nn.LeakyReLU(0.1),
nn.BatchNorm1d(256),
nn.Dropout(0.2),
nn.Linear(256, 128),
nn.LeakyReLU(0.1),
nn.BatchNorm1d(128),
nn.Dropout(0.2),
nn.Linear(128, 64),
nn.LeakyReLU(0.1),
nn.BatchNorm1d(64),
nn.Dropout(0.1),
nn.Linear(64, 1)
)

def forward(self, x):
return self.net(x)

# =========================
# DATA LOADING
# =========================
def load_data_from_csv():
CURRENT_DIR = Path(__file__).resolve().parent
DATA_DIR = CURRENT_DIR.parent.parent / "Data" # points to TransformerPredictionModel/Data/
player_file = DATA_DIR / "PlayerStatistics.csv"
if not player_file.exists():
raise FileNotFoundError(f"{player_file} does not exist. Make sure Data/ contains PlayerStatistics.csv")

# read CSV safely
df = pd.read_csv(player_file, low_memory=False)
df.columns = df.columns.str.strip() # remove whitespace
return df

# =========================
# TRAINING PIPELINE
# =========================
def main():
print("Loading data from CSVs...")
df = load_data_from_csv()
print(f"Total rows: {len(df)}")

# -------------------------
# Reduce dataset size for fast testing
# -------------------------
if len(df) > SAMPLE_SIZE:
df = df.sample(n=SAMPLE_SIZE, random_state=42)

# -------------------------
# Create binary target
# -------------------------
df['beats_line'] = (df['points'] >= 20).astype(int)

# -------------------------
# Numeric features
# -------------------------
drop_cols = ['beats_line', 'firstName', 'lastName', 'personId', 'gameId', 'gameDateTimeEst',
'playerteamCity','playerteamName','opponentteamCity','opponentteamName',
'gameType','gameLabel','gameSubLabel','seriesGameNumber','win','home']

X = df.select_dtypes(include=[np.number]).drop(columns=drop_cols, errors='ignore')

# Handle NaNs / infinities
X = X.replace([np.inf, -np.inf], 0)
X = X.fillna(0)

y = df['beats_line']

print(f"Features: {X.shape[1]}")
print("Label distribution:")
print(y.value_counts(normalize=True))

# -------------------------
# Class imbalance weighting
# -------------------------
classes = np.unique(y)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
pos_weight = torch.tensor([class_weights[1] / class_weights[0]], dtype=torch.float)
print(f"Class weight for positive class: {pos_weight.item():.2f}")

# -------------------------
# SPLIT
# -------------------------
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

youre also not dropping any of the lines that leak data to the outcome. like youre not removing any stats like points or anything thats current game.

X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.176, random_state=42)

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are definitely having leaking data here because you are having random split

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Create DataLoaders
train_ds = TensorDataset(torch.FloatTensor(X_train_scaled), torch.FloatTensor(y_train.values).unsqueeze(1))
val_ds = TensorDataset(torch.FloatTensor(X_val_scaled), torch.FloatTensor(y_val.values).unsqueeze(1))
test_ds = TensorDataset(torch.FloatTensor(X_test_scaled), torch.FloatTensor(y_test.values).unsqueeze(1))

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

# -------------------------
# MODEL SETUP
# -------------------------
model = PredictorModel(input_size=X_train_scaled.shape[1])
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

best_f1 = 0
patience_counter = 0
best_state = None
best_epoch = 0

print("\nTraining...")
for epoch in range(EPOCHS):
model.train()
train_loss = 0

for xb, yb in train_loader:
optimizer.zero_grad()
preds = model(xb)
loss = criterion(preds, yb)
if torch.isnan(loss):
print("NaN loss detected, stopping...")
return
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
train_loss += loss.item()

# Validation
model.eval()
all_preds, all_probs, all_labels = [], [], []

with torch.no_grad():
for xb, yb in val_loader:
logits = model(xb)
probs = torch.sigmoid(logits)
preds = (probs > 0.5).float()
all_preds.extend(preds.numpy())
all_probs.extend(probs.numpy())
all_labels.extend(yb.numpy())

all_preds = np.array(all_preds).flatten()
all_probs = np.array(all_probs).flatten()
all_labels = np.array(all_labels).flatten()

val_f1 = f1_score(all_labels, all_preds)

# Early stopping
if val_f1 > best_f1:
best_f1 = val_f1
best_state = model.state_dict()
best_epoch = epoch + 1
patience_counter = 0
else:
patience_counter += 1

if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss/len(train_loader):.4f} | Val F1: {val_f1:.4f}")

if patience_counter >= PATIENCE:
print(f"\nEarly stopping at epoch {epoch+1}")
break

if best_state is None:
print("No valid model trained. Exiting...")
return

# -------------------------
# TEST
# -------------------------
model.load_state_dict(best_state)
model.eval()
test_preds, test_probs, test_labels = [], [], []

with torch.no_grad():
for xb, yb in test_loader:
logits = model(xb)
probs = torch.sigmoid(logits)
preds = (probs > 0.5).float()
test_preds.extend(preds.numpy())
test_probs.extend(probs.numpy())
test_labels.extend(yb.numpy())

test_preds = np.array(test_preds).flatten()
test_probs = np.array(test_probs).flatten()
test_labels = np.array(test_labels).flatten()

print("\n================ RESULTS ================")
print(f"Best Epoch: {best_epoch}")
print(f"Test Accuracy: {(test_preds == test_labels).mean():.4f}")
print(f"Test F1 Score: {f1_score(test_labels, test_preds):.4f}")
print(f"Test ROC-AUC: {roc_auc_score(test_labels, test_probs):.4f}")

# -------------------------
# SAVE OUTPUT
# -------------------------
out = pd.DataFrame({
"prob_beats_20_pts": test_probs,
"prediction": test_preds,
"actual": test_labels
})

out_path = Path(__file__).parent / "model_predictions.csv"
out.to_csv(out_path, index=False)
print(f"\nSaved predictions → {out_path}")

if __name__ == "__main__":
main()
Loading