From 1cbbb6d0ded8b0ad9cc153cc324c243ee2db8595 Mon Sep 17 00:00:00 2001 From: Karthik Ravi Date: Tue, 10 Feb 2026 00:32:27 -0800 Subject: [PATCH] feat(mini_nn): add karthik_model.py for NBA CSV neural network --- src/mini_nn/karthik_model.py | 239 +++++++++++++++++++++++++++++++++++ 1 file changed, 239 insertions(+) create mode 100644 src/mini_nn/karthik_model.py diff --git a/src/mini_nn/karthik_model.py b/src/mini_nn/karthik_model.py new file mode 100644 index 0000000..132fd0f --- /dev/null +++ b/src/mini_nn/karthik_model.py @@ -0,0 +1,239 @@ +import torch +import torch.nn as nn +import torch.optim as optim +import pandas as pd +import numpy as np + +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import f1_score, roc_auc_score +from sklearn.utils.class_weight import compute_class_weight + +from torch.utils.data import TensorDataset, DataLoader +from pathlib import Path + +# ========================= +# CONFIG +# ========================= +EPOCHS = 150 +PATIENCE = 15 +LEARNING_RATE = 1e-4 # lower to prevent NaNs +BATCH_SIZE = 64 +SAMPLE_SIZE = 100000 # reduce dataset for testing speed + +# ========================= +# MODEL +# ========================= +class PredictorModel(nn.Module): + def __init__(self, input_size): + super().__init__() + self.net = nn.Sequential( + nn.Linear(input_size, 256), + nn.LeakyReLU(0.1), + nn.BatchNorm1d(256), + nn.Dropout(0.2), + nn.Linear(256, 128), + nn.LeakyReLU(0.1), + nn.BatchNorm1d(128), + nn.Dropout(0.2), + nn.Linear(128, 64), + nn.LeakyReLU(0.1), + nn.BatchNorm1d(64), + nn.Dropout(0.1), + nn.Linear(64, 1) + ) + + def forward(self, x): + return self.net(x) + +# ========================= +# DATA LOADING +# ========================= +def load_data_from_csv(): + CURRENT_DIR = Path(__file__).resolve().parent + DATA_DIR = CURRENT_DIR.parent.parent / "Data" # points to TransformerPredictionModel/Data/ + player_file = DATA_DIR / "PlayerStatistics.csv" + if not player_file.exists(): + raise FileNotFoundError(f"{player_file} does not exist. Make sure Data/ contains PlayerStatistics.csv") + + # read CSV safely + df = pd.read_csv(player_file, low_memory=False) + df.columns = df.columns.str.strip() # remove whitespace + return df + +# ========================= +# TRAINING PIPELINE +# ========================= +def main(): + print("Loading data from CSVs...") + df = load_data_from_csv() + print(f"Total rows: {len(df)}") + + # ------------------------- + # Reduce dataset size for fast testing + # ------------------------- + if len(df) > SAMPLE_SIZE: + df = df.sample(n=SAMPLE_SIZE, random_state=42) + + # ------------------------- + # Create binary target + # ------------------------- + df['beats_line'] = (df['points'] >= 20).astype(int) + + # ------------------------- + # Numeric features + # ------------------------- + drop_cols = ['beats_line', 'firstName', 'lastName', 'personId', 'gameId', 'gameDateTimeEst', + 'playerteamCity','playerteamName','opponentteamCity','opponentteamName', + 'gameType','gameLabel','gameSubLabel','seriesGameNumber','win','home'] + + X = df.select_dtypes(include=[np.number]).drop(columns=drop_cols, errors='ignore') + + # Handle NaNs / infinities + X = X.replace([np.inf, -np.inf], 0) + X = X.fillna(0) + + y = df['beats_line'] + + print(f"Features: {X.shape[1]}") + print("Label distribution:") + print(y.value_counts(normalize=True)) + + # ------------------------- + # Class imbalance weighting + # ------------------------- + classes = np.unique(y) + class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y) + pos_weight = torch.tensor([class_weights[1] / class_weights[0]], dtype=torch.float) + print(f"Class weight for positive class: {pos_weight.item():.2f}") + + # ------------------------- + # SPLIT + # ------------------------- + X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42) + X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.176, random_state=42) + + # Scale features + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + X_val_scaled = scaler.transform(X_val) + X_test_scaled = scaler.transform(X_test) + + # Create DataLoaders + train_ds = TensorDataset(torch.FloatTensor(X_train_scaled), torch.FloatTensor(y_train.values).unsqueeze(1)) + val_ds = TensorDataset(torch.FloatTensor(X_val_scaled), torch.FloatTensor(y_val.values).unsqueeze(1)) + test_ds = TensorDataset(torch.FloatTensor(X_test_scaled), torch.FloatTensor(y_test.values).unsqueeze(1)) + + train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True) + val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE) + test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE) + + # ------------------------- + # MODEL SETUP + # ------------------------- + model = PredictorModel(input_size=X_train_scaled.shape[1]) + optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3) + criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight) + + best_f1 = 0 + patience_counter = 0 + best_state = None + best_epoch = 0 + + print("\nTraining...") + for epoch in range(EPOCHS): + model.train() + train_loss = 0 + + for xb, yb in train_loader: + optimizer.zero_grad() + preds = model(xb) + loss = criterion(preds, yb) + if torch.isnan(loss): + print("NaN loss detected, stopping...") + return + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + train_loss += loss.item() + + # Validation + model.eval() + all_preds, all_probs, all_labels = [], [], [] + + with torch.no_grad(): + for xb, yb in val_loader: + logits = model(xb) + probs = torch.sigmoid(logits) + preds = (probs > 0.5).float() + all_preds.extend(preds.numpy()) + all_probs.extend(probs.numpy()) + all_labels.extend(yb.numpy()) + + all_preds = np.array(all_preds).flatten() + all_probs = np.array(all_probs).flatten() + all_labels = np.array(all_labels).flatten() + + val_f1 = f1_score(all_labels, all_preds) + + # Early stopping + if val_f1 > best_f1: + best_f1 = val_f1 + best_state = model.state_dict() + best_epoch = epoch + 1 + patience_counter = 0 + else: + patience_counter += 1 + + if (epoch + 1) % 10 == 0: + print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss/len(train_loader):.4f} | Val F1: {val_f1:.4f}") + + if patience_counter >= PATIENCE: + print(f"\nEarly stopping at epoch {epoch+1}") + break + + if best_state is None: + print("No valid model trained. Exiting...") + return + + # ------------------------- + # TEST + # ------------------------- + model.load_state_dict(best_state) + model.eval() + test_preds, test_probs, test_labels = [], [], [] + + with torch.no_grad(): + for xb, yb in test_loader: + logits = model(xb) + probs = torch.sigmoid(logits) + preds = (probs > 0.5).float() + test_preds.extend(preds.numpy()) + test_probs.extend(probs.numpy()) + test_labels.extend(yb.numpy()) + + test_preds = np.array(test_preds).flatten() + test_probs = np.array(test_probs).flatten() + test_labels = np.array(test_labels).flatten() + + print("\n================ RESULTS ================") + print(f"Best Epoch: {best_epoch}") + print(f"Test Accuracy: {(test_preds == test_labels).mean():.4f}") + print(f"Test F1 Score: {f1_score(test_labels, test_preds):.4f}") + print(f"Test ROC-AUC: {roc_auc_score(test_labels, test_probs):.4f}") + + # ------------------------- + # SAVE OUTPUT + # ------------------------- + out = pd.DataFrame({ + "prob_beats_20_pts": test_probs, + "prediction": test_preds, + "actual": test_labels + }) + + out_path = Path(__file__).parent / "model_predictions.csv" + out.to_csv(out_path, index=False) + print(f"\nSaved predictions → {out_path}") + +if __name__ == "__main__": + main()