From 544d38765c2a435b6bca6c80e78d6e28198de2b9 Mon Sep 17 00:00:00 2001 From: Lalit Yemireddy Date: Tue, 10 Feb 2026 00:57:43 -0800 Subject: [PATCH] feat(mini-nn): add baseline NN for player stats classification --- src/mini_nn/lalit_model.py | 261 +++++++++++++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 src/mini_nn/lalit_model.py diff --git a/src/mini_nn/lalit_model.py b/src/mini_nn/lalit_model.py new file mode 100644 index 0000000..597fd62 --- /dev/null +++ b/src/mini_nn/lalit_model.py @@ -0,0 +1,261 @@ +# src/mini_nn/lalit_model.py +# +# Mini neural net baseline on PlayerStatistics.csv +# IMPORTANT: avoids target leakage by dropping box-score scoring columns +# when the target is derived from points. + +import torch +import torch.nn as nn +import torch.optim as optim +import pandas as pd +import numpy as np + +from pathlib import Path +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import f1_score, roc_auc_score +from torch.utils.data import TensorDataset, DataLoader + + +# ---- training config ---- +EPOCHS = 50 +PATIENCE = 8 +LEARNING_RATE = 5e-4 +BATCH_SIZE = 256 +SEED = 42 + + +class MiniNet(nn.Module): + def __init__(self, input_size: int): + super().__init__() + self.net = nn.Sequential( + nn.Linear(input_size, 128), + nn.ReLU(), + nn.BatchNorm1d(128), + nn.Dropout(0.2), + nn.Linear(128, 64), + nn.ReLU(), + nn.BatchNorm1d(64), + nn.Dropout(0.1), + nn.Linear(64, 1), + ) + + def forward(self, x): + return self.net(x) + + +def load_player_stats_csv() -> pd.DataFrame: + # file: src/nba_dataset/Data/PlayerStatistics.csv + current_dir = Path(__file__).resolve().parent # .../src/mini_nn + data_path = current_dir.parent / "nba_dataset" / "Data" / "PlayerStatistics.csv" + if not data_path.exists(): + raise FileNotFoundError(f"Could not find: {data_path}") + + return pd.read_csv(data_path, low_memory=False) + + +def main(): + torch.manual_seed(SEED) + np.random.seed(SEED) + + df = load_player_stats_csv() + + # ---- define a target (binary classification) ---- + # Example: did the player score 20+ points in the game? + if "points" not in df.columns: + raise ValueError("Column 'points' not found in PlayerStatistics.csv") + + points = pd.to_numeric(df["points"], errors="coerce").fillna(0) + y = (points >= 20).astype(np.float32) + + # ---- build features ---- + X = df.copy() + + # IDs / text / labels that shouldn't be features + id_text_cols = [ + "firstName", + "lastName", + "personId", + "gameId", + "gameDateTimeEst", + "playerteamCity", + "playerteamName", + "opponentteamCity", + "opponentteamName", + "gameLabel", + "gameSubLabel", + ] + + # Target + leakage columns: + # These directly determine points (or are derived from it), + # so keeping them makes metrics unrealistically perfect. + leakage_cols = [ + "points", + "fieldGoalsMade", + "threePointersMade", + "freeThrowsMade", + "fieldGoalsAttempted", + "threePointersAttempted", + "freeThrowsAttempted", + "fieldGoalsPercentage", + "threePointersPercentage", + "freeThrowsPercentage", + # NOTE: "numMinutes" is still post-game, but not a direct function of points. + # If you want stricter "pre-game only", you should engineer rolling averages + # from prior games and drop any same-game box-score stats. + ] + + X = X.drop(columns=id_text_cols + leakage_cols, errors="ignore") + + # Keep numeric only + X = ( + X.select_dtypes(include=[np.number]) + .replace([np.inf, -np.inf], np.nan) + .fillna(0) + ) + + if X.shape[1] == 0: + raise ValueError("No numeric feature columns found after filtering. Inspect the CSV columns.") + + print(f"Rows: {len(df):,}") + print(f"Features: {X.shape[1]}") + print(f"Positive rate (>=20 pts): {y.mean():.3f}") + print(f"Using features: {list(X.columns)}") + + # ---- split (stratified) ---- + X_temp, X_test, y_temp, y_test = train_test_split( + X, y, test_size=0.15, random_state=SEED, stratify=y + ) + X_train, X_val, y_train, y_val = train_test_split( + X_temp, y_temp, test_size=0.176, random_state=SEED, stratify=y_temp + ) # ~15% val + + # ---- scale ---- + scaler = StandardScaler() + X_train_s = scaler.fit_transform(X_train) + X_val_s = scaler.transform(X_val) + X_test_s = scaler.transform(X_test) + + # ---- dataloaders ---- + train_ds = TensorDataset( + torch.tensor(X_train_s, dtype=torch.float32), + torch.tensor(np.array(y_train), dtype=torch.float32).unsqueeze(1), + ) + val_ds = TensorDataset( + torch.tensor(X_val_s, dtype=torch.float32), + torch.tensor(np.array(y_val), dtype=torch.float32).unsqueeze(1), + ) + test_ds = TensorDataset( + torch.tensor(X_test_s, dtype=torch.float32), + torch.tensor(np.array(y_test), dtype=torch.float32).unsqueeze(1), + ) + + train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True) + val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False) + test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False) + + # ---- model ---- + model = MiniNet(input_size=X_train.shape[1]) + optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3) + + # Handle class imbalance a bit (>=20 is ~13% positives) + pos = float(np.sum(y_train == 1)) + neg = float(np.sum(y_train == 0)) + if pos > 0: + pos_weight = torch.tensor([neg / pos], dtype=torch.float32) + criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight) + else: + criterion = nn.BCEWithLogitsLoss() + + best_val_f1 = -1.0 + best_state = None + patience = 0 + best_epoch = 0 + + for epoch in range(1, EPOCHS + 1): + model.train() + train_loss = 0.0 + + for xb, yb in train_loader: + optimizer.zero_grad() + logits = model(xb) + loss = criterion(logits, yb) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + train_loss += loss.item() + + # ---- validation ---- + model.eval() + probs_list, preds_list, labels_list = [], [], [] + with torch.no_grad(): + for xb, yb in val_loader: + logits = model(xb) + probs = torch.sigmoid(logits) + preds = (probs >= 0.5).float() + + probs_list.append(probs.cpu().numpy()) + preds_list.append(preds.cpu().numpy()) + labels_list.append(yb.cpu().numpy()) + + probs = np.concatenate(probs_list).ravel() + preds = np.concatenate(preds_list).ravel() + labels = np.concatenate(labels_list).ravel() + + val_f1 = f1_score(labels, preds, zero_division=0) + val_acc = (preds == labels).mean() + + if val_f1 > best_val_f1: + best_val_f1 = val_f1 + best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()} + best_epoch = epoch + patience = 0 + else: + patience += 1 + + if epoch % 5 == 0 or epoch == 1: + print( + f"Epoch {epoch:3d} | " + f"train_loss {train_loss/len(train_loader):.4f} | " + f"val_acc {val_acc:.4f} | val_f1 {val_f1:.4f}" + ) + + if patience >= PATIENCE: + print( + f"Early stopping at epoch {epoch} " + f"(best epoch {best_epoch}, best val_f1 {best_val_f1:.4f})" + ) + break + + # ---- test ---- + model.load_state_dict(best_state) + model.eval() + + probs_list, preds_list, labels_list = [], [], [] + with torch.no_grad(): + for xb, yb in test_loader: + logits = model(xb) + probs = torch.sigmoid(logits) + preds = (probs >= 0.5).float() + + probs_list.append(probs.cpu().numpy()) + preds_list.append(preds.cpu().numpy()) + labels_list.append(yb.cpu().numpy()) + + probs = np.concatenate(probs_list).ravel() + preds = np.concatenate(preds_list).ravel() + labels = np.concatenate(labels_list).ravel() + + test_acc = (preds == labels).mean() + test_f1 = f1_score(labels, preds, zero_division=0) + test_auc = roc_auc_score(labels, probs) if len(np.unique(labels)) == 2 else float("nan") + + print("\n=== FINAL TEST ===") + print(f"Best epoch: {best_epoch}") + print(f"Test Accuracy: {test_acc:.4f}") + print(f"Test F1: {test_f1:.4f}") + print(f"Test AUC: {test_auc:.4f}") + + +if __name__ == "__main__": + main()