From 118f5f222b31d1cf34feaf938c2efea41ddedd96 Mon Sep 17 00:00:00 2001 From: riley-1995 Date: Thu, 5 Mar 2026 05:18:42 -0800 Subject: [PATCH] Persist normalization stats JSON and ignore .github --- .gitignore | 1 + README.md | 5 +++++ .../convert_audio_to_spec_tfrecords.py | 9 ++++++++- .../data_creation/create_tfrecords.py | 8 ++++++++ src/elp_rumble/data_creation/utils.py | 18 ++++++++++++++++++ 5 files changed, 40 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 2f0f1b3..7cbf4e9 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ *.egg-info/ .env /exported +.github/ /model_checkpoints __pycache__/ *.py[cod] diff --git a/README.md b/README.md index 566294d..553cd36 100644 --- a/README.md +++ b/README.md @@ -349,6 +349,11 @@ python3 -m elp_rumble.data_creation.create_tfrecords python3 -m elp_rumble.data_creation.convert_audio_to_spec_tfrecords ``` +During preprocessing, normalization statistics are saved to `data/normalization_stats.json`. +`create_tfrecords` writes `audio_mean` and `audio_std`, and +`convert_audio_to_spec_tfrecords` writes `spec_mean` and `spec_std`. +Running both scripts produces a single JSON file with all four keys. + Once TFRecords are created, no manual path edits are required for CNN. CNN data paths come from `src/elp_rumble/input_pipeline/spectrogram_tfrecords.py` and `src/elp_rumble/config/paths.py`. For RNN-only workflows, dataset file names are defined in `src/elp_rumble/models/rnn_config.py` diff --git a/src/elp_rumble/data_creation/convert_audio_to_spec_tfrecords.py b/src/elp_rumble/data_creation/convert_audio_to_spec_tfrecords.py index 01f1beb..c9e82dc 100644 --- a/src/elp_rumble/data_creation/convert_audio_to_spec_tfrecords.py +++ b/src/elp_rumble/data_creation/convert_audio_to_spec_tfrecords.py @@ -2,7 +2,8 @@ import os import numpy as np import tensorflow as tf -from elp_rumble.config.paths import TFRECORDS_AUDIO_DIR, TFRECORDS_SPECTROGRAM_DIR +from elp_rumble.config.paths import DATA_ROOT, TFRECORDS_AUDIO_DIR, TFRECORDS_SPECTROGRAM_DIR +from .utils import upsert_normalization_stats INPUT_AUDIO_TFR_FOLDER = TFRECORDS_AUDIO_DIR OUTPUT_SPEC_FOLDER = TFRECORDS_SPECTROGRAM_DIR @@ -119,6 +120,12 @@ def main(): datasets[i] = (apply_stft(dataset, frame_length, frame_step, sample_rate, max_frequency), name) global_mean, global_std = compute_global_stats(datasets) + normalization_stats_path = DATA_ROOT / "normalization_stats.json" + upsert_normalization_stats( + normalization_stats_path, + {"spec_mean": global_mean, "spec_std": global_std}, + ) + print(f"Saved spectrogram normalization stats to {normalization_stats_path}") for dataset, file_name in datasets: normalized_dataset = dataset.map(lambda spectrogram, label: (normalize_spectrogram(spectrogram, global_mean, global_std), label), num_parallel_calls=tf.data.AUTOTUNE) diff --git a/src/elp_rumble/data_creation/create_tfrecords.py b/src/elp_rumble/data_creation/create_tfrecords.py index 4f6c3a3..e16f7dc 100644 --- a/src/elp_rumble/data_creation/create_tfrecords.py +++ b/src/elp_rumble/data_creation/create_tfrecords.py @@ -7,9 +7,11 @@ load_wavs_into_dataset, normalize_dataset, stratified_split, + upsert_normalization_stats, write_tfrecords, ) from elp_rumble.config.paths import ( + DATA_ROOT, POS_TRAIN_VAL_CLIPS_DIR, TRAIN_VAL_NEG_CLIPS_DIR, POS_HOLDOUT_TEST_CLIPS_DIR, @@ -41,6 +43,12 @@ def main(): # Compute statistics global_mean, global_std = compute_statistics(combined_dataset) + normalization_stats_path = DATA_ROOT / "normalization_stats.json" + upsert_normalization_stats( + normalization_stats_path, + {"audio_mean": global_mean, "audio_std": global_std}, + ) + print(f"Saved audio normalization stats to {normalization_stats_path}") del combined_dataset diff --git a/src/elp_rumble/data_creation/utils.py b/src/elp_rumble/data_creation/utils.py index 475f4d5..3459ad6 100644 --- a/src/elp_rumble/data_creation/utils.py +++ b/src/elp_rumble/data_creation/utils.py @@ -2,6 +2,7 @@ from scipy.signal import resample, butter, lfilter import numpy as np import os +import json import tensorflow as tf from collections import Counter from sklearn.model_selection import train_test_split @@ -157,6 +158,23 @@ def compute_statistics(dataset): mean, std = total_sum / total_count, np.sqrt((total_sum_sq / total_count) - (total_sum / total_count) ** 2) return mean, std + +def upsert_normalization_stats(stats_path, updates): + """Merge normalization stats into JSON at stats_path, creating file if needed.""" + os.makedirs(os.path.dirname(os.fspath(stats_path)), exist_ok=True) + + existing = {} + if os.path.exists(stats_path): + with open(stats_path, "r", encoding="utf-8") as f: + existing = json.load(f) + + existing.update({k: float(v) for k, v in updates.items()}) + + tmp_path = f"{stats_path}.tmp" + with open(tmp_path, "w", encoding="utf-8") as f: + json.dump(existing, f, indent=2) + os.replace(tmp_path, stats_path) + def normalize_dataset(dataset, mean, std): return dataset.map(lambda audio: (audio - mean) / std)