Update benchmarks and make whiten=True default

JacekDabrowski1 · JacekDabrowski1 · commit 62cd905431e0 · 2026-03-26T14:08:14.000Z
- Fixed OOM in whiten_embeddings with chunked float64 covariance
- Re-ran all benchmarks with 16 iterations: PPI=1.000, Flickr=0.971, ogbn-arxiv=0.994
- Updated benchmarks.js with new accuracy, speed, memory, scatter, CV data
- Updated benchmarks.html all tables: summary, detail, speed, memory, scatter, CV
- Updated index.html benchmark cards with new numbers
- Made whiten=True the default in embed() and embed_multiscale()
- Updated API docs to reflect new defaults
- Yelp and roadNet now show whitening works (speed/memory data)
diff --git a/full_benchmark.py b/full_benchmark.py
@@ -0,0 +1,77 @@
+"""Full benchmark: Cleora base vs Cleora(whiten) on ALL datasets"""
+import numpy as np
+import time
+import tracemalloc
+import sys
+import gc
+
+from pycleora import SparseMatrix, embed
+from pycleora.metrics import node_classification_scores
+from pycleora.community import detect_communities_louvain
+from pycleora.datasets import load_dataset
+from pycleora.classify import mlp_classify
+from pycleora.algorithms import embed_prone, embed_randne
+
+DIM = 256
+
+def measure(fn, graph):
+    gc.collect()
+    tracemalloc.start()
+    t0 = time.time()
+    result = fn(graph)
+    elapsed = time.time() - t0
+    _, peak = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+    gc.collect()
+    return result, elapsed, peak / 1024 / 1024
+
+DATASETS = ["facebook", "ppi_large", "flickr", "ogbn_arxiv", "yelp", "roadnet"]
+
+print(f"{'Dataset':<16s} {'Algorithm':<20s} {'NC_Acc':>8s} {'NC_F1':>8s} {'Time':>8s} {'Mem_MB':>8s}")
+print("=" * 80)
+
+for ds_name in DATASETS:
+    sys.stderr.write(f"\n--- Loading {ds_name} ---\n")
+    sys.stderr.flush()
+
+    ds = load_dataset(ds_name)
+    graph = SparseMatrix.from_iterator(iter(ds["edges"]), ds["columns"])
+    labels = ds.get("labels", {})
+    if len(labels) < 4:
+        labels = detect_communities_louvain(graph)
+
+    has_labels = len(labels) >= 4
+    n_nodes = ds["num_nodes"]
+    sys.stderr.write(f"  Nodes: {n_nodes}, Labels: {len(labels)}\n")
+    sys.stderr.flush()
+
+    configs = [
+        ("Cleora(base,4it)", lambda g: embed(g, DIM, 4)),
+        ("Cleora(w,16it)", lambda g: embed(g, DIM, 16, whiten=True)),
+        ("ProNE", lambda g: embed_prone(g, DIM)),
+        ("RandNE", lambda g: embed_randne(g, DIM)),
+    ]
+
+    for name, fn in configs:
+        try:
+            emb, t, mem = measure(fn, graph)
+            if has_labels:
+                scores = node_classification_scores(graph, emb, labels, seed=42)
+                acc = scores["accuracy"]
+                f1 = scores["macro_f1"]
+            else:
+                acc = f1 = float('nan')
+            print(f"{ds_name:<16s} {name:<20s} {acc:>8.4f} {f1:>8.4f} {t:>7.3f}s {mem:>8.1f}")
+            sys.stdout.flush()
+        except Exception as e:
+            print(f"{ds_name:<16s} {name:<20s} {'FAIL':>8s} {'':>8s} {'':>8s} {str(e)[:30]}")
+            sys.stdout.flush()
+
+        del emb
+        gc.collect()
+
+    del graph, labels
+    gc.collect()
+
+print("\n" + "=" * 80)
+print("DONE")
diff --git a/pycleora/__init__.py b/pycleora/__init__.py
@@ -62,7 +62,7 @@ def embed(
     callback: Optional[Callable[[int, np.ndarray], None]] = None,
     residual_weight: float = 0.0,
     convergence_threshold: float = 0.0,
-    whiten: bool = False,
+    whiten: bool = True,
 ) -> np.ndarray:
     if isinstance(num_iterations, str):
         if num_iterations == "auto":
@@ -126,9 +126,20 @@ def embed(
 
 
 def whiten_embeddings(embeddings: np.ndarray, n_components: Optional[int] = None) -> np.ndarray:
-    mean = embeddings.mean(axis=0)
-    centered = embeddings - mean
-    cov = np.cov(centered, rowvar=False)
+    n, d = embeddings.shape
+    if n <= 1:
+        return embeddings.copy()
+    chunk = 50000
+
+    mean = embeddings.mean(axis=0, dtype=np.float64)
+
+    cov = np.zeros((d, d), dtype=np.float64)
+    for i in range(0, n, chunk):
+        end = min(i + chunk, n)
+        block = embeddings[i:end].astype(np.float64) - mean
+        cov += block.T @ block
+    cov *= 1.0 / (n - 1)
+
     eigenvalues, eigenvectors = np.linalg.eigh(cov)
 
     idx = np.argsort(eigenvalues)[::-1]
@@ -139,9 +150,16 @@ def whiten_embeddings(embeddings: np.ndarray, n_components: Optional[int] = None
         eigenvalues = eigenvalues[:n_components]
         eigenvectors = eigenvectors[:, :n_components]
 
-    inv_sqrt = np.diag(1.0 / np.sqrt(np.maximum(eigenvalues, 1e-10)))
-    whitened = centered @ eigenvectors @ inv_sqrt
-    return whitened.astype(np.float32)
+    scale = 1.0 / np.sqrt(np.maximum(eigenvalues, 1e-10))
+    transform = (eigenvectors * scale).astype(np.float32)
+    mean_f32 = mean.astype(np.float32)
+
+    out = np.empty((n, transform.shape[1]), dtype=np.float32)
+    for i in range(0, n, chunk):
+        end = min(i + chunk, n)
+        block = embeddings[i:end] - mean_f32
+        np.dot(block, transform, out=out[i:end])
+    return out
 
 
 def embed_with_node_features(
@@ -263,7 +281,7 @@ def embed_multiscale(
     normalization: str = "l2",
     seed: int = 0,
     num_workers: Optional[int] = None,
-    whiten: bool = False,
+    whiten: bool = True,
 ) -> np.ndarray:
     propagate_fn = _get_propagate_fn(graph, propagation)
 
diff --git a/replit.md b/replit.md
@@ -129,10 +129,11 @@ cp ~/.pythonlibs/lib/python3.12/site-packages/pycleora/pycleora.cpython-312-x86_
 - `pycleora benchmark` - Run benchmarks
 - `pycleora similar` - Find similar entities
 
-### High-Dimension Optimization
-- `embed(..., whiten=True)` — PCA whitening post-processing. Critical for high dims (512+). At 1024d on ego-Facebook: 0.964 acc (vs 0.355 without).
-- `embed(..., num_iterations="auto")` — Auto-selects iterations: 4 for dim≤256, 8 for dim≤512, 16 for dim>512.
-- `embed_multiscale(..., whiten=True)` — Each scale whitened before concatenation. 2×512 [8,16] whiten → 0.942 acc.
+### Whitening (Default: Enabled)
+- `embed()` and `embed_multiscale()` now default to `whiten=True`. Memory-efficient chunked implementation avoids OOM on large graphs.
+- Best config: `embed(graph, 256, num_iterations=16, whiten=True)` — achieves 0.932 (Facebook), 1.000 (PPI-large), 0.971 (Flickr), 0.994 (ogbn-arxiv).
+- Whitening now works on ALL datasets including Yelp (717K nodes, 1.5GB) and roadNet-CA (2M nodes, 4.1GB).
+- `whiten_embeddings()` uses chunked float64 covariance computation — peak memory ~2x embedding size instead of 3-4x.
 
 ## Architecture Notes
 
diff --git a/scale_whiten_test.py b/scale_whiten_test.py
@@ -0,0 +1,45 @@
+"""Test whitening on Yelp and roadNet — previously OOM'd"""
+import numpy as np, time, tracemalloc, gc, sys
+from pycleora import SparseMatrix, embed
+from pycleora.datasets import load_dataset
+from pycleora.community import detect_communities_louvain
+from pycleora.metrics import node_classification_scores
+DIM = 256
+
+def m(fn, g):
+    gc.collect(); tracemalloc.start()
+    t0 = time.time(); r = fn(g); t = time.time() - t0
+    _, p = tracemalloc.get_traced_memory(); tracemalloc.stop()
+    gc.collect()
+    return r, t, p/1024/1024
+
+for ds_name in ["yelp", "roadnet"]:
+    sys.stderr.write(f"\n=== {ds_name} ===\n"); sys.stderr.flush()
+    t0 = time.time()
+    ds = load_dataset(ds_name)
+    sys.stderr.write(f"  load: {time.time()-t0:.1f}s\n"); sys.stderr.flush()
+
+    t0 = time.time()
+    g = SparseMatrix.from_iterator(iter(ds["edges"]), ds["columns"])
+    sys.stderr.write(f"  graph: {time.time()-t0:.1f}s, nodes={g.num_entities}\n"); sys.stderr.flush()
+
+    t0 = time.time()
+    labels = detect_communities_louvain(g)
+    sys.stderr.write(f"  louvain: {time.time()-t0:.1f}s, communities={len(set(labels.values()))}\n"); sys.stderr.flush()
+
+    emb, t, mem = m(lambda g: embed(g, DIM, 4), g)
+    scores = node_classification_scores(g, emb, labels, seed=42)
+    print(f"{ds_name:<10s} Cleora(base,4it)   acc={scores['accuracy']:.4f}  time={t:.3f}s  mem={mem:.1f}MB")
+    sys.stdout.flush()
+    del emb; gc.collect()
+
+    sys.stderr.write(f"  Running whiten(16it)...\n"); sys.stderr.flush()
+    emb, t, mem = m(lambda g: embed(g, DIM, 16, whiten=True), g)
+    scores = node_classification_scores(g, emb, labels, seed=42)
+    print(f"{ds_name:<10s} Cleora(w,16it)     acc={scores['accuracy']:.4f}  time={t:.3f}s  mem={mem:.1f}MB")
+    sys.stdout.flush()
+    del emb; gc.collect()
+
+    del g, labels; gc.collect()
+
+print("\nDONE - Whitening works on ALL datasets!")
diff --git a/test_scale_whiten.py b/test_scale_whiten.py
@@ -0,0 +1,57 @@
+import numpy as np, time, tracemalloc, gc, sys
+from pycleora import SparseMatrix, embed
+from pycleora.datasets import load_dataset
+
+DIM = 256
+
+for ds_name in ["yelp", "roadnet"]:
+    print(f"\n=== {ds_name} ===", flush=True)
+    ds = load_dataset(ds_name)
+    print(f"Loaded: {ds['num_nodes']} nodes, {len(ds['edges'])} edges", flush=True)
+    
+    g = SparseMatrix.from_iterator(iter(ds['edges']), ds['columns'])
+    print(f"Graph: {g.num_entities} entities", flush=True)
+    
+    gc.collect()
+    tracemalloc.start()
+    t0 = time.time()
+    emb = embed(g, DIM, 4)
+    t_base = time.time() - t0
+    _, peak_base = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+    print(f"  Base(4it): time={t_base:.1f}s, mem={peak_base/1e6:.0f}MB", flush=True)
+    del emb; gc.collect()
+    
+    gc.collect()
+    tracemalloc.start()
+    t0 = time.time()
+    emb = embed(g, DIM, 4, whiten=True)
+    t_w = time.time() - t0
+    _, peak_w = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+    print(f"  Whiten(4it): time={t_w:.1f}s, mem={peak_w/1e6:.0f}MB", flush=True)
+    del emb; gc.collect()
+
+    gc.collect()
+    tracemalloc.start()
+    t0 = time.time()
+    emb = embed(g, DIM, 8, whiten=True)
+    t_w8 = time.time() - t0
+    _, peak_w8 = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+    print(f"  Whiten(8it): time={t_w8:.1f}s, mem={peak_w8/1e6:.0f}MB", flush=True)
+    del emb; gc.collect()
+
+    gc.collect()
+    tracemalloc.start()
+    t0 = time.time()
+    emb = embed(g, DIM, 16, whiten=True)
+    t_w16 = time.time() - t0
+    _, peak_w16 = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+    print(f"  Whiten(16it): time={t_w16:.1f}s, mem={peak_w16/1e6:.0f}MB", flush=True)
+    del emb; gc.collect()
+    
+    del g; gc.collect()
+
+print("\nSUCCESS - All scale tests passed!", flush=True)
diff --git a/test_yelp_whiten.py b/test_yelp_whiten.py
@@ -0,0 +1,36 @@
+import numpy as np, time, tracemalloc, gc, sys
+from pycleora import SparseMatrix, embed
+from pycleora.datasets import load_dataset
+
+DIM = 256
+print("Loading yelp...", flush=True)
+ds = load_dataset('yelp')
+print(f"Loaded: {ds['num_nodes']} nodes, {len(ds['edges'])} edges", flush=True)
+
+print("Building graph...", flush=True)
+g = SparseMatrix.from_iterator(iter(ds['edges']), ds['columns'])
+print(f"Graph: {g.num_entities} entities", flush=True)
+
+print("Running base embed(4it)...", flush=True)
+gc.collect()
+tracemalloc.start()
+t0 = time.time()
+emb = embed(g, DIM, 4)
+t = time.time() - t0
+_, peak = tracemalloc.get_traced_memory()
+tracemalloc.stop()
+print(f"  Base(4it): shape={emb.shape}, time={t:.1f}s, mem={peak/1e6:.0f}MB", flush=True)
+del emb; gc.collect()
+
+print("Running whiten embed(16it)...", flush=True)
+gc.collect()
+tracemalloc.start()
+t0 = time.time()
+emb = embed(g, DIM, 16, whiten=True)
+t = time.time() - t0
+_, peak = tracemalloc.get_traced_memory()
+tracemalloc.stop()
+print(f"  Whiten(16it): shape={emb.shape}, time={t:.1f}s, mem={peak/1e6:.0f}MB", flush=True)
+del emb; gc.collect()
+
+print("SUCCESS - Yelp whitening completed without OOM!", flush=True)
diff --git a/website/static/benchmarks.js b/website/static/benchmarks.js
@@ -26,7 +26,7 @@ const DATASETS = ['ego-Facebook', 'PPI-large', 'Flickr', 'ogbn-arxiv', 'Yelp'];
 const ALGORITHMS = ['Cleora (whiten)', 'Cleora', 'ProNE', 'RandNE', 'NetMF', 'DeepWalk'];
 
 const SUMMARY_DATA = {
-    'Cleora (whiten)': [0.932, 0.985, 0.502, 0.624, null],
+    'Cleora (whiten)': [0.932, 1.000, 0.971, 0.994, null],
     'Cleora':          [0.350, 0.025, 0.157, 0.038, 0.013],
     'ProNE':           [0.019, 0.008, 0.142, 0.026, null],
     'RandNE':          [0.120, 0.014, 0.153, 0.032, null],
@@ -46,21 +46,21 @@ const MLP_DATA = {
 const SPEED_DATA = {
     algorithms: ['Cleora', 'Cleora (whiten)', 'RandNE', 'ProNE', 'NetMF', 'DeepWalk'],
     facebook:   [0.111,    0.430,              0.070,    0.264,   35.229,  50.093],
-    ppi_large:  [0.707,    1.702,              1.863,    7.286,   null,    null],
-    flickr:     [0.869,    2.218,              2.169,    10.732,  null,    null],
-    ogbn_arxiv: [1.290,    3.623,              3.204,    15.725,  null,    null],
-    yelp:       [7.076,    null,               null,     null,    null,    null],
-    roadnet:    [5.312,    null,               null,     null,    null,    null],
+    ppi_large:  [0.707,    2.842,              1.863,    7.286,   null,    null],
+    flickr:     [0.869,    3.676,              2.169,    10.732,  null,    null],
+    ogbn_arxiv: [1.290,    5.222,              3.204,    15.725,  null,    null],
+    yelp:       [7.076,    30.419,             null,     null,    null,    null],
+    roadnet:    [5.312,    31.500,             null,     null,    null,    null],
 };
 
 const MEMORY_DATA = {
     algorithms: ['Cleora', 'Cleora (whiten)', 'RandNE', 'ProNE', 'DeepWalk', 'NetMF'],
-    facebook:   [3.9,      25.2,               39.8,     64.0,    540.8,      1047.4],
-    ppi_large:  [55.6,     335.2,              541.0,    875.8,   null,        null],
-    flickr:     [87.2,     524.5,              830.4,    1354.9,  null,        null],
-    ogbn_arxiv: [165.4,    993.8,              1550.8,   2545.5,  null,        null],
-    yelp:       [700.0,    null,               null,     null,    null,        null],
-    roadnet:    [1919.1,   null,               null,     null,    null,        null],
+    facebook:   [3.9,      21.0,               39.8,     64.0,    540.8,      1047.4],
+    ppi_large:  [55.6,     251.5,              541.0,    875.8,   null,        null],
+    flickr:     [87.2,     338.7,              830.4,    1354.9,  null,        null],
+    ogbn_arxiv: [165.4,    458.9,              1550.8,   2545.5,  null,        null],
+    yelp:       [700.0,    1499.0,             null,     null,    null,        null],
+    roadnet:    [1919.1,   4129.0,             null,     null,    null,        null],
 };
 
 const SCATTER_DATA = {
@@ -73,19 +73,19 @@ const SCATTER_DATA = {
         'Cleora':          { acc: 0.350, time: 0.111 },
     },
     'PPI-large': {
-        'Cleora (whiten)': { acc: 0.985, time: 1.702 },
+        'Cleora (whiten)': { acc: 1.000, time: 2.842 },
         'Cleora':          { acc: 0.025, time: 0.707 },
         'ProNE':           { acc: 0.008, time: 7.286 },
         'RandNE':          { acc: 0.014, time: 1.863 },
     },
     'Flickr': {
-        'Cleora (whiten)': { acc: 0.502, time: 2.218 },
+        'Cleora (whiten)': { acc: 0.971, time: 3.676 },
         'Cleora':          { acc: 0.157, time: 0.869 },
         'ProNE':           { acc: 0.142, time: 10.732 },
         'RandNE':          { acc: 0.153, time: 2.169 },
     },
     'ogbn-arxiv': {
-        'Cleora (whiten)': { acc: 0.624, time: 3.623 },
+        'Cleora (whiten)': { acc: 0.994, time: 5.222 },
         'Cleora':          { acc: 0.038, time: 1.290 },
         'RandNE':          { acc: 0.032, time: 3.204 },
         'ProNE':           { acc: 0.026, time: 15.725 },
@@ -94,10 +94,10 @@ const SCATTER_DATA = {
 
 const CV_DATA = {
     datasets:     ['ego-Facebook', 'PPI-large', 'Flickr', 'ogbn-arxiv'],
-    meanAccuracy: [0.931, 0.985, 0.507, 0.620],
-    stdAccuracy:  [0.017, 0.001, 0.006, 0.003],
-    meanF1:       [0.813, 0.985, 0.507, 0.620],
-    stdF1:        [0.025, 0.001, 0.006, 0.003],
+    meanAccuracy: [0.939, 1.000, 0.972, 0.994],
+    stdAccuracy:  [0.009, 0.000, 0.001, 0.000],
+    meanF1:       [0.705, 1.000, 0.972, 0.994],
+    stdF1:        [0.040, 0.000, 0.001, 0.000],
 };
 
 function chartDefaults() {
diff --git a/website/templates/api.html b/website/templates/api.html
diff --git a/website/templates/benchmarks.html b/website/templates/benchmarks.html
diff --git a/website/templates/index.html b/website/templates/index.html