Skip to content

Commit 62cd905

Browse files
JacekDabrowski1JacekDabrowski1
authored andcommitted
Update benchmarks and make whiten=True default
- Fixed OOM in whiten_embeddings with chunked float64 covariance - Re-ran all benchmarks with 16 iterations: PPI=1.000, Flickr=0.971, ogbn-arxiv=0.994 - Updated benchmarks.js with new accuracy, speed, memory, scatter, CV data - Updated benchmarks.html all tables: summary, detail, speed, memory, scatter, CV - Updated index.html benchmark cards with new numbers - Made whiten=True the default in embed() and embed_multiscale() - Updated API docs to reflect new defaults - Yelp and roadNet now show whitening works (speed/memory data)
1 parent b2242cc commit 62cd905

10 files changed

Lines changed: 330 additions & 95 deletions

File tree

full_benchmark.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""Full benchmark: Cleora base vs Cleora(whiten) on ALL datasets"""
2+
import numpy as np
3+
import time
4+
import tracemalloc
5+
import sys
6+
import gc
7+
8+
from pycleora import SparseMatrix, embed
9+
from pycleora.metrics import node_classification_scores
10+
from pycleora.community import detect_communities_louvain
11+
from pycleora.datasets import load_dataset
12+
from pycleora.classify import mlp_classify
13+
from pycleora.algorithms import embed_prone, embed_randne
14+
15+
DIM = 256
16+
17+
def measure(fn, graph):
18+
gc.collect()
19+
tracemalloc.start()
20+
t0 = time.time()
21+
result = fn(graph)
22+
elapsed = time.time() - t0
23+
_, peak = tracemalloc.get_traced_memory()
24+
tracemalloc.stop()
25+
gc.collect()
26+
return result, elapsed, peak / 1024 / 1024
27+
28+
DATASETS = ["facebook", "ppi_large", "flickr", "ogbn_arxiv", "yelp", "roadnet"]
29+
30+
print(f"{'Dataset':<16s} {'Algorithm':<20s} {'NC_Acc':>8s} {'NC_F1':>8s} {'Time':>8s} {'Mem_MB':>8s}")
31+
print("=" * 80)
32+
33+
for ds_name in DATASETS:
34+
sys.stderr.write(f"\n--- Loading {ds_name} ---\n")
35+
sys.stderr.flush()
36+
37+
ds = load_dataset(ds_name)
38+
graph = SparseMatrix.from_iterator(iter(ds["edges"]), ds["columns"])
39+
labels = ds.get("labels", {})
40+
if len(labels) < 4:
41+
labels = detect_communities_louvain(graph)
42+
43+
has_labels = len(labels) >= 4
44+
n_nodes = ds["num_nodes"]
45+
sys.stderr.write(f" Nodes: {n_nodes}, Labels: {len(labels)}\n")
46+
sys.stderr.flush()
47+
48+
configs = [
49+
("Cleora(base,4it)", lambda g: embed(g, DIM, 4)),
50+
("Cleora(w,16it)", lambda g: embed(g, DIM, 16, whiten=True)),
51+
("ProNE", lambda g: embed_prone(g, DIM)),
52+
("RandNE", lambda g: embed_randne(g, DIM)),
53+
]
54+
55+
for name, fn in configs:
56+
try:
57+
emb, t, mem = measure(fn, graph)
58+
if has_labels:
59+
scores = node_classification_scores(graph, emb, labels, seed=42)
60+
acc = scores["accuracy"]
61+
f1 = scores["macro_f1"]
62+
else:
63+
acc = f1 = float('nan')
64+
print(f"{ds_name:<16s} {name:<20s} {acc:>8.4f} {f1:>8.4f} {t:>7.3f}s {mem:>8.1f}")
65+
sys.stdout.flush()
66+
except Exception as e:
67+
print(f"{ds_name:<16s} {name:<20s} {'FAIL':>8s} {'':>8s} {'':>8s} {str(e)[:30]}")
68+
sys.stdout.flush()
69+
70+
del emb
71+
gc.collect()
72+
73+
del graph, labels
74+
gc.collect()
75+
76+
print("\n" + "=" * 80)
77+
print("DONE")

pycleora/__init__.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def embed(
6262
callback: Optional[Callable[[int, np.ndarray], None]] = None,
6363
residual_weight: float = 0.0,
6464
convergence_threshold: float = 0.0,
65-
whiten: bool = False,
65+
whiten: bool = True,
6666
) -> np.ndarray:
6767
if isinstance(num_iterations, str):
6868
if num_iterations == "auto":
@@ -126,9 +126,20 @@ def embed(
126126

127127

128128
def whiten_embeddings(embeddings: np.ndarray, n_components: Optional[int] = None) -> np.ndarray:
129-
mean = embeddings.mean(axis=0)
130-
centered = embeddings - mean
131-
cov = np.cov(centered, rowvar=False)
129+
n, d = embeddings.shape
130+
if n <= 1:
131+
return embeddings.copy()
132+
chunk = 50000
133+
134+
mean = embeddings.mean(axis=0, dtype=np.float64)
135+
136+
cov = np.zeros((d, d), dtype=np.float64)
137+
for i in range(0, n, chunk):
138+
end = min(i + chunk, n)
139+
block = embeddings[i:end].astype(np.float64) - mean
140+
cov += block.T @ block
141+
cov *= 1.0 / (n - 1)
142+
132143
eigenvalues, eigenvectors = np.linalg.eigh(cov)
133144

134145
idx = np.argsort(eigenvalues)[::-1]
@@ -139,9 +150,16 @@ def whiten_embeddings(embeddings: np.ndarray, n_components: Optional[int] = None
139150
eigenvalues = eigenvalues[:n_components]
140151
eigenvectors = eigenvectors[:, :n_components]
141152

142-
inv_sqrt = np.diag(1.0 / np.sqrt(np.maximum(eigenvalues, 1e-10)))
143-
whitened = centered @ eigenvectors @ inv_sqrt
144-
return whitened.astype(np.float32)
153+
scale = 1.0 / np.sqrt(np.maximum(eigenvalues, 1e-10))
154+
transform = (eigenvectors * scale).astype(np.float32)
155+
mean_f32 = mean.astype(np.float32)
156+
157+
out = np.empty((n, transform.shape[1]), dtype=np.float32)
158+
for i in range(0, n, chunk):
159+
end = min(i + chunk, n)
160+
block = embeddings[i:end] - mean_f32
161+
np.dot(block, transform, out=out[i:end])
162+
return out
145163

146164

147165
def embed_with_node_features(
@@ -263,7 +281,7 @@ def embed_multiscale(
263281
normalization: str = "l2",
264282
seed: int = 0,
265283
num_workers: Optional[int] = None,
266-
whiten: bool = False,
284+
whiten: bool = True,
267285
) -> np.ndarray:
268286
propagate_fn = _get_propagate_fn(graph, propagation)
269287

replit.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,10 +129,11 @@ cp ~/.pythonlibs/lib/python3.12/site-packages/pycleora/pycleora.cpython-312-x86_
129129
- `pycleora benchmark` - Run benchmarks
130130
- `pycleora similar` - Find similar entities
131131

132-
### High-Dimension Optimization
133-
- `embed(..., whiten=True)` — PCA whitening post-processing. Critical for high dims (512+). At 1024d on ego-Facebook: 0.964 acc (vs 0.355 without).
134-
- `embed(..., num_iterations="auto")` — Auto-selects iterations: 4 for dim≤256, 8 for dim≤512, 16 for dim>512.
135-
- `embed_multiscale(..., whiten=True)` — Each scale whitened before concatenation. 2×512 [8,16] whiten → 0.942 acc.
132+
### Whitening (Default: Enabled)
133+
- `embed()` and `embed_multiscale()` now default to `whiten=True`. Memory-efficient chunked implementation avoids OOM on large graphs.
134+
- Best config: `embed(graph, 256, num_iterations=16, whiten=True)` — achieves 0.932 (Facebook), 1.000 (PPI-large), 0.971 (Flickr), 0.994 (ogbn-arxiv).
135+
- Whitening now works on ALL datasets including Yelp (717K nodes, 1.5GB) and roadNet-CA (2M nodes, 4.1GB).
136+
- `whiten_embeddings()` uses chunked float64 covariance computation — peak memory ~2x embedding size instead of 3-4x.
136137

137138
## Architecture Notes
138139

scale_whiten_test.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""Test whitening on Yelp and roadNet — previously OOM'd"""
2+
import numpy as np, time, tracemalloc, gc, sys
3+
from pycleora import SparseMatrix, embed
4+
from pycleora.datasets import load_dataset
5+
from pycleora.community import detect_communities_louvain
6+
from pycleora.metrics import node_classification_scores
7+
DIM = 256
8+
9+
def m(fn, g):
10+
gc.collect(); tracemalloc.start()
11+
t0 = time.time(); r = fn(g); t = time.time() - t0
12+
_, p = tracemalloc.get_traced_memory(); tracemalloc.stop()
13+
gc.collect()
14+
return r, t, p/1024/1024
15+
16+
for ds_name in ["yelp", "roadnet"]:
17+
sys.stderr.write(f"\n=== {ds_name} ===\n"); sys.stderr.flush()
18+
t0 = time.time()
19+
ds = load_dataset(ds_name)
20+
sys.stderr.write(f" load: {time.time()-t0:.1f}s\n"); sys.stderr.flush()
21+
22+
t0 = time.time()
23+
g = SparseMatrix.from_iterator(iter(ds["edges"]), ds["columns"])
24+
sys.stderr.write(f" graph: {time.time()-t0:.1f}s, nodes={g.num_entities}\n"); sys.stderr.flush()
25+
26+
t0 = time.time()
27+
labels = detect_communities_louvain(g)
28+
sys.stderr.write(f" louvain: {time.time()-t0:.1f}s, communities={len(set(labels.values()))}\n"); sys.stderr.flush()
29+
30+
emb, t, mem = m(lambda g: embed(g, DIM, 4), g)
31+
scores = node_classification_scores(g, emb, labels, seed=42)
32+
print(f"{ds_name:<10s} Cleora(base,4it) acc={scores['accuracy']:.4f} time={t:.3f}s mem={mem:.1f}MB")
33+
sys.stdout.flush()
34+
del emb; gc.collect()
35+
36+
sys.stderr.write(f" Running whiten(16it)...\n"); sys.stderr.flush()
37+
emb, t, mem = m(lambda g: embed(g, DIM, 16, whiten=True), g)
38+
scores = node_classification_scores(g, emb, labels, seed=42)
39+
print(f"{ds_name:<10s} Cleora(w,16it) acc={scores['accuracy']:.4f} time={t:.3f}s mem={mem:.1f}MB")
40+
sys.stdout.flush()
41+
del emb; gc.collect()
42+
43+
del g, labels; gc.collect()
44+
45+
print("\nDONE - Whitening works on ALL datasets!")

test_scale_whiten.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import numpy as np, time, tracemalloc, gc, sys
2+
from pycleora import SparseMatrix, embed
3+
from pycleora.datasets import load_dataset
4+
5+
DIM = 256
6+
7+
for ds_name in ["yelp", "roadnet"]:
8+
print(f"\n=== {ds_name} ===", flush=True)
9+
ds = load_dataset(ds_name)
10+
print(f"Loaded: {ds['num_nodes']} nodes, {len(ds['edges'])} edges", flush=True)
11+
12+
g = SparseMatrix.from_iterator(iter(ds['edges']), ds['columns'])
13+
print(f"Graph: {g.num_entities} entities", flush=True)
14+
15+
gc.collect()
16+
tracemalloc.start()
17+
t0 = time.time()
18+
emb = embed(g, DIM, 4)
19+
t_base = time.time() - t0
20+
_, peak_base = tracemalloc.get_traced_memory()
21+
tracemalloc.stop()
22+
print(f" Base(4it): time={t_base:.1f}s, mem={peak_base/1e6:.0f}MB", flush=True)
23+
del emb; gc.collect()
24+
25+
gc.collect()
26+
tracemalloc.start()
27+
t0 = time.time()
28+
emb = embed(g, DIM, 4, whiten=True)
29+
t_w = time.time() - t0
30+
_, peak_w = tracemalloc.get_traced_memory()
31+
tracemalloc.stop()
32+
print(f" Whiten(4it): time={t_w:.1f}s, mem={peak_w/1e6:.0f}MB", flush=True)
33+
del emb; gc.collect()
34+
35+
gc.collect()
36+
tracemalloc.start()
37+
t0 = time.time()
38+
emb = embed(g, DIM, 8, whiten=True)
39+
t_w8 = time.time() - t0
40+
_, peak_w8 = tracemalloc.get_traced_memory()
41+
tracemalloc.stop()
42+
print(f" Whiten(8it): time={t_w8:.1f}s, mem={peak_w8/1e6:.0f}MB", flush=True)
43+
del emb; gc.collect()
44+
45+
gc.collect()
46+
tracemalloc.start()
47+
t0 = time.time()
48+
emb = embed(g, DIM, 16, whiten=True)
49+
t_w16 = time.time() - t0
50+
_, peak_w16 = tracemalloc.get_traced_memory()
51+
tracemalloc.stop()
52+
print(f" Whiten(16it): time={t_w16:.1f}s, mem={peak_w16/1e6:.0f}MB", flush=True)
53+
del emb; gc.collect()
54+
55+
del g; gc.collect()
56+
57+
print("\nSUCCESS - All scale tests passed!", flush=True)

test_yelp_whiten.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import numpy as np, time, tracemalloc, gc, sys
2+
from pycleora import SparseMatrix, embed
3+
from pycleora.datasets import load_dataset
4+
5+
DIM = 256
6+
print("Loading yelp...", flush=True)
7+
ds = load_dataset('yelp')
8+
print(f"Loaded: {ds['num_nodes']} nodes, {len(ds['edges'])} edges", flush=True)
9+
10+
print("Building graph...", flush=True)
11+
g = SparseMatrix.from_iterator(iter(ds['edges']), ds['columns'])
12+
print(f"Graph: {g.num_entities} entities", flush=True)
13+
14+
print("Running base embed(4it)...", flush=True)
15+
gc.collect()
16+
tracemalloc.start()
17+
t0 = time.time()
18+
emb = embed(g, DIM, 4)
19+
t = time.time() - t0
20+
_, peak = tracemalloc.get_traced_memory()
21+
tracemalloc.stop()
22+
print(f" Base(4it): shape={emb.shape}, time={t:.1f}s, mem={peak/1e6:.0f}MB", flush=True)
23+
del emb; gc.collect()
24+
25+
print("Running whiten embed(16it)...", flush=True)
26+
gc.collect()
27+
tracemalloc.start()
28+
t0 = time.time()
29+
emb = embed(g, DIM, 16, whiten=True)
30+
t = time.time() - t0
31+
_, peak = tracemalloc.get_traced_memory()
32+
tracemalloc.stop()
33+
print(f" Whiten(16it): shape={emb.shape}, time={t:.1f}s, mem={peak/1e6:.0f}MB", flush=True)
34+
del emb; gc.collect()
35+
36+
print("SUCCESS - Yelp whitening completed without OOM!", flush=True)

website/static/benchmarks.js

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ const DATASETS = ['ego-Facebook', 'PPI-large', 'Flickr', 'ogbn-arxiv', 'Yelp'];
2626
const ALGORITHMS = ['Cleora (whiten)', 'Cleora', 'ProNE', 'RandNE', 'NetMF', 'DeepWalk'];
2727

2828
const SUMMARY_DATA = {
29-
'Cleora (whiten)': [0.932, 0.985, 0.502, 0.624, null],
29+
'Cleora (whiten)': [0.932, 1.000, 0.971, 0.994, null],
3030
'Cleora': [0.350, 0.025, 0.157, 0.038, 0.013],
3131
'ProNE': [0.019, 0.008, 0.142, 0.026, null],
3232
'RandNE': [0.120, 0.014, 0.153, 0.032, null],
@@ -46,21 +46,21 @@ const MLP_DATA = {
4646
const SPEED_DATA = {
4747
algorithms: ['Cleora', 'Cleora (whiten)', 'RandNE', 'ProNE', 'NetMF', 'DeepWalk'],
4848
facebook: [0.111, 0.430, 0.070, 0.264, 35.229, 50.093],
49-
ppi_large: [0.707, 1.702, 1.863, 7.286, null, null],
50-
flickr: [0.869, 2.218, 2.169, 10.732, null, null],
51-
ogbn_arxiv: [1.290, 3.623, 3.204, 15.725, null, null],
52-
yelp: [7.076, null, null, null, null, null],
53-
roadnet: [5.312, null, null, null, null, null],
49+
ppi_large: [0.707, 2.842, 1.863, 7.286, null, null],
50+
flickr: [0.869, 3.676, 2.169, 10.732, null, null],
51+
ogbn_arxiv: [1.290, 5.222, 3.204, 15.725, null, null],
52+
yelp: [7.076, 30.419, null, null, null, null],
53+
roadnet: [5.312, 31.500, null, null, null, null],
5454
};
5555

5656
const MEMORY_DATA = {
5757
algorithms: ['Cleora', 'Cleora (whiten)', 'RandNE', 'ProNE', 'DeepWalk', 'NetMF'],
58-
facebook: [3.9, 25.2, 39.8, 64.0, 540.8, 1047.4],
59-
ppi_large: [55.6, 335.2, 541.0, 875.8, null, null],
60-
flickr: [87.2, 524.5, 830.4, 1354.9, null, null],
61-
ogbn_arxiv: [165.4, 993.8, 1550.8, 2545.5, null, null],
62-
yelp: [700.0, null, null, null, null, null],
63-
roadnet: [1919.1, null, null, null, null, null],
58+
facebook: [3.9, 21.0, 39.8, 64.0, 540.8, 1047.4],
59+
ppi_large: [55.6, 251.5, 541.0, 875.8, null, null],
60+
flickr: [87.2, 338.7, 830.4, 1354.9, null, null],
61+
ogbn_arxiv: [165.4, 458.9, 1550.8, 2545.5, null, null],
62+
yelp: [700.0, 1499.0, null, null, null, null],
63+
roadnet: [1919.1, 4129.0, null, null, null, null],
6464
};
6565

6666
const SCATTER_DATA = {
@@ -73,19 +73,19 @@ const SCATTER_DATA = {
7373
'Cleora': { acc: 0.350, time: 0.111 },
7474
},
7575
'PPI-large': {
76-
'Cleora (whiten)': { acc: 0.985, time: 1.702 },
76+
'Cleora (whiten)': { acc: 1.000, time: 2.842 },
7777
'Cleora': { acc: 0.025, time: 0.707 },
7878
'ProNE': { acc: 0.008, time: 7.286 },
7979
'RandNE': { acc: 0.014, time: 1.863 },
8080
},
8181
'Flickr': {
82-
'Cleora (whiten)': { acc: 0.502, time: 2.218 },
82+
'Cleora (whiten)': { acc: 0.971, time: 3.676 },
8383
'Cleora': { acc: 0.157, time: 0.869 },
8484
'ProNE': { acc: 0.142, time: 10.732 },
8585
'RandNE': { acc: 0.153, time: 2.169 },
8686
},
8787
'ogbn-arxiv': {
88-
'Cleora (whiten)': { acc: 0.624, time: 3.623 },
88+
'Cleora (whiten)': { acc: 0.994, time: 5.222 },
8989
'Cleora': { acc: 0.038, time: 1.290 },
9090
'RandNE': { acc: 0.032, time: 3.204 },
9191
'ProNE': { acc: 0.026, time: 15.725 },
@@ -94,10 +94,10 @@ const SCATTER_DATA = {
9494

9595
const CV_DATA = {
9696
datasets: ['ego-Facebook', 'PPI-large', 'Flickr', 'ogbn-arxiv'],
97-
meanAccuracy: [0.931, 0.985, 0.507, 0.620],
98-
stdAccuracy: [0.017, 0.001, 0.006, 0.003],
99-
meanF1: [0.813, 0.985, 0.507, 0.620],
100-
stdF1: [0.025, 0.001, 0.006, 0.003],
97+
meanAccuracy: [0.939, 1.000, 0.972, 0.994],
98+
stdAccuracy: [0.009, 0.000, 0.001, 0.000],
99+
meanF1: [0.705, 1.000, 0.972, 0.994],
100+
stdF1: [0.040, 0.000, 0.001, 0.000],
101101
};
102102

103103
function chartDefaults() {

0 commit comments

Comments
 (0)