Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/app/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def _get_env_int(
)
PICTO_CLUSTERING_MIN_SAMPLES = 2
PICTO_CLUSTERING_SIMILARITY_THRESHOLD = _get_env_float(
"PICTO_CLUSTERING_SIMILARITY_THRESHOLD", 0.85, min_value=0.0, max_value=1.0
"PICTO_CLUSTERING_SIMILARITY_THRESHOLD", 0.65, min_value=0.0, max_value=1.0
Comment thread
rohan-pandeyy marked this conversation as resolved.
)
PICTO_CLUSTERING_MERGE_THRESHOLD = _get_env_float(
"PICTO_CLUSTERING_MERGE_THRESHOLD", 0.7, min_value=0.0, max_value=1.0
Expand Down
16 changes: 13 additions & 3 deletions backend/app/utils/face_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def cluster_util_cluster_all_face_embeddings(
Args:
eps: DBSCAN epsilon parameter for maximum distance between samples (default: 0.75)
min_samples: DBSCAN minimum samples parameter for core points (default: 2)
similarity_threshold: Minimum similarity to consider same person (default: 0.85, range: 0.75-0.90)
similarity_threshold: Minimum similarity to consider same person (default: 0.65, range: 0.65-0.90)
merge_threshold: Similarity threshold for post-clustering merge (default: None, uses similarity_threshold)

Returns:
Expand Down Expand Up @@ -286,8 +286,18 @@ def cluster_util_cluster_all_face_embeddings(

estimated_eps = estimate_eps(embeddings_array, k=min_samples)
if estimated_eps is not None:
logger.info(f"Adaptive eps estimated: {estimated_eps:.4f}")
eps = estimated_eps
clamped_eps = min(estimated_eps, max_distance)
# DBSCAN requires eps to be strictly positive
clamped_eps = max(clamped_eps, 1e-6)
if clamped_eps < estimated_eps:
logger.warning(
f"Adaptive eps {estimated_eps:.4f} exceeded max_distance "
f"{max_distance:.4f} (similarity_threshold={similarity_threshold}); "
f"clamping to {clamped_eps:.4f}"
)
else:
logger.info(f"Adaptive eps estimated: {clamped_eps:.4f}")
eps = clamped_eps
Comment thread
coderabbitai[bot] marked this conversation as resolved.
else:
logger.warning(
f"Too few embeddings for eps estimation, using config default: {eps}"
Expand Down
81 changes: 80 additions & 1 deletion backend/tests/test_face_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,8 +551,87 @@ def test_estimate_eps_fallback(self):
# 2 elements
assert estimate_eps(np.random.randn(2, 512), k=2) is None

@patch("app.utils.face_clusters.db_get_all_faces_with_cluster_names")
def test_adaptive_eps_clamping_regression(self, mock_db_get):
"""Test 4: Adaptive eps clamping under sparse datasets with singletons"""
# Create 9 embeddings:
# Identity A: 2 points (very close)
# Identity B: 2 points (very close)
# 5 Singleton points (completely random / orthogonal)
dim = 512
np.random.seed(42)

# Identity A
center_a = np.random.randn(dim)
center_a /= np.linalg.norm(center_a)
pt_a1 = center_a + np.random.randn(dim) * 0.01
pt_a1 /= np.linalg.norm(pt_a1)
pt_a2 = center_a + np.random.randn(dim) * 0.01
pt_a2 /= np.linalg.norm(pt_a2)

# Identity B (orthogonal to A)
pt_b1 = np.random.randn(dim)
pt_b1 -= np.dot(pt_b1, center_a) * center_a
pt_b1 /= np.linalg.norm(pt_b1)
pt_b2 = pt_b1 + np.random.randn(dim) * 0.01
pt_b2 /= np.linalg.norm(pt_b2)

# 5 Singletons (mutually orthogonal to each other and A/B)
singletons = []
for _ in range(5):
vec = np.random.randn(dim)
vec -= np.dot(vec, center_a) * center_a
vec -= np.dot(vec, pt_b1) * pt_b1
for prev in singletons:
vec -= np.dot(vec, prev) * prev
vec /= np.linalg.norm(vec)
singletons.append(vec)

all_embeddings = [pt_a1, pt_a2, pt_b1, pt_b2] + singletons

# Mock database call
mock_db_get.return_value = [
{"face_id": i, "embeddings": emb, "cluster_name": None}
for i, emb in enumerate(all_embeddings)
]

# Run clustering with similarity_threshold=0.85 -> max_distance = 0.15
results, _ = cluster_util_cluster_all_face_embeddings(
min_samples=2, similarity_threshold=0.85
)

# Group face_ids by their cluster UUIDs
clusters = {}
for r in results:
if r.cluster_uuid not in clusters:
clusters[r.cluster_uuid] = []
clusters[r.cluster_uuid].append(r.face_id)

cluster_a_uuid = None
cluster_b_uuid = None

for cluster_uuid, face_ids in clusters.items():
if 0 in face_ids:
cluster_a_uuid = cluster_uuid
assert 1 in face_ids, "Identity A faces should be grouped together"
assert all(
f in [0, 1] for f in face_ids
), f"Identity A cluster contains unexpected faces: {face_ids}"
elif 2 in face_ids:
cluster_b_uuid = cluster_uuid
assert 3 in face_ids, "Identity B faces should be grouped together"
assert all(
f in [2, 3] for f in face_ids
), f"Identity B cluster contains unexpected faces: {face_ids}"

assert cluster_a_uuid is not None, "Identity A was not clustered"
assert cluster_b_uuid is not None, "Identity B was not clustered"
assert (
cluster_a_uuid != cluster_b_uuid
), "Identity A and Identity B should not be merged into the same cluster"

def test_quality_gate(self):
"""Test 4: Quality gate unit tests"""
"""Test 5: Quality gate unit tests"""
# A sharp, large face crop should pass
# Random noise image has high variance (sharp)
np.random.seed(42)
Expand Down
Loading
Loading