AOSSIE-Org · rohan-pandeyy · Jun 20, 2026 · Jun 20, 2026
diff --git a/backend/app/config/settings.py b/backend/app/config/settings.py
@@ -122,7 +122,7 @@ def _get_env_int(
     )
     PICTO_CLUSTERING_MIN_SAMPLES = 2
 PICTO_CLUSTERING_SIMILARITY_THRESHOLD = _get_env_float(
-    "PICTO_CLUSTERING_SIMILARITY_THRESHOLD", 0.85, min_value=0.0, max_value=1.0
+    "PICTO_CLUSTERING_SIMILARITY_THRESHOLD", 0.65, min_value=0.0, max_value=1.0
 )
 PICTO_CLUSTERING_MERGE_THRESHOLD = _get_env_float(
     "PICTO_CLUSTERING_MERGE_THRESHOLD", 0.7, min_value=0.0, max_value=1.0

diff --git a/backend/app/utils/face_clusters.py b/backend/app/utils/face_clusters.py
@@ -220,7 +220,7 @@ def cluster_util_cluster_all_face_embeddings(
     Args:
         eps: DBSCAN epsilon parameter for maximum distance between samples (default: 0.75)
         min_samples: DBSCAN minimum samples parameter for core points (default: 2)
-        similarity_threshold: Minimum similarity to consider same person (default: 0.85, range: 0.75-0.90)
+        similarity_threshold: Minimum similarity to consider same person (default: 0.65, range: 0.65-0.90)
         merge_threshold: Similarity threshold for post-clustering merge (default: None, uses similarity_threshold)
 
     Returns:
@@ -286,8 +286,18 @@ def cluster_util_cluster_all_face_embeddings(
 
     estimated_eps = estimate_eps(embeddings_array, k=min_samples)
     if estimated_eps is not None:
-        logger.info(f"Adaptive eps estimated: {estimated_eps:.4f}")
-        eps = estimated_eps
+        clamped_eps = min(estimated_eps, max_distance)
+        # DBSCAN requires eps to be strictly positive
+        clamped_eps = max(clamped_eps, 1e-6)
+        if clamped_eps < estimated_eps:
+            logger.warning(
+                f"Adaptive eps {estimated_eps:.4f} exceeded max_distance "
+                f"{max_distance:.4f} (similarity_threshold={similarity_threshold}); "
+                f"clamping to {clamped_eps:.4f}"
+            )
+        else:
+            logger.info(f"Adaptive eps estimated: {clamped_eps:.4f}")
+        eps = clamped_eps
     else:
         logger.warning(
             f"Too few embeddings for eps estimation, using config default: {eps}"

diff --git a/backend/tests/test_face_clusters.py b/backend/tests/test_face_clusters.py
@@ -551,8 +551,87 @@ def test_estimate_eps_fallback(self):
         # 2 elements
         assert estimate_eps(np.random.randn(2, 512), k=2) is None
 
+    @patch("app.utils.face_clusters.db_get_all_faces_with_cluster_names")
+    def test_adaptive_eps_clamping_regression(self, mock_db_get):
+        """Test 4: Adaptive eps clamping under sparse datasets with singletons"""
+        # Create 9 embeddings:
+        # Identity A: 2 points (very close)
+        # Identity B: 2 points (very close)
+        # 5 Singleton points (completely random / orthogonal)
+        dim = 512
+        np.random.seed(42)
+
+        # Identity A
+        center_a = np.random.randn(dim)
+        center_a /= np.linalg.norm(center_a)
+        pt_a1 = center_a + np.random.randn(dim) * 0.01
+        pt_a1 /= np.linalg.norm(pt_a1)
+        pt_a2 = center_a + np.random.randn(dim) * 0.01
+        pt_a2 /= np.linalg.norm(pt_a2)
+
+        # Identity B (orthogonal to A)
+        pt_b1 = np.random.randn(dim)
+        pt_b1 -= np.dot(pt_b1, center_a) * center_a
+        pt_b1 /= np.linalg.norm(pt_b1)
+        pt_b2 = pt_b1 + np.random.randn(dim) * 0.01
+        pt_b2 /= np.linalg.norm(pt_b2)
+
+        # 5 Singletons (mutually orthogonal to each other and A/B)
+        singletons = []
+        for _ in range(5):
+            vec = np.random.randn(dim)
+            vec -= np.dot(vec, center_a) * center_a
+            vec -= np.dot(vec, pt_b1) * pt_b1
+            for prev in singletons:
+                vec -= np.dot(vec, prev) * prev
+            vec /= np.linalg.norm(vec)
+            singletons.append(vec)
+
+        all_embeddings = [pt_a1, pt_a2, pt_b1, pt_b2] + singletons
+
+        # Mock database call
+        mock_db_get.return_value = [
+            {"face_id": i, "embeddings": emb, "cluster_name": None}
+            for i, emb in enumerate(all_embeddings)
+        ]
+
+        # Run clustering with similarity_threshold=0.85 -> max_distance = 0.15
+        results, _ = cluster_util_cluster_all_face_embeddings(
+            min_samples=2, similarity_threshold=0.85
+        )
+
+        # Group face_ids by their cluster UUIDs
+        clusters = {}
+        for r in results:
+            if r.cluster_uuid not in clusters:
+                clusters[r.cluster_uuid] = []
+            clusters[r.cluster_uuid].append(r.face_id)
+
+        cluster_a_uuid = None
+        cluster_b_uuid = None
+
+        for cluster_uuid, face_ids in clusters.items():
+            if 0 in face_ids:
+                cluster_a_uuid = cluster_uuid
+                assert 1 in face_ids, "Identity A faces should be grouped together"
+                assert all(
+                    f in [0, 1] for f in face_ids
+                ), f"Identity A cluster contains unexpected faces: {face_ids}"
+            elif 2 in face_ids:
+                cluster_b_uuid = cluster_uuid
+                assert 3 in face_ids, "Identity B faces should be grouped together"
+                assert all(
+                    f in [2, 3] for f in face_ids
+                ), f"Identity B cluster contains unexpected faces: {face_ids}"
+
+        assert cluster_a_uuid is not None, "Identity A was not clustered"
+        assert cluster_b_uuid is not None, "Identity B was not clustered"
+        assert (
+            cluster_a_uuid != cluster_b_uuid
+        ), "Identity A and Identity B should not be merged into the same cluster"
+
     def test_quality_gate(self):
-        """Test 4: Quality gate unit tests"""
+        """Test 5: Quality gate unit tests"""
         # A sharp, large face crop should pass
         # Random noise image has high variance (sharp)
         np.random.seed(42)