Skip to content

Commit 7d82e95

Browse files
joaopamaralclaude
andcommitted
fix: make kNN num_candidates configurable and improve default recall
- Add knnNumCandidatesMultiplier to NaturalLanguageSearchConfiguration schema (default 2). Operators can tune recall vs latency by adjusting this value in openmetadata.yaml under naturalLanguageSearch. - Change buildNativeESQuery to accept a numCandidatesMultiplier param; add no-arg overload that uses DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER (2) - ElasticSearchVectorService.search() reads the multiplier from config via resolveNumCandidatesMultiplier(), falling back to the default - Remove nestedTags path from appendFilterMustClauses: entity indices use flat tags objects, so nested tags query is never correct here - Remove appendNested/appendOneNestedQuery dead code - Update tests to match new num_candidates = max(k * multiplier, 100) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent f81a21e commit 7d82e95

4 files changed

Lines changed: 49 additions & 53 deletions

File tree

openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@
1818
import lombok.Getter;
1919
import lombok.extern.slf4j.Slf4j;
2020
import org.openmetadata.schema.EntityInterface;
21+
import org.openmetadata.schema.service.configuration.elasticsearch.NaturalLanguageSearchConfiguration;
22+
import org.openmetadata.service.Entity;
2123
import org.openmetadata.service.events.lifecycle.EntityLifecycleEventDispatcher;
24+
import org.openmetadata.service.search.SearchRepository;
2225
import org.openmetadata.service.search.vector.client.EmbeddingClient;
2326
import org.openmetadata.service.search.vector.utils.DTOs.VectorSearchResponse;
2427

@@ -102,11 +105,12 @@ public VectorSearchResponse search(
102105
overFetchSize = Math.min(overFetchSize, k);
103106
}
104107

108+
int numCandidatesMultiplier = resolveNumCandidatesMultiplier();
105109
String indexName = getIndexAlias();
106110
while (!exhausted && byParent.size() < requestedParents) {
107111
String queryJson =
108112
VectorSearchQueryBuilder.buildNativeESQuery(
109-
queryVector, overFetchSize, rawOffset, k, filters);
113+
queryVector, overFetchSize, rawOffset, k, filters, numCandidatesMultiplier);
110114
String responseBody = executeGenericRequest("POST", "/" + indexName + "/_search", queryJson);
111115

112116
JsonNode root = MAPPER.readTree(responseBody);
@@ -300,6 +304,18 @@ public void partialUpdateEntity(
300304
}
301305
}
302306

307+
private static int resolveNumCandidatesMultiplier() {
308+
SearchRepository repo = Entity.getSearchRepository();
309+
if (repo == null) {
310+
return VectorSearchQueryBuilder.DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER;
311+
}
312+
NaturalLanguageSearchConfiguration cfg = repo.getSearchConfiguration().getNaturalLanguageSearch();
313+
if (cfg == null || cfg.getKnnNumCandidatesMultiplier() == null) {
314+
return VectorSearchQueryBuilder.DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER;
315+
}
316+
return cfg.getKnnNumCandidatesMultiplier();
317+
}
318+
303319
public void close() {
304320
try {
305321
if (client != null && client._transport() != null) {

openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java

Lines changed: 15 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ public class VectorSearchQueryBuilder {
1414
private static final Logger LOG = LoggerFactory.getLogger(VectorSearchQueryBuilder.class);
1515
private static final String ANY = "__ANY__";
1616
private static final String NONE = "__NONE__";
17+
static final int DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER = 2;
1718

1819
/** Build a full search request body (size + _source + query) for standalone vector search. */
1920
public static String build(
@@ -74,7 +75,17 @@ private static void appendKnnQuery(
7475

7576
public static String buildNativeESQuery(
7677
float[] vector, int size, int from, int k, Map<String, List<String>> filters) {
77-
int numCandidates = Math.max(k, 100);
78+
return buildNativeESQuery(vector, size, from, k, filters, DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER);
79+
}
80+
81+
public static String buildNativeESQuery(
82+
float[] vector,
83+
int size,
84+
int from,
85+
int k,
86+
Map<String, List<String>> filters,
87+
int numCandidatesMultiplier) {
88+
int numCandidates = Math.max(k * numCandidatesMultiplier, 100);
7889
StringBuilder sb =
7990
new StringBuilder(512)
8091
.append("{\"size\":")
@@ -92,24 +103,15 @@ public static String buildNativeESQuery(
92103
.append(numCandidates);
93104

94105
sb.append(",\"filter\":{\"bool\":{\"must\":[");
95-
appendFilterMustClauses(sb, filters, true);
106+
appendFilterMustClauses(sb, filters);
96107
sb.append("]}}"); // close must array and bool
97108

98109
sb.append("}}"); // close knn object
99110
return sb.toString();
100111
}
101112

102-
private static void appendFilterMustClauses(StringBuilder sb, Map<String, List<String>> filters) {
103-
appendFilterMustClauses(sb, filters, false);
104-
}
105-
106-
/**
107-
* @param nestedTags when true, emits a nested query for "tags" (required by the ES-native vector
108-
* index where tags is mapped as nested); when false, emits a flat terms query (used by
109-
* OpenSearch, which queries the regular entity indices where tags is a plain object).
110-
*/
111113
private static void appendFilterMustClauses(
112-
StringBuilder sb, Map<String, List<String>> filters, boolean nestedTags) {
114+
StringBuilder sb, Map<String, List<String>> filters) {
113115
sb.append("{\"term\":{\"deleted\":false}}");
114116
for (var e : filters.entrySet()) {
115117
String field = e.getKey();
@@ -126,11 +128,7 @@ private static void appendFilterMustClauses(
126128
}
127129
case "tags" -> {
128130
sb.append(',');
129-
if (nestedTags) {
130-
appendNested(sb, "tags", "tags.tagFQN", values);
131-
} else {
132-
appendFlat(sb, "tags.tagFQN", values);
133-
}
131+
appendFlat(sb, "tags.tagFQN", values);
134132
}
135133
case "domains" -> {
136134
sb.append(',');
@@ -170,35 +168,6 @@ private static void appendFilterMustClauses(
170168
}
171169
}
172170

173-
private static void appendNested(StringBuilder sb, String path, String field, List<String> vals) {
174-
sb.append("{\"nested\":{\"path\":\"").append(path).append("\",\"query\":");
175-
if (vals.size() == 1) {
176-
appendOneNestedQuery(sb, field, vals.get(0));
177-
} else {
178-
sb.append("{\"bool\":{\"should\":[");
179-
for (int i = 0; i < vals.size(); i++) {
180-
if (i > 0) sb.append(',');
181-
appendOneNestedQuery(sb, field, vals.get(i));
182-
}
183-
sb.append("]}}");
184-
}
185-
sb.append("}}");
186-
}
187-
188-
private static void appendOneNestedQuery(StringBuilder sb, String field, String val) {
189-
switch (val) {
190-
case ANY -> sb.append("{\"exists\":{\"field\":\"").append(field).append("\"}}");
191-
case NONE -> sb.append("{\"bool\":{\"must_not\":{\"exists\":{\"field\":\"")
192-
.append(field)
193-
.append("\"}}}}");
194-
default -> sb.append("{\"term\":{\"")
195-
.append(field)
196-
.append("\":\"")
197-
.append(escape(val))
198-
.append("\"}}");
199-
}
200-
}
201-
202171
private static void appendFlat(StringBuilder sb, String field, List<String> vals) {
203172
if (vals.size() == 1) {
204173
appendOneFlat(sb, field, vals.get(0));

openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -736,15 +736,20 @@ void testNativeESQueryTopLevelKnnStructure() throws Exception {
736736
void testNativeESQueryNumCandidates() throws Exception {
737737
float[] vector = {0.1f};
738738

739-
// k < 100 → num_candidates should be 100
740-
String query1 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 50, Map.of());
739+
// default multiplier (2): k * 2 < 100 → num_candidates should be 100
740+
String query1 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 30, Map.of());
741741
JsonNode root1 = MAPPER.readTree(query1);
742742
assertEquals(100, root1.get("knn").get("num_candidates").asInt());
743743

744-
// k > 100 → num_candidates should equal k
744+
// default multiplier (2): k * 2 > 100 → num_candidates should be k * 2
745745
String query2 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 200, Map.of());
746746
JsonNode root2 = MAPPER.readTree(query2);
747-
assertEquals(200, root2.get("knn").get("num_candidates").asInt());
747+
assertEquals(400, root2.get("knn").get("num_candidates").asInt());
748+
749+
// custom multiplier (5): num_candidates = max(k * 5, 100)
750+
String query3 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 100, Map.of(), 5);
751+
JsonNode root3 = MAPPER.readTree(query3);
752+
assertEquals(500, root3.get("knn").get("num_candidates").asInt());
748753
}
749754

750755
@Test
@@ -815,8 +820,8 @@ void testNativeESQueryWithTagsFilter() throws Exception {
815820

816821
assertEquals(2, mustFilters.size());
817822
JsonNode tagsFilter = mustFilters.get(1);
818-
assertTrue(tagsFilter.has("nested"));
819-
assertEquals("tags", tagsFilter.get("nested").get("path").asText());
823+
assertTrue(tagsFilter.has("term"));
824+
assertEquals("PII.Sensitive", tagsFilter.get("term").get("tags.tagFQN").asText());
820825
}
821826

822827
@Test

openmetadata-spec/src/main/resources/json/schema/configuration/elasticSearchConfiguration.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,12 @@
159159
"default": 10,
160160
"minimum": 1
161161
},
162+
"knnNumCandidatesMultiplier": {
163+
"description": "Multiplier applied to k when computing num_candidates for Elasticsearch kNN vector search. num_candidates = max(k * multiplier, 100). Higher values improve recall at the cost of latency. Defaults to 2.",
164+
"type": "integer",
165+
"default": 2,
166+
"minimum": 1
167+
},
162168
"providerClass": {
163169
"description": "Fully qualified class name of the NLQService implementation to use",
164170
"type": "string",

0 commit comments

Comments
 (0)