Skip to content

Commit b5c3916

Browse files
sonika-shahCopilot
andcommitted
Remove fuzzy match on ngram; merge SearchUtils into single class; add more test coverage (#27636)
* Remove fuzzy match on ngram; merge SearchUtils into single class; add more test coverage * Update openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/SearchResourceIT.java Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fix tests * test(search): bump indexing-wait timeouts from 30s to 90s CI was timing out in the Awaitility loops that wait for newly-created tables to appear in the search index. Indexing is async via change events and can take noticeably longer under CI load than locally. 30s gave no margin; 90s is 3x cushion without slowing the happy path. * test(search): use distinctive xqz prefix and bump matrix size to 50 CI was failing on three short-prefix matrix scenarios that queried the seeded table's unique tag. The tag was pure hex from uniqueShortId(), which shares ngrams with every UUID/hash in a busy CI index — our table got pushed out of the top-15 hits by ngram-overlap noise from other tests. Two fixes: - Prefix the tag with "xqz", a trigraph rare in any real document. Now the first sub-token is uniquely ours regardless of index pollution. - Bump matrix size from 15 to 50. The matrix tests retrievability, not top-N ranking — testExactFullNameRanksSeededTableFirst already pins the production-UI ranking concern at size=10. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent be355e5 commit b5c3916

10 files changed

Lines changed: 1091 additions & 174 deletions

File tree

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ ingestion/.nox/
159159
_bmad/
160160

161161
# Claude Flow generated files
162-
.claude/settings.local.json
162+
.claude/*
163163
.mcp.json
164164
claude-flow.config.json
165165
.swarm/

openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/SearchResourceIT.java

Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,317 @@ void testTopicWithManySchemaFieldsDoesNotCauseClauseExplosion(TestNamespace ns)
358358
});
359359
}
360360

361+
/**
362+
* Matrix test that reproduces the {@code dataAsset}-alias regression and pins the behavior of
363+
* any fix across the query shapes users actually type.
364+
*
365+
* <p>The bug: composite config merges fuzzy fields from every asset type. The {@code name.ngram}
366+
* analyzer splits on non-alphanumeric characters, so a long multi-segment identifier yields many
367+
* sub-tokens that each expand into many ngrams, and each ngram becomes a fuzzy term (fuzziness=1,
368+
* maxExpansions=10). Clause count crosses Lucene's 1024 limit; in ES 7/OS only the table shards
369+
* overflow (silent drop); in ES 9 the whole query is rejected.
370+
*
371+
* <p>Every scenario must satisfy {@code _shards.failed == 0}. The {@code shouldFind} column pins
372+
* whether the seeded table is expected in {@code hits.hits}. Failures from every row are
373+
* collected and reported together rather than short-circuiting on the first one, so a single
374+
* run surfaces the whole regression surface.
375+
*/
376+
@Test
377+
void testDataAssetAliasSearchMatrix(TestNamespace ns) throws Exception {
378+
OpenMetadataClient client = SdkClients.adminClient();
379+
380+
// Use a production-realistic name length (~40 chars, 5-6 alnum sub-tokens) by bypassing
381+
// ns.prefix() — that helper appends RUN_ID + classId + methodId which balloons the name
382+
// to ~127 chars, and the sheer ngram cardinality of that long string exceeds
383+
// OpenSearch's 1024 max_clause_count even with fuzziness=0 + max_expansions=1.
384+
// Production names like kochi__expected_vessels__portcall_v1 are ~36 chars, which is
385+
// the length we want to pin behavior against.
386+
// Prefix the unique tag with a distinctive "xqz" marker. uniqueShortId() returns hex,
387+
// and pure-hex prefixes share ngrams with every UUID/hash in a busy CI index, which can
388+
// push our seeded table out of the top-N hits. "xqz" is rare in any real document and
389+
// makes the first sub-token uniquely ours.
390+
String uniq = "xqz" + ns.uniqueShortId().substring(0, 5);
391+
String longName = uniq + "_lhr__incoming_flights__arrivals_schedule_v1";
392+
Table table =
393+
createTestTableWithColumns(
394+
ns,
395+
longName,
396+
List.of(
397+
new Column().withName("id").withDataType(ColumnDataType.BIGINT),
398+
new Column()
399+
.withName("name")
400+
.withDataType(ColumnDataType.VARCHAR)
401+
.withDataLength(255)));
402+
String indexedName = table.getName();
403+
404+
// Wait for the table to appear in the table-only index using a real search call.
405+
// Query by the first alphanumeric segment of the indexed name — it's short (3-5 chars,
406+
// one alnum sub-token), so it won't itself trigger the clause-explosion path we're
407+
// about to stress in the matrix below. We still verify the specific seeded table is the
408+
// hit, so accidental matches on other docs with "lhr" in their name don't fool us.
409+
String waitQuery = indexedName.split("_+")[0];
410+
// 90s timeout: search indexing is async via change events and can lag noticeably under
411+
// CI load, especially the first time the index is warmed in a fresh test container.
412+
Awaitility.await()
413+
.atMost(90, TimeUnit.SECONDS)
414+
.pollInterval(500, TimeUnit.MILLISECONDS)
415+
.until(
416+
() -> {
417+
String r =
418+
client.search().query(waitQuery).index("table_search_index").size(25).execute();
419+
JsonNode root = OBJECT_MAPPER.readTree(r);
420+
for (JsonNode hit : root.path("hits").path("hits")) {
421+
if (indexedName.equals(hit.path("_source").path("name").asText())) {
422+
return true;
423+
}
424+
}
425+
return false;
426+
});
427+
428+
// Derive substrings from the seeded name. shouldFind reflects realistic user expectations
429+
// given that `name` has fuzziness via FUZZY_FIELDS and `name.ngram` handles substrings.
430+
String firstSegment = indexedName.split("_+")[0]; // the 8-char unique tag
431+
int midLen = Math.min(15, indexedName.length());
432+
String shortPrefix = indexedName.substring(0, Math.min(5, indexedName.length()));
433+
String midPrefix = indexedName.substring(0, midLen);
434+
String fullWithDots = indexedName.replace("_", ".");
435+
String typoInSegment = indexedName.replaceFirst("incoming", "incaming"); // 1-char typo
436+
String dropOneSegment = indexedName.replaceFirst("__arrivals_schedule_v1", "_v1");
437+
String trailingSegment = "schedule_v1";
438+
String middleSegment = "flights";
439+
440+
String firstTwoSegments = "lhr__incoming"; // exactly 2 alnum sub-tokens (boundary case)
441+
String firstThreeSegments = "lhr__incoming_flights"; // exactly 3 — first to trip fuzz=0
442+
String mixedSeparators = indexedName.replace("__", "-").replace("_", ".");
443+
String withTrailingWhitespace = " " + indexedName + " ";
444+
String withInternalWhitespace = indexedName.replace("__", " ");
445+
String camelCaseChunk = "LhrIncomingFlightsArrivalsScheduleV1"; // single alnum sub-token, long
446+
String slashSeparated = indexedName.replace("_", "/");
447+
448+
List<Scenario> scenarios =
449+
List.of(
450+
// --- the original repro and its immediate variants ---
451+
new Scenario("exact full name (the repro)", indexedName, true),
452+
new Scenario("short prefix (autocomplete early)", shortPrefix, true),
453+
new Scenario("medium prefix (autocomplete mid-type)", midPrefix, true),
454+
new Scenario("first segment alone", firstSegment, true),
455+
new Scenario("middle segment alone", middleSegment, true),
456+
new Scenario("trailing segment only", trailingSegment, true),
457+
new Scenario("dotted variant (FQN-ish)", fullWithDots, true),
458+
new Scenario("one-char typo inside a segment", typoInSegment, true),
459+
new Scenario("dropped middle segments", dropOneSegment, true),
460+
new Scenario("unrelated query", "totally_unrelated_zzzqqq_9999", false),
461+
// --- boundary cases for the sub-token-count heuristic ---
462+
// 2 sub-tokens → fuzziness=1 path still active; must not explode and must match
463+
new Scenario("exactly 2 sub-tokens (fuzzy path active)", firstTwoSegments, true),
464+
// 3 sub-tokens → first to flip to fuzziness=0; must not explode and must match
465+
new Scenario("exactly 3 sub-tokens (fuzzy path off)", firstThreeSegments, true),
466+
// --- separator variants: ngram tokenizer splits on ALL non-alnum the same way, so
467+
// dots / dashes / slashes must all behave equivalently to underscores ---
468+
new Scenario("mixed separators (- and .)", mixedSeparators, true),
469+
new Scenario("slash-separated (path-like)", slashSeparated, true),
470+
// --- whitespace handling: trim, and whitespace as a separator in the query ---
471+
new Scenario("leading/trailing whitespace", withTrailingWhitespace, true),
472+
new Scenario("whitespace-separated segments", withInternalWhitespace, true),
473+
// --- single-alnum-token stress: long camelCase that is one 36-char sub-token ---
474+
new Scenario("long camelCase single token", camelCaseChunk, false),
475+
// --- edge-case query shape that must never throw or blow shards ---
476+
new Scenario("only separators", "___", false));
477+
478+
List<String> failures = new ArrayList<>();
479+
for (Scenario s : scenarios) {
480+
evaluateScenario(client, s, indexedName, failures);
481+
}
482+
483+
assertTrue(
484+
failures.isEmpty(), "Matrix scenarios failed:\n - " + String.join("\n - ", failures));
485+
}
486+
487+
private record Scenario(String description, String query, boolean shouldFind) {}
488+
489+
private void evaluateScenario(
490+
OpenMetadataClient client, Scenario s, String seededName, List<String> failures) {
491+
JsonNode root;
492+
try {
493+
String response =
494+
client.search().query(s.query()).index("dataAsset").deleted(false).size(50).execute();
495+
root = OBJECT_MAPPER.readTree(response);
496+
} catch (Exception e) {
497+
// A thrown exception means the whole search was rejected (e.g. ES 9 "too many clauses"
498+
// blows the request). Treat that as a shard-level failure for reporting purposes.
499+
failures.add(
500+
s.description()
501+
+ " [query=\""
502+
+ s.query()
503+
+ "\"]: request threw "
504+
+ e.getClass().getSimpleName()
505+
+ " — "
506+
+ e.getMessage());
507+
return;
508+
}
509+
510+
int shardsFailed = root.path("_shards").path("failed").asInt(-1);
511+
if (shardsFailed != 0) {
512+
failures.add(
513+
s.description()
514+
+ " [query=\""
515+
+ s.query()
516+
+ "\"]: _shards.failed="
517+
+ shardsFailed
518+
+ ", failures="
519+
+ root.path("_shards").path("failures").toString());
520+
return;
521+
}
522+
523+
boolean found = false;
524+
for (JsonNode hit : root.path("hits").path("hits")) {
525+
if (seededName.equals(hit.path("_source").path("name").asText())) {
526+
found = true;
527+
break;
528+
}
529+
}
530+
if (found != s.shouldFind()) {
531+
failures.add(
532+
s.description()
533+
+ " [query=\""
534+
+ s.query()
535+
+ "\"]: expected shouldFind="
536+
+ s.shouldFind()
537+
+ " but got found="
538+
+ found);
539+
}
540+
}
541+
542+
/**
543+
* Guards against over-correction of the clause-explosion fix. The fix disables fuzziness
544+
* once the query analyzes to more than 2 sub-tokens; it must keep fuzziness on single-word
545+
* queries so normal typo tolerance ({@code custmer} → {@code customer}) keeps working.
546+
*/
547+
@Test
548+
void testSingleWordTypoStillMatchesViaFuzzy(TestNamespace ns) throws Exception {
549+
OpenMetadataClient client = SdkClients.adminClient();
550+
551+
Table table = createTestTable(ns, "customer_analytics");
552+
String indexedName = table.getName();
553+
String firstSeg = indexedName.split("_+")[0];
554+
555+
Awaitility.await()
556+
.atMost(90, TimeUnit.SECONDS)
557+
.pollInterval(500, TimeUnit.MILLISECONDS)
558+
.until(
559+
() -> {
560+
String r =
561+
client.search().query(firstSeg).index("table_search_index").size(25).execute();
562+
JsonNode root = OBJECT_MAPPER.readTree(r);
563+
for (JsonNode hit : root.path("hits").path("hits")) {
564+
if (indexedName.equals(hit.path("_source").path("name").asText())) {
565+
return true;
566+
}
567+
}
568+
return false;
569+
});
570+
571+
// "custmer" is a 1-char typo of "customer", 1 alnum sub-token → fuzziness path is active.
572+
String typoQuery = "custmer";
573+
String response =
574+
client.search().query(typoQuery).index("dataAsset").deleted(false).size(25).execute();
575+
JsonNode root = OBJECT_MAPPER.readTree(response);
576+
577+
assertEquals(
578+
0,
579+
root.path("_shards").path("failed").asInt(-1),
580+
"single-word fuzzy query must not cause shard failures: "
581+
+ root.path("_shards").path("failures").toString());
582+
583+
boolean found = false;
584+
for (JsonNode hit : root.path("hits").path("hits")) {
585+
if (indexedName.equals(hit.path("_source").path("name").asText())) {
586+
found = true;
587+
break;
588+
}
589+
}
590+
assertTrue(
591+
found,
592+
"Single-word typo query \""
593+
+ typoQuery
594+
+ "\" must still match seeded table \""
595+
+ indexedName
596+
+ "\" via fuzzy path; regression would indicate the clause-explosion fix "
597+
+ "over-corrected and killed normal typo tolerance.");
598+
}
599+
600+
/**
601+
* Pins the {@code name.keyword} exact-match boost for tables. This field was missing from
602+
* the {@code table} asset config (unlike most other asset types), which meant typing a
603+
* table's full name produced no exact-match boost. Regression guard: the seeded table must
604+
* be the top hit (or strictly above any accidental substring matches) when the full name is
605+
* queried.
606+
*/
607+
@Test
608+
void testExactFullNameRanksSeededTableFirst(TestNamespace ns) throws Exception {
609+
OpenMetadataClient client = SdkClients.adminClient();
610+
611+
// Seed two tables so ranking is observable: the exact-match query must prefer `target`
612+
// over the near-duplicate `decoy` that shares the same first segment. Use short unique
613+
// tags (bypassing ns.prefix()) so the seeded names stay at production-realistic lengths
614+
// and the exact-name query stays well under OpenSearch's default 1024-clause cap.
615+
String uniq = "xqz" + ns.uniqueShortId().substring(0, 5);
616+
String targetNameRaw = uniq + "_exact_rank_target_v1";
617+
String decoyNameRaw = uniq + "_exact_rank_target_v1_extended_suffix";
618+
List<Column> cols =
619+
List.of(
620+
new Column().withName("id").withDataType(ColumnDataType.BIGINT),
621+
new Column().withName("name").withDataType(ColumnDataType.VARCHAR).withDataLength(255));
622+
Table target = createTestTableWithColumns(ns, targetNameRaw, cols);
623+
Table decoy = createTestTableWithColumns(ns, decoyNameRaw, cols);
624+
String targetName = target.getName();
625+
String decoyName = decoy.getName();
626+
627+
Awaitility.await()
628+
.atMost(90, TimeUnit.SECONDS)
629+
.pollInterval(500, TimeUnit.MILLISECONDS)
630+
.until(
631+
() -> {
632+
String r =
633+
client
634+
.search()
635+
.query(targetName.split("_+")[0])
636+
.index("table_search_index")
637+
.size(50)
638+
.execute();
639+
JsonNode root = OBJECT_MAPPER.readTree(r);
640+
boolean sawTarget = false;
641+
boolean sawDecoy = false;
642+
for (JsonNode hit : root.path("hits").path("hits")) {
643+
String n = hit.path("_source").path("name").asText();
644+
if (targetName.equals(n)) sawTarget = true;
645+
if (decoyName.equals(n)) sawDecoy = true;
646+
}
647+
return sawTarget && sawDecoy;
648+
});
649+
650+
String response =
651+
client.search().query(targetName).index("dataAsset").deleted(false).size(10).execute();
652+
JsonNode root = OBJECT_MAPPER.readTree(response);
653+
654+
assertEquals(
655+
0, root.path("_shards").path("failed").asInt(-1), "exact-name query must not fail shards");
656+
657+
JsonNode hits = root.path("hits").path("hits");
658+
assertTrue(hits.size() > 0, "exact-name query must return at least one hit");
659+
String topName = hits.get(0).path("_source").path("name").asText();
660+
assertEquals(
661+
targetName,
662+
topName,
663+
"Exact full-name query must rank the exact-match table first, not the decoy. "
664+
+ "Got top hit \""
665+
+ topName
666+
+ "\" instead of \""
667+
+ targetName
668+
+ "\". This typically regresses when name.keyword exact-match is removed "
669+
+ "from the table asset config.");
670+
}
671+
361672
// ===================================================================
362673
// SEARCH CONSISTENCY TESTS
363674
// ===================================================================

openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/SearchMetadataTool.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package org.openmetadata.mcp.tools;
22

33
import static org.openmetadata.common.utils.CommonUtil.nullOrEmpty;
4-
import static org.openmetadata.service.search.SearchUtil.mapEntityTypesToIndexNames;
4+
import static org.openmetadata.service.search.SearchUtils.mapEntityTypesToIndexNames;
55
import static org.openmetadata.service.security.DefaultAuthorizer.getSubjectContext;
66

77
import com.fasterxml.jackson.databind.JsonNode;

openmetadata-service/src/main/java/org/openmetadata/service/search/SearchSourceBuilderFactory.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
package org.openmetadata.service.search;
22

3-
import static org.openmetadata.service.search.SearchUtil.isDataAssetIndex;
4-
import static org.openmetadata.service.search.SearchUtil.isDataQualityIndex;
5-
import static org.openmetadata.service.search.SearchUtil.isServiceIndex;
6-
import static org.openmetadata.service.search.SearchUtil.isTimeSeriesIndex;
7-
import static org.openmetadata.service.search.SearchUtil.mapEntityTypesToIndexNames;
3+
import static org.openmetadata.service.search.SearchUtils.isDataAssetIndex;
4+
import static org.openmetadata.service.search.SearchUtils.isDataQualityIndex;
5+
import static org.openmetadata.service.search.SearchUtils.isServiceIndex;
6+
import static org.openmetadata.service.search.SearchUtils.isTimeSeriesIndex;
7+
import static org.openmetadata.service.search.SearchUtils.mapEntityTypesToIndexNames;
88

99
import java.util.List;
1010
import java.util.Map;

0 commit comments

Comments
 (0)