@@ -358,6 +358,317 @@ void testTopicWithManySchemaFieldsDoesNotCauseClauseExplosion(TestNamespace ns)
358358 });
359359 }
360360
361+ /**
362+ * Matrix test that reproduces the {@code dataAsset}-alias regression and pins the behavior of
363+ * any fix across the query shapes users actually type.
364+ *
365+ * <p>The bug: composite config merges fuzzy fields from every asset type. The {@code name.ngram}
366+ * analyzer splits on non-alphanumeric characters, so a long multi-segment identifier yields many
367+ * sub-tokens that each expand into many ngrams, and each ngram becomes a fuzzy term (fuzziness=1,
368+ * maxExpansions=10). Clause count crosses Lucene's 1024 limit; in ES 7/OS only the table shards
369+ * overflow (silent drop); in ES 9 the whole query is rejected.
370+ *
371+ * <p>Every scenario must satisfy {@code _shards.failed == 0}. The {@code shouldFind} column pins
372+ * whether the seeded table is expected in {@code hits.hits}. Failures from every row are
373+ * collected and reported together rather than short-circuiting on the first one, so a single
374+ * run surfaces the whole regression surface.
375+ */
376+ @ Test
377+ void testDataAssetAliasSearchMatrix (TestNamespace ns ) throws Exception {
378+ OpenMetadataClient client = SdkClients .adminClient ();
379+
380+ // Use a production-realistic name length (~40 chars, 5-6 alnum sub-tokens) by bypassing
381+ // ns.prefix() — that helper appends RUN_ID + classId + methodId which balloons the name
382+ // to ~127 chars, and the sheer ngram cardinality of that long string exceeds
383+ // OpenSearch's 1024 max_clause_count even with fuzziness=0 + max_expansions=1.
384+ // Production names like kochi__expected_vessels__portcall_v1 are ~36 chars, which is
385+ // the length we want to pin behavior against.
386+ // Prefix the unique tag with a distinctive "xqz" marker. uniqueShortId() returns hex,
387+ // and pure-hex prefixes share ngrams with every UUID/hash in a busy CI index, which can
388+ // push our seeded table out of the top-N hits. "xqz" is rare in any real document and
389+ // makes the first sub-token uniquely ours.
390+ String uniq = "xqz" + ns .uniqueShortId ().substring (0 , 5 );
391+ String longName = uniq + "_lhr__incoming_flights__arrivals_schedule_v1" ;
392+ Table table =
393+ createTestTableWithColumns (
394+ ns ,
395+ longName ,
396+ List .of (
397+ new Column ().withName ("id" ).withDataType (ColumnDataType .BIGINT ),
398+ new Column ()
399+ .withName ("name" )
400+ .withDataType (ColumnDataType .VARCHAR )
401+ .withDataLength (255 )));
402+ String indexedName = table .getName ();
403+
404+ // Wait for the table to appear in the table-only index using a real search call.
405+ // Query by the first alphanumeric segment of the indexed name — it's short (3-5 chars,
406+ // one alnum sub-token), so it won't itself trigger the clause-explosion path we're
407+ // about to stress in the matrix below. We still verify the specific seeded table is the
408+ // hit, so accidental matches on other docs with "lhr" in their name don't fool us.
409+ String waitQuery = indexedName .split ("_+" )[0 ];
410+ // 90s timeout: search indexing is async via change events and can lag noticeably under
411+ // CI load, especially the first time the index is warmed in a fresh test container.
412+ Awaitility .await ()
413+ .atMost (90 , TimeUnit .SECONDS )
414+ .pollInterval (500 , TimeUnit .MILLISECONDS )
415+ .until (
416+ () -> {
417+ String r =
418+ client .search ().query (waitQuery ).index ("table_search_index" ).size (25 ).execute ();
419+ JsonNode root = OBJECT_MAPPER .readTree (r );
420+ for (JsonNode hit : root .path ("hits" ).path ("hits" )) {
421+ if (indexedName .equals (hit .path ("_source" ).path ("name" ).asText ())) {
422+ return true ;
423+ }
424+ }
425+ return false ;
426+ });
427+
428+ // Derive substrings from the seeded name. shouldFind reflects realistic user expectations
429+ // given that `name` has fuzziness via FUZZY_FIELDS and `name.ngram` handles substrings.
430+ String firstSegment = indexedName .split ("_+" )[0 ]; // the 8-char unique tag
431+ int midLen = Math .min (15 , indexedName .length ());
432+ String shortPrefix = indexedName .substring (0 , Math .min (5 , indexedName .length ()));
433+ String midPrefix = indexedName .substring (0 , midLen );
434+ String fullWithDots = indexedName .replace ("_" , "." );
435+ String typoInSegment = indexedName .replaceFirst ("incoming" , "incaming" ); // 1-char typo
436+ String dropOneSegment = indexedName .replaceFirst ("__arrivals_schedule_v1" , "_v1" );
437+ String trailingSegment = "schedule_v1" ;
438+ String middleSegment = "flights" ;
439+
440+ String firstTwoSegments = "lhr__incoming" ; // exactly 2 alnum sub-tokens (boundary case)
441+ String firstThreeSegments = "lhr__incoming_flights" ; // exactly 3 — first to trip fuzz=0
442+ String mixedSeparators = indexedName .replace ("__" , "-" ).replace ("_" , "." );
443+ String withTrailingWhitespace = " " + indexedName + " " ;
444+ String withInternalWhitespace = indexedName .replace ("__" , " " );
445+ String camelCaseChunk = "LhrIncomingFlightsArrivalsScheduleV1" ; // single alnum sub-token, long
446+ String slashSeparated = indexedName .replace ("_" , "/" );
447+
448+ List <Scenario > scenarios =
449+ List .of (
450+ // --- the original repro and its immediate variants ---
451+ new Scenario ("exact full name (the repro)" , indexedName , true ),
452+ new Scenario ("short prefix (autocomplete early)" , shortPrefix , true ),
453+ new Scenario ("medium prefix (autocomplete mid-type)" , midPrefix , true ),
454+ new Scenario ("first segment alone" , firstSegment , true ),
455+ new Scenario ("middle segment alone" , middleSegment , true ),
456+ new Scenario ("trailing segment only" , trailingSegment , true ),
457+ new Scenario ("dotted variant (FQN-ish)" , fullWithDots , true ),
458+ new Scenario ("one-char typo inside a segment" , typoInSegment , true ),
459+ new Scenario ("dropped middle segments" , dropOneSegment , true ),
460+ new Scenario ("unrelated query" , "totally_unrelated_zzzqqq_9999" , false ),
461+ // --- boundary cases for the sub-token-count heuristic ---
462+ // 2 sub-tokens → fuzziness=1 path still active; must not explode and must match
463+ new Scenario ("exactly 2 sub-tokens (fuzzy path active)" , firstTwoSegments , true ),
464+ // 3 sub-tokens → first to flip to fuzziness=0; must not explode and must match
465+ new Scenario ("exactly 3 sub-tokens (fuzzy path off)" , firstThreeSegments , true ),
466+ // --- separator variants: ngram tokenizer splits on ALL non-alnum the same way, so
467+ // dots / dashes / slashes must all behave equivalently to underscores ---
468+ new Scenario ("mixed separators (- and .)" , mixedSeparators , true ),
469+ new Scenario ("slash-separated (path-like)" , slashSeparated , true ),
470+ // --- whitespace handling: trim, and whitespace as a separator in the query ---
471+ new Scenario ("leading/trailing whitespace" , withTrailingWhitespace , true ),
472+ new Scenario ("whitespace-separated segments" , withInternalWhitespace , true ),
473+ // --- single-alnum-token stress: long camelCase that is one 36-char sub-token ---
474+ new Scenario ("long camelCase single token" , camelCaseChunk , false ),
475+ // --- edge-case query shape that must never throw or blow shards ---
476+ new Scenario ("only separators" , "___" , false ));
477+
478+ List <String > failures = new ArrayList <>();
479+ for (Scenario s : scenarios ) {
480+ evaluateScenario (client , s , indexedName , failures );
481+ }
482+
483+ assertTrue (
484+ failures .isEmpty (), "Matrix scenarios failed:\n - " + String .join ("\n - " , failures ));
485+ }
486+
487+ private record Scenario (String description , String query , boolean shouldFind ) {}
488+
489+ private void evaluateScenario (
490+ OpenMetadataClient client , Scenario s , String seededName , List <String > failures ) {
491+ JsonNode root ;
492+ try {
493+ String response =
494+ client .search ().query (s .query ()).index ("dataAsset" ).deleted (false ).size (50 ).execute ();
495+ root = OBJECT_MAPPER .readTree (response );
496+ } catch (Exception e ) {
497+ // A thrown exception means the whole search was rejected (e.g. ES 9 "too many clauses"
498+ // blows the request). Treat that as a shard-level failure for reporting purposes.
499+ failures .add (
500+ s .description ()
501+ + " [query=\" "
502+ + s .query ()
503+ + "\" ]: request threw "
504+ + e .getClass ().getSimpleName ()
505+ + " — "
506+ + e .getMessage ());
507+ return ;
508+ }
509+
510+ int shardsFailed = root .path ("_shards" ).path ("failed" ).asInt (-1 );
511+ if (shardsFailed != 0 ) {
512+ failures .add (
513+ s .description ()
514+ + " [query=\" "
515+ + s .query ()
516+ + "\" ]: _shards.failed="
517+ + shardsFailed
518+ + ", failures="
519+ + root .path ("_shards" ).path ("failures" ).toString ());
520+ return ;
521+ }
522+
523+ boolean found = false ;
524+ for (JsonNode hit : root .path ("hits" ).path ("hits" )) {
525+ if (seededName .equals (hit .path ("_source" ).path ("name" ).asText ())) {
526+ found = true ;
527+ break ;
528+ }
529+ }
530+ if (found != s .shouldFind ()) {
531+ failures .add (
532+ s .description ()
533+ + " [query=\" "
534+ + s .query ()
535+ + "\" ]: expected shouldFind="
536+ + s .shouldFind ()
537+ + " but got found="
538+ + found );
539+ }
540+ }
541+
542+ /**
543+ * Guards against over-correction of the clause-explosion fix. The fix disables fuzziness
544+ * once the query analyzes to more than 2 sub-tokens; it must keep fuzziness on single-word
545+ * queries so normal typo tolerance ({@code custmer} → {@code customer}) keeps working.
546+ */
547+ @ Test
548+ void testSingleWordTypoStillMatchesViaFuzzy (TestNamespace ns ) throws Exception {
549+ OpenMetadataClient client = SdkClients .adminClient ();
550+
551+ Table table = createTestTable (ns , "customer_analytics" );
552+ String indexedName = table .getName ();
553+ String firstSeg = indexedName .split ("_+" )[0 ];
554+
555+ Awaitility .await ()
556+ .atMost (90 , TimeUnit .SECONDS )
557+ .pollInterval (500 , TimeUnit .MILLISECONDS )
558+ .until (
559+ () -> {
560+ String r =
561+ client .search ().query (firstSeg ).index ("table_search_index" ).size (25 ).execute ();
562+ JsonNode root = OBJECT_MAPPER .readTree (r );
563+ for (JsonNode hit : root .path ("hits" ).path ("hits" )) {
564+ if (indexedName .equals (hit .path ("_source" ).path ("name" ).asText ())) {
565+ return true ;
566+ }
567+ }
568+ return false ;
569+ });
570+
571+ // "custmer" is a 1-char typo of "customer", 1 alnum sub-token → fuzziness path is active.
572+ String typoQuery = "custmer" ;
573+ String response =
574+ client .search ().query (typoQuery ).index ("dataAsset" ).deleted (false ).size (25 ).execute ();
575+ JsonNode root = OBJECT_MAPPER .readTree (response );
576+
577+ assertEquals (
578+ 0 ,
579+ root .path ("_shards" ).path ("failed" ).asInt (-1 ),
580+ "single-word fuzzy query must not cause shard failures: "
581+ + root .path ("_shards" ).path ("failures" ).toString ());
582+
583+ boolean found = false ;
584+ for (JsonNode hit : root .path ("hits" ).path ("hits" )) {
585+ if (indexedName .equals (hit .path ("_source" ).path ("name" ).asText ())) {
586+ found = true ;
587+ break ;
588+ }
589+ }
590+ assertTrue (
591+ found ,
592+ "Single-word typo query \" "
593+ + typoQuery
594+ + "\" must still match seeded table \" "
595+ + indexedName
596+ + "\" via fuzzy path; regression would indicate the clause-explosion fix "
597+ + "over-corrected and killed normal typo tolerance." );
598+ }
599+
600+ /**
601+ * Pins the {@code name.keyword} exact-match boost for tables. This field was missing from
602+ * the {@code table} asset config (unlike most other asset types), which meant typing a
603+ * table's full name produced no exact-match boost. Regression guard: the seeded table must
604+ * be the top hit (or strictly above any accidental substring matches) when the full name is
605+ * queried.
606+ */
607+ @ Test
608+ void testExactFullNameRanksSeededTableFirst (TestNamespace ns ) throws Exception {
609+ OpenMetadataClient client = SdkClients .adminClient ();
610+
611+ // Seed two tables so ranking is observable: the exact-match query must prefer `target`
612+ // over the near-duplicate `decoy` that shares the same first segment. Use short unique
613+ // tags (bypassing ns.prefix()) so the seeded names stay at production-realistic lengths
614+ // and the exact-name query stays well under OpenSearch's default 1024-clause cap.
615+ String uniq = "xqz" + ns .uniqueShortId ().substring (0 , 5 );
616+ String targetNameRaw = uniq + "_exact_rank_target_v1" ;
617+ String decoyNameRaw = uniq + "_exact_rank_target_v1_extended_suffix" ;
618+ List <Column > cols =
619+ List .of (
620+ new Column ().withName ("id" ).withDataType (ColumnDataType .BIGINT ),
621+ new Column ().withName ("name" ).withDataType (ColumnDataType .VARCHAR ).withDataLength (255 ));
622+ Table target = createTestTableWithColumns (ns , targetNameRaw , cols );
623+ Table decoy = createTestTableWithColumns (ns , decoyNameRaw , cols );
624+ String targetName = target .getName ();
625+ String decoyName = decoy .getName ();
626+
627+ Awaitility .await ()
628+ .atMost (90 , TimeUnit .SECONDS )
629+ .pollInterval (500 , TimeUnit .MILLISECONDS )
630+ .until (
631+ () -> {
632+ String r =
633+ client
634+ .search ()
635+ .query (targetName .split ("_+" )[0 ])
636+ .index ("table_search_index" )
637+ .size (50 )
638+ .execute ();
639+ JsonNode root = OBJECT_MAPPER .readTree (r );
640+ boolean sawTarget = false ;
641+ boolean sawDecoy = false ;
642+ for (JsonNode hit : root .path ("hits" ).path ("hits" )) {
643+ String n = hit .path ("_source" ).path ("name" ).asText ();
644+ if (targetName .equals (n )) sawTarget = true ;
645+ if (decoyName .equals (n )) sawDecoy = true ;
646+ }
647+ return sawTarget && sawDecoy ;
648+ });
649+
650+ String response =
651+ client .search ().query (targetName ).index ("dataAsset" ).deleted (false ).size (10 ).execute ();
652+ JsonNode root = OBJECT_MAPPER .readTree (response );
653+
654+ assertEquals (
655+ 0 , root .path ("_shards" ).path ("failed" ).asInt (-1 ), "exact-name query must not fail shards" );
656+
657+ JsonNode hits = root .path ("hits" ).path ("hits" );
658+ assertTrue (hits .size () > 0 , "exact-name query must return at least one hit" );
659+ String topName = hits .get (0 ).path ("_source" ).path ("name" ).asText ();
660+ assertEquals (
661+ targetName ,
662+ topName ,
663+ "Exact full-name query must rank the exact-match table first, not the decoy. "
664+ + "Got top hit \" "
665+ + topName
666+ + "\" instead of \" "
667+ + targetName
668+ + "\" . This typically regresses when name.keyword exact-match is removed "
669+ + "from the table asset config." );
670+ }
671+
361672 // ===================================================================
362673 // SEARCH CONSISTENCY TESTS
363674 // ===================================================================
0 commit comments