From 53338f39de3880b62c3c0c93993d539265584b74 Mon Sep 17 00:00:00 2001 From: jupblb Date: Tue, 19 May 2026 16:05:40 +0200 Subject: [PATCH 1/3] Add 'scip merge' command to combine multiple SCIP indexes The merge command takes two or more SCIP indexes and produces a single combined index. The output project_root is automatically inferred as the common URI ancestor of the inputs' project_root values, and each input document's relative_path is rewritten to be relative to that root. Use --project-root to override the inferred root (e.g. to root the merged index at a parent directory). Example: scip merge --output merged.scip a.scip b.scip c.scip Documents with the same rewritten relative_path and external symbols with the same name are deduplicated via the existing FlattenDocuments and FlattenSymbols helpers. --- cmd/scip/main.go | 3 +- cmd/scip/merge.go | 336 +++++++++++++++++++++++++++++++++++ cmd/scip/merge_test.go | 388 +++++++++++++++++++++++++++++++++++++++++ docs/CLI.md | 43 +++++ 4 files changed, 769 insertions(+), 1 deletion(-) create mode 100644 cmd/scip/merge.go create mode 100644 cmd/scip/merge_test.go diff --git a/cmd/scip/main.go b/cmd/scip/main.go index c801b029..d9fc5441 100644 --- a/cmd/scip/main.go +++ b/cmd/scip/main.go @@ -26,7 +26,8 @@ func commands() []*cli.Command { stats := statsCommand() test := testCommand() convert := convertCommand() - return []*cli.Command{&lint, &print, &snapshot, &stats, &test, &convert} + merge := mergeCommand() + return []*cli.Command{&lint, &print, &snapshot, &stats, &test, &convert, &merge} } //go:embed version.txt diff --git a/cmd/scip/merge.go b/cmd/scip/merge.go new file mode 100644 index 00000000..594454e1 --- /dev/null +++ b/cmd/scip/merge.go @@ -0,0 +1,336 @@ +package main + +import ( + "context" + "errors" + "fmt" + "net/url" + "os" + "path" + "strings" + + "github.com/urfave/cli/v3" + "google.golang.org/protobuf/proto" + + "github.com/scip-code/scip/bindings/go/scip" +) + +type mergeFlags struct { + output string + projectRootOverride string +} + +func mergeCommand() cli.Command { + var flags mergeFlags + command := cli.Command{ + Name: "merge", + Usage: "Merge multiple SCIP indexes into a single SCIP index", + Description: `Merges two or more SCIP indexes into one. + +The output project_root is inferred as the common URI ancestor of the input +indexes' project_root values. Each input document's relative_path is rewritten +to be relative to that common root. + +For example, given: + + a.scip with project_root file:///repo/frontend, document src/a.ts + b.scip with project_root file:///repo/backend, document src/b.go + +The merged index will have: + + project_root file:///repo + documents frontend/src/a.ts and backend/src/b.go + +Documents and symbols are deduplicated after rewriting. + +Use --project-root to override the inferred root (each input's root must then +be a descendant of the override). + +Example usage: + + scip merge --output merged.scip a.scip b.scip c.scip`, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "output", + Usage: "Path to write the merged SCIP index", + Destination: &flags.output, + Value: "index.scip", + }, + &cli.StringFlag{ + Name: "project-root", + Usage: "Override the inferred output project_root URI. Each input project_root must be a descendant of this URI.", + Destination: &flags.projectRootOverride, + }, + }, + Action: func(ctx context.Context, cmd *cli.Command) error { + inputs := cmd.Args().Slice() + if len(inputs) == 0 { + return errors.New("at least one input SCIP index path is required") + } + return mergeMain(inputs, flags) + }, + } + return command +} + +func mergeMain(inputs []string, flags mergeFlags) error { + indexes := make([]*scip.Index, 0, len(inputs)) + for _, p := range inputs { + idx, err := readFromOption(p) + if err != nil { + return fmt.Errorf("reading %s: %w", p, err) + } + indexes = append(indexes, idx) + } + + merged, err := mergeIndexes(indexes, flags.projectRootOverride) + if err != nil { + return err + } + + bytes, err := proto.Marshal(merged) + if err != nil { + return fmt.Errorf("marshaling merged index: %w", err) + } + if err := os.WriteFile(flags.output, bytes, 0o644); err != nil { + return fmt.Errorf("writing %s: %w", flags.output, err) + } + return nil +} + +// mergeIndexes combines multiple SCIP indexes into one. It determines the +// output project_root by computing the common URI ancestor of the inputs' +// project_root values (unless projectRootOverride is non-empty), rewrites +// each Document.relative_path to be relative to that root, and deduplicates +// documents and external symbols. +func mergeIndexes(indexes []*scip.Index, projectRootOverride string) (*scip.Index, error) { + if len(indexes) == 0 { + return nil, errors.New("no indexes to merge") + } + + // Validate metadata is present everywhere. + for i, idx := range indexes { + if idx.Metadata == nil { + return nil, fmt.Errorf("index %d has no metadata", i) + } + } + + // Validate compatible ProtocolVersion and TextDocumentEncoding. + protoVersion := indexes[0].Metadata.Version + encoding := indexes[0].Metadata.TextDocumentEncoding + for i, idx := range indexes[1:] { + if idx.Metadata.Version != protoVersion { + return nil, fmt.Errorf( + "index %d has incompatible protocol version %v (expected %v)", + i+1, idx.Metadata.Version, protoVersion) + } + if !encodingsCompatible(encoding, idx.Metadata.TextDocumentEncoding) { + return nil, fmt.Errorf( + "index %d has incompatible text encoding %v (expected %v)", + i+1, idx.Metadata.TextDocumentEncoding, encoding) + } + // Promote from Unspecified to a concrete encoding if one input has it. + if encoding == scip.TextEncoding_UnspecifiedTextEncoding { + encoding = idx.Metadata.TextDocumentEncoding + } + } + + // Determine the output project_root and per-input prefix. + outputRoot, prefixes, err := computeOutputRootAndPrefixes(indexes, projectRootOverride) + if err != nil { + return nil, err + } + + // Aggregate documents (rewriting paths) and external symbols. + var allDocuments []*scip.Document + var allExternalSymbols []*scip.SymbolInformation + for i, idx := range indexes { + for _, doc := range idx.Documents { + if prefixes[i] != "" { + if doc.RelativePath == "" { + doc.RelativePath = prefixes[i] + } else { + doc.RelativePath = path.Join(prefixes[i], doc.RelativePath) + } + } + allDocuments = append(allDocuments, doc) + } + allExternalSymbols = append(allExternalSymbols, idx.ExternalSymbols...) + } + + mergedDocuments := scip.FlattenDocuments(allDocuments) + mergedExternalSymbols := scip.FlattenSymbols(allExternalSymbols) + + return &scip.Index{ + Metadata: &scip.Metadata{ + Version: protoVersion, + ToolInfo: &scip.ToolInfo{ + Name: "scip", + Version: strings.TrimSpace(version), + Arguments: []string{"merge"}, + }, + ProjectRoot: outputRoot, + TextDocumentEncoding: encoding, + }, + Documents: mergedDocuments, + ExternalSymbols: mergedExternalSymbols, + }, nil +} + +// encodingsCompatible reports whether two TextEncoding values can be merged. +// Unspecified is treated as compatible with any value. +func encodingsCompatible(a, b scip.TextEncoding) bool { + if a == scip.TextEncoding_UnspecifiedTextEncoding || + b == scip.TextEncoding_UnspecifiedTextEncoding { + return true + } + return a == b +} + +// computeOutputRootAndPrefixes returns the URI to use as the merged index's +// project_root, together with one path-prefix per input index. The prefix for +// input i is what should be prepended to each document's relative_path so that +// it remains relative to the output root. +// +// If projectRootOverride is non-empty, it is used directly and each input root +// must be a descendant of it. Otherwise, the output root is the common URI +// ancestor of all inputs. +func computeOutputRootAndPrefixes(indexes []*scip.Index, projectRootOverride string) (string, []string, error) { + roots := make([]string, len(indexes)) + for i, idx := range indexes { + roots[i] = idx.Metadata.ProjectRoot + } + + var outputRoot string + if projectRootOverride != "" { + outputRoot = projectRootOverride + } else { + var err error + outputRoot, err = commonAncestorURI(roots) + if err != nil { + return "", nil, err + } + } + + prefixes := make([]string, len(indexes)) + for i, r := range roots { + prefix, err := relativePrefix(outputRoot, r) + if err != nil { + return "", nil, fmt.Errorf( + "index %d project_root %q is not under output root %q: %w", + i, r, outputRoot, err) + } + prefixes[i] = prefix + } + return outputRoot, prefixes, nil +} + +// commonAncestorURI returns the longest URI that is an ancestor of every input +// URI. All inputs must share the same scheme and host. +func commonAncestorURI(roots []string) (string, error) { + if len(roots) == 0 { + return "", errors.New("no project roots provided") + } + + first, err := url.Parse(roots[0]) + if err != nil { + return "", fmt.Errorf("parsing project_root %q: %w", roots[0], err) + } + + commonScheme := first.Scheme + commonHost := first.Host + commonPath := normalizeURIPath(first.Path) + + for _, r := range roots[1:] { + u, err := url.Parse(r) + if err != nil { + return "", fmt.Errorf("parsing project_root %q: %w", r, err) + } + if u.Scheme != commonScheme { + return "", fmt.Errorf( + "incompatible URI schemes %q and %q across inputs; "+ + "pass --project-root to override", + commonScheme, u.Scheme) + } + if u.Host != commonHost { + return "", fmt.Errorf( + "incompatible URI hosts %q and %q across inputs; "+ + "pass --project-root to override", + commonHost, u.Host) + } + commonPath = commonPathPrefix(commonPath, normalizeURIPath(u.Path)) + } + + out := url.URL{ + Scheme: commonScheme, + Host: commonHost, + Path: commonPath, + } + return out.String(), nil +} + +// normalizeURIPath trims trailing slashes while preserving the root "/". +func normalizeURIPath(p string) string { + if p == "" || p == "/" { + return p + } + return strings.TrimRight(p, "/") +} + +// commonPathPrefix returns the longest path prefix of a and b along "/" +// component boundaries. +func commonPathPrefix(a, b string) string { + aParts := strings.Split(a, "/") + bParts := strings.Split(b, "/") + n := len(aParts) + if len(bParts) < n { + n = len(bParts) + } + i := 0 + for ; i < n; i++ { + if aParts[i] != bParts[i] { + break + } + } + common := strings.Join(aParts[:i], "/") + // If only the leading empty element matched, both paths were absolute and + // the only shared ancestor is the root. + if common == "" && i > 0 && strings.HasPrefix(a, "/") && strings.HasPrefix(b, "/") { + return "/" + } + return common +} + +// relativePrefix returns the path that, when prepended to a relative_path under +// the input root, makes it relative to the output root. +func relativePrefix(outputRoot, inputRoot string) (string, error) { + outURL, err := url.Parse(outputRoot) + if err != nil { + return "", fmt.Errorf("parsing output project_root %q: %w", outputRoot, err) + } + inURL, err := url.Parse(inputRoot) + if err != nil { + return "", fmt.Errorf("parsing input project_root %q: %w", inputRoot, err) + } + if outURL.Scheme != inURL.Scheme { + return "", fmt.Errorf("scheme mismatch: %q vs %q", outURL.Scheme, inURL.Scheme) + } + if outURL.Host != inURL.Host { + return "", fmt.Errorf("host mismatch: %q vs %q", outURL.Host, inURL.Host) + } + + outPath := normalizeURIPath(outURL.Path) + inPath := normalizeURIPath(inURL.Path) + + if outPath == inPath { + return "", nil + } + // outPath must be a directory prefix of inPath. + if outPath == "/" { + return strings.TrimLeft(inPath, "/"), nil + } + if !strings.HasPrefix(inPath, outPath+"/") { + return "", fmt.Errorf("%q is not under %q", inPath, outPath) + } + return strings.TrimPrefix(inPath, outPath+"/"), nil +} diff --git a/cmd/scip/merge_test.go b/cmd/scip/merge_test.go new file mode 100644 index 00000000..c7961037 --- /dev/null +++ b/cmd/scip/merge_test.go @@ -0,0 +1,388 @@ +package main + +import ( + "os" + "path/filepath" + "sort" + "testing" + + "github.com/stretchr/testify/require" + "google.golang.org/protobuf/proto" + + "github.com/scip-code/scip/bindings/go/scip" +) + +func TestMerge_CommonAncestorURI(t *testing.T) { + cases := []struct { + name string + roots []string + want string + wantErr bool + }{ + { + name: "identical roots", + roots: []string{"file:///repo", "file:///repo"}, + want: "file:///repo", + }, + { + name: "sibling subdirs", + roots: []string{"file:///repo/frontend", "file:///repo/backend"}, + want: "file:///repo", + }, + { + name: "one is ancestor of the other", + roots: []string{"file:///repo", "file:///repo/sub"}, + want: "file:///repo", + }, + { + name: "deep nesting", + roots: []string{"file:///repo/a/b/c", "file:///repo/a/b/d", "file:///repo/a/e"}, + want: "file:///repo/a", + }, + { + name: "common component is filesystem root", + roots: []string{"file:///alpha/x", "file:///beta/y"}, + want: "file:///", + }, + { + name: "trailing slashes normalized", + roots: []string{"file:///repo/", "file:///repo/sub"}, + want: "file:///repo", + }, + { + name: "different schemes error", + roots: []string{"file:///repo", "https://example.com/repo"}, + wantErr: true, + }, + { + name: "different hosts error", + roots: []string{"https://a.com/repo", "https://b.com/repo"}, + wantErr: true, + }, + { + name: "non-overlapping but same scheme is ancestor=/", + roots: []string{"file:///repo/frontend", "file:///different-name/backend"}, + want: "file:///", + }, + { + name: "prefix-like names don't share parent", + roots: []string{"file:///repo/frontend", "file:///repo/frontend-v2"}, + want: "file:///repo", + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, err := commonAncestorURI(c.roots) + if c.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + require.Equal(t, c.want, got) + }) + } +} + +func TestMerge_RelativePrefix(t *testing.T) { + cases := []struct { + name string + outputRoot string + inputRoot string + want string + wantErr bool + }{ + { + name: "same root", + outputRoot: "file:///repo", + inputRoot: "file:///repo", + want: "", + }, + { + name: "direct child", + outputRoot: "file:///repo", + inputRoot: "file:///repo/sub", + want: "sub", + }, + { + name: "deep child", + outputRoot: "file:///repo", + inputRoot: "file:///repo/a/b/c", + want: "a/b/c", + }, + { + name: "input is ancestor (error)", + outputRoot: "file:///repo/sub", + inputRoot: "file:///repo", + wantErr: true, + }, + { + name: "siblings (error)", + outputRoot: "file:///repo/frontend", + inputRoot: "file:///repo/backend", + wantErr: true, + }, + { + name: "prefix-like but not actual ancestor", + outputRoot: "file:///repo/frontend", + inputRoot: "file:///repo/frontend-v2", + wantErr: true, + }, + { + name: "output is filesystem root", + outputRoot: "file:///", + inputRoot: "file:///alpha/x", + want: "alpha/x", + }, + { + name: "trailing slash on input", + outputRoot: "file:///repo", + inputRoot: "file:///repo/sub/", + want: "sub", + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, err := relativePrefix(c.outputRoot, c.inputRoot) + if c.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + require.Equal(t, c.want, got) + }) + } +} + +func TestMergeIndexes_AutoInferredRoot(t *testing.T) { + a := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo/frontend", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + Documents: []*scip.Document{ + { + RelativePath: "src/a.ts", + Occurrences: []*scip.Occurrence{ + {Symbol: "scip-ts npm pkg-a 1.0 A#", Range: []int32{0, 0, 1}, SymbolRoles: int32(scip.SymbolRole_Definition)}, + }, + Symbols: []*scip.SymbolInformation{{Symbol: "scip-ts npm pkg-a 1.0 A#"}}, + }, + }, + ExternalSymbols: []*scip.SymbolInformation{ + {Symbol: "scip-ts npm shared 1.0 Util#"}, + }, + } + b := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo/backend", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + Documents: []*scip.Document{ + { + RelativePath: "src/b.go", + Occurrences: []*scip.Occurrence{ + {Symbol: "scip-go go . . pkg/B#", Range: []int32{0, 0, 1}, SymbolRoles: int32(scip.SymbolRole_Definition)}, + }, + Symbols: []*scip.SymbolInformation{{Symbol: "scip-go go . . pkg/B#"}}, + }, + }, + ExternalSymbols: []*scip.SymbolInformation{ + {Symbol: "scip-ts npm shared 1.0 Util#"}, + }, + } + + merged, err := mergeIndexes([]*scip.Index{a, b}, "") + require.NoError(t, err) + + require.NotNil(t, merged.Metadata) + require.Equal(t, "file:///repo", merged.Metadata.ProjectRoot) + require.Equal(t, scip.TextEncoding_UTF8, merged.Metadata.TextDocumentEncoding) + require.NotNil(t, merged.Metadata.ToolInfo) + require.Equal(t, "scip", merged.Metadata.ToolInfo.Name) + + paths := docPaths(merged) + require.ElementsMatch(t, []string{"frontend/src/a.ts", "backend/src/b.go"}, paths) + + // External symbols should be deduped. + require.Len(t, merged.ExternalSymbols, 1) + require.Equal(t, "scip-ts npm shared 1.0 Util#", merged.ExternalSymbols[0].Symbol) +} + +func TestMergeIndexes_SameRootDeduplicatesDocuments(t *testing.T) { + a := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + Documents: []*scip.Document{ + { + RelativePath: "src/shared.go", + Symbols: []*scip.SymbolInformation{{Symbol: "shared.A"}}, + Occurrences: []*scip.Occurrence{ + {Symbol: "shared.A", Range: []int32{0, 0, 1}, SymbolRoles: int32(scip.SymbolRole_Definition)}, + }, + }, + }, + } + b := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + Documents: []*scip.Document{ + { + RelativePath: "src/shared.go", + Symbols: []*scip.SymbolInformation{{Symbol: "shared.B"}}, + Occurrences: []*scip.Occurrence{ + {Symbol: "shared.B", Range: []int32{1, 0, 1}, SymbolRoles: int32(scip.SymbolRole_Definition)}, + }, + }, + }, + } + + merged, err := mergeIndexes([]*scip.Index{a, b}, "") + require.NoError(t, err) + + require.Equal(t, "file:///repo", merged.Metadata.ProjectRoot) + require.Len(t, merged.Documents, 1, "documents with the same relative_path should be merged") + doc := merged.Documents[0] + require.Equal(t, "src/shared.go", doc.RelativePath) + require.Len(t, doc.Symbols, 2) + require.Len(t, doc.Occurrences, 2) +} + +func TestMergeIndexes_OverrideProjectRoot(t *testing.T) { + a := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///workspace/repo/frontend", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + Documents: []*scip.Document{ + {RelativePath: "x.ts"}, + }, + } + b := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///workspace/repo/backend", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + Documents: []*scip.Document{ + {RelativePath: "y.go"}, + }, + } + + merged, err := mergeIndexes([]*scip.Index{a, b}, "file:///workspace") + require.NoError(t, err) + + require.Equal(t, "file:///workspace", merged.Metadata.ProjectRoot) + paths := docPaths(merged) + require.ElementsMatch(t, []string{"repo/frontend/x.ts", "repo/backend/y.go"}, paths) +} + +func TestMergeIndexes_OverrideMustBeAncestor(t *testing.T) { + a := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo/frontend", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + } + b := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///elsewhere/backend", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + } + + _, err := mergeIndexes([]*scip.Index{a, b}, "file:///repo") + require.Error(t, err) +} + +func TestMergeIndexes_IncompatibleEncoding(t *testing.T) { + a := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + } + b := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo", + TextDocumentEncoding: scip.TextEncoding_UTF16, + }, + } + + _, err := mergeIndexes([]*scip.Index{a, b}, "") + require.Error(t, err) +} + +func TestMergeIndexes_UnspecifiedEncodingPromotes(t *testing.T) { + a := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo", + TextDocumentEncoding: scip.TextEncoding_UnspecifiedTextEncoding, + }, + } + b := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + } + + merged, err := mergeIndexes([]*scip.Index{a, b}, "") + require.NoError(t, err) + require.Equal(t, scip.TextEncoding_UTF8, merged.Metadata.TextDocumentEncoding) +} + +func TestMergeMain_EndToEnd(t *testing.T) { + tempDir := t.TempDir() + + write := func(name string, idx *scip.Index) string { + p := filepath.Join(tempDir, name) + data, err := proto.Marshal(idx) + require.NoError(t, err) + require.NoError(t, os.WriteFile(p, data, 0o644)) + return p + } + + aPath := write("a.scip", &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo/frontend", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + Documents: []*scip.Document{{RelativePath: "src/a.ts"}}, + }) + bPath := write("b.scip", &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo/backend", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + Documents: []*scip.Document{{RelativePath: "src/b.go"}}, + }) + + outPath := filepath.Join(tempDir, "merged.scip") + err := mergeMain( + []string{aPath, bPath}, + mergeFlags{output: outPath}, + ) + require.NoError(t, err) + + merged, err := readFromOption(outPath) + require.NoError(t, err) + + require.Equal(t, "file:///repo", merged.Metadata.ProjectRoot) + require.ElementsMatch(t, + []string{"frontend/src/a.ts", "backend/src/b.go"}, + docPaths(merged)) +} + +func docPaths(idx *scip.Index) []string { + out := make([]string, 0, len(idx.Documents)) + for _, d := range idx.Documents { + out = append(out, d.RelativePath) + } + sort.Strings(out) + return out +} diff --git a/docs/CLI.md b/docs/CLI.md index 719fe0f4..051529b5 100644 --- a/docs/CLI.md +++ b/docs/CLI.md @@ -8,6 +8,7 @@ - [`scip snapshot`](#scip-snapshot) - [`scip stats`](#scip-stats) - [`scip expt-convert`](#scip-convert) + - [`scip merge`](#scip-merge) ``` @@ -32,6 +33,7 @@ COMMANDS: stats Output useful statistics about a SCIP index test Validate a SCIP index against test files expt-convert [EXPERIMENTAL] Convert a SCIP index to a SQLite database + merge Merge multiple SCIP indexes into a single SCIP index help, h Shows a list of commands or help for one command GLOBAL OPTIONS: @@ -177,3 +179,44 @@ OPTIONS: --cpu-profile string Path to output prof file --help, -h show help ``` + +## `scip merge` + +``` +NAME: + scip merge - Merge multiple SCIP indexes into a single SCIP index + +USAGE: + scip merge [options] + +DESCRIPTION: + Merges two or more SCIP indexes into one. + + The output project_root is inferred as the common URI ancestor of the input + indexes' project_root values. Each input document's relative_path is rewritten + to be relative to that common root. + + For example, given: + + a.scip with project_root file:///repo/frontend, document src/a.ts + b.scip with project_root file:///repo/backend, document src/b.go + + The merged index will have: + + project_root file:///repo + documents frontend/src/a.ts and backend/src/b.go + + Documents and symbols are deduplicated after rewriting. + + Use --project-root to override the inferred root (each input's root must then + be a descendant of the override). + + Example usage: + + scip merge --output merged.scip a.scip b.scip c.scip + +OPTIONS: + --output string Path to write the merged SCIP index (default: "index.scip") + --project-root string Override the inferred output project_root URI. Each input project_root must be a descendant of this URI. + --help, -h show help +``` From 46b1ffb0b5c3fb2e8370217a469aa4b648d1e2d2 Mon Sep 17 00:00:00 2001 From: jupblb Date: Tue, 19 May 2026 16:33:01 +0200 Subject: [PATCH 2/3] Simplify scip merge implementation - Fold metadata/version/encoding validation and root-URI parsing into a single pass over the input indexes. - Replace four URI-path helpers (commonAncestorURI, normalizeURIPath, commonPathPrefix, relativePrefix) with three plain slash-path helpers (commonPath, isAncestor, relativeTo) operating on cleaned paths. - Normalize project_root paths once at parse time via parseRootURI, so the path helpers do not need to special-case trailing slashes, dot segments, or empty paths. - Drop encodingsCompatible and computeOutputRootAndPrefixes wrappers; inline their bodies. - Use strings.CutPrefix in relativeTo. - Drop the empty-RelativePath special case in document rewriting; rely on path.Join's behavior. Output is unchanged: end-to-end merge of three real Go-project indexes produces the same 62 documents / 15,340 occurrences / project_root. Adds tests for protocol-version mismatch, missing metadata, and empty RelativePath with a non-empty prefix. --- cmd/scip/merge.go | 271 +++++++++++++++-------------------------- cmd/scip/merge_test.go | 220 ++++++++++++++++++++------------- 2 files changed, 232 insertions(+), 259 deletions(-) diff --git a/cmd/scip/merge.go b/cmd/scip/merge.go index 594454e1..57645149 100644 --- a/cmd/scip/merge.go +++ b/cmd/scip/merge.go @@ -16,13 +16,13 @@ import ( ) type mergeFlags struct { - output string + output string projectRootOverride string } func mergeCommand() cli.Command { var flags mergeFlags - command := cli.Command{ + return cli.Command{ Name: "merge", Usage: "Merge multiple SCIP indexes into a single SCIP index", Description: `Merges two or more SCIP indexes into one. @@ -70,17 +70,16 @@ Example usage: return mergeMain(inputs, flags) }, } - return command } func mergeMain(inputs []string, flags mergeFlags) error { - indexes := make([]*scip.Index, 0, len(inputs)) - for _, p := range inputs { + indexes := make([]*scip.Index, len(inputs)) + for i, p := range inputs { idx, err := readFromOption(p) if err != nil { return fmt.Errorf("reading %s: %w", p, err) } - indexes = append(indexes, idx) + indexes[i] = idx } merged, err := mergeIndexes(indexes, flags.projectRootOverride) @@ -88,79 +87,74 @@ func mergeMain(inputs []string, flags mergeFlags) error { return err } - bytes, err := proto.Marshal(merged) + data, err := proto.Marshal(merged) if err != nil { return fmt.Errorf("marshaling merged index: %w", err) } - if err := os.WriteFile(flags.output, bytes, 0o644); err != nil { + if err := os.WriteFile(flags.output, data, 0o644); err != nil { return fmt.Errorf("writing %s: %w", flags.output, err) } return nil } // mergeIndexes combines multiple SCIP indexes into one. It determines the -// output project_root by computing the common URI ancestor of the inputs' -// project_root values (unless projectRootOverride is non-empty), rewrites -// each Document.relative_path to be relative to that root, and deduplicates +// output project_root as the common URI ancestor of the inputs' project_root +// values (or projectRootOverride if non-empty), rewrites each +// Document.relative_path to be relative to that root, and deduplicates // documents and external symbols. func mergeIndexes(indexes []*scip.Index, projectRootOverride string) (*scip.Index, error) { if len(indexes) == 0 { return nil, errors.New("no indexes to merge") } - // Validate metadata is present everywhere. + // Validate metadata and pick the merged ProtocolVersion / TextEncoding. + protoVersion := indexes[0].Metadata.GetVersion() + encoding := scip.TextEncoding_UnspecifiedTextEncoding + roots := make([]*url.URL, len(indexes)) for i, idx := range indexes { if idx.Metadata == nil { return nil, fmt.Errorf("index %d has no metadata", i) } - } - - // Validate compatible ProtocolVersion and TextDocumentEncoding. - protoVersion := indexes[0].Metadata.Version - encoding := indexes[0].Metadata.TextDocumentEncoding - for i, idx := range indexes[1:] { if idx.Metadata.Version != protoVersion { return nil, fmt.Errorf( "index %d has incompatible protocol version %v (expected %v)", - i+1, idx.Metadata.Version, protoVersion) + i, idx.Metadata.Version, protoVersion) } - if !encodingsCompatible(encoding, idx.Metadata.TextDocumentEncoding) { - return nil, fmt.Errorf( - "index %d has incompatible text encoding %v (expected %v)", - i+1, idx.Metadata.TextDocumentEncoding, encoding) + if e := idx.Metadata.TextDocumentEncoding; e != scip.TextEncoding_UnspecifiedTextEncoding { + if encoding == scip.TextEncoding_UnspecifiedTextEncoding { + encoding = e + } else if encoding != e { + return nil, fmt.Errorf( + "index %d has incompatible text encoding %v (expected %v)", + i, e, encoding) + } } - // Promote from Unspecified to a concrete encoding if one input has it. - if encoding == scip.TextEncoding_UnspecifiedTextEncoding { - encoding = idx.Metadata.TextDocumentEncoding + u, err := parseRootURI(idx.Metadata.ProjectRoot) + if err != nil { + return nil, fmt.Errorf("index %d: %w", i, err) } + roots[i] = u } - // Determine the output project_root and per-input prefix. - outputRoot, prefixes, err := computeOutputRootAndPrefixes(indexes, projectRootOverride) + // Determine the output project_root and the per-input prefix. + outputURL, prefixes, err := planPaths(roots, projectRootOverride) if err != nil { return nil, err } - // Aggregate documents (rewriting paths) and external symbols. + // Aggregate documents (rewriting relative_path) and external symbols. var allDocuments []*scip.Document var allExternalSymbols []*scip.SymbolInformation for i, idx := range indexes { for _, doc := range idx.Documents { if prefixes[i] != "" { - if doc.RelativePath == "" { - doc.RelativePath = prefixes[i] - } else { - doc.RelativePath = path.Join(prefixes[i], doc.RelativePath) - } + doc.RelativePath = path.Join(prefixes[i], doc.RelativePath) } allDocuments = append(allDocuments, doc) } allExternalSymbols = append(allExternalSymbols, idx.ExternalSymbols...) } - mergedDocuments := scip.FlattenDocuments(allDocuments) - mergedExternalSymbols := scip.FlattenSymbols(allExternalSymbols) - return &scip.Index{ Metadata: &scip.Metadata{ Version: protoVersion, @@ -169,168 +163,99 @@ func mergeIndexes(indexes []*scip.Index, projectRootOverride string) (*scip.Inde Version: strings.TrimSpace(version), Arguments: []string{"merge"}, }, - ProjectRoot: outputRoot, + ProjectRoot: outputURL.String(), TextDocumentEncoding: encoding, }, - Documents: mergedDocuments, - ExternalSymbols: mergedExternalSymbols, + Documents: scip.FlattenDocuments(allDocuments), + ExternalSymbols: scip.FlattenSymbols(allExternalSymbols), }, nil } -// encodingsCompatible reports whether two TextEncoding values can be merged. -// Unspecified is treated as compatible with any value. -func encodingsCompatible(a, b scip.TextEncoding) bool { - if a == scip.TextEncoding_UnspecifiedTextEncoding || - b == scip.TextEncoding_UnspecifiedTextEncoding { - return true - } - return a == b -} - -// computeOutputRootAndPrefixes returns the URI to use as the merged index's -// project_root, together with one path-prefix per input index. The prefix for -// input i is what should be prepended to each document's relative_path so that -// it remains relative to the output root. -// -// If projectRootOverride is non-empty, it is used directly and each input root -// must be a descendant of it. Otherwise, the output root is the common URI -// ancestor of all inputs. -func computeOutputRootAndPrefixes(indexes []*scip.Index, projectRootOverride string) (string, []string, error) { - roots := make([]string, len(indexes)) - for i, idx := range indexes { - roots[i] = idx.Metadata.ProjectRoot +// parseRootURI parses a project_root URI and normalizes its path component +// (trailing slashes trimmed, "." / ".." resolved, "" replaced with "/"). +func parseRootURI(raw string) (*url.URL, error) { + u, err := url.Parse(raw) + if err != nil { + return nil, fmt.Errorf("parsing project_root %q: %w", raw, err) } - - var outputRoot string - if projectRootOverride != "" { - outputRoot = projectRootOverride + if u.Path == "" { + u.Path = "/" } else { - var err error - outputRoot, err = commonAncestorURI(roots) - if err != nil { - return "", nil, err - } - } - - prefixes := make([]string, len(indexes)) - for i, r := range roots { - prefix, err := relativePrefix(outputRoot, r) - if err != nil { - return "", nil, fmt.Errorf( - "index %d project_root %q is not under output root %q: %w", - i, r, outputRoot, err) - } - prefixes[i] = prefix + u.Path = path.Clean(u.Path) } - return outputRoot, prefixes, nil + return u, nil } -// commonAncestorURI returns the longest URI that is an ancestor of every input -// URI. All inputs must share the same scheme and host. -func commonAncestorURI(roots []string) (string, error) { - if len(roots) == 0 { - return "", errors.New("no project roots provided") - } - - first, err := url.Parse(roots[0]) - if err != nil { - return "", fmt.Errorf("parsing project_root %q: %w", roots[0], err) - } - - commonScheme := first.Scheme - commonHost := first.Host - commonPath := normalizeURIPath(first.Path) - - for _, r := range roots[1:] { - u, err := url.Parse(r) +// planPaths returns the output project_root URL and the prefix that must be +// prepended to each input's document relative_paths. If override is non-empty +// it becomes the output root and every input root must be a descendant of it; +// otherwise the output root is the common URI ancestor of all inputs. +func planPaths(inputs []*url.URL, override string) (*url.URL, []string, error) { + var root *url.URL + if override != "" { + u, err := parseRootURI(override) if err != nil { - return "", fmt.Errorf("parsing project_root %q: %w", r, err) + return nil, nil, fmt.Errorf("invalid --project-root: %w", err) } - if u.Scheme != commonScheme { - return "", fmt.Errorf( - "incompatible URI schemes %q and %q across inputs; "+ - "pass --project-root to override", - commonScheme, u.Scheme) - } - if u.Host != commonHost { - return "", fmt.Errorf( - "incompatible URI hosts %q and %q across inputs; "+ - "pass --project-root to override", - commonHost, u.Host) + root = u + } else { + root = inputs[0] + for _, u := range inputs[1:] { + if u.Scheme != root.Scheme || u.Host != root.Host { + return nil, nil, fmt.Errorf( + "inputs have incompatible URI scheme/host: %q vs %q; "+ + "pass --project-root to override", + root, u) + } + root = &url.URL{Scheme: root.Scheme, Host: root.Host, Path: commonPath(root.Path, u.Path)} } - commonPath = commonPathPrefix(commonPath, normalizeURIPath(u.Path)) } - out := url.URL{ - Scheme: commonScheme, - Host: commonHost, - Path: commonPath, + prefixes := make([]string, len(inputs)) + for i, u := range inputs { + if u.Scheme != root.Scheme || u.Host != root.Host { + return nil, nil, fmt.Errorf( + "index %d project_root %q has different scheme/host than output root %q", + i, u, root) + } + rel, err := relativeTo(root.Path, u.Path) + if err != nil { + return nil, nil, fmt.Errorf("index %d project_root %q: %w", i, u, err) + } + prefixes[i] = rel } - return out.String(), nil + return root, prefixes, nil } -// normalizeURIPath trims trailing slashes while preserving the root "/". -func normalizeURIPath(p string) string { - if p == "" || p == "/" { - return p +// commonPath returns the longest slash-bounded ancestor of both a and b. +// Inputs must be absolute (start with "/") and cleaned. +func commonPath(a, b string) string { + for !isAncestor(a, b) { + a = path.Dir(a) } - return strings.TrimRight(p, "/") + return a } -// commonPathPrefix returns the longest path prefix of a and b along "/" -// component boundaries. -func commonPathPrefix(a, b string) string { - aParts := strings.Split(a, "/") - bParts := strings.Split(b, "/") - n := len(aParts) - if len(bParts) < n { - n = len(bParts) - } - i := 0 - for ; i < n; i++ { - if aParts[i] != bParts[i] { - break - } - } - common := strings.Join(aParts[:i], "/") - // If only the leading empty element matched, both paths were absolute and - // the only shared ancestor is the root. - if common == "" && i > 0 && strings.HasPrefix(a, "/") && strings.HasPrefix(b, "/") { - return "/" +// isAncestor reports whether parent is an ancestor of (or equal to) child. +// Inputs must be cleaned. +func isAncestor(parent, child string) bool { + if parent == "/" { + return strings.HasPrefix(child, "/") } - return common + return child == parent || strings.HasPrefix(child, parent+"/") } -// relativePrefix returns the path that, when prepended to a relative_path under -// the input root, makes it relative to the output root. -func relativePrefix(outputRoot, inputRoot string) (string, error) { - outURL, err := url.Parse(outputRoot) - if err != nil { - return "", fmt.Errorf("parsing output project_root %q: %w", outputRoot, err) - } - inURL, err := url.Parse(inputRoot) - if err != nil { - return "", fmt.Errorf("parsing input project_root %q: %w", inputRoot, err) - } - if outURL.Scheme != inURL.Scheme { - return "", fmt.Errorf("scheme mismatch: %q vs %q", outURL.Scheme, inURL.Scheme) - } - if outURL.Host != inURL.Host { - return "", fmt.Errorf("host mismatch: %q vs %q", outURL.Host, inURL.Host) - } - - outPath := normalizeURIPath(outURL.Path) - inPath := normalizeURIPath(inURL.Path) - - if outPath == inPath { +// relativeTo returns child's path relative to parent (no leading slash). +// Returns an error if parent is not an ancestor of child. +func relativeTo(parent, child string) (string, error) { + if child == parent { return "", nil } - // outPath must be a directory prefix of inPath. - if outPath == "/" { - return strings.TrimLeft(inPath, "/"), nil + if parent == "/" { + return strings.TrimPrefix(child, "/"), nil } - if !strings.HasPrefix(inPath, outPath+"/") { - return "", fmt.Errorf("%q is not under %q", inPath, outPath) + if rel, ok := strings.CutPrefix(child, parent+"/"); ok { + return rel, nil } - return strings.TrimPrefix(inPath, outPath+"/"), nil + return "", fmt.Errorf("%q is not under %q", child, parent) } diff --git a/cmd/scip/merge_test.go b/cmd/scip/merge_test.go index c7961037..d58c6502 100644 --- a/cmd/scip/merge_test.go +++ b/cmd/scip/merge_test.go @@ -1,6 +1,7 @@ package main import ( + "net/url" "os" "path/filepath" "sort" @@ -12,42 +13,56 @@ import ( "github.com/scip-code/scip/bindings/go/scip" ) -func TestMerge_CommonAncestorURI(t *testing.T) { +func TestMerge_PlanPaths(t *testing.T) { cases := []struct { - name string - roots []string - want string - wantErr bool + name string + roots []string + override string + wantRoot string + wantPrefixes []string + wantErr bool }{ { - name: "identical roots", - roots: []string{"file:///repo", "file:///repo"}, - want: "file:///repo", + name: "identical roots", + roots: []string{"file:///repo", "file:///repo"}, + wantRoot: "file:///repo", + wantPrefixes: []string{"", ""}, + }, + { + name: "sibling subdirs", + roots: []string{"file:///repo/frontend", "file:///repo/backend"}, + wantRoot: "file:///repo", + wantPrefixes: []string{"frontend", "backend"}, }, { - name: "sibling subdirs", - roots: []string{"file:///repo/frontend", "file:///repo/backend"}, - want: "file:///repo", + name: "one is ancestor of the other", + roots: []string{"file:///repo", "file:///repo/sub"}, + wantRoot: "file:///repo", + wantPrefixes: []string{"", "sub"}, }, { - name: "one is ancestor of the other", - roots: []string{"file:///repo", "file:///repo/sub"}, - want: "file:///repo", + name: "deep nesting", + roots: []string{"file:///repo/a/b/c", "file:///repo/a/b/d", "file:///repo/a/e"}, + wantRoot: "file:///repo/a", + wantPrefixes: []string{"b/c", "b/d", "e"}, }, { - name: "deep nesting", - roots: []string{"file:///repo/a/b/c", "file:///repo/a/b/d", "file:///repo/a/e"}, - want: "file:///repo/a", + name: "common component is filesystem root", + roots: []string{"file:///alpha/x", "file:///beta/y"}, + wantRoot: "file:///", + wantPrefixes: []string{"alpha/x", "beta/y"}, }, { - name: "common component is filesystem root", - roots: []string{"file:///alpha/x", "file:///beta/y"}, - want: "file:///", + name: "trailing slashes normalized", + roots: []string{"file:///repo/", "file:///repo/sub"}, + wantRoot: "file:///repo", + wantPrefixes: []string{"", "sub"}, }, { - name: "trailing slashes normalized", - roots: []string{"file:///repo/", "file:///repo/sub"}, - want: "file:///repo", + name: "prefix-like names don't share parent", + roots: []string{"file:///repo/frontend", "file:///repo/frontend-v2"}, + wantRoot: "file:///repo", + wantPrefixes: []string{"frontend", "frontend-v2"}, }, { name: "different schemes error", @@ -60,91 +75,61 @@ func TestMerge_CommonAncestorURI(t *testing.T) { wantErr: true, }, { - name: "non-overlapping but same scheme is ancestor=/", - roots: []string{"file:///repo/frontend", "file:///different-name/backend"}, - want: "file:///", + name: "override at a common ancestor", + roots: []string{"file:///workspace/repo/frontend", "file:///workspace/repo/backend"}, + override: "file:///workspace", + wantRoot: "file:///workspace", + wantPrefixes: []string{"repo/frontend", "repo/backend"}, }, { - name: "prefix-like names don't share parent", - roots: []string{"file:///repo/frontend", "file:///repo/frontend-v2"}, - want: "file:///repo", + name: "override must be ancestor of all inputs", + roots: []string{"file:///repo/frontend", "file:///elsewhere/backend"}, + override: "file:///repo", + wantErr: true, }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { - got, err := commonAncestorURI(c.roots) + parsed := make([]*url.URL, len(c.roots)) + for i, r := range c.roots { + u, err := parseRootURI(r) + require.NoError(t, err) + parsed[i] = u + } + gotURL, gotPrefixes, err := planPaths(parsed, c.override) if c.wantErr { require.Error(t, err) return } require.NoError(t, err) - require.Equal(t, c.want, got) + require.Equal(t, c.wantRoot, gotURL.String()) + require.Equal(t, c.wantPrefixes, gotPrefixes) }) } } -func TestMerge_RelativePrefix(t *testing.T) { +func TestMerge_RelativeTo(t *testing.T) { cases := []struct { - name string - outputRoot string - inputRoot string - want string - wantErr bool + name string + parent string + child string + want string + wantErr bool }{ - { - name: "same root", - outputRoot: "file:///repo", - inputRoot: "file:///repo", - want: "", - }, - { - name: "direct child", - outputRoot: "file:///repo", - inputRoot: "file:///repo/sub", - want: "sub", - }, - { - name: "deep child", - outputRoot: "file:///repo", - inputRoot: "file:///repo/a/b/c", - want: "a/b/c", - }, - { - name: "input is ancestor (error)", - outputRoot: "file:///repo/sub", - inputRoot: "file:///repo", - wantErr: true, - }, - { - name: "siblings (error)", - outputRoot: "file:///repo/frontend", - inputRoot: "file:///repo/backend", - wantErr: true, - }, - { - name: "prefix-like but not actual ancestor", - outputRoot: "file:///repo/frontend", - inputRoot: "file:///repo/frontend-v2", - wantErr: true, - }, - { - name: "output is filesystem root", - outputRoot: "file:///", - inputRoot: "file:///alpha/x", - want: "alpha/x", - }, - { - name: "trailing slash on input", - outputRoot: "file:///repo", - inputRoot: "file:///repo/sub/", - want: "sub", - }, + {"same", "/repo", "/repo", "", false}, + {"direct child", "/repo", "/repo/sub", "sub", false}, + {"deep child", "/repo", "/repo/a/b/c", "a/b/c", false}, + {"parent is filesystem root", "/", "/alpha/x", "alpha/x", false}, + {"parent equals root and child equals root", "/", "/", "", false}, + {"input is ancestor (error)", "/repo/sub", "/repo", "", true}, + {"siblings (error)", "/repo/frontend", "/repo/backend", "", true}, + {"prefix-like but not ancestor", "/repo/frontend", "/repo/frontend-v2", "", true}, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { - got, err := relativePrefix(c.outputRoot, c.inputRoot) + got, err := relativeTo(c.parent, c.child) if c.wantErr { require.Error(t, err) return @@ -317,6 +302,69 @@ func TestMergeIndexes_IncompatibleEncoding(t *testing.T) { require.Error(t, err) } +func TestMergeIndexes_IncompatibleProtocolVersion(t *testing.T) { + a := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo", + TextDocumentEncoding: scip.TextEncoding_UTF8, + Version: scip.ProtocolVersion_UnspecifiedProtocolVersion, + }, + } + b := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo", + TextDocumentEncoding: scip.TextEncoding_UTF8, + Version: scip.ProtocolVersion(42), // imaginary future version + }, + } + + _, err := mergeIndexes([]*scip.Index{a, b}, "") + require.Error(t, err) + require.Contains(t, err.Error(), "protocol version") +} + +func TestMergeIndexes_MissingMetadata(t *testing.T) { + a := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + } + b := &scip.Index{Metadata: nil} + + _, err := mergeIndexes([]*scip.Index{a, b}, "") + require.Error(t, err) + require.Contains(t, err.Error(), "metadata") +} + +func TestMergeIndexes_EmptyRelativePathGetsPrefix(t *testing.T) { + a := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo/frontend", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + Documents: []*scip.Document{ + {RelativePath: ""}, // empty -- represents the project root itself + }, + } + b := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo/backend", + TextDocumentEncoding: scip.TextEncoding_UTF8, + }, + Documents: []*scip.Document{ + {RelativePath: "main.go"}, + }, + } + + merged, err := mergeIndexes([]*scip.Index{a, b}, "") + require.NoError(t, err) + require.Equal(t, "file:///repo", merged.Metadata.ProjectRoot) + require.ElementsMatch(t, + []string{"frontend", "backend/main.go"}, + docPaths(merged)) +} + func TestMergeIndexes_UnspecifiedEncodingPromotes(t *testing.T) { a := &scip.Index{ Metadata: &scip.Metadata{ From bbdd1590ffbc3882bd554e216dd517da13cb29d1 Mon Sep 17 00:00:00 2001 From: jupblb Date: Tue, 19 May 2026 17:21:28 +0200 Subject: [PATCH 3/3] Treat TextEncoding Unspecified as a distinct value in scip merge Previously mergeIndexes treated TextEncoding_UnspecifiedTextEncoding as a wildcard: an Unspecified input would be silently 'promoted' to the encoding of another input, and the merged metadata would be labeled with that concrete encoding. This silently mislabels source files whose encoding was never actually known. Now all inputs must agree exactly on TextDocumentEncoding (Unspecified counts as itself). Same for ProtocolVersion. Tests updated to cover both the new error case (Unspecified mixed with a concrete encoding) and the preservation case (all Unspecified). --- cmd/scip/merge.go | 29 +++++++++++++---------------- cmd/scip/merge_test.go | 29 +++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/cmd/scip/merge.go b/cmd/scip/merge.go index 57645149..428c5d13 100644 --- a/cmd/scip/merge.go +++ b/cmd/scip/merge.go @@ -97,19 +97,20 @@ func mergeMain(inputs []string, flags mergeFlags) error { return nil } -// mergeIndexes combines multiple SCIP indexes into one. It determines the -// output project_root as the common URI ancestor of the inputs' project_root -// values (or projectRootOverride if non-empty), rewrites each -// Document.relative_path to be relative to that root, and deduplicates -// documents and external symbols. func mergeIndexes(indexes []*scip.Index, projectRootOverride string) (*scip.Index, error) { if len(indexes) == 0 { return nil, errors.New("no indexes to merge") } - // Validate metadata and pick the merged ProtocolVersion / TextEncoding. - protoVersion := indexes[0].Metadata.GetVersion() - encoding := scip.TextEncoding_UnspecifiedTextEncoding + // Validate metadata. All inputs must agree exactly on ProtocolVersion and + // TextDocumentEncoding (Unspecified is treated as a distinct value, not a + // wildcard, to avoid silently mislabeling an index whose encoding was + // unknown). + if indexes[0].Metadata == nil { + return nil, errors.New("index 0 has no metadata") + } + protoVersion := indexes[0].Metadata.Version + encoding := indexes[0].Metadata.TextDocumentEncoding roots := make([]*url.URL, len(indexes)) for i, idx := range indexes { if idx.Metadata == nil { @@ -120,14 +121,10 @@ func mergeIndexes(indexes []*scip.Index, projectRootOverride string) (*scip.Inde "index %d has incompatible protocol version %v (expected %v)", i, idx.Metadata.Version, protoVersion) } - if e := idx.Metadata.TextDocumentEncoding; e != scip.TextEncoding_UnspecifiedTextEncoding { - if encoding == scip.TextEncoding_UnspecifiedTextEncoding { - encoding = e - } else if encoding != e { - return nil, fmt.Errorf( - "index %d has incompatible text encoding %v (expected %v)", - i, e, encoding) - } + if idx.Metadata.TextDocumentEncoding != encoding { + return nil, fmt.Errorf( + "index %d has incompatible text encoding %v (expected %v)", + i, idx.Metadata.TextDocumentEncoding, encoding) } u, err := parseRootURI(idx.Metadata.ProjectRoot) if err != nil { diff --git a/cmd/scip/merge_test.go b/cmd/scip/merge_test.go index d58c6502..20e3d961 100644 --- a/cmd/scip/merge_test.go +++ b/cmd/scip/merge_test.go @@ -365,7 +365,11 @@ func TestMergeIndexes_EmptyRelativePathGetsPrefix(t *testing.T) { docPaths(merged)) } -func TestMergeIndexes_UnspecifiedEncodingPromotes(t *testing.T) { +func TestMergeIndexes_UnspecifiedAndConcreteEncodingMismatch(t *testing.T) { + // Unspecified is treated as a distinct value, not a wildcard: mixing it + // with a concrete encoding must error rather than silently labeling the + // merged output with the concrete encoding (which could mislabel the + // Unspecified index's source files). a := &scip.Index{ Metadata: &scip.Metadata{ ProjectRoot: "file:///repo", @@ -379,9 +383,30 @@ func TestMergeIndexes_UnspecifiedEncodingPromotes(t *testing.T) { }, } + _, err := mergeIndexes([]*scip.Index{a, b}, "") + require.Error(t, err) + require.Contains(t, err.Error(), "text encoding") +} + +func TestMergeIndexes_AllUnspecifiedEncodingPreserved(t *testing.T) { + // When every input has the same encoding (including Unspecified), the + // merged output preserves it. + a := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo", + TextDocumentEncoding: scip.TextEncoding_UnspecifiedTextEncoding, + }, + } + b := &scip.Index{ + Metadata: &scip.Metadata{ + ProjectRoot: "file:///repo", + TextDocumentEncoding: scip.TextEncoding_UnspecifiedTextEncoding, + }, + } + merged, err := mergeIndexes([]*scip.Index{a, b}, "") require.NoError(t, err) - require.Equal(t, scip.TextEncoding_UTF8, merged.Metadata.TextDocumentEncoding) + require.Equal(t, scip.TextEncoding_UnspecifiedTextEncoding, merged.Metadata.TextDocumentEncoding) } func TestMergeMain_EndToEnd(t *testing.T) {