Skip to content

Commit d4ecb22

Browse files
aksOpsclaude
andcommitted
fix(enrich): explicit QUOTE/ESCAPE so Kuzu COPY honors RFC-4180
PR #150 switched the staging file delimiter to '|' to avoid JSON- property comma collisions. That fixes comma-bearing values but breaks when an ID itself contains a literal '|' — Istio's EDS cluster names are exactly this shape: json:istio/none_cds.json:inbound|7070|tcplocal|s1tcp.none Go's encoding/csv writer DOES wrap such fields in '"' per RFC-4180. But Kuzu's CSV reader defaults to BACKSLASH escaping, not the RFC-4180 doubled-quote form Go produces. With the default Kuzu escape rule, the pipe-bearing quoted field is parsed as multiple fields and the COPY aborts: Copy exception: Error in file ... expected 6 values per row, but got more. Fix: pass `QUOTE='"', ESCAPE='"'` explicitly so Kuzu interprets the RFC-4180 form Go writes. Applies to both copyNodeBatch and copyEdgeBatch. End-to-end: `codeiq enrich ~/projects/polyglot-bench/istio` now exits 0 (was exit 2 pre-fix): 36k nodes, 55k edges, 20 services. Regression test TestBulkLoadEdgesPipeInTargetID covers the exact Istio cluster-name shape. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 4a6d82a commit d4ecb22

2 files changed

Lines changed: 59 additions & 4 deletions

File tree

go/internal/graph/bulk.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,17 @@ func (s *Store) copyNodeBatch(batch []*model.CodeNode) error {
101101

102102
// Kuzu COPY FROM with explicit column list. ToSlash for Windows path
103103
// portability — Kuzu's parser accepts forward slashes on all platforms.
104-
// DELIM='|' matches the pipe-separated staging file written above.
104+
//
105+
// DELIM='|' matches the pipe-separated staging file written above. The
106+
// explicit QUOTE/ESCAPE pair overrides Kuzu's default backslash-escape
107+
// behaviour with RFC-4180 (doubled-quote) escaping so that Go's
108+
// encoding/csv writer (which emits "field""with""quotes" form) round-
109+
// trips correctly. Fields containing the delimiter (e.g. Istio service
110+
// names like "inbound|7070|tcplocal|s1tcp.none") are wrapped by the Go
111+
// writer; Kuzu then dequotes them only when the matching escape rule is
112+
// set.
105113
q := fmt.Sprintf(
106-
"COPY CodeNode(%s) FROM '%s' (header=false, DELIM='|')",
114+
`COPY CodeNode(%s) FROM '%s' (header=false, DELIM='|', QUOTE='"', ESCAPE='"')`,
107115
strings.Join(nodeColumns, ", "),
108116
filepath.ToSlash(tmp.Name()),
109117
)
@@ -263,9 +271,10 @@ func (s *Store) copyEdgeBatch(kind model.EdgeKind, batch []*model.CodeEdge) erro
263271
return fmt.Errorf("graph: csv close: %w", err)
264272
}
265273

266-
// DELIM='|' matches the pipe-separated staging file written above.
274+
// DELIM/QUOTE/ESCAPE — see copyNodeBatch for the rationale (RFC-4180
275+
// round-trip with Go's encoding/csv).
267276
q := fmt.Sprintf(
268-
"COPY %s FROM '%s' (header=false, DELIM='|')",
277+
`COPY %s FROM '%s' (header=false, DELIM='|', QUOTE='"', ESCAPE='"')`,
269278
relTableName(kind),
270279
filepath.ToSlash(tmp.Name()),
271280
)

go/internal/graph/bulk_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,52 @@ func TestBulkLoadNodesCommaInProperties(t *testing.T) {
223223
}
224224
}
225225

226+
// TestBulkLoadEdgesPipeInTargetID is a regression test for Istio-style IDs
227+
// that contain the field delimiter '|' literally (e.g. EDS cluster names
228+
// "inbound|7070|tcplocal|s1tcp.none" parsed from JSON config). Go's csv.Writer
229+
// RFC-4180-wraps such fields in '"', but Kuzu's default ESCAPE is backslash
230+
// not doubled-quote — so without explicit QUOTE='"', ESCAPE='"' the COPY
231+
// FROM splits the wrapped field on each interior '|' and aborts with
232+
// "expected N values per row, but got more". Fix: explicit QUOTE/ESCAPE in
233+
// the COPY FROM clause.
234+
func TestBulkLoadEdgesPipeInTargetID(t *testing.T) {
235+
s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu"))
236+
if err != nil {
237+
t.Fatal(err)
238+
}
239+
defer s.Close()
240+
if err := s.ApplySchema(); err != nil {
241+
t.Fatal(err)
242+
}
243+
// Istio-flavoured target ID with literal pipes.
244+
target := "json:istio/none_cds.json:inbound|7070|tcplocal|s1tcp.none"
245+
nodes := []*model.CodeNode{
246+
{ID: "json:istio/none_cds.json", Kind: model.NodeModule, Label: "none_cds.json"},
247+
{ID: target, Kind: model.NodeConfigKey, Label: "inbound|7070|tcplocal|s1tcp.none"},
248+
}
249+
if err := s.BulkLoadNodes(nodes); err != nil {
250+
t.Fatalf("BulkLoadNodes with pipe-bearing ID: %v", err)
251+
}
252+
edges := []*model.CodeEdge{{
253+
ID: "json:istio/none_cds.json->" + target,
254+
Kind: model.EdgeContains,
255+
SourceID: "json:istio/none_cds.json",
256+
TargetID: target,
257+
Confidence: model.ConfidenceSyntactic,
258+
Source: "JsonStructureDetector",
259+
}}
260+
if err := s.BulkLoadEdges(edges); err != nil {
261+
t.Fatalf("BulkLoadEdges with pipe-bearing target ID: %v", err)
262+
}
263+
rows, err := s.Cypher("MATCH ()-[r:CONTAINS]->() RETURN r.id AS id")
264+
if err != nil {
265+
t.Fatal(err)
266+
}
267+
if len(rows) != 1 {
268+
t.Fatalf("want 1 CONTAINS row, got %d: %v", len(rows), rows)
269+
}
270+
}
271+
226272
// TestBulkLoadEdgesEmpty — zero edges is a no-op like the node path.
227273
func TestBulkLoadEdgesEmpty(t *testing.T) {
228274
s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu"))

0 commit comments

Comments
 (0)