Skip to content

Commit 7867a79

Browse files
aksOpsclaude
andauthored
fix(enrich): use pipe '|' delimiter for Kuzu COPY staging files (#150)
Kuzu's CSV parser doesn't respect RFC-4180 quoting and counts commas inside JSON property values as field separators. On real-world inputs this aborted BulkLoadEdges with "Copy exception: expected 6 values per row, but got more" — observed on Markdown depends_on edges and Python imports whose properties include {"language":"python", "module":"glob"}. Switch the staging file delimiter from comma to pipe '|'. Go's json.Marshal never emits a literal '|', so the separator is unambiguous. Both copyNodeBatch and copyEdgeBatch flip together. Adds two regression tests with comma-bearing Properties JSON (TestBulkLoadEdgesCommaInProperties + TestBulkLoadNodesCommaInProperties) that fail on main and pass after the fix. Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 91e34c3 commit 7867a79

2 files changed

Lines changed: 89 additions & 2 deletions

File tree

go/internal/graph/bulk.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,12 @@ func (s *Store) copyNodeBatch(batch []*model.CodeNode) error {
7373
// Cleanup runs whether COPY succeeds or fails.
7474
defer os.Remove(tmp.Name())
7575

76+
// Use pipe '|' as the field delimiter so that JSON property values
77+
// containing commas (e.g. {"language":"python","module":"glob"}) are not
78+
// mis-parsed by Kuzu's CSV reader. Go's json.Marshal never emits '|',
79+
// so it is unambiguous as a separator.
7680
w := csv.NewWriter(tmp)
81+
w.Comma = '|'
7782
for _, n := range batch {
7883
row, err := encodeNodeRow(n)
7984
if err != nil {
@@ -96,8 +101,9 @@ func (s *Store) copyNodeBatch(batch []*model.CodeNode) error {
96101

97102
// Kuzu COPY FROM with explicit column list. ToSlash for Windows path
98103
// portability — Kuzu's parser accepts forward slashes on all platforms.
104+
// DELIM='|' matches the pipe-separated staging file written above.
99105
q := fmt.Sprintf(
100-
"COPY CodeNode(%s) FROM '%s' (header=false)",
106+
"COPY CodeNode(%s) FROM '%s' (header=false, DELIM='|')",
101107
strings.Join(nodeColumns, ", "),
102108
filepath.ToSlash(tmp.Name()),
103109
)
@@ -226,7 +232,9 @@ func (s *Store) copyEdgeBatch(kind model.EdgeKind, batch []*model.CodeEdge) erro
226232
}
227233
defer os.Remove(tmp.Name())
228234

235+
// Use pipe '|' as the field delimiter — see copyNodeBatch for the rationale.
229236
w := csv.NewWriter(tmp)
237+
w.Comma = '|'
230238
for _, e := range batch {
231239
props, err := json.Marshal(e.Properties)
232240
if err != nil {
@@ -255,8 +263,9 @@ func (s *Store) copyEdgeBatch(kind model.EdgeKind, batch []*model.CodeEdge) erro
255263
return fmt.Errorf("graph: csv close: %w", err)
256264
}
257265

266+
// DELIM='|' matches the pipe-separated staging file written above.
258267
q := fmt.Sprintf(
259-
"COPY %s FROM '%s' (header=false)",
268+
"COPY %s FROM '%s' (header=false, DELIM='|')",
260269
relTableName(kind),
261270
filepath.ToSlash(tmp.Name()),
262271
)

go/internal/graph/bulk_test.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,84 @@ func TestBulkLoadEdgesGroupedByKind(t *testing.T) {
145145
}
146146
}
147147

148+
// TestBulkLoadEdgesCommaInProperties is a regression test for the bug where
149+
// Properties JSON containing commas (e.g. {"language":"python","module":"glob"})
150+
// caused Kuzu's CSV parser to count more fields than expected and abort with
151+
// "Copy exception: expected 6 values per row, but got more". The fix switches
152+
// the staging file to pipe-separated (DELIM='|'), which is unambiguous because
153+
// Go's json.Marshal never emits a '|' character.
154+
func TestBulkLoadEdgesCommaInProperties(t *testing.T) {
155+
s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu"))
156+
if err != nil {
157+
t.Fatal(err)
158+
}
159+
defer s.Close()
160+
if err := s.ApplySchema(); err != nil {
161+
t.Fatal(err)
162+
}
163+
nodes := []*model.CodeNode{
164+
{ID: "py:file:check_structure.py", Kind: model.NodeModule, Label: "check_structure.py"},
165+
{ID: "py:external:glob", Kind: model.NodeExternal, Label: "glob"},
166+
}
167+
if err := s.BulkLoadNodes(nodes); err != nil {
168+
t.Fatal(err)
169+
}
170+
edges := []*model.CodeEdge{{
171+
ID: "py:file:check_structure.py->py:external:glob:imports",
172+
Kind: model.EdgeImports,
173+
SourceID: "py:file:check_structure.py",
174+
TargetID: "py:external:glob",
175+
Confidence: model.ConfidenceLexical,
176+
Source: "GenericImportsDetector",
177+
Properties: map[string]any{
178+
"language": "python",
179+
"module": "glob",
180+
},
181+
}}
182+
if err := s.BulkLoadEdges(edges); err != nil {
183+
t.Fatalf("BulkLoadEdges with comma-bearing Properties: %v", err)
184+
}
185+
rows, err := s.Cypher("MATCH ()-[r:IMPORTS]->() RETURN r.id AS id")
186+
if err != nil {
187+
t.Fatal(err)
188+
}
189+
if len(rows) != 1 {
190+
t.Fatalf("want 1 IMPORTS row, got %d: %v", len(rows), rows)
191+
}
192+
}
193+
194+
// TestBulkLoadNodesCommaInProperties is a regression test for nodes whose
195+
// props JSON column contains commas — same root cause as the edge variant.
196+
func TestBulkLoadNodesCommaInProperties(t *testing.T) {
197+
s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu"))
198+
if err != nil {
199+
t.Fatal(err)
200+
}
201+
defer s.Close()
202+
if err := s.ApplySchema(); err != nil {
203+
t.Fatal(err)
204+
}
205+
nodes := []*model.CodeNode{{
206+
ID: "py:file:app.py",
207+
Kind: model.NodeModule,
208+
Label: "app.py",
209+
Properties: map[string]any{
210+
"language": "python",
211+
"module": "flask,requests,os", // value itself contains commas
212+
},
213+
}}
214+
if err := s.BulkLoadNodes(nodes); err != nil {
215+
t.Fatalf("BulkLoadNodes with comma-bearing Properties: %v", err)
216+
}
217+
rows, err := s.Cypher("MATCH (n:CodeNode {id: 'py:file:app.py'}) RETURN n.id AS id")
218+
if err != nil {
219+
t.Fatal(err)
220+
}
221+
if len(rows) != 1 {
222+
t.Fatalf("want 1 node, got %d: %v", len(rows), rows)
223+
}
224+
}
225+
148226
// TestBulkLoadEdgesEmpty — zero edges is a no-op like the node path.
149227
func TestBulkLoadEdgesEmpty(t *testing.T) {
150228
s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu"))

0 commit comments

Comments
 (0)