Skip to content

Commit 94841e0

Browse files
fix(k8s): address QA round 1 findings
- [Major 1+2] pass_k8s: always re-extract K8s manifests with CBM_LANG_K8S, discarding any cached YAML result; pass already-read source buffer to handle_k8s_manifest to eliminate the double file read - [Minor 3] CONTRIBUTING.md: fix extractor path to internal/cbm/extract_k8s.c and clarify tree-sitter YAML grammar usage - [Minor 4] language.c: add comment documenting intentional case-sensitive FILENAME_TABLE and case-insensitive cbm_is_kustomize_file() split behaviour - [Minor 5] test_pipeline.c: add k8s_extract_manifest_multidoc test pinning single-document-per-file extraction behaviour - [Minor 6] pass_infrascan: replace strlen with strnlen(content, 4096) in cbm_is_k8s_manifest to bound the scan - [Minor 7] extract_k8s: add "crds" to is_kustomize_list_key - [Nit 8] extract_k8s: remove dead second block_node unwrap in extract_k8s_scalars metadata descent - [Nit 10] extract_k8s: use cbm_arena_strdup for def.label "Resource" Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent e696fcd commit 94841e0

6 files changed

Lines changed: 69 additions & 58 deletions

File tree

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ Language support is split between two layers:
8989
Languages like **Dockerfile**, **docker-compose**, **Kubernetes manifests**, and **Kustomize** do not use tree-sitter grammars. Instead they follow an *infra-pass* pattern:
9090

9191
1. **Detection helpers** in `src/pipeline/pass_infrascan.c` — functions like `cbm_is_dockerfile()`, `cbm_is_k8s_manifest()`, `cbm_is_kustomize_file()` identify files by name and/or content heuristics (e.g., presence of `apiVersion:`).
92-
2. **Custom extractors** in `src/pipeline/extract_k8s.c` (or `extract_infra.c`) — hand-written parsers that walk the raw YAML/text and populate `CBMFileResult` with imports and definitions.
92+
2. **Custom extractors** in `internal/cbm/extract_k8s.c` — tree-sitter-based parsers that walk the YAML AST (using the tree-sitter YAML grammar) and populate `CBMFileResult` with imports and definitions.
9393
3. **Pipeline pass** (`pass_k8s.c`, `pass_infrascan.c`) — calls the extractor and emits graph nodes/edges. K8s manifests emit `Resource` nodes; Kustomize files emit `Module` nodes with `IMPORTS` edges to referenced resource files.
9494

9595
**When adding a new infrastructure language:**

internal/cbm/extract_k8s.c

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ static const char *get_scalar_text(CBMArena *a, TSNode node, const char *source)
5757
static int is_kustomize_list_key(const char *key) {
5858
return (strcmp(key, "resources") == 0 || strcmp(key, "bases") == 0 ||
5959
strcmp(key, "patches") == 0 || strcmp(key, "components") == 0 ||
60-
strcmp(key, "patchesStrategicMerge") == 0);
60+
strcmp(key, "patchesStrategicMerge") == 0 || strcmp(key, "crds") == 0);
6161
}
6262

6363
// ---------------------------------------------------------------------------
@@ -196,10 +196,8 @@ static void extract_k8s_scalars(CBMExtractCtx *ctx, TSNode mapping, char *kind_b
196196
}
197197
} else if (strcmp(key, "metadata") == 0) {
198198
// Descend into metadata block_mapping to find "name"
199+
// val_node is already unwrapped from block_node above.
199200
TSNode meta_mapping = val_node;
200-
if (strcmp(ts_node_type(meta_mapping), "block_node") == 0) {
201-
meta_mapping = ts_node_named_child(meta_mapping, 0);
202-
}
203201
if (ts_node_is_null(meta_mapping) ||
204202
strcmp(ts_node_type(meta_mapping), "block_mapping") != 0) {
205203
continue;
@@ -269,7 +267,7 @@ static void extract_k8s_manifest(CBMExtractCtx *ctx) {
269267
CBMDefinition def = {0};
270268
def.name = cbm_arena_strdup(a, def_name);
271269
def.qualified_name = cbm_arena_sprintf(a, "%s.%s", ctx->module_qn, def_name);
272-
def.label = "Resource";
270+
def.label = cbm_arena_strdup(a, "Resource");
273271
def.file_path = ctx->rel_path;
274272
def.start_line = ts_node_start_point(mapping).row + 1;
275273
def.end_line = ts_node_end_point(mapping).row + 1;

src/discover/language.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,11 @@ static const filename_entry_t FILENAME_TABLE[] = {
279279
{"meson_options.txt", CBM_LANG_MESON},
280280
{"kustomization.yaml", CBM_LANG_KUSTOMIZE},
281281
{"kustomization.yml", CBM_LANG_KUSTOMIZE},
282+
/* Note: FILENAME_TABLE uses case-sensitive strcmp, so mixed-case variants
283+
* (e.g. "Kustomization.yaml") are not matched here. They fall through to
284+
* CBM_LANG_YAML and are re-classified by cbm_is_kustomize_file() in
285+
* pass_k8s.c, which performs a case-insensitive comparison. This is the
286+
* intended behaviour — no additional entries are needed. */
282287
{".vimrc", CBM_LANG_VIMSCRIPT},
283288
};
284289

src/pipeline/pass_infrascan.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -207,10 +207,7 @@ bool cbm_is_k8s_manifest(const char *name, const char *content) {
207207
return false;
208208
}
209209
char buf[4097];
210-
size_t n = strlen(content);
211-
if (n > 4096) {
212-
n = 4096;
213-
}
210+
size_t n = strnlen(content, 4096);
214211
memcpy(buf, content, n);
215212
buf[n] = '\0';
216213
return ci_strstr(buf, "apiVersion:") != NULL;

src/pipeline/pass_k8s.c

Lines changed: 19 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -142,24 +142,15 @@ static void handle_kustomize(cbm_pipeline_ctx_t *ctx, const char *path, const ch
142142

143143
/* ── K8s manifest handler ────────────────────────────────────────── */
144144

145+
/* source/src_len are the already-read file bytes (caller retains ownership and
146+
* must free after this call returns). */
145147
static void handle_k8s_manifest(cbm_pipeline_ctx_t *ctx, const char *path, const char *rel_path,
146-
CBMFileResult *result) {
148+
const char *source, int src_len) {
149+
(void)path; /* retained for symmetry; source is always provided now */
147150
int resource_count = 0;
148-
CBMFileResult *res = result;
149-
bool allocated = false;
150-
151-
if (!res) {
152-
/* Fall back to re-extraction */
153-
int src_len = 0;
154-
char *source = k8s_read_file(path, &src_len);
155-
if (source) {
156-
res = cbm_extract_file(source, src_len, CBM_LANG_K8S, ctx->project_name, rel_path,
157-
CBM_EXTRACT_BUDGET, NULL, NULL);
158-
free(source);
159-
allocated = true;
160-
}
161-
}
162151

152+
CBMFileResult *res = cbm_extract_file(source, src_len, CBM_LANG_K8S, ctx->project_name,
153+
rel_path, CBM_EXTRACT_BUDGET, NULL, NULL);
163154
if (!res) {
164155
return;
165156
}
@@ -191,9 +182,7 @@ static void handle_k8s_manifest(cbm_pipeline_ctx_t *ctx, const char *path, const
191182
resource_count++;
192183
}
193184

194-
if (allocated) {
195-
cbm_free_result(res);
196-
}
185+
cbm_free_result(res);
197186

198187
cbm_log_info("pass.k8s.manifest", "file", rel_path, "resources", itoa_k8s(resource_count));
199188
}
@@ -226,38 +215,20 @@ int cbm_pipeline_pass_k8s(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files,
226215
handle_kustomize(ctx, path, rel, cached);
227216
kustomize_count++;
228217
} else if (lang == CBM_LANG_YAML || lang == CBM_LANG_K8S) {
229-
/* Need source content for cbm_is_k8s_manifest check */
230-
if (cached) {
231-
/* Use cached result — the file is a k8s manifest if lang is CBM_LANG_K8S,
232-
* or if we check the source. With a cached result available, trust the
233-
* language field set during discovery. */
234-
if (lang == CBM_LANG_K8S) {
235-
handle_k8s_manifest(ctx, path, rel, cached);
218+
/* Read source once to classify (and reuse for uncached extraction). */
219+
int src_len = 0;
220+
char *source = k8s_read_file(path, &src_len);
221+
if (source) {
222+
if (cbm_is_k8s_manifest(base, source)) {
223+
/* Always re-extract with CBM_LANG_K8S regardless of any cached
224+
* result: cached results were produced during the parallel YAML
225+
* pass and contain no "Resource" definitions. Pass the already-
226+
* read source buffer so handle_k8s_manifest does not re-read. */
227+
(void)cached; /* cached YAML result intentionally discarded */
228+
handle_k8s_manifest(ctx, path, rel, source, src_len);
236229
manifest_count++;
237-
} else {
238-
/* CBM_LANG_YAML: need source to confirm apiVersion presence */
239-
int src_len = 0;
240-
char *source = k8s_read_file(path, &src_len);
241-
if (source) {
242-
if (cbm_is_k8s_manifest(base, source)) {
243-
handle_k8s_manifest(ctx, path, rel, cached);
244-
manifest_count++;
245-
}
246-
free(source);
247-
}
248-
}
249-
} else {
250-
/* No cached result — read source to classify */
251-
int src_len = 0;
252-
char *source = k8s_read_file(path, &src_len);
253-
if (source) {
254-
if (cbm_is_k8s_manifest(base, source)) {
255-
/* Pass NULL result — handle_k8s_manifest will re-extract */
256-
handle_k8s_manifest(ctx, path, rel, NULL);
257-
manifest_count++;
258-
}
259-
free(source);
260230
}
231+
free(source);
261232
}
262233
}
263234
}

tests/test_pipeline.c

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4232,6 +4232,45 @@ TEST(k8s_extract_manifest_no_name) {
42324232
PASS();
42334233
}
42344234

4235+
TEST(k8s_extract_manifest_multidoc) {
4236+
/* Two-document YAML separated by "---".
4237+
* extract_k8s_manifest contains a "break" after the first successful push,
4238+
* so it processes only the first document that has both kind and
4239+
* metadata.name. This test pins that behaviour: the first document's
4240+
* resource must be present and no crash must occur.
4241+
*
4242+
* Note: with some tree-sitter YAML grammar versions the root stream may
4243+
* expose both documents as siblings; the break still fires after the first
4244+
* successful def push, so defs.count must be exactly 1. */
4245+
const char *src =
4246+
"apiVersion: apps/v1\n"
4247+
"kind: Deployment\n"
4248+
"metadata:\n"
4249+
" name: my-app\n"
4250+
"---\n"
4251+
"apiVersion: v1\n"
4252+
"kind: Service\n"
4253+
"metadata:\n"
4254+
" name: my-svc\n";
4255+
CBMFileResult *r = cbm_extract_file(src, (int)strlen(src), CBM_LANG_K8S,
4256+
"myproj", "k8s/multi.yaml", 0, NULL, NULL);
4257+
ASSERT(r != NULL);
4258+
ASSERT(!r->has_error);
4259+
/* First document's resource must be present */
4260+
int found = 0;
4261+
for (int i = 0; i < r->defs.count; i++) {
4262+
if (r->defs.items[i].label && strcmp(r->defs.items[i].label, "Resource") == 0 &&
4263+
r->defs.items[i].name && strcmp(r->defs.items[i].name, "Deployment/my-app") == 0) {
4264+
found = 1;
4265+
}
4266+
}
4267+
ASSERT(found);
4268+
/* At least one def, no more than one (only first document processed) */
4269+
ASSERT(r->defs.count >= 1);
4270+
cbm_free_result(r);
4271+
PASS();
4272+
}
4273+
42354274
/* ── Envscan tests (port of envscan_test.go) ───────────────────── */
42364275

42374276
/* Helper: write a file inside a temp dir */
@@ -5182,6 +5221,7 @@ SUITE(pipeline) {
51825221
RUN_TEST(k8s_extract_kustomize);
51835222
RUN_TEST(k8s_extract_manifest);
51845223
RUN_TEST(k8s_extract_manifest_no_name);
5224+
RUN_TEST(k8s_extract_manifest_multidoc);
51855225
/* Env URL scanning */
51865226
RUN_TEST(envscan_dockerfile_env_urls);
51875227
RUN_TEST(envscan_shell_env_urls);

0 commit comments

Comments
 (0)