Skip to content

Commit 88ddb63

Browse files
feat(k8s): K8s and Kustomize YAML extractors in the CBM layer
- lang_specs.c: add LangSpec entries for CBM_LANG_KUSTOMIZE and CBM_LANG_K8S (both reuse tree_sitter_yaml()); add cbm_ts_language() switch cases - extract_k8s.c: new file implementing cbm_extract_k8s(); kustomize path walks block_sequence items under resources/bases/patches/components/ patchesStrategicMerge and emits CBMImport per scalar; k8s path extracts apiVersion/kind/metadata.name and emits CBMDefinition with label "Resource" and name "Kind/metadata-name"; malformed manifests (missing kind or name) produce zero definitions - cbm.h: declare cbm_extract_k8s() alongside other sub-extractor entry points - cbm.c: call cbm_extract_k8s() after unified extraction for the two new langs - Makefile.cbm: add extract_k8s.c to EXTRACTION_SRCS Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent ee769d5 commit 88ddb63

5 files changed

Lines changed: 306 additions & 0 deletions

File tree

Makefile.cbm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ EXTRACTION_SRCS = \
115115
$(CBM_DIR)/extract_type_refs.c \
116116
$(CBM_DIR)/extract_type_assigns.c \
117117
$(CBM_DIR)/extract_env_accesses.c \
118+
$(CBM_DIR)/extract_k8s.c \
118119
$(CBM_DIR)/helpers.c \
119120
$(CBM_DIR)/lang_specs.c
120121

internal/cbm/cbm.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,11 @@ CBMFileResult *cbm_extract_file(const char *source, int source_len, CBMLanguage
316316
cbm_extract_imports(&ctx);
317317
cbm_extract_unified(&ctx);
318318

319+
// K8s / Kustomize semantic pass (additional structured extraction for YAML-based infra files).
320+
if (ctx.language == CBM_LANG_KUSTOMIZE || ctx.language == CBM_LANG_K8S) {
321+
cbm_extract_k8s(&ctx);
322+
}
323+
319324
// LSP type-aware call resolution
320325
uint64_t lsp_start = now_ns();
321326
if (language == CBM_LANG_GO) {

internal/cbm/cbm.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,4 +363,7 @@ void cbm_extract_type_assigns(CBMExtractCtx *ctx);
363363
// Single-pass unified extraction (replaces the 7 calls above except defs+imports).
364364
void cbm_extract_unified(CBMExtractCtx *ctx);
365365

366+
// K8s / Kustomize semantic extractor (called when language is CBM_LANG_K8S or CBM_LANG_KUSTOMIZE).
367+
void cbm_extract_k8s(CBMExtractCtx *ctx);
368+
366369
#endif // CBM_H

internal/cbm/extract_k8s.c

Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
// extract_k8s.c — K8s manifest and Kustomize file extractor.
2+
//
3+
// For CBM_LANG_KUSTOMIZE: walks top-level block_mapping_pair nodes whose key
4+
// matches "resources", "bases", "patches", "components", or
5+
// "patchesStrategicMerge", then emits one CBMImport per block_sequence item.
6+
//
7+
// For CBM_LANG_K8S: finds apiVersion, kind, and metadata.name scalars in the
8+
// first document's block_mapping and emits one CBMDefinition with label
9+
// "Resource" and name "Kind/metadata-name".
10+
11+
#include "cbm.h"
12+
#include "arena.h"
13+
#include "helpers.h"
14+
#include "tree_sitter/api.h"
15+
#include <stdint.h>
16+
#include <stdio.h>
17+
#include <string.h>
18+
19+
// ---------------------------------------------------------------------------
20+
// Internal helpers
21+
// ---------------------------------------------------------------------------
22+
23+
// Return the raw source text for a scalar node (plain, single-quoted, or
24+
// double-quoted). Surrounding quote characters are stripped for quoted forms.
25+
// Returns NULL for non-scalar node types.
26+
static const char *get_scalar_text(CBMArena *a, TSNode node, const char *source) {
27+
const char *type = ts_node_type(node);
28+
if (strcmp(type, "plain_scalar") == 0) {
29+
return cbm_node_text(a, node, source);
30+
}
31+
if (strcmp(type, "double_quote_scalar") == 0 || strcmp(type, "single_quote_scalar") == 0) {
32+
const char *raw = cbm_node_text(a, node, source);
33+
if (!raw) {
34+
return NULL;
35+
}
36+
size_t len = strlen(raw);
37+
if (len >= 2) {
38+
return cbm_arena_strndup(a, raw + 1, len - 2);
39+
}
40+
return raw;
41+
}
42+
return NULL;
43+
}
44+
45+
// Return true if the key text of a block_mapping_pair matches one of the
46+
// Kustomize resource-list field names.
47+
static int is_kustomize_list_key(const char *key) {
48+
return (strcmp(key, "resources") == 0 || strcmp(key, "bases") == 0 ||
49+
strcmp(key, "patches") == 0 || strcmp(key, "components") == 0 ||
50+
strcmp(key, "patchesStrategicMerge") == 0);
51+
}
52+
53+
// ---------------------------------------------------------------------------
54+
// Kustomize extraction
55+
// ---------------------------------------------------------------------------
56+
57+
// Walk a block_sequence node and emit one CBMImport per block_sequence_item
58+
// scalar child, using key_name as the local_name.
59+
static void emit_kustomize_sequence(CBMExtractCtx *ctx, TSNode seq_node,
60+
const char *key_name) {
61+
CBMArena *a = ctx->arena;
62+
uint32_t n = ts_node_child_count(seq_node);
63+
for (uint32_t i = 0; i < n; i++) {
64+
TSNode item = ts_node_child(seq_node, i);
65+
if (strcmp(ts_node_type(item), "block_sequence_item") != 0) {
66+
continue;
67+
}
68+
// block_sequence_item has one named child: the value
69+
uint32_t ic = ts_node_child_count(item);
70+
for (uint32_t j = 0; j < ic; j++) {
71+
TSNode val = ts_node_child(item, j);
72+
const char *scalar = get_scalar_text(a, val, ctx->source);
73+
if (!scalar) {
74+
continue;
75+
}
76+
CBMImport imp = {
77+
.local_name = cbm_arena_strdup(a, key_name),
78+
.module_path = cbm_arena_strdup(a, scalar),
79+
};
80+
cbm_imports_push(&ctx->result->imports, a, imp);
81+
}
82+
}
83+
}
84+
85+
static void extract_kustomize(CBMExtractCtx *ctx) {
86+
CBMArena *a = ctx->arena;
87+
88+
// Traverse: stream -> document -> block_node -> block_mapping -> block_mapping_pair
89+
TSNode root = ctx->root;
90+
uint32_t root_n = ts_node_child_count(root);
91+
for (uint32_t si = 0; si < root_n; si++) {
92+
TSNode stream_child = ts_node_child(root, si);
93+
if (strcmp(ts_node_type(stream_child), "document") != 0) {
94+
continue;
95+
}
96+
// Find block_mapping inside the document (may be wrapped in block_node)
97+
TSNode mapping = ts_node_named_child(stream_child, 0);
98+
if (ts_node_is_null(mapping)) {
99+
continue;
100+
}
101+
// Some grammars wrap in block_node
102+
if (strcmp(ts_node_type(mapping), "block_node") == 0) {
103+
mapping = ts_node_named_child(mapping, 0);
104+
}
105+
if (ts_node_is_null(mapping) || strcmp(ts_node_type(mapping), "block_mapping") != 0) {
106+
continue;
107+
}
108+
109+
uint32_t pair_n = ts_node_child_count(mapping);
110+
for (uint32_t pi = 0; pi < pair_n; pi++) {
111+
TSNode pair = ts_node_child(mapping, pi);
112+
if (strcmp(ts_node_type(pair), "block_mapping_pair") != 0) {
113+
continue;
114+
}
115+
116+
// First named child = key
117+
TSNode key_node = ts_node_named_child(pair, 0);
118+
if (ts_node_is_null(key_node)) {
119+
continue;
120+
}
121+
const char *key_text = get_scalar_text(a, key_node, ctx->source);
122+
if (!key_text || !is_kustomize_list_key(key_text)) {
123+
continue;
124+
}
125+
126+
// Second named child = value (should be a block_sequence or block_node wrapping one)
127+
TSNode val_node = ts_node_named_child(pair, 1);
128+
if (ts_node_is_null(val_node)) {
129+
continue;
130+
}
131+
if (strcmp(ts_node_type(val_node), "block_node") == 0) {
132+
val_node = ts_node_named_child(val_node, 0);
133+
}
134+
if (ts_node_is_null(val_node) ||
135+
strcmp(ts_node_type(val_node), "block_sequence") != 0) {
136+
continue;
137+
}
138+
139+
emit_kustomize_sequence(ctx, val_node, key_text);
140+
}
141+
}
142+
}
143+
144+
// ---------------------------------------------------------------------------
145+
// K8s manifest extraction
146+
// ---------------------------------------------------------------------------
147+
148+
// Descend into the first block_mapping of a document and extract apiVersion,
149+
// kind, and metadata.name. Returns void; fills kind_buf and meta_name_buf.
150+
static void extract_k8s_scalars(CBMExtractCtx *ctx, TSNode mapping,
151+
char *kind_buf, size_t kind_sz,
152+
char *meta_name_buf, size_t meta_sz) {
153+
CBMArena *a = ctx->arena;
154+
kind_buf[0] = '\0';
155+
meta_name_buf[0] = '\0';
156+
157+
uint32_t n = ts_node_child_count(mapping);
158+
for (uint32_t i = 0; i < n; i++) {
159+
TSNode pair = ts_node_child(mapping, i);
160+
if (strcmp(ts_node_type(pair), "block_mapping_pair") != 0) {
161+
continue;
162+
}
163+
TSNode key_node = ts_node_named_child(pair, 0);
164+
if (ts_node_is_null(key_node)) {
165+
continue;
166+
}
167+
const char *key = get_scalar_text(a, key_node, ctx->source);
168+
if (!key) {
169+
continue;
170+
}
171+
172+
TSNode val_node = ts_node_named_child(pair, 1);
173+
if (ts_node_is_null(val_node)) {
174+
continue;
175+
}
176+
// Unwrap block_node if present
177+
if (strcmp(ts_node_type(val_node), "block_node") == 0) {
178+
val_node = ts_node_named_child(val_node, 0);
179+
}
180+
if (ts_node_is_null(val_node)) {
181+
continue;
182+
}
183+
184+
if (strcmp(key, "kind") == 0) {
185+
const char *v = get_scalar_text(a, val_node, ctx->source);
186+
if (v) {
187+
snprintf(kind_buf, kind_sz, "%s", v);
188+
}
189+
} else if (strcmp(key, "metadata") == 0) {
190+
// Descend into metadata block_mapping to find "name"
191+
TSNode meta_mapping = val_node;
192+
if (strcmp(ts_node_type(meta_mapping), "block_node") == 0) {
193+
meta_mapping = ts_node_named_child(meta_mapping, 0);
194+
}
195+
if (ts_node_is_null(meta_mapping) ||
196+
strcmp(ts_node_type(meta_mapping), "block_mapping") != 0) {
197+
continue;
198+
}
199+
uint32_t mn = ts_node_child_count(meta_mapping);
200+
for (uint32_t mi = 0; mi < mn; mi++) {
201+
TSNode mpair = ts_node_child(meta_mapping, mi);
202+
if (strcmp(ts_node_type(mpair), "block_mapping_pair") != 0) {
203+
continue;
204+
}
205+
TSNode mkey = ts_node_named_child(mpair, 0);
206+
if (ts_node_is_null(mkey)) {
207+
continue;
208+
}
209+
const char *mkey_text = get_scalar_text(a, mkey, ctx->source);
210+
if (!mkey_text || strcmp(mkey_text, "name") != 0) {
211+
continue;
212+
}
213+
TSNode mval = ts_node_named_child(mpair, 1);
214+
if (ts_node_is_null(mval)) {
215+
continue;
216+
}
217+
const char *meta_name = get_scalar_text(a, mval, ctx->source);
218+
if (meta_name) {
219+
snprintf(meta_name_buf, meta_sz, "%s", meta_name);
220+
}
221+
}
222+
}
223+
}
224+
}
225+
226+
static void extract_k8s_manifest(CBMExtractCtx *ctx) {
227+
CBMArena *a = ctx->arena;
228+
229+
TSNode root = ctx->root;
230+
uint32_t root_n = ts_node_child_count(root);
231+
for (uint32_t si = 0; si < root_n; si++) {
232+
TSNode stream_child = ts_node_child(root, si);
233+
if (strcmp(ts_node_type(stream_child), "document") != 0) {
234+
continue;
235+
}
236+
237+
TSNode mapping = ts_node_named_child(stream_child, 0);
238+
if (ts_node_is_null(mapping)) {
239+
continue;
240+
}
241+
if (strcmp(ts_node_type(mapping), "block_node") == 0) {
242+
mapping = ts_node_named_child(mapping, 0);
243+
}
244+
if (ts_node_is_null(mapping) || strcmp(ts_node_type(mapping), "block_mapping") != 0) {
245+
continue;
246+
}
247+
248+
char kind_buf[256] = {0};
249+
char meta_name_buf[256] = {0};
250+
extract_k8s_scalars(ctx, mapping, kind_buf, sizeof(kind_buf), meta_name_buf,
251+
sizeof(meta_name_buf));
252+
253+
// Skip malformed manifests (no kind or no metadata.name)
254+
if (kind_buf[0] == '\0' || meta_name_buf[0] == '\0') {
255+
continue;
256+
}
257+
258+
char def_name[512];
259+
snprintf(def_name, sizeof(def_name), "%s/%s", kind_buf, meta_name_buf);
260+
261+
CBMDefinition def = {0};
262+
def.name = cbm_arena_strdup(a, def_name);
263+
def.qualified_name = cbm_arena_sprintf(a, "%s.%s", ctx->module_qn, def_name);
264+
def.label = "Resource";
265+
def.file_path = ctx->rel_path;
266+
def.start_line = ts_node_start_point(mapping).row + 1;
267+
def.end_line = ts_node_end_point(mapping).row + 1;
268+
cbm_defs_push(&ctx->result->defs, a, def);
269+
270+
break; // Only the first document per file
271+
}
272+
}
273+
274+
// ---------------------------------------------------------------------------
275+
// Public entry point
276+
// ---------------------------------------------------------------------------
277+
278+
void cbm_extract_k8s(CBMExtractCtx *ctx) {
279+
if (ctx->language == CBM_LANG_KUSTOMIZE) {
280+
extract_kustomize(ctx);
281+
} else if (ctx->language == CBM_LANG_K8S) {
282+
extract_k8s_manifest(ctx);
283+
}
284+
}

internal/cbm/lang_specs.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,6 +1041,16 @@ static const CBMLangSpec lang_specs[CBM_LANG_COUNT] = {
10411041
{CBM_LANG_WOLFRAM, wolfram_func_types, empty_types, empty_types, wolfram_module_types,
10421042
wolfram_call_types, wolfram_import_types, empty_types, empty_types, empty_types, empty_types,
10431043
empty_types, NULL, empty_types, NULL, NULL},
1044+
1045+
// CBM_LANG_KUSTOMIZE — reuses YAML grammar; semantic extraction via cbm_extract_k8s()
1046+
{CBM_LANG_KUSTOMIZE, empty_types, empty_types, empty_types, yaml_module_types, empty_types,
1047+
empty_types, empty_types, empty_types, empty_types, empty_types, empty_types, NULL,
1048+
empty_types, NULL, NULL},
1049+
1050+
// CBM_LANG_K8S — reuses YAML grammar; semantic extraction via cbm_extract_k8s()
1051+
{CBM_LANG_K8S, empty_types, empty_types, empty_types, yaml_module_types, empty_types,
1052+
empty_types, empty_types, empty_types, empty_types, empty_types, empty_types, NULL,
1053+
empty_types, NULL, NULL},
10441054
};
10451055

10461056
const CBMLangSpec *cbm_lang_spec(CBMLanguage lang) {
@@ -1180,6 +1190,9 @@ const TSLanguage *cbm_ts_language(CBMLanguage lang) {
11801190
return tree_sitter_magma();
11811191
case CBM_LANG_WOLFRAM:
11821192
return tree_sitter_wolfram();
1193+
case CBM_LANG_KUSTOMIZE:
1194+
case CBM_LANG_K8S:
1195+
return tree_sitter_yaml();
11831196
default:
11841197
return NULL;
11851198
}

0 commit comments

Comments
 (0)