Skip to content

Commit 6e4ca93

Browse files
committed
Split 168 functions to cognitive complexity 25, zero lint errors
Lower thresholds to industry defaults: cognitive-complexity: 25 (was 250) statements: 200 (was 400) lines: 400 (was 800) All 168 functions split into smaller helpers across 44 files. Zero NOLINTNEXTLINE suppressions remain. Zero clang-tidy errors. Add clang-tidy to scripts/lint.sh (--ci to skip where unavailable). Fix sqlite_writer B-tree PageRef initialization. Fix Terraform struct parsing, Louvain null guards, const qualifiers, shadow variables. 2741 tests pass.
1 parent 87c2e14 commit 6e4ca93

40 files changed

Lines changed: 10975 additions & 11119 deletions

internal/cbm/ac.c

Lines changed: 83 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -61,60 +61,11 @@ static void queue_free(Queue *q) {
6161
free(q->data);
6262
}
6363

64-
// cbm_ac_build constructs an Aho-Corasick automaton from a set of patterns.
65-
//
66-
// Parameters:
67-
// patterns — array of pattern pointers (not necessarily NUL-terminated)
68-
// lengths — length of each pattern
69-
// count — number of patterns (max 64 for bitmask mode)
70-
// alpha_map — byte→index mapping (NULL = identity/256). For compact alphabets,
71-
// map relevant chars to 1..N and everything else to 0.
72-
// alpha_size — alphabet size (256 if alpha_map is NULL)
73-
//
74-
// Returns a heap-allocated automaton. Caller must call cbm_ac_free().
75-
// NOLINTNEXTLINE(readability-function-cognitive-complexity)
76-
CBMAutomaton *cbm_ac_build(const char **patterns, const int *lengths, int count,
77-
const uint8_t *alpha_map, int alpha_size) {
78-
if (count <= 0) {
79-
return NULL;
80-
}
81-
if (alpha_size <= 0) {
82-
alpha_size = 256;
83-
}
84-
85-
// Estimate max states: sum of pattern lengths + 1 (root).
86-
int max_states = 1;
87-
for (int i = 0; i < count; i++) {
88-
max_states += lengths[i];
89-
}
90-
91-
CBMAutomaton *ac = (CBMAutomaton *)calloc(1, sizeof(CBMAutomaton));
92-
ac->alpha_size = alpha_size;
93-
ac->num_patterns = count;
94-
95-
// Set up alphabet mapping.
96-
if (alpha_map) {
97-
memcpy(ac->alpha_map, alpha_map, 256);
98-
} else {
99-
for (int i = 0; i < 256; i++) {
100-
ac->alpha_map[i] = (uint8_t)i;
101-
}
102-
}
103-
104-
// Allocate goto table and output arrays.
105-
ac->go_table = (int *)malloc((size_t)max_states * alpha_size * sizeof(int));
106-
memset(ac->go_table, -1, (size_t)max_states * alpha_size * sizeof(int));
107-
ac->output = (uint64_t *)calloc(max_states, sizeof(uint64_t));
108-
ac->output_list = (int *)malloc(max_states * sizeof(int));
109-
ac->output_next = (int *)malloc(max_states * sizeof(int));
110-
for (int i = 0; i < max_states; i++) {
111-
ac->output_list[i] = -1;
112-
ac->output_next[i] = -1;
113-
}
114-
64+
// Phase 1: Build trie (goto function) from patterns. Returns state count.
65+
static int ac_build_trie(CBMAutomaton *ac, const char **patterns, const int *lengths, int count) {
66+
int alpha_size = ac->alpha_size;
11567
int num_states = 1; // state 0 = root
11668

117-
// Phase 1: Build trie (goto function) from patterns.
11869
for (int p = 0; p < count; p++) {
11970
int state = 0;
12071
for (int j = 0; j < lengths[p]; j++) {
@@ -125,11 +76,9 @@ CBMAutomaton *cbm_ac_build(const char **patterns, const int *lengths, int count,
12576
}
12677
state = ac->go_table[idx];
12778
}
128-
// Mark this state as accepting pattern p.
12979
if (p < CBM_AC_MAX_BITMASK) {
13080
ac->output[state] |= (1ULL << p);
13181
}
132-
// Append to output list.
13382
ac->output_list[state] = p;
13483
}
13584

@@ -139,72 +88,124 @@ CBMAutomaton *cbm_ac_build(const char **patterns, const int *lengths, int count,
13988
ac->go_table[c] = 0;
14089
}
14190
}
91+
return num_states;
92+
}
14293

143-
// Phase 2: Build failure function via BFS + compute full goto table.
144-
// We store failure links temporarily in a separate array.
94+
// Phase 2: Build failure function via BFS + compute full goto table.
95+
static void ac_build_failure(CBMAutomaton *ac, int num_states) {
96+
int alpha_size = ac->alpha_size;
14597
int *fail = (int *)calloc(num_states, sizeof(int));
14698

14799
Queue q;
148100
queue_init(&q, num_states);
149101

150-
// Depth-1 states: failure → root.
151102
for (int c = 0; c < alpha_size; c++) {
152-
int s = ac->go_table[c]; // root's goto for c
103+
int s = ac->go_table[c];
153104
if (s != 0) {
154105
fail[s] = 0;
155106
queue_push(&q, s);
156107
}
157108
}
158109

159-
// BFS: compute failure links and fill in missing goto entries.
160110
while (!queue_empty(&q)) {
161111
int r = queue_pop(&q);
162112
for (int c = 0; c < alpha_size; c++) {
163113
int idx = (r * alpha_size) + c;
164114
int s = ac->go_table[idx];
165115
if (s != -1) {
166-
// s exists in trie
167116
fail[s] = ac->go_table[(fail[r] * alpha_size) + c];
168-
// Merge output: dictionary suffix links.
169117
ac->output[s] |= ac->output[fail[s]];
170-
// Chain output list (for >64 pattern mode).
171118
if (ac->output_next[s] == -1 && ac->output_list[fail[s]] != -1) {
172119
ac->output_next[s] = fail[s];
173120
}
174121
queue_push(&q, s);
175122
} else {
176-
// Fill missing transition: follow failure link.
177123
ac->go_table[idx] = ac->go_table[(fail[r] * alpha_size) + c];
178124
}
179125
}
180126
}
181127

182128
free(fail);
183129
queue_free(&q);
130+
}
184131

185-
ac->num_states = num_states;
132+
// Shrink allocations to exact state count.
133+
static void ac_shrink_tables(CBMAutomaton *ac, int num_states, int max_states) {
134+
if (num_states >= max_states) {
135+
return;
136+
}
137+
int alpha_size = ac->alpha_size;
138+
void *tmp;
139+
tmp = realloc(ac->go_table, (size_t)num_states * alpha_size * sizeof(int));
140+
if (tmp) {
141+
ac->go_table = (int *)tmp;
142+
}
143+
tmp = realloc(ac->output, (size_t)num_states * sizeof(uint64_t));
144+
if (tmp) {
145+
ac->output = (uint64_t *)tmp;
146+
}
147+
tmp = realloc(ac->output_list, (size_t)num_states * sizeof(int));
148+
if (tmp) {
149+
ac->output_list = (int *)tmp;
150+
}
151+
tmp = realloc(ac->output_next, (size_t)num_states * sizeof(int));
152+
if (tmp) {
153+
ac->output_next = (int *)tmp;
154+
}
155+
}
186156

187-
// Reallocate to exact size (optional, saves memory for large automatons).
188-
if (num_states < max_states) {
189-
void *tmp;
190-
tmp = realloc(ac->go_table, (size_t)num_states * alpha_size * sizeof(int));
191-
if (tmp) {
192-
ac->go_table = (int *)tmp;
193-
}
194-
tmp = realloc(ac->output, (size_t)num_states * sizeof(uint64_t));
195-
if (tmp) {
196-
ac->output = (uint64_t *)tmp;
197-
}
198-
tmp = realloc(ac->output_list, (size_t)num_states * sizeof(int));
199-
if (tmp) {
200-
ac->output_list = (int *)tmp;
201-
}
202-
tmp = realloc(ac->output_next, (size_t)num_states * sizeof(int));
203-
if (tmp) {
204-
ac->output_next = (int *)tmp;
157+
// cbm_ac_build constructs an Aho-Corasick automaton from a set of patterns.
158+
//
159+
// Parameters:
160+
// patterns — array of pattern pointers (not necessarily NUL-terminated)
161+
// lengths — length of each pattern
162+
// count — number of patterns (max 64 for bitmask mode)
163+
// alpha_map — byte→index mapping (NULL = identity/256). For compact alphabets,
164+
// map relevant chars to 1..N and everything else to 0.
165+
// alpha_size — alphabet size (256 if alpha_map is NULL)
166+
//
167+
// Returns a heap-allocated automaton. Caller must call cbm_ac_free().
168+
CBMAutomaton *cbm_ac_build(const char **patterns, const int *lengths, int count,
169+
const uint8_t *alpha_map, int alpha_size) {
170+
if (count <= 0) {
171+
return NULL;
172+
}
173+
if (alpha_size <= 0) {
174+
alpha_size = 256;
175+
}
176+
177+
int max_states = 1;
178+
for (int i = 0; i < count; i++) {
179+
max_states += lengths[i];
180+
}
181+
182+
CBMAutomaton *ac = (CBMAutomaton *)calloc(1, sizeof(CBMAutomaton));
183+
ac->alpha_size = alpha_size;
184+
ac->num_patterns = count;
185+
186+
if (alpha_map) {
187+
memcpy(ac->alpha_map, alpha_map, 256);
188+
} else {
189+
for (int i = 0; i < 256; i++) {
190+
ac->alpha_map[i] = (uint8_t)i;
205191
}
206192
}
207193

194+
ac->go_table = (int *)malloc((size_t)max_states * alpha_size * sizeof(int));
195+
memset(ac->go_table, -1, (size_t)max_states * alpha_size * sizeof(int));
196+
ac->output = (uint64_t *)calloc(max_states, sizeof(uint64_t));
197+
ac->output_list = (int *)malloc(max_states * sizeof(int));
198+
ac->output_next = (int *)malloc(max_states * sizeof(int));
199+
for (int i = 0; i < max_states; i++) {
200+
ac->output_list[i] = -1;
201+
ac->output_next[i] = -1;
202+
}
203+
204+
int num_states = ac_build_trie(ac, patterns, lengths, count);
205+
ac_build_failure(ac, num_states);
206+
ac->num_states = num_states;
207+
ac_shrink_tables(ac, num_states, max_states);
208+
208209
return ac;
209210
}
210211

0 commit comments

Comments
 (0)