@@ -61,60 +61,11 @@ static void queue_free(Queue *q) {
6161 free (q -> data );
6262}
6363
64- // cbm_ac_build constructs an Aho-Corasick automaton from a set of patterns.
65- //
66- // Parameters:
67- // patterns — array of pattern pointers (not necessarily NUL-terminated)
68- // lengths — length of each pattern
69- // count — number of patterns (max 64 for bitmask mode)
70- // alpha_map — byte→index mapping (NULL = identity/256). For compact alphabets,
71- // map relevant chars to 1..N and everything else to 0.
72- // alpha_size — alphabet size (256 if alpha_map is NULL)
73- //
74- // Returns a heap-allocated automaton. Caller must call cbm_ac_free().
75- // NOLINTNEXTLINE(readability-function-cognitive-complexity)
76- CBMAutomaton * cbm_ac_build (const char * * patterns , const int * lengths , int count ,
77- const uint8_t * alpha_map , int alpha_size ) {
78- if (count <= 0 ) {
79- return NULL ;
80- }
81- if (alpha_size <= 0 ) {
82- alpha_size = 256 ;
83- }
84-
85- // Estimate max states: sum of pattern lengths + 1 (root).
86- int max_states = 1 ;
87- for (int i = 0 ; i < count ; i ++ ) {
88- max_states += lengths [i ];
89- }
90-
91- CBMAutomaton * ac = (CBMAutomaton * )calloc (1 , sizeof (CBMAutomaton ));
92- ac -> alpha_size = alpha_size ;
93- ac -> num_patterns = count ;
94-
95- // Set up alphabet mapping.
96- if (alpha_map ) {
97- memcpy (ac -> alpha_map , alpha_map , 256 );
98- } else {
99- for (int i = 0 ; i < 256 ; i ++ ) {
100- ac -> alpha_map [i ] = (uint8_t )i ;
101- }
102- }
103-
104- // Allocate goto table and output arrays.
105- ac -> go_table = (int * )malloc ((size_t )max_states * alpha_size * sizeof (int ));
106- memset (ac -> go_table , -1 , (size_t )max_states * alpha_size * sizeof (int ));
107- ac -> output = (uint64_t * )calloc (max_states , sizeof (uint64_t ));
108- ac -> output_list = (int * )malloc (max_states * sizeof (int ));
109- ac -> output_next = (int * )malloc (max_states * sizeof (int ));
110- for (int i = 0 ; i < max_states ; i ++ ) {
111- ac -> output_list [i ] = -1 ;
112- ac -> output_next [i ] = -1 ;
113- }
114-
64+ // Phase 1: Build trie (goto function) from patterns. Returns state count.
65+ static int ac_build_trie (CBMAutomaton * ac , const char * * patterns , const int * lengths , int count ) {
66+ int alpha_size = ac -> alpha_size ;
11567 int num_states = 1 ; // state 0 = root
11668
117- // Phase 1: Build trie (goto function) from patterns.
11869 for (int p = 0 ; p < count ; p ++ ) {
11970 int state = 0 ;
12071 for (int j = 0 ; j < lengths [p ]; j ++ ) {
@@ -125,11 +76,9 @@ CBMAutomaton *cbm_ac_build(const char **patterns, const int *lengths, int count,
12576 }
12677 state = ac -> go_table [idx ];
12778 }
128- // Mark this state as accepting pattern p.
12979 if (p < CBM_AC_MAX_BITMASK ) {
13080 ac -> output [state ] |= (1ULL << p );
13181 }
132- // Append to output list.
13382 ac -> output_list [state ] = p ;
13483 }
13584
@@ -139,72 +88,124 @@ CBMAutomaton *cbm_ac_build(const char **patterns, const int *lengths, int count,
13988 ac -> go_table [c ] = 0 ;
14089 }
14190 }
91+ return num_states ;
92+ }
14293
143- // Phase 2: Build failure function via BFS + compute full goto table.
144- // We store failure links temporarily in a separate array.
94+ // Phase 2: Build failure function via BFS + compute full goto table.
95+ static void ac_build_failure (CBMAutomaton * ac , int num_states ) {
96+ int alpha_size = ac -> alpha_size ;
14597 int * fail = (int * )calloc (num_states , sizeof (int ));
14698
14799 Queue q ;
148100 queue_init (& q , num_states );
149101
150- // Depth-1 states: failure → root.
151102 for (int c = 0 ; c < alpha_size ; c ++ ) {
152- int s = ac -> go_table [c ]; // root's goto for c
103+ int s = ac -> go_table [c ];
153104 if (s != 0 ) {
154105 fail [s ] = 0 ;
155106 queue_push (& q , s );
156107 }
157108 }
158109
159- // BFS: compute failure links and fill in missing goto entries.
160110 while (!queue_empty (& q )) {
161111 int r = queue_pop (& q );
162112 for (int c = 0 ; c < alpha_size ; c ++ ) {
163113 int idx = (r * alpha_size ) + c ;
164114 int s = ac -> go_table [idx ];
165115 if (s != -1 ) {
166- // s exists in trie
167116 fail [s ] = ac -> go_table [(fail [r ] * alpha_size ) + c ];
168- // Merge output: dictionary suffix links.
169117 ac -> output [s ] |= ac -> output [fail [s ]];
170- // Chain output list (for >64 pattern mode).
171118 if (ac -> output_next [s ] == -1 && ac -> output_list [fail [s ]] != -1 ) {
172119 ac -> output_next [s ] = fail [s ];
173120 }
174121 queue_push (& q , s );
175122 } else {
176- // Fill missing transition: follow failure link.
177123 ac -> go_table [idx ] = ac -> go_table [(fail [r ] * alpha_size ) + c ];
178124 }
179125 }
180126 }
181127
182128 free (fail );
183129 queue_free (& q );
130+ }
184131
185- ac -> num_states = num_states ;
132+ // Shrink allocations to exact state count.
133+ static void ac_shrink_tables (CBMAutomaton * ac , int num_states , int max_states ) {
134+ if (num_states >= max_states ) {
135+ return ;
136+ }
137+ int alpha_size = ac -> alpha_size ;
138+ void * tmp ;
139+ tmp = realloc (ac -> go_table , (size_t )num_states * alpha_size * sizeof (int ));
140+ if (tmp ) {
141+ ac -> go_table = (int * )tmp ;
142+ }
143+ tmp = realloc (ac -> output , (size_t )num_states * sizeof (uint64_t ));
144+ if (tmp ) {
145+ ac -> output = (uint64_t * )tmp ;
146+ }
147+ tmp = realloc (ac -> output_list , (size_t )num_states * sizeof (int ));
148+ if (tmp ) {
149+ ac -> output_list = (int * )tmp ;
150+ }
151+ tmp = realloc (ac -> output_next , (size_t )num_states * sizeof (int ));
152+ if (tmp ) {
153+ ac -> output_next = (int * )tmp ;
154+ }
155+ }
186156
187- // Reallocate to exact size (optional, saves memory for large automatons).
188- if (num_states < max_states ) {
189- void * tmp ;
190- tmp = realloc (ac -> go_table , (size_t )num_states * alpha_size * sizeof (int ));
191- if (tmp ) {
192- ac -> go_table = (int * )tmp ;
193- }
194- tmp = realloc (ac -> output , (size_t )num_states * sizeof (uint64_t ));
195- if (tmp ) {
196- ac -> output = (uint64_t * )tmp ;
197- }
198- tmp = realloc (ac -> output_list , (size_t )num_states * sizeof (int ));
199- if (tmp ) {
200- ac -> output_list = (int * )tmp ;
201- }
202- tmp = realloc (ac -> output_next , (size_t )num_states * sizeof (int ));
203- if (tmp ) {
204- ac -> output_next = (int * )tmp ;
157+ // cbm_ac_build constructs an Aho-Corasick automaton from a set of patterns.
158+ //
159+ // Parameters:
160+ // patterns — array of pattern pointers (not necessarily NUL-terminated)
161+ // lengths — length of each pattern
162+ // count — number of patterns (max 64 for bitmask mode)
163+ // alpha_map — byte→index mapping (NULL = identity/256). For compact alphabets,
164+ // map relevant chars to 1..N and everything else to 0.
165+ // alpha_size — alphabet size (256 if alpha_map is NULL)
166+ //
167+ // Returns a heap-allocated automaton. Caller must call cbm_ac_free().
168+ CBMAutomaton * cbm_ac_build (const char * * patterns , const int * lengths , int count ,
169+ const uint8_t * alpha_map , int alpha_size ) {
170+ if (count <= 0 ) {
171+ return NULL ;
172+ }
173+ if (alpha_size <= 0 ) {
174+ alpha_size = 256 ;
175+ }
176+
177+ int max_states = 1 ;
178+ for (int i = 0 ; i < count ; i ++ ) {
179+ max_states += lengths [i ];
180+ }
181+
182+ CBMAutomaton * ac = (CBMAutomaton * )calloc (1 , sizeof (CBMAutomaton ));
183+ ac -> alpha_size = alpha_size ;
184+ ac -> num_patterns = count ;
185+
186+ if (alpha_map ) {
187+ memcpy (ac -> alpha_map , alpha_map , 256 );
188+ } else {
189+ for (int i = 0 ; i < 256 ; i ++ ) {
190+ ac -> alpha_map [i ] = (uint8_t )i ;
205191 }
206192 }
207193
194+ ac -> go_table = (int * )malloc ((size_t )max_states * alpha_size * sizeof (int ));
195+ memset (ac -> go_table , -1 , (size_t )max_states * alpha_size * sizeof (int ));
196+ ac -> output = (uint64_t * )calloc (max_states , sizeof (uint64_t ));
197+ ac -> output_list = (int * )malloc (max_states * sizeof (int ));
198+ ac -> output_next = (int * )malloc (max_states * sizeof (int ));
199+ for (int i = 0 ; i < max_states ; i ++ ) {
200+ ac -> output_list [i ] = -1 ;
201+ ac -> output_next [i ] = -1 ;
202+ }
203+
204+ int num_states = ac_build_trie (ac , patterns , lengths , count );
205+ ac_build_failure (ac , num_states );
206+ ac -> num_states = num_states ;
207+ ac_shrink_tables (ac , num_states , max_states );
208+
208209 return ac ;
209210}
210211
0 commit comments