Skip to content

Commit e0c09f8

Browse files
committed
Add regex-based grammar matcher experiment
Experiment: compile the grammar to a single PCRE2 pattern using: - each token id encoded as a Unicode codepoint at offset 0x4000 - each rule emitted as (?<rN>...) named subroutine - (*THEN) on each branch's first symbol of *single-candidate* rules (where sibling-branch FIRST sets are disjoint, so committing is safe) - aggressive transitive inlining of single-use non-recursive rules to shrink the bytecode below PCRE2's compiled-pattern size limit Result on the 69,576-query MySQL test corpus (PCRE2 JIT enabled): - Pattern: ~76 KB source, 1127 named subroutines after 789 rules inlined - Match throughput: ~97,600 QPS, vs the optimised interpreter's ~62k. - 99.82% accuracy: ~120 spurious failures, mostly the 'SELECT ... INTO' ambiguity that the interpreter handles via a runtime negative lookahead the regex doesn't model. Trade-offs: 1. Match-only - the regex doesn't build an AST, so it's not a drop-in replacement for the recursive-descent parser the SQLite driver needs. 2. Without (*THEN) the matcher backtracks catastrophically on nested compound statements (CREATE TRIGGER ... BEGIN ... IF ...). 3. With (*THEN) on every branch (not just single-candidate) the regex gives spurious failures because PCRE commits to the first first-symbol match and can't try a sibling alternative. 4. Pattern size is constrained by PCRE2's default LINK_SIZE=2 bytecode limit; aggressive rule inlining is needed to fit a non-trivial grammar. Kept as documentation: an interesting upper bound on PHP-side parsing speed when the AST shape is not required.
1 parent b9b64a6 commit e0c09f8

1 file changed

Lines changed: 288 additions & 0 deletions

File tree

Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
<?php
2+
/**
3+
* Regex grammar compiler v3: aggressively inline single-use rules and
4+
* use (*THEN) on every branch's first symbol so the matcher can't
5+
* backtrack into a sibling alternative once a token has been consumed.
6+
*/
7+
8+
set_error_handler(
9+
function ( $s, $m, $f, $l ) {
10+
throw new ErrorException( $m, 0, $s, $f, $l );
11+
}
12+
);
13+
14+
require_once __DIR__ . '/../../src/parser/class-wp-parser-grammar.php';
15+
require_once __DIR__ . '/../../src/parser/class-wp-parser-token.php';
16+
require_once __DIR__ . '/../../src/mysql/class-wp-mysql-token.php';
17+
require_once __DIR__ . '/../../src/mysql/class-wp-mysql-lexer.php';
18+
19+
const TOKEN_OFFSET = 0x4000;
20+
21+
function token_char( $tid ) {
22+
return mb_chr( $tid + TOKEN_OFFSET, 'UTF-8' );
23+
}
24+
25+
$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' );
26+
$low_nt = $grammar->lowest_non_terminal_id;
27+
28+
// Count how many times each rule is referenced.
29+
function ref_counts( WP_Parser_Grammar $g ) {
30+
$low_nt = $g->lowest_non_terminal_id;
31+
$refs = array();
32+
foreach ( $g->rules as $rid => $branches ) {
33+
$refs[ $rid ] = 0;
34+
}
35+
foreach ( $g->rules as $rid => $branches ) {
36+
foreach ( $branches as $b ) {
37+
foreach ( $b as $sym ) {
38+
if ( $sym >= $low_nt ) {
39+
$refs[ $sym ] = ( $refs[ $sym ] ?? 0 ) + 1;
40+
}
41+
}
42+
}
43+
}
44+
return $refs;
45+
}
46+
47+
// FIRST and NULLABLE.
48+
$rules = $grammar->rules;
49+
$nullable = array();
50+
$first = array();
51+
foreach ( $rules as $rid => $_ ) {
52+
$nullable[ $rid ] = false;
53+
$first[ $rid ] = array();
54+
}
55+
do {
56+
$changed = false;
57+
foreach ( $rules as $rid => $branches ) {
58+
foreach ( $branches as $branch ) {
59+
$bn = true;
60+
foreach ( $branch as $sym ) {
61+
if ( $sym < $low_nt ) {
62+
if ( ! isset( $first[ $rid ][ $sym ] ) ) {
63+
$first[ $rid ][ $sym ] = true;
64+
$changed = true;
65+
}
66+
$bn = false;
67+
break;
68+
}
69+
foreach ( $first[ $sym ] as $tid => $_ ) {
70+
if ( ! isset( $first[ $rid ][ $tid ] ) ) {
71+
$first[ $rid ][ $tid ] = true;
72+
$changed = true;
73+
}
74+
}
75+
if ( ! $nullable[ $sym ] ) {
76+
$bn = false;
77+
break;
78+
}
79+
}
80+
if ( $bn && ! $nullable[ $rid ] ) {
81+
$nullable[ $rid ] = true;
82+
$changed = true;
83+
}
84+
}
85+
}
86+
} while ( $changed );
87+
88+
// Compile each rule into a "regex body" string. Inline single-use
89+
// non-recursive rules into their callers transitively via memoization.
90+
$single_candidate_rules = $grammar->single_candidate_rules ?? array();
91+
$select_rid = $grammar->get_rule_id( 'selectStatement' );
92+
$into_char = token_char( WP_MySQL_Lexer::INTO_SYMBOL );
93+
$compiled = array();
94+
$visiting = array();
95+
$compile_rule = function ( $rid ) use ( &$compile_rule, &$compiled, &$visiting, $rules, $first, $nullable, $low_nt, $single_candidate_rules, $select_rid, $into_char ) {
96+
if ( isset( $compiled[ $rid ] ) ) {
97+
return $compiled[ $rid ];
98+
}
99+
$visiting[ $rid ] = true;
100+
$alts = array();
101+
$safe_then = isset( $single_candidate_rules[ $rid ] );
102+
foreach ( $rules[ $rid ] as $branch ) {
103+
$alt = '';
104+
foreach ( $branch as $i => $sym ) {
105+
if ( $sym < $low_nt ) {
106+
$alt .= token_char( $sym );
107+
} else {
108+
$alt .= "RREF{$sym}RREF";
109+
}
110+
// (*THEN) commits the alternative once the first symbol matches.
111+
// Only safe when sibling branches of this rule have disjoint
112+
// FIRST sets - that property is captured by
113+
// $grammar->single_candidate_rules. Outside that set, multiple
114+
// branches can share a first token and committing prematurely
115+
// would yield spurious match failures.
116+
if ( 0 === $i && $safe_then ) {
117+
$alt .= '(*THEN)';
118+
}
119+
}
120+
$alts[] = $alt;
121+
}
122+
unset( $visiting[ $rid ] );
123+
$body = '(?:' . implode( '|', $alts ) . ')';
124+
if ( $rid === $select_rid ) {
125+
// Mirror the negative lookahead the parser uses: a successful
126+
// selectStatement match must not be followed by INTO. Otherwise
127+
// the surrounding rule should pick a different alternative.
128+
$body .= '(?!' . $into_char . ')';
129+
}
130+
$compiled[ $rid ] = $body;
131+
return $compiled[ $rid ];
132+
};
133+
134+
// First pass: compile every rule once.
135+
foreach ( array_keys( $rules ) as $rid ) {
136+
$compile_rule( $rid );
137+
}
138+
139+
// Second pass: inline single-use non-recursive rules. A rule is
140+
// inlinable if its body doesn't reference itself transitively. Repeat
141+
// to fixpoint - inlining changes ref counts.
142+
$inlined_count = 0;
143+
do {
144+
$changed = false;
145+
$refs = array();
146+
foreach ( $compiled as $rid => $body ) {
147+
$refs[ $rid ] = 0;
148+
}
149+
foreach ( $compiled as $rid => $body ) {
150+
if ( preg_match_all( '/RREF(\d+)RREF/', $body, $m ) ) {
151+
foreach ( $m[1] as $r ) {
152+
$refs[ (int) $r ] = ( $refs[ (int) $r ] ?? 0 ) + 1;
153+
}
154+
}
155+
}
156+
foreach ( $compiled as $rid => $body ) {
157+
if ( ( $refs[ $rid ] ?? 0 ) !== 1 ) {
158+
continue;
159+
}
160+
// Don't inline recursive rules.
161+
if ( strpos( $body, "RREF{$rid}RREF" ) !== false ) {
162+
continue;
163+
}
164+
// Replace the single reference somewhere.
165+
foreach ( $compiled as $caller_rid => $caller_body ) {
166+
if ( strpos( $caller_body, "RREF{$rid}RREF" ) !== false ) {
167+
$compiled[ $caller_rid ] = str_replace( "RREF{$rid}RREF", $body, $caller_body );
168+
unset( $compiled[ $rid ] );
169+
++$inlined_count;
170+
$changed = true;
171+
break 2; // restart from top so refs recount with the new state
172+
}
173+
}
174+
}
175+
} while ( $changed );
176+
177+
// Now compile remaining rules with named subroutines.
178+
$rule_to_idx = array();
179+
$idx_to_rule = array();
180+
foreach ( $compiled as $rid => $_ ) {
181+
$rule_to_idx[ $rid ] = count( $idx_to_rule );
182+
$idx_to_rule[] = $rid;
183+
}
184+
185+
$define = '';
186+
foreach ( $idx_to_rule as $rid ) {
187+
$body = $compiled[ $rid ];
188+
// Replace RREF placeholders with named-group references.
189+
$body = preg_replace_callback(
190+
'/RREF(\d+)RREF/',
191+
function ( $m ) use ( $rule_to_idx ) {
192+
$rid = (int) $m[1];
193+
return '(?&r' . $rule_to_idx[ $rid ] . ')';
194+
},
195+
$body
196+
);
197+
$define .= "(?<r{$rule_to_idx[$rid]}>{$body})";
198+
}
199+
200+
$start_rid = $grammar->get_rule_id( 'query' );
201+
$pattern = '/(?(DEFINE)' . $define . ')\\A(?&r' . $rule_to_idx[ $start_rid ] . ')\\z/u';
202+
printf(
203+
"Inlined %d rules. Final rules: %d. Pattern: %s bytes\n",
204+
$inlined_count,
205+
count( $idx_to_rule ),
206+
number_format( strlen( $pattern ) )
207+
);
208+
209+
ini_set( 'pcre.backtrack_limit', '1000000000' );
210+
ini_set( 'pcre.recursion_limit', '10000000' );
211+
ini_set( 'pcre.jit', '1' );
212+
213+
$t = microtime( true );
214+
$ok = @preg_match( $pattern, "\xff", $m );
215+
printf(
216+
"Compile: %.2fms, ok=%s, err=%s\n",
217+
( microtime( true ) - $t ) * 1000,
218+
var_export( $ok, true ),
219+
preg_last_error_msg()
220+
);
221+
if ( false === $ok && PREG_BAD_UTF8_ERROR !== preg_last_error() ) {
222+
echo "Pattern doesn't compile cleanly. Bailing.\n";
223+
exit( 1 );
224+
}
225+
226+
$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' );
227+
$queries = array();
228+
$header = true;
229+
while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) {
230+
if ( $header ) {
231+
$header = false;
232+
continue; }
233+
if ( null !== $r[0] ) {
234+
$queries[] = $r[0];
235+
}
236+
}
237+
$queries = array_slice( $queries, 0, (int) ( $argv[1] ?? 5000 ) );
238+
239+
$encoded = array();
240+
foreach ( $queries as $q ) {
241+
$tokens = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens();
242+
$s = '';
243+
foreach ( $tokens as $t ) {
244+
$s .= token_char( $t->id );
245+
}
246+
$encoded[] = $s;
247+
}
248+
249+
$t = microtime( true );
250+
$matched = 0;
251+
$failed = 0;
252+
$errors = 0;
253+
$failed_examples = array();
254+
$slow = array();
255+
foreach ( $encoded as $i => $s ) {
256+
$qstart = microtime( true );
257+
$r = @preg_match( $pattern, $s );
258+
$qd = microtime( true ) - $qstart;
259+
if ( 1 === $r ) {
260+
++$matched;
261+
} elseif ( 0 === $r ) {
262+
++$failed;
263+
if ( count( $failed_examples ) < 10 ) {
264+
$failed_examples[] = substr( str_replace( "\n", ' ', $queries[ $i ] ), 0, 120 );
265+
}
266+
} else {
267+
++$errors; }
268+
if ( $qd > 0.005 && count( $slow ) < 3 ) {
269+
$slow[] = sprintf( '%6.0fms: %s', $qd * 1000, substr( str_replace( "\n", ' ', $queries[ $i ] ), 0, 100 ) );
270+
}
271+
}
272+
$d = microtime( true ) - $t;
273+
printf(
274+
"Matched=%d, Failed=%d, Errors=%d, time=%.4fs (%d QPS)\n",
275+
$matched,
276+
$failed,
277+
$errors,
278+
$d,
279+
count( $encoded ) / $d
280+
);
281+
echo "\nFailed queries:\n";
282+
foreach ( $failed_examples as $e ) {
283+
echo " $e\n";
284+
}
285+
echo "\nSlow queries:\n";
286+
foreach ( $slow as $e ) {
287+
echo " $e\n";
288+
}

0 commit comments

Comments
 (0)