Skip to content

Commit 9d36df4

Browse files
committed
Add hybrid regex-pre-validate + parser experiment
Tests whether running the regex match as a pre-validator before the AST-building parser is faster than the parser alone. Result on the 69,576-query MySQL corpus, tracing JIT enabled: regex only (no AST): 0.752 s, 92,519 QPS parser only (AST): 1.136 s, 61,240 QPS regex + parser: 1.480 s, 47,008 QPS The hybrid is *slower* than the parser alone because the regex is pure overhead - 99.99% of corpus queries are valid SQL, so the parser still has to run on each query to build the AST. The pre-check only pays off when many inputs are invalid; that is not our workload. Confirms the regex experiment is a recogniser, not a parser replacement: PCRE2 in PHP cannot return a structured tree from a recursive named-group match (last-match-wins semantics) and PHP does not expose user PCRE callouts that could intercept the match to record structural events. Useful as a fast 'does this query parse?' gate; not useful in workloads that need the AST.
1 parent e0c09f8 commit 9d36df4

1 file changed

Lines changed: 231 additions & 0 deletions

File tree

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
<?php
2+
/**
3+
* Hybrid: regex pre-validation followed by the AST-building parser.
4+
*
5+
* Hypothesis: a PCRE2 match is a fast yes/no gate; if regex confirms
6+
* the input parses, the AST builder can run. Tests whether this
7+
* hybrid is faster than just running the parser.
8+
*/
9+
10+
set_error_handler(
11+
function ( $s, $m, $f, $l ) {
12+
throw new ErrorException( $m, 0, $s, $f, $l );
13+
}
14+
);
15+
16+
require_once __DIR__ . '/../../src/parser/class-wp-parser-grammar.php';
17+
require_once __DIR__ . '/../../src/parser/class-wp-parser-node.php';
18+
require_once __DIR__ . '/../../src/parser/class-wp-parser-token.php';
19+
require_once __DIR__ . '/../../src/parser/class-wp-parser.php';
20+
require_once __DIR__ . '/../../src/mysql/class-wp-mysql-token.php';
21+
require_once __DIR__ . '/../../src/mysql/class-wp-mysql-lexer.php';
22+
require_once __DIR__ . '/../../src/mysql/class-wp-mysql-parser.php';
23+
24+
const TOKEN_OFFSET = 0x4000;
25+
26+
// Reuse the regex compiler from exp-regex-v3 (a simplified inline copy).
27+
function compile_regex( WP_Parser_Grammar $grammar ): string {
28+
$low_nt = $grammar->lowest_non_terminal_id;
29+
$rules = $grammar->rules;
30+
$nullable = array();
31+
$first = array();
32+
foreach ( $rules as $rid => $_ ) {
33+
$nullable[ $rid ] = false;
34+
$first[ $rid ] = array();
35+
}
36+
do {
37+
$changed = false;
38+
foreach ( $rules as $rid => $branches ) {
39+
foreach ( $branches as $branch ) {
40+
$bn = true;
41+
foreach ( $branch as $sym ) {
42+
if ( $sym < $low_nt ) {
43+
if ( ! isset( $first[ $rid ][ $sym ] ) ) {
44+
$first[ $rid ][ $sym ] = true;
45+
$changed = true;
46+
}
47+
$bn = false;
48+
break;
49+
}
50+
foreach ( $first[ $sym ] as $tid => $_ ) {
51+
if ( ! isset( $first[ $rid ][ $tid ] ) ) {
52+
$first[ $rid ][ $tid ] = true;
53+
$changed = true;
54+
}
55+
}
56+
if ( ! $nullable[ $sym ] ) {
57+
$bn = false;
58+
break;
59+
}
60+
}
61+
if ( $bn && ! $nullable[ $rid ] ) {
62+
$nullable[ $rid ] = true;
63+
$changed = true;
64+
}
65+
}
66+
}
67+
} while ( $changed );
68+
69+
$single_candidate_rules = $grammar->single_candidate_rules ?? array();
70+
$select_rid = $grammar->get_rule_id( 'selectStatement' );
71+
$into_char = mb_chr( WP_MySQL_Lexer::INTO_SYMBOL + TOKEN_OFFSET, 'UTF-8' );
72+
73+
$compiled = array();
74+
$compile = function ( $rid ) use ( &$compile, &$compiled, $rules, $low_nt, $single_candidate_rules, $select_rid, $into_char ) {
75+
if ( isset( $compiled[ $rid ] ) ) {
76+
return $compiled[ $rid ];
77+
}
78+
$alts = array();
79+
$st = isset( $single_candidate_rules[ $rid ] );
80+
foreach ( $rules[ $rid ] as $branch ) {
81+
$alt = '';
82+
foreach ( $branch as $i => $sym ) {
83+
if ( $sym < $low_nt ) {
84+
$alt .= mb_chr( $sym + TOKEN_OFFSET, 'UTF-8' );
85+
} else {
86+
$alt .= "RREF{$sym}RREF";
87+
}
88+
if ( 0 === $i && $st ) {
89+
$alt .= '(*THEN)';
90+
}
91+
}
92+
$alts[] = $alt;
93+
}
94+
$body = '(?:' . implode( '|', $alts ) . ')';
95+
if ( $rid === $select_rid ) {
96+
$body .= '(?!' . $into_char . ')';
97+
}
98+
$compiled[ $rid ] = $body;
99+
return $compiled[ $rid ];
100+
};
101+
foreach ( array_keys( $rules ) as $rid ) {
102+
$compile( $rid );
103+
}
104+
105+
// Inline single-use rules.
106+
do {
107+
$changed = false;
108+
$refs = array();
109+
foreach ( $compiled as $rid => $_ ) {
110+
$refs[ $rid ] = 0;
111+
}
112+
foreach ( $compiled as $rid => $body ) {
113+
if ( preg_match_all( '/RREF(\d+)RREF/', $body, $m ) ) {
114+
foreach ( $m[1] as $r ) {
115+
$refs[ (int) $r ] = ( $refs[ (int) $r ] ?? 0 ) + 1;
116+
}
117+
}
118+
}
119+
foreach ( $compiled as $rid => $body ) {
120+
if ( ( $refs[ $rid ] ?? 0 ) !== 1 || strpos( $body, "RREF{$rid}RREF" ) !== false ) {
121+
continue;
122+
}
123+
foreach ( $compiled as $cr => $cb ) {
124+
if ( strpos( $cb, "RREF{$rid}RREF" ) !== false ) {
125+
$compiled[ $cr ] = str_replace( "RREF{$rid}RREF", $body, $cb );
126+
unset( $compiled[ $rid ] );
127+
$changed = true;
128+
break 2;
129+
}
130+
}
131+
}
132+
} while ( $changed );
133+
134+
$rule_to_idx = array();
135+
foreach ( $compiled as $rid => $_ ) {
136+
$rule_to_idx[ $rid ] = count( $rule_to_idx );
137+
}
138+
$define = '';
139+
foreach ( $compiled as $rid => $body ) {
140+
$body = preg_replace_callback(
141+
'/RREF(\d+)RREF/',
142+
function ( $m ) use ( $rule_to_idx ) {
143+
return '(?&r' . $rule_to_idx[ (int) $m[1] ] . ')';
144+
},
145+
$body
146+
);
147+
$define .= "(?<r{$rule_to_idx[$rid]}>{$body})";
148+
}
149+
$start_rid = $grammar->get_rule_id( 'query' );
150+
return '/(?(DEFINE)' . $define . ')\\A(?&r' . $rule_to_idx[ $start_rid ] . ')\\z/u';
151+
}
152+
153+
$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' );
154+
$pattern = compile_regex( $grammar );
155+
156+
ini_set( 'pcre.backtrack_limit', '1000000000' );
157+
ini_set( 'pcre.recursion_limit', '10000000' );
158+
ini_set( 'pcre.jit', '1' );
159+
ini_set( 'pcre.jit_stacksize', '32M' );
160+
161+
$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' );
162+
$queries = array();
163+
$header = true;
164+
while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) {
165+
if ( $header ) {
166+
$header = false;
167+
continue;
168+
}
169+
if ( null !== $r[0] ) {
170+
$queries[] = $r[0];
171+
}
172+
}
173+
$queries = array_slice( $queries, 0, (int) ( $argv[1] ?? 10000 ) );
174+
175+
// Pre-tokenize and pre-encode.
176+
$pairs = array();
177+
foreach ( $queries as $q ) {
178+
$tokens = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens();
179+
$enc = '';
180+
foreach ( $tokens as $t ) {
181+
$enc .= mb_chr( $t->id + TOKEN_OFFSET, 'UTF-8' );
182+
}
183+
$pairs[] = array( $tokens, $enc );
184+
}
185+
printf( "Loaded %d queries\n", count( $pairs ) );
186+
187+
// 1. Just regex match.
188+
$start = microtime( true );
189+
$ok = 0;
190+
foreach ( $pairs as $p ) {
191+
if ( @preg_match( $pattern, $p[1] ) === 1 ) {
192+
++$ok;
193+
}
194+
}
195+
$d = microtime( true ) - $start;
196+
printf( "regex only: %.4fs (%d QPS, %d/%d match)\n", $d, count( $pairs ) / $d, $ok, count( $pairs ) );
197+
198+
// 2. Just parser (build AST).
199+
$start = microtime( true );
200+
$ok = 0;
201+
foreach ( $pairs as $p ) {
202+
if ( ( new WP_MySQL_Parser( $grammar, $p[0] ) )->parse() ) {
203+
++$ok;
204+
}
205+
}
206+
$d = microtime( true ) - $start;
207+
printf( "parser only (AST): %.4fs (%d QPS, %d/%d match)\n", $d, count( $pairs ) / $d, $ok, count( $pairs ) );
208+
209+
// 3. Hybrid: regex first; on success run the parser to build AST. Pure
210+
// overhead: same parser runs, plus the regex.
211+
$start = microtime( true );
212+
$ok = 0;
213+
$regex_failed = 0;
214+
foreach ( $pairs as $p ) {
215+
if ( @preg_match( $pattern, $p[1] ) !== 1 ) {
216+
++$regex_failed;
217+
continue;
218+
}
219+
if ( ( new WP_MySQL_Parser( $grammar, $p[0] ) )->parse() ) {
220+
++$ok;
221+
}
222+
}
223+
$d = microtime( true ) - $start;
224+
printf(
225+
"regex + parser: %.4fs (%d QPS, %d/%d match, %d regex-rejected)\n",
226+
$d,
227+
count( $pairs ) / $d,
228+
$ok,
229+
count( $pairs ),
230+
$regex_failed
231+
);

0 commit comments

Comments
 (0)