|
| 1 | +<?php |
| 2 | +/** |
| 3 | + * Regex grammar compiler v3: aggressively inline single-use rules and |
| 4 | + * use (*THEN) on every branch's first symbol so the matcher can't |
| 5 | + * backtrack into a sibling alternative once a token has been consumed. |
| 6 | + */ |
| 7 | + |
| 8 | +set_error_handler( |
| 9 | + function ( $s, $m, $f, $l ) { |
| 10 | + throw new ErrorException( $m, 0, $s, $f, $l ); |
| 11 | + } |
| 12 | +); |
| 13 | + |
| 14 | +require_once __DIR__ . '/../../src/parser/class-wp-parser-grammar.php'; |
| 15 | +require_once __DIR__ . '/../../src/parser/class-wp-parser-token.php'; |
| 16 | +require_once __DIR__ . '/../../src/mysql/class-wp-mysql-token.php'; |
| 17 | +require_once __DIR__ . '/../../src/mysql/class-wp-mysql-lexer.php'; |
| 18 | + |
| 19 | +const TOKEN_OFFSET = 0x4000; |
| 20 | + |
| 21 | +function token_char( $tid ) { |
| 22 | + return mb_chr( $tid + TOKEN_OFFSET, 'UTF-8' ); |
| 23 | +} |
| 24 | + |
| 25 | +$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); |
| 26 | +$low_nt = $grammar->lowest_non_terminal_id; |
| 27 | + |
| 28 | +// Count how many times each rule is referenced. |
| 29 | +function ref_counts( WP_Parser_Grammar $g ) { |
| 30 | + $low_nt = $g->lowest_non_terminal_id; |
| 31 | + $refs = array(); |
| 32 | + foreach ( $g->rules as $rid => $branches ) { |
| 33 | + $refs[ $rid ] = 0; |
| 34 | + } |
| 35 | + foreach ( $g->rules as $rid => $branches ) { |
| 36 | + foreach ( $branches as $b ) { |
| 37 | + foreach ( $b as $sym ) { |
| 38 | + if ( $sym >= $low_nt ) { |
| 39 | + $refs[ $sym ] = ( $refs[ $sym ] ?? 0 ) + 1; |
| 40 | + } |
| 41 | + } |
| 42 | + } |
| 43 | + } |
| 44 | + return $refs; |
| 45 | +} |
| 46 | + |
| 47 | +// FIRST and NULLABLE. |
| 48 | +$rules = $grammar->rules; |
| 49 | +$nullable = array(); |
| 50 | +$first = array(); |
| 51 | +foreach ( $rules as $rid => $_ ) { |
| 52 | + $nullable[ $rid ] = false; |
| 53 | + $first[ $rid ] = array(); |
| 54 | +} |
| 55 | +do { |
| 56 | + $changed = false; |
| 57 | + foreach ( $rules as $rid => $branches ) { |
| 58 | + foreach ( $branches as $branch ) { |
| 59 | + $bn = true; |
| 60 | + foreach ( $branch as $sym ) { |
| 61 | + if ( $sym < $low_nt ) { |
| 62 | + if ( ! isset( $first[ $rid ][ $sym ] ) ) { |
| 63 | + $first[ $rid ][ $sym ] = true; |
| 64 | + $changed = true; |
| 65 | + } |
| 66 | + $bn = false; |
| 67 | + break; |
| 68 | + } |
| 69 | + foreach ( $first[ $sym ] as $tid => $_ ) { |
| 70 | + if ( ! isset( $first[ $rid ][ $tid ] ) ) { |
| 71 | + $first[ $rid ][ $tid ] = true; |
| 72 | + $changed = true; |
| 73 | + } |
| 74 | + } |
| 75 | + if ( ! $nullable[ $sym ] ) { |
| 76 | + $bn = false; |
| 77 | + break; |
| 78 | + } |
| 79 | + } |
| 80 | + if ( $bn && ! $nullable[ $rid ] ) { |
| 81 | + $nullable[ $rid ] = true; |
| 82 | + $changed = true; |
| 83 | + } |
| 84 | + } |
| 85 | + } |
| 86 | +} while ( $changed ); |
| 87 | + |
| 88 | +// Compile each rule into a "regex body" string. Inline single-use |
| 89 | +// non-recursive rules into their callers transitively via memoization. |
| 90 | +$single_candidate_rules = $grammar->single_candidate_rules ?? array(); |
| 91 | +$select_rid = $grammar->get_rule_id( 'selectStatement' ); |
| 92 | +$into_char = token_char( WP_MySQL_Lexer::INTO_SYMBOL ); |
| 93 | +$compiled = array(); |
| 94 | +$visiting = array(); |
| 95 | +$compile_rule = function ( $rid ) use ( &$compile_rule, &$compiled, &$visiting, $rules, $first, $nullable, $low_nt, $single_candidate_rules, $select_rid, $into_char ) { |
| 96 | + if ( isset( $compiled[ $rid ] ) ) { |
| 97 | + return $compiled[ $rid ]; |
| 98 | + } |
| 99 | + $visiting[ $rid ] = true; |
| 100 | + $alts = array(); |
| 101 | + $safe_then = isset( $single_candidate_rules[ $rid ] ); |
| 102 | + foreach ( $rules[ $rid ] as $branch ) { |
| 103 | + $alt = ''; |
| 104 | + foreach ( $branch as $i => $sym ) { |
| 105 | + if ( $sym < $low_nt ) { |
| 106 | + $alt .= token_char( $sym ); |
| 107 | + } else { |
| 108 | + $alt .= "RREF{$sym}RREF"; |
| 109 | + } |
| 110 | + // (*THEN) commits the alternative once the first symbol matches. |
| 111 | + // Only safe when sibling branches of this rule have disjoint |
| 112 | + // FIRST sets - that property is captured by |
| 113 | + // $grammar->single_candidate_rules. Outside that set, multiple |
| 114 | + // branches can share a first token and committing prematurely |
| 115 | + // would yield spurious match failures. |
| 116 | + if ( 0 === $i && $safe_then ) { |
| 117 | + $alt .= '(*THEN)'; |
| 118 | + } |
| 119 | + } |
| 120 | + $alts[] = $alt; |
| 121 | + } |
| 122 | + unset( $visiting[ $rid ] ); |
| 123 | + $body = '(?:' . implode( '|', $alts ) . ')'; |
| 124 | + if ( $rid === $select_rid ) { |
| 125 | + // Mirror the negative lookahead the parser uses: a successful |
| 126 | + // selectStatement match must not be followed by INTO. Otherwise |
| 127 | + // the surrounding rule should pick a different alternative. |
| 128 | + $body .= '(?!' . $into_char . ')'; |
| 129 | + } |
| 130 | + $compiled[ $rid ] = $body; |
| 131 | + return $compiled[ $rid ]; |
| 132 | +}; |
| 133 | + |
| 134 | +// First pass: compile every rule once. |
| 135 | +foreach ( array_keys( $rules ) as $rid ) { |
| 136 | + $compile_rule( $rid ); |
| 137 | +} |
| 138 | + |
| 139 | +// Second pass: inline single-use non-recursive rules. A rule is |
| 140 | +// inlinable if its body doesn't reference itself transitively. Repeat |
| 141 | +// to fixpoint - inlining changes ref counts. |
| 142 | +$inlined_count = 0; |
| 143 | +do { |
| 144 | + $changed = false; |
| 145 | + $refs = array(); |
| 146 | + foreach ( $compiled as $rid => $body ) { |
| 147 | + $refs[ $rid ] = 0; |
| 148 | + } |
| 149 | + foreach ( $compiled as $rid => $body ) { |
| 150 | + if ( preg_match_all( '/RREF(\d+)RREF/', $body, $m ) ) { |
| 151 | + foreach ( $m[1] as $r ) { |
| 152 | + $refs[ (int) $r ] = ( $refs[ (int) $r ] ?? 0 ) + 1; |
| 153 | + } |
| 154 | + } |
| 155 | + } |
| 156 | + foreach ( $compiled as $rid => $body ) { |
| 157 | + if ( ( $refs[ $rid ] ?? 0 ) !== 1 ) { |
| 158 | + continue; |
| 159 | + } |
| 160 | + // Don't inline recursive rules. |
| 161 | + if ( strpos( $body, "RREF{$rid}RREF" ) !== false ) { |
| 162 | + continue; |
| 163 | + } |
| 164 | + // Replace the single reference somewhere. |
| 165 | + foreach ( $compiled as $caller_rid => $caller_body ) { |
| 166 | + if ( strpos( $caller_body, "RREF{$rid}RREF" ) !== false ) { |
| 167 | + $compiled[ $caller_rid ] = str_replace( "RREF{$rid}RREF", $body, $caller_body ); |
| 168 | + unset( $compiled[ $rid ] ); |
| 169 | + ++$inlined_count; |
| 170 | + $changed = true; |
| 171 | + break 2; // restart from top so refs recount with the new state |
| 172 | + } |
| 173 | + } |
| 174 | + } |
| 175 | +} while ( $changed ); |
| 176 | + |
| 177 | +// Now compile remaining rules with named subroutines. |
| 178 | +$rule_to_idx = array(); |
| 179 | +$idx_to_rule = array(); |
| 180 | +foreach ( $compiled as $rid => $_ ) { |
| 181 | + $rule_to_idx[ $rid ] = count( $idx_to_rule ); |
| 182 | + $idx_to_rule[] = $rid; |
| 183 | +} |
| 184 | + |
| 185 | +$define = ''; |
| 186 | +foreach ( $idx_to_rule as $rid ) { |
| 187 | + $body = $compiled[ $rid ]; |
| 188 | + // Replace RREF placeholders with named-group references. |
| 189 | + $body = preg_replace_callback( |
| 190 | + '/RREF(\d+)RREF/', |
| 191 | + function ( $m ) use ( $rule_to_idx ) { |
| 192 | + $rid = (int) $m[1]; |
| 193 | + return '(?&r' . $rule_to_idx[ $rid ] . ')'; |
| 194 | + }, |
| 195 | + $body |
| 196 | + ); |
| 197 | + $define .= "(?<r{$rule_to_idx[$rid]}>{$body})"; |
| 198 | +} |
| 199 | + |
| 200 | +$start_rid = $grammar->get_rule_id( 'query' ); |
| 201 | +$pattern = '/(?(DEFINE)' . $define . ')\\A(?&r' . $rule_to_idx[ $start_rid ] . ')\\z/u'; |
| 202 | +printf( |
| 203 | + "Inlined %d rules. Final rules: %d. Pattern: %s bytes\n", |
| 204 | + $inlined_count, |
| 205 | + count( $idx_to_rule ), |
| 206 | + number_format( strlen( $pattern ) ) |
| 207 | +); |
| 208 | + |
| 209 | +ini_set( 'pcre.backtrack_limit', '1000000000' ); |
| 210 | +ini_set( 'pcre.recursion_limit', '10000000' ); |
| 211 | +ini_set( 'pcre.jit', '1' ); |
| 212 | + |
| 213 | +$t = microtime( true ); |
| 214 | +$ok = @preg_match( $pattern, "\xff", $m ); |
| 215 | +printf( |
| 216 | + "Compile: %.2fms, ok=%s, err=%s\n", |
| 217 | + ( microtime( true ) - $t ) * 1000, |
| 218 | + var_export( $ok, true ), |
| 219 | + preg_last_error_msg() |
| 220 | +); |
| 221 | +if ( false === $ok && PREG_BAD_UTF8_ERROR !== preg_last_error() ) { |
| 222 | + echo "Pattern doesn't compile cleanly. Bailing.\n"; |
| 223 | + exit( 1 ); |
| 224 | +} |
| 225 | + |
| 226 | +$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' ); |
| 227 | +$queries = array(); |
| 228 | +$header = true; |
| 229 | +while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) { |
| 230 | + if ( $header ) { |
| 231 | + $header = false; |
| 232 | + continue; } |
| 233 | + if ( null !== $r[0] ) { |
| 234 | + $queries[] = $r[0]; |
| 235 | + } |
| 236 | +} |
| 237 | +$queries = array_slice( $queries, 0, (int) ( $argv[1] ?? 5000 ) ); |
| 238 | + |
| 239 | +$encoded = array(); |
| 240 | +foreach ( $queries as $q ) { |
| 241 | + $tokens = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens(); |
| 242 | + $s = ''; |
| 243 | + foreach ( $tokens as $t ) { |
| 244 | + $s .= token_char( $t->id ); |
| 245 | + } |
| 246 | + $encoded[] = $s; |
| 247 | +} |
| 248 | + |
| 249 | +$t = microtime( true ); |
| 250 | +$matched = 0; |
| 251 | +$failed = 0; |
| 252 | +$errors = 0; |
| 253 | +$failed_examples = array(); |
| 254 | +$slow = array(); |
| 255 | +foreach ( $encoded as $i => $s ) { |
| 256 | + $qstart = microtime( true ); |
| 257 | + $r = @preg_match( $pattern, $s ); |
| 258 | + $qd = microtime( true ) - $qstart; |
| 259 | + if ( 1 === $r ) { |
| 260 | + ++$matched; |
| 261 | + } elseif ( 0 === $r ) { |
| 262 | + ++$failed; |
| 263 | + if ( count( $failed_examples ) < 10 ) { |
| 264 | + $failed_examples[] = substr( str_replace( "\n", ' ', $queries[ $i ] ), 0, 120 ); |
| 265 | + } |
| 266 | + } else { |
| 267 | + ++$errors; } |
| 268 | + if ( $qd > 0.005 && count( $slow ) < 3 ) { |
| 269 | + $slow[] = sprintf( '%6.0fms: %s', $qd * 1000, substr( str_replace( "\n", ' ', $queries[ $i ] ), 0, 100 ) ); |
| 270 | + } |
| 271 | +} |
| 272 | +$d = microtime( true ) - $t; |
| 273 | +printf( |
| 274 | + "Matched=%d, Failed=%d, Errors=%d, time=%.4fs (%d QPS)\n", |
| 275 | + $matched, |
| 276 | + $failed, |
| 277 | + $errors, |
| 278 | + $d, |
| 279 | + count( $encoded ) / $d |
| 280 | +); |
| 281 | +echo "\nFailed queries:\n"; |
| 282 | +foreach ( $failed_examples as $e ) { |
| 283 | + echo " $e\n"; |
| 284 | +} |
| 285 | +echo "\nSlow queries:\n"; |
| 286 | +foreach ( $slow as $e ) { |
| 287 | + echo " $e\n"; |
| 288 | +} |
0 commit comments