|
| 1 | +<?php |
| 2 | +/** |
| 3 | + * Hybrid: regex pre-validation followed by the AST-building parser. |
| 4 | + * |
| 5 | + * Hypothesis: a PCRE2 match is a fast yes/no gate; if regex confirms |
| 6 | + * the input parses, the AST builder can run. Tests whether this |
| 7 | + * hybrid is faster than just running the parser. |
| 8 | + */ |
| 9 | + |
| 10 | +set_error_handler( |
| 11 | + function ( $s, $m, $f, $l ) { |
| 12 | + throw new ErrorException( $m, 0, $s, $f, $l ); |
| 13 | + } |
| 14 | +); |
| 15 | + |
| 16 | +require_once __DIR__ . '/../../src/parser/class-wp-parser-grammar.php'; |
| 17 | +require_once __DIR__ . '/../../src/parser/class-wp-parser-node.php'; |
| 18 | +require_once __DIR__ . '/../../src/parser/class-wp-parser-token.php'; |
| 19 | +require_once __DIR__ . '/../../src/parser/class-wp-parser.php'; |
| 20 | +require_once __DIR__ . '/../../src/mysql/class-wp-mysql-token.php'; |
| 21 | +require_once __DIR__ . '/../../src/mysql/class-wp-mysql-lexer.php'; |
| 22 | +require_once __DIR__ . '/../../src/mysql/class-wp-mysql-parser.php'; |
| 23 | + |
| 24 | +const TOKEN_OFFSET = 0x4000; |
| 25 | + |
| 26 | +// Reuse the regex compiler from exp-regex-v3 (a simplified inline copy). |
| 27 | +function compile_regex( WP_Parser_Grammar $grammar ): string { |
| 28 | + $low_nt = $grammar->lowest_non_terminal_id; |
| 29 | + $rules = $grammar->rules; |
| 30 | + $nullable = array(); |
| 31 | + $first = array(); |
| 32 | + foreach ( $rules as $rid => $_ ) { |
| 33 | + $nullable[ $rid ] = false; |
| 34 | + $first[ $rid ] = array(); |
| 35 | + } |
| 36 | + do { |
| 37 | + $changed = false; |
| 38 | + foreach ( $rules as $rid => $branches ) { |
| 39 | + foreach ( $branches as $branch ) { |
| 40 | + $bn = true; |
| 41 | + foreach ( $branch as $sym ) { |
| 42 | + if ( $sym < $low_nt ) { |
| 43 | + if ( ! isset( $first[ $rid ][ $sym ] ) ) { |
| 44 | + $first[ $rid ][ $sym ] = true; |
| 45 | + $changed = true; |
| 46 | + } |
| 47 | + $bn = false; |
| 48 | + break; |
| 49 | + } |
| 50 | + foreach ( $first[ $sym ] as $tid => $_ ) { |
| 51 | + if ( ! isset( $first[ $rid ][ $tid ] ) ) { |
| 52 | + $first[ $rid ][ $tid ] = true; |
| 53 | + $changed = true; |
| 54 | + } |
| 55 | + } |
| 56 | + if ( ! $nullable[ $sym ] ) { |
| 57 | + $bn = false; |
| 58 | + break; |
| 59 | + } |
| 60 | + } |
| 61 | + if ( $bn && ! $nullable[ $rid ] ) { |
| 62 | + $nullable[ $rid ] = true; |
| 63 | + $changed = true; |
| 64 | + } |
| 65 | + } |
| 66 | + } |
| 67 | + } while ( $changed ); |
| 68 | + |
| 69 | + $single_candidate_rules = $grammar->single_candidate_rules ?? array(); |
| 70 | + $select_rid = $grammar->get_rule_id( 'selectStatement' ); |
| 71 | + $into_char = mb_chr( WP_MySQL_Lexer::INTO_SYMBOL + TOKEN_OFFSET, 'UTF-8' ); |
| 72 | + |
| 73 | + $compiled = array(); |
| 74 | + $compile = function ( $rid ) use ( &$compile, &$compiled, $rules, $low_nt, $single_candidate_rules, $select_rid, $into_char ) { |
| 75 | + if ( isset( $compiled[ $rid ] ) ) { |
| 76 | + return $compiled[ $rid ]; |
| 77 | + } |
| 78 | + $alts = array(); |
| 79 | + $st = isset( $single_candidate_rules[ $rid ] ); |
| 80 | + foreach ( $rules[ $rid ] as $branch ) { |
| 81 | + $alt = ''; |
| 82 | + foreach ( $branch as $i => $sym ) { |
| 83 | + if ( $sym < $low_nt ) { |
| 84 | + $alt .= mb_chr( $sym + TOKEN_OFFSET, 'UTF-8' ); |
| 85 | + } else { |
| 86 | + $alt .= "RREF{$sym}RREF"; |
| 87 | + } |
| 88 | + if ( 0 === $i && $st ) { |
| 89 | + $alt .= '(*THEN)'; |
| 90 | + } |
| 91 | + } |
| 92 | + $alts[] = $alt; |
| 93 | + } |
| 94 | + $body = '(?:' . implode( '|', $alts ) . ')'; |
| 95 | + if ( $rid === $select_rid ) { |
| 96 | + $body .= '(?!' . $into_char . ')'; |
| 97 | + } |
| 98 | + $compiled[ $rid ] = $body; |
| 99 | + return $compiled[ $rid ]; |
| 100 | + }; |
| 101 | + foreach ( array_keys( $rules ) as $rid ) { |
| 102 | + $compile( $rid ); |
| 103 | + } |
| 104 | + |
| 105 | + // Inline single-use rules. |
| 106 | + do { |
| 107 | + $changed = false; |
| 108 | + $refs = array(); |
| 109 | + foreach ( $compiled as $rid => $_ ) { |
| 110 | + $refs[ $rid ] = 0; |
| 111 | + } |
| 112 | + foreach ( $compiled as $rid => $body ) { |
| 113 | + if ( preg_match_all( '/RREF(\d+)RREF/', $body, $m ) ) { |
| 114 | + foreach ( $m[1] as $r ) { |
| 115 | + $refs[ (int) $r ] = ( $refs[ (int) $r ] ?? 0 ) + 1; |
| 116 | + } |
| 117 | + } |
| 118 | + } |
| 119 | + foreach ( $compiled as $rid => $body ) { |
| 120 | + if ( ( $refs[ $rid ] ?? 0 ) !== 1 || strpos( $body, "RREF{$rid}RREF" ) !== false ) { |
| 121 | + continue; |
| 122 | + } |
| 123 | + foreach ( $compiled as $cr => $cb ) { |
| 124 | + if ( strpos( $cb, "RREF{$rid}RREF" ) !== false ) { |
| 125 | + $compiled[ $cr ] = str_replace( "RREF{$rid}RREF", $body, $cb ); |
| 126 | + unset( $compiled[ $rid ] ); |
| 127 | + $changed = true; |
| 128 | + break 2; |
| 129 | + } |
| 130 | + } |
| 131 | + } |
| 132 | + } while ( $changed ); |
| 133 | + |
| 134 | + $rule_to_idx = array(); |
| 135 | + foreach ( $compiled as $rid => $_ ) { |
| 136 | + $rule_to_idx[ $rid ] = count( $rule_to_idx ); |
| 137 | + } |
| 138 | + $define = ''; |
| 139 | + foreach ( $compiled as $rid => $body ) { |
| 140 | + $body = preg_replace_callback( |
| 141 | + '/RREF(\d+)RREF/', |
| 142 | + function ( $m ) use ( $rule_to_idx ) { |
| 143 | + return '(?&r' . $rule_to_idx[ (int) $m[1] ] . ')'; |
| 144 | + }, |
| 145 | + $body |
| 146 | + ); |
| 147 | + $define .= "(?<r{$rule_to_idx[$rid]}>{$body})"; |
| 148 | + } |
| 149 | + $start_rid = $grammar->get_rule_id( 'query' ); |
| 150 | + return '/(?(DEFINE)' . $define . ')\\A(?&r' . $rule_to_idx[ $start_rid ] . ')\\z/u'; |
| 151 | +} |
| 152 | + |
| 153 | +$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); |
| 154 | +$pattern = compile_regex( $grammar ); |
| 155 | + |
| 156 | +ini_set( 'pcre.backtrack_limit', '1000000000' ); |
| 157 | +ini_set( 'pcre.recursion_limit', '10000000' ); |
| 158 | +ini_set( 'pcre.jit', '1' ); |
| 159 | +ini_set( 'pcre.jit_stacksize', '32M' ); |
| 160 | + |
| 161 | +$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' ); |
| 162 | +$queries = array(); |
| 163 | +$header = true; |
| 164 | +while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) { |
| 165 | + if ( $header ) { |
| 166 | + $header = false; |
| 167 | + continue; |
| 168 | + } |
| 169 | + if ( null !== $r[0] ) { |
| 170 | + $queries[] = $r[0]; |
| 171 | + } |
| 172 | +} |
| 173 | +$queries = array_slice( $queries, 0, (int) ( $argv[1] ?? 10000 ) ); |
| 174 | + |
| 175 | +// Pre-tokenize and pre-encode. |
| 176 | +$pairs = array(); |
| 177 | +foreach ( $queries as $q ) { |
| 178 | + $tokens = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens(); |
| 179 | + $enc = ''; |
| 180 | + foreach ( $tokens as $t ) { |
| 181 | + $enc .= mb_chr( $t->id + TOKEN_OFFSET, 'UTF-8' ); |
| 182 | + } |
| 183 | + $pairs[] = array( $tokens, $enc ); |
| 184 | +} |
| 185 | +printf( "Loaded %d queries\n", count( $pairs ) ); |
| 186 | + |
| 187 | +// 1. Just regex match. |
| 188 | +$start = microtime( true ); |
| 189 | +$ok = 0; |
| 190 | +foreach ( $pairs as $p ) { |
| 191 | + if ( @preg_match( $pattern, $p[1] ) === 1 ) { |
| 192 | + ++$ok; |
| 193 | + } |
| 194 | +} |
| 195 | +$d = microtime( true ) - $start; |
| 196 | +printf( "regex only: %.4fs (%d QPS, %d/%d match)\n", $d, count( $pairs ) / $d, $ok, count( $pairs ) ); |
| 197 | + |
| 198 | +// 2. Just parser (build AST). |
| 199 | +$start = microtime( true ); |
| 200 | +$ok = 0; |
| 201 | +foreach ( $pairs as $p ) { |
| 202 | + if ( ( new WP_MySQL_Parser( $grammar, $p[0] ) )->parse() ) { |
| 203 | + ++$ok; |
| 204 | + } |
| 205 | +} |
| 206 | +$d = microtime( true ) - $start; |
| 207 | +printf( "parser only (AST): %.4fs (%d QPS, %d/%d match)\n", $d, count( $pairs ) / $d, $ok, count( $pairs ) ); |
| 208 | + |
| 209 | +// 3. Hybrid: regex first; on success run the parser to build AST. Pure |
| 210 | +// overhead: same parser runs, plus the regex. |
| 211 | +$start = microtime( true ); |
| 212 | +$ok = 0; |
| 213 | +$regex_failed = 0; |
| 214 | +foreach ( $pairs as $p ) { |
| 215 | + if ( @preg_match( $pattern, $p[1] ) !== 1 ) { |
| 216 | + ++$regex_failed; |
| 217 | + continue; |
| 218 | + } |
| 219 | + if ( ( new WP_MySQL_Parser( $grammar, $p[0] ) )->parse() ) { |
| 220 | + ++$ok; |
| 221 | + } |
| 222 | +} |
| 223 | +$d = microtime( true ) - $start; |
| 224 | +printf( |
| 225 | + "regex + parser: %.4fs (%d QPS, %d/%d match, %d regex-rejected)\n", |
| 226 | + $d, |
| 227 | + count( $pairs ) / $d, |
| 228 | + $ok, |
| 229 | + count( $pairs ), |
| 230 | + $regex_failed |
| 231 | +); |
0 commit comments