Skip to content

Commit 7875ef6

Browse files
committed
Inline terminal matching and defer parse node allocation
Hot-path changes in WP_Parser::parse_recursive(): - Inline the terminal match in the branch loop instead of recursing into parse_recursive() for every token. Over the full MySQL test suite this eliminates ~1.6M function calls. - Hoist grammar, rules, fragment_ids, rule_names, tokens, and token_count into local variables so the inner loops avoid repeated property lookups on $this->grammar. - Cache the token count on the instance to avoid a count() per call. - Build branch children in a local array and only instantiate the WP_Parser_Node once the branch has matched; on the MySQL corpus ~75% of speculative nodes were previously created and thrown away. - Drop a dead is_array($subnode) check that never fires in practice (subnodes are false, true, tokens, or nodes - never arrays). - Inline fragment inlining: read the fragment's children directly instead of building a fragment node and immediately merging it. End-to-end parser benchmark on the MySQL server test corpus: Before: ~11,500 QPS After: ~14,900 QPS (+29%)
1 parent 104a872 commit 7875ef6

3 files changed

Lines changed: 87 additions & 28 deletions

File tree

packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class WP_MySQL_Parser extends WP_Parser {
2929
* @return bool Whether a query was successfully parsed.
3030
*/
3131
public function next_query(): bool {
32-
if ( $this->position >= count( $this->tokens ) ) {
32+
if ( $this->position >= $this->token_count ) {
3333
return false;
3434
}
3535
$this->current_ast = $this->parse();

packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,30 @@ public function append_child( $node ) {
2626
$this->children[] = $node;
2727
}
2828

29+
/**
30+
* Replace all children with the given array.
31+
*
32+
* This is used by the parser to attach a batch of children built up in a
33+
* local array while trying branches, without allocating a node per attempt.
34+
*
35+
* @param array<WP_Parser_Node|WP_Parser_Token> $children The new children.
36+
*/
37+
public function set_children( array $children ): void {
38+
$this->children = $children;
39+
}
40+
41+
/**
42+
* Return the children array by reference for efficient fragment inlining.
43+
*
44+
* Returning a reference lets the parser iterate children without copying
45+
* the array. The returned reference must not be mutated by callers.
46+
*
47+
* @return array<WP_Parser_Node|WP_Parser_Token>
48+
*/
49+
public function &get_children_ref(): array {
50+
return $this->children;
51+
}
52+
2953
/**
3054
* Flatten the matched rule fragments as if their children were direct
3155
* descendants of the current rule.

packages/mysql-on-sqlite/src/parser/class-wp-parser.php

Lines changed: 62 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,14 @@
1111
class WP_Parser {
1212
protected $grammar;
1313
protected $tokens;
14+
protected $token_count;
1415
protected $position;
1516

1617
public function __construct( WP_Parser_Grammar $grammar, array $tokens ) {
17-
$this->grammar = $grammar;
18-
$this->tokens = $tokens;
19-
$this->position = 0;
18+
$this->grammar = $grammar;
19+
$this->tokens = $tokens;
20+
$this->token_count = count( $tokens );
21+
$this->position = 0;
2022
}
2123

2224
public function parse() {
@@ -27,9 +29,11 @@ public function parse() {
2729
}
2830

2931
private function parse_recursive( $rule_id ) {
30-
$is_terminal = $rule_id <= $this->grammar->highest_terminal_id;
31-
if ( $is_terminal ) {
32-
if ( $this->position >= count( $this->tokens ) ) {
32+
$grammar = $this->grammar;
33+
$highest_terminal_id = $grammar->highest_terminal_id;
34+
35+
if ( $rule_id <= $highest_terminal_id ) {
36+
if ( $this->position >= $this->token_count ) {
3337
return false;
3438
}
3539

@@ -38,58 +42,83 @@ private function parse_recursive( $rule_id ) {
3842
}
3943

4044
if ( $this->tokens[ $this->position ]->id === $rule_id ) {
45+
$token = $this->tokens[ $this->position ];
4146
++$this->position;
42-
return $this->tokens[ $this->position - 1 ];
47+
return $token;
4348
}
4449
return false;
4550
}
4651

47-
$branches = $this->grammar->rules[ $rule_id ];
48-
if ( ! count( $branches ) ) {
52+
$branches = $grammar->rules[ $rule_id ];
53+
if ( ! $branches ) {
4954
return false;
5055
}
5156

5257
// Bale out from processing the current branch if none of its rules can
5358
// possibly match the current token.
54-
if ( isset( $this->grammar->lookahead_is_match_possible[ $rule_id ] ) ) {
59+
$rule_lookahead = $grammar->lookahead_is_match_possible[ $rule_id ] ?? null;
60+
if ( null !== $rule_lookahead ) {
5561
$token_id = $this->tokens[ $this->position ]->id;
5662
if (
57-
! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ $token_id ] ) &&
58-
! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ WP_Parser_Grammar::EMPTY_RULE_ID ] )
63+
! isset( $rule_lookahead[ $token_id ] ) &&
64+
! isset( $rule_lookahead[ WP_Parser_Grammar::EMPTY_RULE_ID ] )
5965
) {
6066
return false;
6167
}
6268
}
6369

64-
$rule_name = $this->grammar->rule_names[ $rule_id ];
70+
$rule_name = $grammar->rule_names[ $rule_id ];
71+
$fragment_ids = $grammar->fragment_ids;
72+
$rules = $grammar->rules;
73+
$tokens = $this->tokens;
74+
$token_count = $this->token_count;
6575
$starting_position = $this->position;
76+
$branch_matches = false;
6677
foreach ( $branches as $branch ) {
6778
$this->position = $starting_position;
68-
$node = new WP_Parser_Node( $rule_id, $rule_name );
79+
$children = array();
6980
$branch_matches = true;
7081
foreach ( $branch as $subrule_id ) {
82+
// Inline terminal matching to avoid a recursive call per token.
83+
if ( $subrule_id <= $highest_terminal_id ) {
84+
if ( WP_Parser_Grammar::EMPTY_RULE_ID === $subrule_id ) {
85+
// Epsilon rule: matches without consuming input.
86+
continue;
87+
}
88+
if (
89+
$this->position < $token_count
90+
&& $tokens[ $this->position ]->id === $subrule_id
91+
) {
92+
$children[] = $tokens[ $this->position ];
93+
++$this->position;
94+
continue;
95+
}
96+
$branch_matches = false;
97+
break;
98+
}
99+
71100
$subnode = $this->parse_recursive( $subrule_id );
72101
if ( false === $subnode ) {
73102
$branch_matches = false;
74103
break;
75-
} elseif ( true === $subnode ) {
104+
}
105+
if ( true === $subnode ) {
76106
/*
77107
* The subrule was matched without actually matching a token.
78108
* This means a special empty "ε" (epsilon) rule was matched.
79109
* An "ε" rule in a grammar matches an empty input of 0 bytes.
80110
* It is used to represent optional grammar productions.
81111
*/
82112
continue;
83-
} elseif ( is_array( $subnode ) && 0 === count( $subnode ) ) {
84-
continue;
85-
}
86-
if ( is_array( $subnode ) && ! count( $subnode ) ) {
87-
continue;
88113
}
89-
if ( isset( $this->grammar->fragment_ids[ $subrule_id ] ) ) {
90-
$node->merge_fragment( $subnode );
114+
if ( isset( $fragment_ids[ $subrule_id ] ) ) {
115+
// Fragments: inline their children directly to avoid building
116+
// a throwaway WP_Parser_Node that would be merged afterwards.
117+
foreach ( $subnode->get_children_ref() as $c ) {
118+
$children[] = $c;
119+
}
91120
} else {
92-
$node->append_child( $subnode );
121+
$children[] = $subnode;
93122
}
94123
}
95124

@@ -100,12 +129,16 @@ private function parse_recursive( $rule_id ) {
100129
// for right-associative rules, which could solve this.
101130
// See: https://github.com/mysql/mysql-workbench/blob/8.0.38/library/parsers/grammars/MySQLParser.g4#L994
102131
// See: https://github.com/antlr/antlr4/issues/488
103-
$la = $this->tokens[ $this->position ] ?? null;
104-
if ( $la && 'selectStatement' === $rule_name && WP_MySQL_Lexer::INTO_SYMBOL === $la->id ) {
132+
if (
133+
$branch_matches
134+
&& 'selectStatement' === $rule_name
135+
&& $this->position < $token_count
136+
&& WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id
137+
) {
105138
$branch_matches = false;
106139
}
107140

108-
if ( true === $branch_matches ) {
141+
if ( $branch_matches ) {
109142
break;
110143
}
111144
}
@@ -115,10 +148,12 @@ private function parse_recursive( $rule_id ) {
115148
return false;
116149
}
117150

118-
if ( ! $node->has_child() ) {
151+
if ( ! $children ) {
119152
return true;
120153
}
121154

155+
$node = new WP_Parser_Node( $rule_id, $rule_name );
156+
$node->set_children( $children );
122157
return $node;
123158
}
124159
}

0 commit comments

Comments
 (0)