Skip to content

Commit 04191de

Browse files
committed
Short-circuit nullable-fallback and inline single-branch fragments
Two grammar/parser refinements that both reduce recursive calls: * In parse_recursive(): when the rule has a per-token branch selector but the current token is not in any branch's FIRST and the rule itself is nullable, return 'matched empty' immediately instead of descending into nullable branches that would recursively do the same thing. This alone eliminates ~460k recursive calls on the MySQL corpus. * At grammar build time, expand every single-branch fragment rule into its call sites. Fragments exist only to factor shared sub-sequences and their children are already flattened into the parent AST node, so splicing them directly into parent branches is a no-op for the resulting tree but removes an entire recursive call per use. 480 of the grammar's fragments qualify. Also drops the dead terminal branch at the top of parse_recursive() (the branch loop inlines terminal matching, so parse_recursive is only ever called with non-terminal rule ids) and the always-false empty-branches guard. End-to-end parser benchmark: Before: ~22,400 QPS After: ~27,500 QPS (+23%)
1 parent 6812717 commit 04191de

2 files changed

Lines changed: 92 additions & 39 deletions

File tree

packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,80 @@ private function inflate( $grammar ) {
110110
$this->rules[ $rule_id ] = $branches;
111111
}
112112

113+
$this->inline_single_branch_fragments();
113114
$this->build_branch_selectors();
114115
}
115116

117+
/**
118+
* Inline single-branch fragment rules into their call sites.
119+
*
120+
* The grammar contains many single-branch fragment rules that exist only
121+
* to factor shared sub-sequences out of larger productions. At runtime
122+
* the parser would descend into each such fragment via a recursive call
123+
* just to walk the same symbol sequence and splice the results back into
124+
* the parent. Expanding them in-place at build time eliminates that call
125+
* chain without changing the resulting AST because fragment children are
126+
* already flattened into the parent node.
127+
*
128+
* Fragments with two or more alternatives (e.g., `%EOF_zero_or_one`) are
129+
* left intact because they represent real choices that must be evaluated
130+
* against the current token.
131+
*/
132+
private function inline_single_branch_fragments() {
133+
$rules = $this->rules;
134+
$fragment_ids = $this->fragment_ids ?? array();
135+
$low_nt = $this->lowest_non_terminal_id;
136+
137+
// Precompute the set of single-branch fragments that are candidates
138+
// for inlining.
139+
$inlinable = array();
140+
foreach ( $fragment_ids as $rule_id => $_ ) {
141+
if ( isset( $rules[ $rule_id ] ) && 1 === count( $rules[ $rule_id ] ) ) {
142+
$inlinable[ $rule_id ] = true;
143+
}
144+
}
145+
146+
// Depth-first expansion memoized per rule, with cycle detection.
147+
$expanded = array();
148+
$visiting = array();
149+
$expand_branch = function ( array $branch ) use ( &$expand_branch, &$expanded, &$visiting, $rules, $low_nt, $inlinable ) {
150+
$out = array();
151+
foreach ( $branch as $sym ) {
152+
if ( $sym < $low_nt ) {
153+
$out[] = $sym;
154+
continue;
155+
}
156+
if ( ! isset( $inlinable[ $sym ] ) ) {
157+
$out[] = $sym;
158+
continue;
159+
}
160+
if ( isset( $visiting[ $sym ] ) ) {
161+
// Cycle: leave the reference in place.
162+
$out[] = $sym;
163+
continue;
164+
}
165+
if ( ! isset( $expanded[ $sym ] ) ) {
166+
$visiting[ $sym ] = true;
167+
$expanded[ $sym ] = $expand_branch( $rules[ $sym ][0] );
168+
unset( $visiting[ $sym ] );
169+
}
170+
foreach ( $expanded[ $sym ] as $s ) {
171+
$out[] = $s;
172+
}
173+
}
174+
return $out;
175+
};
176+
177+
// Rewrite every rule's branches with fragments inlined.
178+
foreach ( $this->rules as $rule_id => $branches ) {
179+
$new_branches = array();
180+
foreach ( $branches as $branch ) {
181+
$new_branches[] = $expand_branch( $branch );
182+
}
183+
$this->rules[ $rule_id ] = $new_branches;
184+
}
185+
}
186+
116187
/**
117188
* Compute FIRST and NULLABLE sets for every non-terminal, then denormalize
118189
* them into a per-rule map of `token_id => branch_index[]` so the parser

packages/mysql-on-sqlite/src/parser/class-wp-parser.php

Lines changed: 21 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -28,54 +28,36 @@ public function parse() {
2828
return false === $ast ? null : $ast;
2929
}
3030

31+
/**
32+
* Parse a single non-terminal rule.
33+
*
34+
* This function is only called for non-terminal rule ids. Terminals are
35+
* matched inline inside the branch loop below to avoid a function-call
36+
* round trip per consumed token.
37+
*/
3138
private function parse_recursive( $rule_id ) {
32-
$grammar = $this->grammar;
33-
$highest_terminal_id = $grammar->highest_terminal_id;
34-
35-
if ( $rule_id <= $highest_terminal_id ) {
36-
if ( $this->position >= $this->token_count ) {
37-
return false;
38-
}
39-
40-
if ( WP_Parser_Grammar::EMPTY_RULE_ID === $rule_id ) {
41-
return true;
42-
}
43-
44-
if ( $this->tokens[ $this->position ]->id === $rule_id ) {
45-
$token = $this->tokens[ $this->position ];
46-
++$this->position;
47-
return $token;
48-
}
49-
return false;
50-
}
51-
52-
$branches = $grammar->rules[ $rule_id ];
53-
if ( ! $branches ) {
54-
return false;
55-
}
56-
39+
$grammar = $this->grammar;
5740
$tokens = $this->tokens;
5841
$token_count = $this->token_count;
5942
$position = $this->position;
6043

6144
// Narrow the set of branches worth trying using the precomputed FIRST
62-
// sets. When no entry exists for the current token, fall back to the
63-
// rule's nullable branches (if any); if both are empty the rule cannot
64-
// match here.
65-
$branch_selector = $grammar->branches_for_token[ $rule_id ] ?? null;
66-
if ( null !== $branch_selector ) {
67-
$tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID;
68-
if ( isset( $branch_selector[ $tid ] ) ) {
69-
$candidate_branches = $branch_selector[ $tid ];
70-
} elseif ( isset( $grammar->nullable_branches[ $rule_id ] ) ) {
71-
$candidate_branches = $grammar->nullable_branches[ $rule_id ];
72-
} else {
73-
return false;
74-
}
45+
// sets. When no entry exists for the current token but the rule is
46+
// nullable, all candidate branches would match empty, so we return
47+
// immediately without entering any branch.
48+
$branch_selector = $grammar->branches_for_token[ $rule_id ];
49+
$tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID;
50+
if ( isset( $branch_selector[ $tid ] ) ) {
51+
$candidate_branches = $branch_selector[ $tid ];
52+
} elseif ( isset( $grammar->nullable_branches[ $rule_id ] ) ) {
53+
return true;
7554
} else {
76-
$candidate_branches = array_keys( $branches );
55+
return false;
7756
}
7857

58+
$highest_terminal_id = $grammar->highest_terminal_id;
59+
$branches = $grammar->rules[ $rule_id ];
60+
7961
$rule_name = $grammar->rule_names[ $rule_id ];
8062
$fragment_ids = $grammar->fragment_ids;
8163
$is_select_statement = 'selectStatement' === $rule_name;

0 commit comments

Comments
 (0)