Skip to content

Commit 6c9b869

Browse files
committed
Strip epsilon markers and cache grammar refs on the parser
Two minor reductions in per-call work: * Strip explicit EMPTY_RULE_ID symbols out of rule branches at grammar build time. The parser loop would have 'continue'd over them anyway, so removing them ahead of time lets the hot symbol loop drop the epsilon check. Pure-epsilon branches become empty branches and still match empty via the existing empty-children fast path. * Cache the grammar's rules, fragment_ids, rule_names, branches_for_token, nullable_branches, and highest_terminal_id as direct parser instance fields so parse_recursive() no longer pays for a $this->grammar->... double hop on every call. * Collapse the two-step node construction (new + set_children) into a single constructor call that takes the children array directly. This saves a method call per allocated node (~820k across the MySQL corpus). End-to-end parser benchmark: ~27,500 QPS -> ~28,500 QPS (+3.5%).
1 parent 04191de commit 6c9b869

3 files changed

Lines changed: 61 additions & 35 deletions

File tree

packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,39 @@ private function inflate( $grammar ) {
111111
}
112112

113113
$this->inline_single_branch_fragments();
114+
$this->strip_epsilon_markers();
114115
$this->build_branch_selectors();
115116
}
116117

118+
/**
119+
* Remove explicit `EMPTY_RULE_ID` markers from branches.
120+
*
121+
* The epsilon marker is a zero-width, always-matching symbol used in the
122+
* grammar to express optional productions. At parse time it would still
123+
* be walked and "continued" over for no effect, so stripping it ahead of
124+
* time removes a per-symbol branch in the hot loop.
125+
*
126+
* A pure-epsilon branch (`[EMPTY_RULE_ID]`) becomes an empty branch (`[]`)
127+
* which the parser already handles: the inner symbol loop does nothing and
128+
* the rule returns a successful empty match.
129+
*/
130+
private function strip_epsilon_markers() {
131+
foreach ( $this->rules as $rule_id => $branches ) {
132+
foreach ( $branches as $i => $branch ) {
133+
if ( in_array( self::EMPTY_RULE_ID, $branch, true ) ) {
134+
$this->rules[ $rule_id ][ $i ] = array_values(
135+
array_filter(
136+
$branch,
137+
static function ( $s ) {
138+
return self::EMPTY_RULE_ID !== $s;
139+
}
140+
)
141+
);
142+
}
143+
}
144+
}
145+
}
146+
117147
/**
118148
* Inline single-branch fragment rules into their call sites.
119149
*

packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,29 +15,18 @@ class WP_Parser_Node {
1515
*/
1616
public $rule_id;
1717
public $rule_name;
18-
private $children = array();
18+
private $children;
1919

20-
public function __construct( $rule_id, $rule_name ) {
20+
public function __construct( $rule_id, $rule_name, array $children = array() ) {
2121
$this->rule_id = $rule_id;
2222
$this->rule_name = $rule_name;
23+
$this->children = $children;
2324
}
2425

2526
public function append_child( $node ) {
2627
$this->children[] = $node;
2728
}
2829

29-
/**
30-
* Replace all children with the given array.
31-
*
32-
* This is used by the parser to attach a batch of children built up in a
33-
* local array while trying branches, without allocating a node per attempt.
34-
*
35-
* @param array<WP_Parser_Node|WP_Parser_Token> $children The new children.
36-
*/
37-
public function set_children( array $children ): void {
38-
$this->children = $children;
39-
}
40-
4130
/**
4231
* Return the children array by reference for efficient fragment inlining.
4332
*

packages/mysql-on-sqlite/src/parser/class-wp-parser.php

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,26 @@ class WP_Parser {
1414
protected $token_count;
1515
protected $position;
1616

17+
// Grammar data cached as instance fields so the hot path avoids an extra
18+
// property hop via $this->grammar on every recursive call.
19+
private $rules;
20+
private $rule_names;
21+
private $fragment_ids;
22+
private $branches_for_token;
23+
private $nullable_branches;
24+
private $highest_terminal_id;
25+
1726
public function __construct( WP_Parser_Grammar $grammar, array $tokens ) {
18-
$this->grammar = $grammar;
19-
$this->tokens = $tokens;
20-
$this->token_count = count( $tokens );
21-
$this->position = 0;
27+
$this->grammar = $grammar;
28+
$this->tokens = $tokens;
29+
$this->token_count = count( $tokens );
30+
$this->position = 0;
31+
$this->rules = $grammar->rules;
32+
$this->rule_names = $grammar->rule_names;
33+
$this->fragment_ids = $grammar->fragment_ids ?? array();
34+
$this->branches_for_token = $grammar->branches_for_token;
35+
$this->nullable_branches = $grammar->nullable_branches;
36+
$this->highest_terminal_id = $grammar->highest_terminal_id;
2237
}
2338

2439
public function parse() {
@@ -36,7 +51,6 @@ public function parse() {
3651
* round trip per consumed token.
3752
*/
3853
private function parse_recursive( $rule_id ) {
39-
$grammar = $this->grammar;
4054
$tokens = $this->tokens;
4155
$token_count = $this->token_count;
4256
$position = $this->position;
@@ -45,21 +59,19 @@ private function parse_recursive( $rule_id ) {
4559
// sets. When no entry exists for the current token but the rule is
4660
// nullable, all candidate branches would match empty, so we return
4761
// immediately without entering any branch.
48-
$branch_selector = $grammar->branches_for_token[ $rule_id ];
49-
$tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID;
50-
if ( isset( $branch_selector[ $tid ] ) ) {
51-
$candidate_branches = $branch_selector[ $tid ];
52-
} elseif ( isset( $grammar->nullable_branches[ $rule_id ] ) ) {
62+
$tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID;
63+
if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) {
64+
$candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ];
65+
} elseif ( isset( $this->nullable_branches[ $rule_id ] ) ) {
5366
return true;
5467
} else {
5568
return false;
5669
}
5770

58-
$highest_terminal_id = $grammar->highest_terminal_id;
59-
$branches = $grammar->rules[ $rule_id ];
60-
61-
$rule_name = $grammar->rule_names[ $rule_id ];
62-
$fragment_ids = $grammar->fragment_ids;
71+
$highest_terminal_id = $this->highest_terminal_id;
72+
$branches = $this->rules[ $rule_id ];
73+
$fragment_ids = $this->fragment_ids;
74+
$rule_name = $this->rule_names[ $rule_id ];
6375
$is_select_statement = 'selectStatement' === $rule_name;
6476
$branch_matches = false;
6577
$children = array();
@@ -70,9 +82,6 @@ private function parse_recursive( $rule_id ) {
7082
$branch_matches = true;
7183
foreach ( $branch as $subrule_id ) {
7284
if ( $subrule_id <= $highest_terminal_id ) {
73-
if ( WP_Parser_Grammar::EMPTY_RULE_ID === $subrule_id ) {
74-
continue;
75-
}
7685
if (
7786
$this->position < $token_count
7887
&& $tokens[ $this->position ]->id === $subrule_id
@@ -132,8 +141,6 @@ private function parse_recursive( $rule_id ) {
132141
return true;
133142
}
134143

135-
$node = new WP_Parser_Node( $rule_id, $rule_name );
136-
$node->set_children( $children );
137-
return $node;
144+
return new WP_Parser_Node( $rule_id, $rule_name, $children );
138145
}
139146
}

0 commit comments

Comments
 (0)