Skip to content

Commit a786fe2

Browse files
committed
Embed branch symbol sequences directly in the per-token selector
Previously the per-(rule, token) selector stored a list of branch indexes that the parser then had to look up in $rules[$rule_id] on every branch attempt. Store the branch symbol sequences themselves so the hot loop can iterate candidate branches directly. PHP arrays are copy-on-write, so sharing the same branch sequence across selector entries for many tokens costs negligible extra memory. The nullable_branches map shrinks to a bool marker since the parser only uses it for existence checks. Also cache the start rule id on the grammar so parse() skips its array_search() across rule_names on every call. End-to-end parser benchmark: Before: ~29,800 QPS (avg) After: ~31,700 QPS (+6%).
1 parent 04786fb commit a786fe2

2 files changed

Lines changed: 51 additions & 12 deletions

File tree

packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,27 +34,40 @@ class WP_Parser_Grammar {
3434
* Per-rule branch selector keyed by the next token id.
3535
*
3636
* When set, `$branches_for_token[$rule_id][$token_id]` is the ordered list
37-
* of branch indexes in `$rules[$rule_id]` that can possibly match when the
38-
* current token has the given id. Nullable branches appear in every entry.
37+
* of candidate branch symbol sequences (drawn from `$rules[$rule_id]`)
38+
* that can possibly match when the current token has the given id.
39+
* Nullable branches appear in every entry.
3940
*
4041
* If an entry does not exist for the current token, `$nullable_branches`
4142
* is consulted. If neither has an entry for this rule, the rule cannot
4243
* match and the parser returns immediately.
4344
*
44-
* @var array<int,array<int,int[]>>
45+
* @var array<int,array<int,int[][]>>
4546
*/
4647
public $branches_for_token = array();
4748

4849
/**
49-
* Per-rule list of nullable branch indexes.
50+
* Per-rule marker indicating the rule has at least one nullable branch.
5051
*
51-
* @var array<int,int[]>
52+
* @var array<int,true>
5253
*/
5354
public $nullable_branches = array();
5455

5556
public $lowest_non_terminal_id;
5657
public $highest_terminal_id;
5758

59+
/**
60+
* Memoized rule-id lookups, keyed by rule name.
61+
*
62+
* `get_rule_id()` is a linear `array_search` over `$rule_names` and
63+
* costs a few microseconds per call on the MySQL grammar. The parser
64+
* looks up its start rule and the `selectStatement` rule on a hot path,
65+
* so the results are memoized via `get_or_cache_rule_id()`.
66+
*
67+
* @var array<string,int|false>
68+
*/
69+
private $cached_rule_ids = array();
70+
5871
public function __construct( array $rules ) {
5972
$this->inflate( $rules );
6073
}
@@ -67,6 +80,25 @@ public function get_rule_id( $rule_name ) {
6780
return array_search( $rule_name, $this->rule_names, true );
6881
}
6982

83+
/**
84+
* Return the rule id for a given rule name, memoizing the result.
85+
*
86+
* Equivalent to `get_rule_id()` but caches the lookup so repeated
87+
* queries for the same rule name (typically the start rule and a few
88+
* grammar-specific rules consulted on the parser hot path) avoid
89+
* the linear scan over `$rule_names`. Returns `false` for unknown
90+
* rule names, mirroring `get_rule_id()`.
91+
*
92+
* @param string $rule_name
93+
* @return int|false
94+
*/
95+
public function get_or_cache_rule_id( $rule_name ) {
96+
if ( ! array_key_exists( $rule_name, $this->cached_rule_ids ) ) {
97+
$this->cached_rule_ids[ $rule_name ] = $this->get_rule_id( $rule_name );
98+
}
99+
return $this->cached_rule_ids[ $rule_name ];
100+
}
101+
70102
/**
71103
* Inflate the grammar to an internal representation optimized for parsing.
72104
*
@@ -316,10 +348,20 @@ private function build_branch_selectors() {
316348
foreach ( $selector as $tid => $idx_list ) {
317349
$merged[ $tid ] = self::merge_sorted( $idx_list, $nullable_branch_ids );
318350
}
319-
$selector = $merged;
320-
$this->nullable_branches[ $rule_id ] = $nullable_branch_ids;
351+
$selector = $merged;
352+
$this->nullable_branches[ $rule_id ] = true;
321353
}
322354
if ( $selector ) {
355+
// Store the candidate branch sequences directly so the parser
356+
// can foreach over them without an extra $branches[$idx]
357+
// indirection on every branch attempt.
358+
foreach ( $selector as $tid => $idx_list ) {
359+
$seqs = array();
360+
foreach ( $idx_list as $idx ) {
361+
$seqs[] = $branches[ $idx ];
362+
}
363+
$selector[ $tid ] = $seqs;
364+
}
323365
$this->branches_for_token[ $rule_id ] = $selector;
324366
}
325367
}

packages/mysql-on-sqlite/src/parser/class-wp-parser.php

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,7 @@ public function __construct( WP_Parser_Grammar $grammar, array $tokens ) {
5858

5959
public function parse() {
6060
// @TODO: Make the starting rule lookup non-grammar-specific.
61-
$query_rule_id = $this->grammar->get_rule_id( 'query' );
62-
$ast = $this->parse_recursive( $query_rule_id );
61+
$ast = $this->parse_recursive( $this->grammar->get_or_cache_rule_id( 'query' ) );
6362
return false === $ast ? null : $ast;
6463
}
6564

@@ -88,14 +87,12 @@ private function parse_recursive( $rule_id ) {
8887
}
8988

9089
$highest_terminal_id = $this->highest_terminal_id;
91-
$branches = $this->rules[ $rule_id ];
9290
$rule_name = $this->rule_names[ $rule_id ];
9391
$is_fragment = isset( $this->fragment_ids[ $rule_id ] );
9492
$is_select_statement = 'selectStatement' === $rule_name;
9593
$branch_matches = false;
9694
$children = array();
97-
foreach ( $candidate_branches as $idx ) {
98-
$branch = $branches[ $idx ];
95+
foreach ( $candidate_branches as $branch ) {
9996
$this->position = $position;
10097
$children = array();
10198
$branch_matches = true;

0 commit comments

Comments
 (0)