Embed branch symbol sequences directly in the per-token selector

JanJakes · JanJakes · commit f726c2e0275c · 2026-04-28T12:42:48.000+02:00
Previously the per-(rule, token) selector stored a list of branch
indexes that the parser then had to look up in $rules[$rule_id] on
every branch attempt. Store the branch symbol sequences themselves so
the hot loop can iterate candidate branches directly.

PHP arrays are copy-on-write, so sharing the same branch sequence
across selector entries for many tokens costs negligible extra memory.
The nullable_branches map shrinks to a bool marker since the parser
only uses it for existence checks.

Also cache the start rule id on the grammar so parse() skips its
array_search() across rule_names on every call.

End-to-end parser benchmark:
  Before: ~29,800 QPS (avg)   After: ~31,700 QPS (+6%).
diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php
@@ -58,6 +58,13 @@ class WP_Parser_Grammar {
 	public $lowest_non_terminal_id;
 	public $highest_terminal_id;
 
+	/**
+	 * Cached id of the grammar's start rule, populated lazily on first parse.
+	 *
+	 * @var int|null
+	 */
+	public $start_rule_id;
+
 	public function __construct( array $rules ) {
 		$this->inflate( $rules );
 	}
@@ -319,10 +326,20 @@ private function build_branch_selectors() {
 				foreach ( $selector as $tid => $idx_list ) {
 					$merged[ $tid ] = self::merge_sorted( $idx_list, $nullable_branch_ids );
 				}
-				$selector                             = $merged;
-				$this->nullable_branches[ $rule_id ]  = $nullable_branch_ids;
+				$selector                            = $merged;
+				$this->nullable_branches[ $rule_id ] = true;
 			}
 			if ( $selector ) {
+				// Store the candidate branch sequences directly so the parser
+				// can foreach over them without an extra $branches[$idx]
+				// indirection on every branch attempt.
+				foreach ( $selector as $tid => $idx_list ) {
+					$seqs = array();
+					foreach ( $idx_list as $idx ) {
+						$seqs[] = $branches[ $idx ];
+					}
+					$selector[ $tid ] = $seqs;
+				}
 				$this->branches_for_token[ $rule_id ] = $selector;
 			}
 		}
diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php
@@ -44,8 +44,14 @@ public function __construct( WP_Parser_Grammar $grammar, array $tokens ) {
 
 	public function parse() {
 		// @TODO: Make the starting rule lookup non-grammar-specific.
-		$query_rule_id = $this->grammar->get_rule_id( 'query' );
-		$ast           = $this->parse_recursive( $query_rule_id );
+		// Cache the query rule id on the grammar - get_rule_id() does a
+		// linear array_search over all rule names which, on the MySQL
+		// grammar, costs a few microseconds per lookup.
+		$grammar = $this->grammar;
+		if ( null === $grammar->start_rule_id ) {
+			$grammar->start_rule_id = $grammar->get_rule_id( 'query' );
+		}
+		$ast = $this->parse_recursive( $grammar->start_rule_id );
 		return false === $ast ? null : $ast;
 	}
 
@@ -74,14 +80,12 @@ private function parse_recursive( $rule_id ) {
 		}
 
 		$highest_terminal_id = $this->highest_terminal_id;
-		$branches            = $this->rules[ $rule_id ];
 		$rule_name           = $this->rule_names[ $rule_id ];
 		$is_fragment         = isset( $this->fragment_ids[ $rule_id ] );
 		$is_select_statement = 'selectStatement' === $rule_name;
 		$branch_matches      = false;
 		$children            = array();
-		foreach ( $candidate_branches as $idx ) {
-			$branch         = $branches[ $idx ];
+		foreach ( $candidate_branches as $branch ) {
 			$this->position = $position;
 			$children       = array();
 			$branch_matches = true;