Inline terminal matching and defer parse node allocation

JanJakes · JanJakes · commit cb8f8a369033 · 2026-04-28T14:09:43.000+02:00
Hot-path changes in WP_Parser::parse_recursive():

- Inline the terminal match in the branch loop instead of recursing into
  parse_recursive() for every token. Over the full MySQL test suite this
  eliminates ~1.6M function calls.
- Hoist grammar, rules, fragment_ids, rule_names, tokens, and token_count
  into local variables so the inner loops avoid repeated property lookups
  on $this-&gt;grammar.
- Cache the token count on the instance to avoid a count() per call.
- Build branch children in a local array and only instantiate the
  WP_Parser_Node once the branch has matched; on the MySQL corpus ~75% of
  speculative nodes were previously created and thrown away.
- Drop a dead is_array($subnode) check that never fires in practice
  (subnodes are false, true, tokens, or nodes - never arrays).
- Inline fragment inlining: read the fragment's children directly instead
  of building a fragment node and immediately merging it.

End-to-end parser benchmark on the MySQL server test corpus:
  Before: ~11,500 QPS   After: ~14,900 QPS  (+29%)
diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php
@@ -29,7 +29,7 @@ class WP_MySQL_Parser extends WP_Parser {
 	 * @return bool Whether a query was successfully parsed.
 	 */
 	public function next_query(): bool {
-		if ( $this->position >= count( $this->tokens ) ) {
+		if ( $this->position >= $this->token_count ) {
 			return false;
 		}
 		$this->current_ast = $this->parse();
diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php
@@ -26,6 +26,18 @@ public function append_child( $node ) {
 		$this->children[] = $node;
 	}
 
+	/**
+	 * Replace all children with the given array.
+	 *
+	 * This is used by the parser to attach a batch of children built up in a
+	 * local array while trying branches, without allocating a node per attempt.
+	 *
+	 * @param array<WP_Parser_Node|WP_Parser_Token> $children The new children.
+	 */
+	public function set_children( array $children ): void {
+		$this->children = $children;
+	}
+
 	/**
 	 * Flatten the matched rule fragments as if their children were direct
 	 * descendants of the current rule.
diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php
@@ -11,12 +11,14 @@
 class WP_Parser {
 	protected $grammar;
 	protected $tokens;
+	protected $token_count;
 	protected $position;
 
 	public function __construct( WP_Parser_Grammar $grammar, array $tokens ) {
-		$this->grammar  = $grammar;
-		$this->tokens   = $tokens;
-		$this->position = 0;
+		$this->grammar     = $grammar;
+		$this->tokens      = $tokens;
+		$this->token_count = count( $tokens );
+		$this->position    = 0;
 	}
 
 	public function parse() {
@@ -27,9 +29,11 @@ public function parse() {
 	}
 
 	private function parse_recursive( $rule_id ) {
-		$is_terminal = $rule_id <= $this->grammar->highest_terminal_id;
-		if ( $is_terminal ) {
-			if ( $this->position >= count( $this->tokens ) ) {
+		$grammar             = $this->grammar;
+		$highest_terminal_id = $grammar->highest_terminal_id;
+
+		if ( $rule_id <= $highest_terminal_id ) {
+			if ( $this->position >= $this->token_count ) {
 				return false;
 			}
 
@@ -38,58 +42,83 @@ private function parse_recursive( $rule_id ) {
 			}
 
 			if ( $this->tokens[ $this->position ]->id === $rule_id ) {
+				$token = $this->tokens[ $this->position ];
 				++$this->position;
-				return $this->tokens[ $this->position - 1 ];
+				return $token;
 			}
 			return false;
 		}
 
-		$branches = $this->grammar->rules[ $rule_id ];
-		if ( ! count( $branches ) ) {
+		$branches = $grammar->rules[ $rule_id ];
+		if ( ! $branches ) {
 			return false;
 		}
 
 		// Bale out from processing the current branch if none of its rules can
 		// possibly match the current token.
-		if ( isset( $this->grammar->lookahead_is_match_possible[ $rule_id ] ) ) {
+		$rule_lookahead = $grammar->lookahead_is_match_possible[ $rule_id ] ?? null;
+		if ( null !== $rule_lookahead ) {
 			$token_id = $this->tokens[ $this->position ]->id;
 			if (
-				! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ $token_id ] ) &&
-				! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ WP_Parser_Grammar::EMPTY_RULE_ID ] )
+				! isset( $rule_lookahead[ $token_id ] ) &&
+				! isset( $rule_lookahead[ WP_Parser_Grammar::EMPTY_RULE_ID ] )
 			) {
 				return false;
 			}
 		}
 
-		$rule_name         = $this->grammar->rule_names[ $rule_id ];
+		$rule_name         = $grammar->rule_names[ $rule_id ];
+		$fragment_ids      = $grammar->fragment_ids;
+		$rules             = $grammar->rules;
+		$tokens            = $this->tokens;
+		$token_count       = $this->token_count;
 		$starting_position = $this->position;
+		$branch_matches    = false;
 		foreach ( $branches as $branch ) {
 			$this->position = $starting_position;
-			$node           = new WP_Parser_Node( $rule_id, $rule_name );
+			$children       = array();
 			$branch_matches = true;
 			foreach ( $branch as $subrule_id ) {
+				// Inline terminal matching to avoid a recursive call per token.
+				if ( $subrule_id <= $highest_terminal_id ) {
+					if ( WP_Parser_Grammar::EMPTY_RULE_ID === $subrule_id ) {
+						// Epsilon rule: matches without consuming input.
+						continue;
+					}
+					if (
+						$this->position < $token_count
+						&& $tokens[ $this->position ]->id === $subrule_id
+					) {
+						$children[]       = $tokens[ $this->position ];
+						++$this->position;
+						continue;
+					}
+					$branch_matches = false;
+					break;
+				}
+
 				$subnode = $this->parse_recursive( $subrule_id );
 				if ( false === $subnode ) {
 					$branch_matches = false;
 					break;
-				} elseif ( true === $subnode ) {
+				}
+				if ( true === $subnode ) {
 					/*
 					 * The subrule was matched without actually matching a token.
 					 * This means a special empty "ε" (epsilon) rule was matched.
 					 * An "ε" rule in a grammar matches an empty input of 0 bytes.
 					 * It is used to represent optional grammar productions.
 					 */
 					continue;
-				} elseif ( is_array( $subnode ) && 0 === count( $subnode ) ) {
-					continue;
-				}
-				if ( is_array( $subnode ) && ! count( $subnode ) ) {
-					continue;
 				}
-				if ( isset( $this->grammar->fragment_ids[ $subrule_id ] ) ) {
-					$node->merge_fragment( $subnode );
+				if ( isset( $fragment_ids[ $subrule_id ] ) ) {
+					// Fragments: inline their children directly to avoid building
+					// a throwaway WP_Parser_Node that would be merged afterwards.
+					foreach ( $subnode->get_children_ref() as $c ) {
+						$children[] = $c;
+					}
 				} else {
-					$node->append_child( $subnode );
+					$children[] = $subnode;
 				}
 			}
 
@@ -100,12 +129,16 @@ private function parse_recursive( $rule_id ) {
 			//        for right-associative rules, which could solve this.
 			//        See: https://github.com/mysql/mysql-workbench/blob/8.0.38/library/parsers/grammars/MySQLParser.g4#L994
 			//        See: https://github.com/antlr/antlr4/issues/488
-			$la = $this->tokens[ $this->position ] ?? null;
-			if ( $la && 'selectStatement' === $rule_name && WP_MySQL_Lexer::INTO_SYMBOL === $la->id ) {
+			if (
+				$branch_matches
+				&& 'selectStatement' === $rule_name
+				&& $this->position < $token_count
+				&& WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id
+			) {
 				$branch_matches = false;
 			}
 
-			if ( true === $branch_matches ) {
+			if ( $branch_matches ) {
 				break;
 			}
 		}
@@ -115,10 +148,12 @@ private function parse_recursive( $rule_id ) {
 			return false;
 		}
 
-		if ( ! $node->has_child() ) {
+		if ( ! $children ) {
 			return true;
 		}
 
+		$node = new WP_Parser_Node( $rule_id, $rule_name );
+		$node->set_children( $children );
 		return $node;
 	}
 }

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ class WP_MySQL_Parser extends WP_Parser {`
`29`	`29`	`* @return bool Whether a query was successfully parsed.`
`30`	`30`	`*/`
`31`	`31`	`public function next_query(): bool {`
`32`		`- if ( $this->position >= count( $this->tokens ) ) {`
	`32`	`+ if ( $this->position >= $this->token_count ) {`
`33`	`33`	`return false;`
`34`	`34`	`}`
`35`	`35`	`$this->current_ast = $this->parse();`