1111class WP_Parser {
1212 protected $ grammar ;
1313 protected $ tokens ;
14+ protected $ token_count ;
1415 protected $ position ;
1516
1617 public function __construct ( WP_Parser_Grammar $ grammar , array $ tokens ) {
17- $ this ->grammar = $ grammar ;
18- $ this ->tokens = $ tokens ;
19- $ this ->position = 0 ;
18+ $ this ->grammar = $ grammar ;
19+ $ this ->tokens = $ tokens ;
20+ $ this ->token_count = count ( $ tokens );
21+ $ this ->position = 0 ;
2022 }
2123
2224 public function parse () {
@@ -27,9 +29,11 @@ public function parse() {
2729 }
2830
2931 private function parse_recursive ( $ rule_id ) {
30- $ is_terminal = $ rule_id <= $ this ->grammar ->highest_terminal_id ;
31- if ( $ is_terminal ) {
32- if ( $ this ->position >= count ( $ this ->tokens ) ) {
32+ $ grammar = $ this ->grammar ;
33+ $ highest_terminal_id = $ grammar ->highest_terminal_id ;
34+
35+ if ( $ rule_id <= $ highest_terminal_id ) {
36+ if ( $ this ->position >= $ this ->token_count ) {
3337 return false ;
3438 }
3539
@@ -38,58 +42,83 @@ private function parse_recursive( $rule_id ) {
3842 }
3943
4044 if ( $ this ->tokens [ $ this ->position ]->id === $ rule_id ) {
45+ $ token = $ this ->tokens [ $ this ->position ];
4146 ++$ this ->position ;
42- return $ this -> tokens [ $ this -> position - 1 ] ;
47+ return $ token ;
4348 }
4449 return false ;
4550 }
4651
47- $ branches = $ this -> grammar ->rules [ $ rule_id ];
48- if ( ! count ( $ branches ) ) {
52+ $ branches = $ grammar ->rules [ $ rule_id ];
53+ if ( ! $ branches ) {
4954 return false ;
5055 }
5156
5257 // Bale out from processing the current branch if none of its rules can
5358 // possibly match the current token.
54- if ( isset ( $ this ->grammar ->lookahead_is_match_possible [ $ rule_id ] ) ) {
59+ $ rule_lookahead = $ grammar ->lookahead_is_match_possible [ $ rule_id ] ?? null ;
60+ if ( null !== $ rule_lookahead ) {
5561 $ token_id = $ this ->tokens [ $ this ->position ]->id ;
5662 if (
57- ! isset ( $ this -> grammar -> lookahead_is_match_possible [ $ rule_id ] [ $ token_id ] ) &&
58- ! isset ( $ this -> grammar -> lookahead_is_match_possible [ $ rule_id ] [ WP_Parser_Grammar::EMPTY_RULE_ID ] )
63+ ! isset ( $ rule_lookahead [ $ token_id ] ) &&
64+ ! isset ( $ rule_lookahead [ WP_Parser_Grammar::EMPTY_RULE_ID ] )
5965 ) {
6066 return false ;
6167 }
6268 }
6369
64- $ rule_name = $ this ->grammar ->rule_names [ $ rule_id ];
70+ $ rule_name = $ grammar ->rule_names [ $ rule_id ];
71+ $ fragment_ids = $ grammar ->fragment_ids ;
72+ $ rules = $ grammar ->rules ;
73+ $ tokens = $ this ->tokens ;
74+ $ token_count = $ this ->token_count ;
6575 $ starting_position = $ this ->position ;
76+ $ branch_matches = false ;
6677 foreach ( $ branches as $ branch ) {
6778 $ this ->position = $ starting_position ;
68- $ node = new WP_Parser_Node ( $ rule_id , $ rule_name );
79+ $ children = array ( );
6980 $ branch_matches = true ;
7081 foreach ( $ branch as $ subrule_id ) {
82+ // Inline terminal matching to avoid a recursive call per token.
83+ if ( $ subrule_id <= $ highest_terminal_id ) {
84+ if ( WP_Parser_Grammar::EMPTY_RULE_ID === $ subrule_id ) {
85+ // Epsilon rule: matches without consuming input.
86+ continue ;
87+ }
88+ if (
89+ $ this ->position < $ token_count
90+ && $ tokens [ $ this ->position ]->id === $ subrule_id
91+ ) {
92+ $ children [] = $ tokens [ $ this ->position ];
93+ ++$ this ->position ;
94+ continue ;
95+ }
96+ $ branch_matches = false ;
97+ break ;
98+ }
99+
71100 $ subnode = $ this ->parse_recursive ( $ subrule_id );
72101 if ( false === $ subnode ) {
73102 $ branch_matches = false ;
74103 break ;
75- } elseif ( true === $ subnode ) {
104+ }
105+ if ( true === $ subnode ) {
76106 /*
77107 * The subrule was matched without actually matching a token.
78108 * This means a special empty "ε" (epsilon) rule was matched.
79109 * An "ε" rule in a grammar matches an empty input of 0 bytes.
80110 * It is used to represent optional grammar productions.
81111 */
82112 continue ;
83- } elseif ( is_array ( $ subnode ) && 0 === count ( $ subnode ) ) {
84- continue ;
85- }
86- if ( is_array ( $ subnode ) && ! count ( $ subnode ) ) {
87- continue ;
88113 }
89- if ( isset ( $ this ->grammar ->fragment_ids [ $ subrule_id ] ) ) {
90- $ node ->merge_fragment ( $ subnode );
114+ if ( isset ( $ fragment_ids [ $ subrule_id ] ) ) {
115+ // Fragments: inline their children directly to avoid building
116+ // a throwaway WP_Parser_Node that would be merged afterwards.
117+ foreach ( $ subnode ->get_children_ref () as $ c ) {
118+ $ children [] = $ c ;
119+ }
91120 } else {
92- $ node -> append_child ( $ subnode ) ;
121+ $ children [] = $ subnode ;
93122 }
94123 }
95124
@@ -100,12 +129,16 @@ private function parse_recursive( $rule_id ) {
100129 // for right-associative rules, which could solve this.
101130 // See: https://github.com/mysql/mysql-workbench/blob/8.0.38/library/parsers/grammars/MySQLParser.g4#L994
102131 // See: https://github.com/antlr/antlr4/issues/488
103- $ la = $ this ->tokens [ $ this ->position ] ?? null ;
104- if ( $ la && 'selectStatement ' === $ rule_name && WP_MySQL_Lexer::INTO_SYMBOL === $ la ->id ) {
132+ if (
133+ $ branch_matches
134+ && 'selectStatement ' === $ rule_name
135+ && $ this ->position < $ token_count
136+ && WP_MySQL_Lexer::INTO_SYMBOL === $ tokens [ $ this ->position ]->id
137+ ) {
105138 $ branch_matches = false ;
106139 }
107140
108- if ( true === $ branch_matches ) {
141+ if ( $ branch_matches ) {
109142 break ;
110143 }
111144 }
@@ -115,10 +148,12 @@ private function parse_recursive( $rule_id ) {
115148 return false ;
116149 }
117150
118- if ( ! $ node -> has_child () ) {
151+ if ( ! $ children ) {
119152 return true ;
120153 }
121154
155+ $ node = new WP_Parser_Node ( $ rule_id , $ rule_name );
156+ $ node ->set_children ( $ children );
122157 return $ node ;
123158 }
124159}
0 commit comments