@@ -8,28 +8,38 @@ class SQLParser{
88
99 public $ tokens = array ();
1010 public $ tables = array ();
11+ public $ source_map = array ();
1112
1213 public function parse ($ sql ){
1314
14- $ this ->tokens = $ this ->lex ($ sql );
15- $ ret = $ this ->walk ($ this ->tokens );
15+ // stashes tokens and source_map in $this
16+ $ this ->lex ($ sql );
17+ $ ret = $ this ->walk ($ this ->tokens , $ sql , $ this ->source_map );
1618
1719 $ this ->tables = $ ret ['tables ' ];
1820 return $ this ->tables ;
1921 }
2022
21-
23+ #
24+ # lex and collapse tokens
25+ #
26+ public function lex ($ sql ) {
27+ $ this ->source_map = $ this ->_lex ($ sql );
28+ $ this ->tokens = $ this ->_extract_tokens ($ sql , $ this ->source_map );
29+ return $ this ->tokens ;
30+ }
2231
2332 #
2433 # simple lexer based on http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt
2534 #
35+ # returns an array of [position, len] tuples for each token
2636
27- public function lex ($ sql ){
37+ private function _lex ($ sql ){
2838
2939 $ pos = 0 ;
3040 $ len = strlen ($ sql );
3141
32- $ tokens = array ();
42+ $ source_map = array ();
3343
3444 while ($ pos < $ len ){
3545
@@ -64,7 +74,7 @@ public function lex($sql){
6474 # <regular identifier>
6575 # <key word>
6676 if (preg_match ('![[:alpha:]][[:alnum:]_]*!A ' , $ sql , $ m , 0 , $ pos )){
67- $ tokens [] = substr ( $ sql , $ pos , strlen ($ m [0 ])) ;
77+ $ source_map [] = [ $ pos , strlen ($ m [0 ])] ;
6878 $ pos += strlen ($ m [0 ]);
6979 continue ;
7080 }
@@ -75,7 +85,7 @@ public function lex($sql){
7585 if ($ p2 === false ){
7686 $ pos = $ len ;
7787 }else {
78- $ tokens [] = substr ( $ sql , $ pos , 1 +$ p2 -$ pos) ;
88+ $ source_map [] = [ $ pos , 1 +$ p2 -$ pos] ;
7989 $ pos = $ p2 +1 ;
8090 }
8191 continue ;
@@ -86,7 +96,7 @@ public function lex($sql){
8696 # <period> <unsigned integer>
8797 # <unsigned integer> ::= <digit>...
8898 if (preg_match ('!(\d+\.?\d*|\.\d+)!A ' , $ sql , $ m , 0 , $ pos )){
89- $ tokens [] = substr ( $ sql , $ pos , strlen ($ m [0 ])) ;
99+ $ source_map [] = [ $ pos , strlen ($ m [0 ])] ;
90100 $ pos += strlen ($ m [0 ]);
91101 continue ;
92102 }
@@ -107,7 +117,7 @@ public function lex($sql){
107117 }
108118 if ($ sql [$ c ] == $ q ){
109119 $ slen = $ c + 1 - $ pos ;
110- $ tokens [] = substr ( $ sql , $ pos , $ slen) ;
120+ $ source_map [] = [ $ pos , $ slen] ;
111121 $ pos += $ slen ;
112122 break ;
113123 }
@@ -129,60 +139,72 @@ public function lex($sql){
129139 # <double period>
130140 # <left bracket>
131141 # <right bracket>
132-
133- $ tokens [] = substr ($ sql , $ pos , 1 );
142+ $ source_map [] = [$ pos , 1 ];
134143 $ pos ++;
135144 }
136145
137- return $ tokens ;
146+ return $ source_map ;
138147 }
139148
140149
141- function walk ($ tokens ){
150+ function walk ($ tokens, $ sql , $ source_map ){
142151
143152
144153 #
145154 # split into statements
146155 #
147156
148- $ tokens = $ this ->collapse_tokens ($ tokens );
149-
150157 $ statements = array ();
151158 $ temp = array ();
152- foreach ($ tokens as $ t ){
159+ $ start = 0 ;
160+ for ($ i = 0 ; $ i < count ($ tokens ); $ i ++) {
161+ $ t = $ tokens [$ i ];
153162 if ($ t == '; ' ){
154- if (count ($ temp )) $ statements [] = $ temp ;
163+ if (count ($ temp )) {
164+ $ statements [] = array (
165+ "tuples " => $ temp ,
166+ "sql " => substr ($ sql , $ source_map [$ start ][0 ], $ source_map [$ i ][0 ] - $ source_map [$ start ][0 ] + $ source_map [$ i ][1 ]),
167+ );
168+ }
155169 $ temp = array ();
170+ $ start = $ i + 1 ;
156171 }else {
157172 $ temp [] = $ t ;
158173 }
159174 }
160- if (count ($ temp )) $ statements [] = $ temp ;
161-
175+ if (count ($ temp )) {
176+ $ statements [] = array (
177+ "tuples " => $ temp ,
178+ "sql " => substr ($ sql , $ source_map [$ start ][0 ], $ source_map [$ i ][0 ] - $ source_map [$ start ][0 ] + $ source_map [$ i ][1 ]),
179+ );
180+ }
162181
163182 #
164183 # find CREATE TABLE statements
165184 #
166185
167186 $ tables = array ();
168187
169- foreach ($ statements as $ s ){
188+ foreach ($ statements as $ stmt ){
189+ $ s = $ stmt ['tuples ' ];
170190
171- if ($ s [0 ] == 'CREATE TABLE ' ){
191+ if (StrToUpper ( $ s [0 ]) == 'CREATE TABLE ' ){
172192
173193 array_shift ($ s );
174194
175195 $ table = $ this ->parse_create_table ($ s );
196+ $ table ['sql ' ] = $ stmt ['sql ' ];
176197 $ tables [$ table ['name ' ]] = $ table ;
177198 }
178199
179- if ($ s [0 ] == 'CREATE TEMPORARY TABLE ' ){
200+ if (StrToUpper ( $ s [0 ]) == 'CREATE TEMPORARY TABLE ' ){
180201
181202 array_shift ($ s );
182203
183204 $ table = $ this ->parse_create_table ($ s );
184205 $ table ['props ' ]['temp ' ] = true ;
185206 $ tables [$ table ['name ' ]] = $ table ;
207+ $ table ['sql ' ] = $ stmt ['sql ' ];
186208 }
187209
188210 if ($ GLOBALS ['_find_single_table ' ] && count ($ tables )) return array (
@@ -240,7 +262,6 @@ function parse_create_table($tokens){
240262
241263 $ props = $ this ->parse_table_props ($ tokens );
242264
243-
244265 $ table = array (
245266 'name ' => $ name ,
246267 'fields ' => $ fields ,
@@ -675,14 +696,13 @@ function parse_table_props(&$tokens){
675696 }
676697
677698
678-
679- # We can simplify parsing by merging certain tokens when
699+ # Given the source map, extract the tokens from the original sql,
700+ # Along the way, simplify parsing by merging certain tokens when
680701 # they occur next to each other. MySQL treats these productions
681702 # equally: 'UNIQUE|UNIQUE INDEX|UNIQUE KEY' and if they are
682703 # all always a single token it makes parsing easier.
683704
684- function collapse_tokens ($ tokens ){
685-
705+ function _extract_tokens ($ sql , &$ source_map ){
686706 $ lists = array (
687707 'FULLTEXT INDEX ' ,
688708 'FULLTEXT KEY ' ,
@@ -725,38 +745,51 @@ function collapse_tokens($tokens){
725745 foreach ($ singles as $ s ) $ smap [$ s ] = 1 ;
726746
727747 $ out = array ();
748+ $ out_map = [];
749+
728750 $ i = 0 ;
729- $ len = count ($ tokens );
751+ $ len = count ($ source_map );
730752 while ($ i < $ len ){
731- $ next = StrToUpper ($ tokens [$ i ]);
732- if (is_array ($ maps [$ next ])){
753+ $ token = substr ($ sql , $ source_map [$ i ][0 ], $ source_map [$ i ][1 ]);
754+ $ tokenUpper = StrToUpper ($ token );
755+ if (is_array ($ maps [$ tokenUpper ])){
733756 $ found = false ;
734- foreach ($ maps [$ next ] as $ list ){
757+ foreach ($ maps [$ tokenUpper ] as $ list ){
735758 $ fail = false ;
736759 foreach ($ list as $ k => $ v ){
737- if ($ v != StrToUpper ($ tokens [$ k +$ i ])){
760+ $ next = StrToUpper (substr ($ sql , $ source_map [$ k +$ i ][0 ], $ source_map [$ k +$ i ][1 ]));
761+ if ($ v != $ next ){
738762 $ fail = true ;
739763 break ;
740764 }
741765 }
742766 if (!$ fail ){
743- $ i += count ($ list );
744767 $ out [] = implode (' ' , $ list );
768+
769+ # Extend the length of the first token to include everything
770+ # up through the last in the sequence.
771+ $ j = $ i + count ($ list ) - 1 ;
772+ $ out_map [] = array ($ source_map [$ i ][0 ], ($ source_map [$ j ][0 ] - $ source_map [$ i ][0 ]) + $ source_map [$ j ][1 ]);
773+
774+ $ i = $ j + 1 ;
745775 $ found = true ;
746776 break ;
747777 }
748778 }
749779 if ($ found ) continue ;
750780 }
751- if ($ smap [$ next ]){
752- $ out [] = $ next ;
781+ if ($ smap [$ tokenUpper ]){
782+ $ out [] = $ tokenUpper ;
783+ $ out_map []= $ source_map [$ i ];
753784 $ i ++;
754785 continue ;
755786 }
756- $ out [] = $ tokens [$ i ];
787+ $ out [] = $ token ;
788+ $ out_map []= $ source_map [$ i ];
757789 $ i ++;
758790 }
759791
792+ $ source_map = $ out_map ;
760793 return $ out ;
761794 }
762795
0 commit comments