Skip to content

Commit 0902af5

Browse files
demmeriamcal
authored andcommitted
rework the lexer to keep track of source locations for each token
In order to have the original CREATE TABLE text from the schema available when using the parsed structure, rework the lexer so that instead of extracting the substring tokens, it extracts tuples of [position, length] and maintains these associations when collapsing the token sequences. This means we can include the original source text in a `sql` field of the resulting parsed structure.
1 parent b944a2a commit 0902af5

3 files changed

Lines changed: 83 additions & 42 deletions

File tree

lib_sql_parser.php

Lines changed: 69 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8,28 +8,38 @@ class SQLParser{
88

99
public $tokens = array();
1010
public $tables = array();
11+
public $source_map = array();
1112

1213
public function parse($sql){
1314

14-
$this->tokens = $this->lex($sql);
15-
$ret = $this->walk($this->tokens);
15+
// stashes tokens and source_map in $this
16+
$this->lex($sql);
17+
$ret = $this->walk($this->tokens, $sql, $this->source_map);
1618

1719
$this->tables = $ret['tables'];
1820
return $this->tables;
1921
}
2022

21-
23+
#
24+
# lex and collapse tokens
25+
#
26+
public function lex($sql) {
27+
$this->source_map = $this->_lex($sql);
28+
$this->tokens = $this->_extract_tokens($sql, $this->source_map);
29+
return $this->tokens;
30+
}
2231

2332
#
2433
# simple lexer based on http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt
2534
#
35+
# returns an array of [position, len] tuples for each token
2636

27-
public function lex($sql){
37+
private function _lex($sql){
2838

2939
$pos = 0;
3040
$len = strlen($sql);
3141

32-
$tokens = array();
42+
$source_map = array();
3343

3444
while ($pos < $len){
3545

@@ -64,7 +74,7 @@ public function lex($sql){
6474
# <regular identifier>
6575
# <key word>
6676
if (preg_match('![[:alpha:]][[:alnum:]_]*!A', $sql, $m, 0, $pos)){
67-
$tokens[] = substr($sql, $pos, strlen($m[0]));
77+
$source_map[] = [$pos, strlen($m[0])];
6878
$pos += strlen($m[0]);
6979
continue;
7080
}
@@ -75,7 +85,7 @@ public function lex($sql){
7585
if ($p2 === false){
7686
$pos = $len;
7787
}else{
78-
$tokens[] = substr($sql, $pos, 1+$p2-$pos);
88+
$source_map[] = [$pos, 1+$p2-$pos];
7989
$pos = $p2+1;
8090
}
8191
continue;
@@ -86,7 +96,7 @@ public function lex($sql){
8696
# <period> <unsigned integer>
8797
# <unsigned integer> ::= <digit>...
8898
if (preg_match('!(\d+\.?\d*|\.\d+)!A', $sql, $m, 0, $pos)){
89-
$tokens[] = substr($sql, $pos, strlen($m[0]));
99+
$source_map[] = [$pos, strlen($m[0])];
90100
$pos += strlen($m[0]);
91101
continue;
92102
}
@@ -107,7 +117,7 @@ public function lex($sql){
107117
}
108118
if ($sql[$c] == $q){
109119
$slen = $c + 1 - $pos;
110-
$tokens[] = substr($sql, $pos, $slen);
120+
$source_map[] = [$pos, $slen];
111121
$pos += $slen;
112122
break;
113123
}
@@ -129,60 +139,72 @@ public function lex($sql){
129139
# <double period>
130140
# <left bracket>
131141
# <right bracket>
132-
133-
$tokens[] = substr($sql, $pos, 1);
142+
$source_map[] = [$pos, 1];
134143
$pos++;
135144
}
136145

137-
return $tokens;
146+
return $source_map;
138147
}
139148

140149

141-
function walk($tokens){
150+
function walk($tokens, $sql, $source_map){
142151

143152

144153
#
145154
# split into statements
146155
#
147156

148-
$tokens = $this->collapse_tokens($tokens);
149-
150157
$statements = array();
151158
$temp = array();
152-
foreach ($tokens as $t){
159+
$start = 0;
160+
for ($i = 0; $i < count($tokens); $i++) {
161+
$t = $tokens[$i];
153162
if ($t == ';'){
154-
if (count($temp)) $statements[] = $temp;
163+
if (count($temp)) {
164+
$statements[] = array(
165+
"tuples" => $temp,
166+
"sql" => substr($sql, $source_map[$start][0], $source_map[$i][0] - $source_map[$start][0] + $source_map[$i][1]),
167+
);
168+
}
155169
$temp = array();
170+
$start = $i + 1;
156171
}else{
157172
$temp[] = $t;
158173
}
159174
}
160-
if (count($temp)) $statements[] = $temp;
161-
175+
if (count($temp)) {
176+
$statements[] = array(
177+
"tuples" => $temp,
178+
"sql" => substr($sql, $source_map[$start][0], $source_map[$i][0] - $source_map[$start][0] + $source_map[$i][1]),
179+
);
180+
}
162181

163182
#
164183
# find CREATE TABLE statements
165184
#
166185

167186
$tables = array();
168187

169-
foreach ($statements as $s){
188+
foreach ($statements as $stmt){
189+
$s = $stmt['tuples'];
170190

171-
if ($s[0] == 'CREATE TABLE'){
191+
if (StrToUpper($s[0]) == 'CREATE TABLE'){
172192

173193
array_shift($s);
174194

175195
$table = $this->parse_create_table($s);
196+
$table['sql'] = $stmt['sql'];
176197
$tables[$table['name']] = $table;
177198
}
178199

179-
if ($s[0] == 'CREATE TEMPORARY TABLE'){
200+
if (StrToUpper($s[0]) == 'CREATE TEMPORARY TABLE'){
180201

181202
array_shift($s);
182203

183204
$table = $this->parse_create_table($s);
184205
$table['props']['temp'] = true;
185206
$tables[$table['name']] = $table;
207+
$table['sql'] = $stmt['sql'];
186208
}
187209

188210
if ($GLOBALS['_find_single_table'] && count($tables)) return array(
@@ -240,7 +262,6 @@ function parse_create_table($tokens){
240262

241263
$props = $this->parse_table_props($tokens);
242264

243-
244265
$table = array(
245266
'name' => $name,
246267
'fields' => $fields,
@@ -675,14 +696,13 @@ function parse_table_props(&$tokens){
675696
}
676697

677698

678-
679-
# We can simplify parsing by merging certain tokens when
699+
# Given the source map, extract the tokens from the original sql,
700+
# Along the way, simplify parsing by merging certain tokens when
680701
# they occur next to each other. MySQL treats these productions
681702
# equally: 'UNIQUE|UNIQUE INDEX|UNIQUE KEY' and if they are
682703
# all always a single token it makes parsing easier.
683704

684-
function collapse_tokens($tokens){
685-
705+
function _extract_tokens($sql, &$source_map){
686706
$lists = array(
687707
'FULLTEXT INDEX',
688708
'FULLTEXT KEY',
@@ -725,38 +745,51 @@ function collapse_tokens($tokens){
725745
foreach ($singles as $s) $smap[$s] = 1;
726746

727747
$out = array();
748+
$out_map = [];
749+
728750
$i = 0;
729-
$len = count($tokens);
751+
$len = count($source_map);
730752
while ($i < $len){
731-
$next = StrToUpper($tokens[$i]);
732-
if (is_array($maps[$next])){
753+
$token = substr($sql, $source_map[$i][0], $source_map[$i][1]);
754+
$tokenUpper = StrToUpper($token);
755+
if (is_array($maps[$tokenUpper])){
733756
$found = false;
734-
foreach ($maps[$next] as $list){
757+
foreach ($maps[$tokenUpper] as $list){
735758
$fail = false;
736759
foreach ($list as $k => $v){
737-
if ($v != StrToUpper($tokens[$k+$i])){
760+
$next = StrToUpper(substr($sql, $source_map[$k+$i][0], $source_map[$k+$i][1]));
761+
if ($v != $next){
738762
$fail = true;
739763
break;
740764
}
741765
}
742766
if (!$fail){
743-
$i += count($list);
744767
$out[] = implode(' ', $list);
768+
769+
# Extend the length of the first token to include everything
770+
# up through the last in the sequence.
771+
$j = $i + count($list) - 1;
772+
$out_map[] = array($source_map[$i][0], ($source_map[$j][0] - $source_map[$i][0]) + $source_map[$j][1]);
773+
774+
$i = $j + 1;
745775
$found = true;
746776
break;
747777
}
748778
}
749779
if ($found) continue;
750780
}
751-
if ($smap[$next]){
752-
$out[] = $next;
781+
if ($smap[$tokenUpper]){
782+
$out[] = $tokenUpper;
783+
$out_map[]= $source_map[$i];
753784
$i++;
754785
continue;
755786
}
756-
$out[] = $tokens[$i];
787+
$out[] = $token;
788+
$out_map[]= $source_map[$i];
757789
$i++;
758790
}
759791

792+
$source_map = $out_map;
760793
return $out;
761794
}
762795

tests/02_collapse.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77

88
function collapse_test($in, $out){
99
$obj = new SQLParser();
10-
is_deeply($obj->collapse_tokens($in), $out);
10+
is_deeply($obj->lex($in), $out);
1111
}
1212

1313

14-
collapse_test(array('a', 'b'), array('a', 'b'));
15-
collapse_test(array('UNIQUE', 'key'), array('UNIQUE KEY'));
14+
collapse_test('a b', array('a', 'b'));
15+
collapse_test('UNIQUE key', array('UNIQUE KEY'));

tests/10_full.php

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ function full_test($sql, $expected){
1010
$lines = array();
1111
foreach ($obj->tables as $table){
1212
$lines[] = "TABLE:{$table['name']}";
13+
$lines[] = "SQL:{$table['sql']}";
1314
foreach ($table['fields'] as $field){
1415
$lines[] = "-FIELD:{$field['name']}:{$field['type']}";
1516
}
@@ -22,7 +23,14 @@ function full_test($sql, $expected){
2223
plan(1);
2324

2425

25-
full_test("CREATE TABLE table_name (a INT);", array(
26-
"TABLE:table_name",
27-
"-FIELD:a:INT",
26+
full_test("CREATE TABLE table_name (a INT);\n" .
27+
"-- ignored comment\n\n" .
28+
"CREATE TABLE t2 (b VARCHAR)\n\n;\n",
29+
array(
30+
"TABLE:table_name",
31+
"SQL:CREATE TABLE table_name (a INT);",
32+
"-FIELD:a:INT",
33+
"TABLE:t2",
34+
"SQL:CREATE TABLE t2 (b VARCHAR)\n\n;",
35+
"-FIELD:b:VARCHAR",
2836
));

0 commit comments

Comments
 (0)