Skip to content

Commit 916b512

Browse files
committed
Speed up the lexer with cheaper byte checks
Apply lexer optimisations from PR #375: - Cache `strlen($sql)` once in `$sql_length` instead of recomputing on each EOF check. - Replace `strspn($byte, MASK) > 0` with direct byte comparisons (`$byte >= '0' && $byte <= '9'`, `false !== strpos(MASK, $byte)`, unrolled whitespace check). - Use `strpos($sql, '*/', $pos)` instead of a manual scan loop in `read_comment_content()`. - In `read_quoted_text()`, use `strpos()` to find the next quote, eliminating the separate end-of-input check that follows the `strcspn()` scan. - Inline `next_token()` + `get_token()` in `remaining_tokens()` so the hot loop builds tokens directly. Co-authored-by: Adam Zieliński <adam@adamziel.com> Adapted from #375
1 parent daa4185 commit 916b512

1 file changed

Lines changed: 83 additions & 31 deletions

File tree

packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php

Lines changed: 83 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2111,6 +2111,13 @@ class WP_MySQL_Lexer {
21112111
*/
21122112
private $sql;
21132113

2114+
/**
2115+
* Byte length of the SQL payload.
2116+
*
2117+
* @var int
2118+
*/
2119+
private $sql_length;
2120+
21142121
/**
21152122
* The version of the MySQL server that the SQL payload is intended for.
21162123
*
@@ -2189,6 +2196,7 @@ public function __construct(
21892196
array $sql_modes = array()
21902197
) {
21912198
$this->sql = $sql;
2199+
$this->sql_length = strlen( $sql );
21922200
$this->mysql_version = $mysql_version;
21932201

21942202
foreach ( $sql_modes as $sql_mode ) {
@@ -2284,10 +2292,46 @@ public function get_token(): ?WP_MySQL_Token {
22842292
* @return WP_MySQL_Token[] An array of token objects representing the remaining tokens.
22852293
*/
22862294
public function remaining_tokens(): array {
2287-
$tokens = array();
2288-
while ( true === $this->next_token() ) {
2289-
$token = $this->get_token();
2290-
$tokens[] = $token;
2295+
$tokens = array();
2296+
$no_backslash_escapes_sql_mode_set = $this->is_sql_mode_active(
2297+
self::SQL_MODE_NO_BACKSLASH_ESCAPES
2298+
);
2299+
2300+
while ( true ) {
2301+
if (
2302+
self::EOF === $this->token_type
2303+
|| ( null === $this->token_type && $this->bytes_already_read > 0 )
2304+
) {
2305+
$this->token_type = null;
2306+
break;
2307+
}
2308+
2309+
do {
2310+
$this->token_starts_at = $this->bytes_already_read;
2311+
$this->token_type = $this->read_next_token();
2312+
} while (
2313+
self::WHITESPACE === $this->token_type
2314+
|| self::COMMENT === $this->token_type
2315+
|| self::MYSQL_COMMENT_START === $this->token_type
2316+
|| self::MYSQL_COMMENT_END === $this->token_type
2317+
);
2318+
2319+
if ( null === $this->token_type ) {
2320+
break;
2321+
}
2322+
2323+
$tokens[] = new WP_MySQL_Token(
2324+
$this->token_type,
2325+
$this->token_starts_at,
2326+
$this->bytes_already_read - $this->token_starts_at,
2327+
$this->sql,
2328+
$no_backslash_escapes_sql_mode_set
2329+
);
2330+
2331+
if ( self::EOF === $this->token_type ) {
2332+
$this->token_type = null;
2333+
break;
2334+
}
22912335
}
22922336
return $tokens;
22932337
}
@@ -2356,10 +2400,10 @@ private function read_next_token(): ?int {
23562400

23572401
if ( "'" === $byte || '"' === $byte || '`' === $byte ) {
23582402
$type = $this->read_quoted_text();
2359-
} elseif ( null !== $byte && strspn( $byte, self::DIGIT_MASK ) > 0 ) {
2403+
} elseif ( null !== $byte && $byte >= '0' && $byte <= '9' ) {
23602404
$type = $this->read_number();
23612405
} elseif ( '.' === $byte ) {
2362-
if ( null !== $next_byte && strspn( $next_byte, self::DIGIT_MASK ) > 0 ) {
2406+
if ( null !== $next_byte && $next_byte >= '0' && $next_byte <= '9' ) {
23632407
$type = $this->read_number();
23642408
} else {
23652409
$this->bytes_already_read += 1;
@@ -2420,8 +2464,8 @@ private function read_next_token(): ?int {
24202464
} elseif ( '-' === $byte ) {
24212465
if (
24222466
'-' === $next_byte
2423-
&& $this->bytes_already_read + 2 < strlen( $this->sql )
2424-
&& strspn( $this->sql[ $this->bytes_already_read + 2 ], self::WHITESPACE_MASK ) > 0
2467+
&& $this->bytes_already_read + 2 < $this->sql_length
2468+
&& false !== strpos( self::WHITESPACE_MASK, $this->sql[ $this->bytes_already_read + 2 ] )
24252469
) {
24262470
$type = $this->read_line_comment();
24272471
} elseif ( '>' === $next_byte ) {
@@ -2547,7 +2591,13 @@ private function read_next_token(): ?int {
25472591
}
25482592
} elseif ( '#' === $byte ) {
25492593
$type = $this->read_line_comment();
2550-
} elseif ( null !== $byte && strspn( $byte, self::WHITESPACE_MASK ) > 0 ) {
2594+
} elseif (
2595+
' ' === $byte
2596+
|| "\t" === $byte
2597+
|| "\n" === $byte
2598+
|| "\r" === $byte
2599+
|| "\f" === $byte
2600+
) {
25512601
$this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read );
25522602
$type = self::WHITESPACE;
25532603
} elseif ( ( 'x' === $byte || 'X' === $byte || 'b' === $byte || 'B' === $byte ) && "'" === $next_byte ) {
@@ -2675,7 +2725,7 @@ private function read_number(): ?int {
26752725
'0' === $byte
26762726
&& 'x' === $next_byte
26772727
&& null !== $third_byte
2678-
&& strspn( $third_byte, self::HEX_DIGIT_MASK ) > 0
2728+
&& false !== strpos( self::HEX_DIGIT_MASK, $third_byte )
26792729
)
26802730
// HEX number in the form of x'N' or X'N'.
26812731
|| ( ( 'x' === $byte || 'X' === $byte ) && "'" === $next_byte )
@@ -2685,7 +2735,7 @@ private function read_number(): ?int {
26852735
$this->bytes_already_read += strspn( $this->sql, self::HEX_DIGIT_MASK, $this->bytes_already_read );
26862736
if ( $is_quoted ) {
26872737
if (
2688-
$this->bytes_already_read >= strlen( $this->sql )
2738+
$this->bytes_already_read >= $this->sql_length
26892739
|| "'" !== $this->sql[ $this->bytes_already_read ]
26902740
) {
26912741
return null; // Invalid input.
@@ -2708,7 +2758,7 @@ private function read_number(): ?int {
27082758
$this->bytes_already_read += strspn( $this->sql, '01', $this->bytes_already_read );
27092759
if ( $is_quoted ) {
27102760
if (
2711-
$this->bytes_already_read >= strlen( $this->sql )
2761+
$this->bytes_already_read >= $this->sql_length
27122762
|| "'" !== $this->sql[ $this->bytes_already_read ]
27132763
) {
27142764
return null; // Invalid input.
@@ -2737,11 +2787,12 @@ private function read_number(): ?int {
27372787
( 'e' === $byte || 'E' === $byte )
27382788
&& null !== $next_byte
27392789
&& (
2740-
strspn( $next_byte, self::DIGIT_MASK ) > 0
2790+
( $next_byte >= '0' && $next_byte <= '9' )
27412791
|| (
27422792
( '+' === $next_byte || '-' === $next_byte )
2743-
&& $this->bytes_already_read + 2 < strlen( $this->sql )
2744-
&& strspn( $this->sql[ $this->bytes_already_read + 2 ], self::DIGIT_MASK ) > 0
2793+
&& $this->bytes_already_read + 2 < $this->sql_length
2794+
&& $this->sql[ $this->bytes_already_read + 2 ] >= '0'
2795+
&& $this->sql[ $this->bytes_already_read + 2 ] <= '9'
27452796
)
27462797
);
27472798
if ( $has_exponent ) {
@@ -2838,12 +2889,11 @@ private function read_quoted_text(): ?int {
28382889
// in which case the escape sequence is consumed and the loop continues.
28392890
$at = $this->bytes_already_read;
28402891
while ( true ) {
2841-
$at += strcspn( $this->sql, $quote, $at );
2842-
2843-
// Unclosed string - unexpected EOF.
2844-
if ( ( $this->sql[ $at ] ?? null ) !== $quote ) {
2892+
$quote_at = strpos( $this->sql, $quote, $at );
2893+
if ( false === $quote_at ) {
28452894
return null; // Invalid input.
28462895
}
2896+
$at = $quote_at;
28472897

28482898
/*
28492899
* By default, quotes can be escaped with a "\".
@@ -2853,9 +2903,17 @@ private function read_quoted_text(): ?int {
28532903
* The quote is escaped only when the number of preceding backslashes
28542904
* is odd - "\" is an escape sequence, "\\" is an escaped backslash,
28552905
* "\\\" is an escaped backslash and an escape sequence, and so on.
2906+
*
2907+
* The `($at - $i - 1) >= 0` guard prevents PHP's negative-string-
2908+
* offset wraparound (PHP 7.1+) when the closing-quote candidate
2909+
* sits at the very start of the input. The `?? null` covers
2910+
* positive out-of-range indexes belt-and-suspenders.
28562911
*/
28572912
if ( ! $no_backslash_escapes ) {
2858-
for ( $i = 0; ( $at - $i - 1 ) >= 0 && '\\' === $this->sql[ $at - $i - 1 ]; $i += 1 );
2913+
$i = 0;
2914+
while ( ( $at - $i - 1 ) >= 0 && '\\' === ( $this->sql[ $at - $i - 1 ] ?? null ) ) {
2915+
$i += 1;
2916+
}
28592917
if ( 1 === $i % 2 ) {
28602918
$at += 1;
28612919
continue;
@@ -2920,17 +2978,11 @@ private function read_mysql_comment(): int {
29202978
}
29212979

29222980
private function read_comment_content(): void {
2923-
while ( true ) {
2924-
$this->bytes_already_read += strcspn( $this->sql, '*', $this->bytes_already_read );
2925-
$this->bytes_already_read += 1; // Consume the '*'.
2926-
$byte = $this->sql[ $this->bytes_already_read ] ?? null;
2927-
if ( null === $byte ) {
2928-
break;
2929-
}
2930-
if ( '/' === $byte ) {
2931-
$this->bytes_already_read += 1; // Consume the '/'.
2932-
break;
2933-
}
2981+
$comment_end = strpos( $this->sql, '*/', $this->bytes_already_read );
2982+
if ( false === $comment_end ) {
2983+
$this->bytes_already_read = $this->sql_length;
2984+
} else {
2985+
$this->bytes_already_read = $comment_end + 2;
29342986
}
29352987
}
29362988

0 commit comments

Comments
 (0)