Skip to content

Commit 60e7764

Browse files
committed
Add REGEXP_LIKE() UDF
Implements MySQL REGEXP_LIKE(expr, pattern [, match_type]) via a new variadic UDF. Introduces a shared regexp_compile() helper that translates MySQL match_type flags to PCRE modifiers and always uses UTF-8 mode, plus regexp_run() (suppresses preg_* warnings) and regexp_fail() (translates preg failures into MySQL-style messages). regexp_compile() rejects empty patterns to match MySQL ERROR 3685 and documents two known limitations of the emulation: collation-blind case-sensitivity defaulting and the always-on /u modifier diverging from the legacy REGEXP operator on binary data. The match_type loop accepts MySQL's c/i/m/n/u flags (last of the case flags wins; "u" — Unix-only line endings — is a no-op since PCRE's default already matches that semantics). Unknown flags raise "Invalid match_type flag: X.". Tests cover: data-driven match cases, NULL propagation, invalid patterns, multi-flag combinations, UTF-8 input errors via the PREG_BAD_UTF8_ERROR branch, the backtrack-limit branch, and that the legacy REGEXP operator still works alongside REGEXP_LIKE.
1 parent 3a3baf7 commit 60e7764

2 files changed

Lines changed: 266 additions & 0 deletions

File tree

packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ public static function register_for( $pdo ): self {
7171
'isnull' => 'isnull',
7272
'if' => '_if',
7373
'regexp' => 'regexp',
74+
'regexp_like' => 'regexp_like',
7475
'field' => 'field',
7576
'log' => 'log',
7677
'least' => 'least',
@@ -536,6 +537,32 @@ public function regexp( $pattern, $field ) {
536537
return preg_match( $pattern, $field );
537538
}
538539

540+
/**
541+
* Method to emulate MySQL REGEXP_LIKE() function.
542+
*
543+
* @param string|null $expr The subject string.
544+
* @param string|null $pattern The regex pattern.
545+
* @param string|null $match_type Optional MySQL match_type flags.
546+
*
547+
* @throws Exception If the pattern is not a valid regular expression.
548+
* @return int|null 1 on match, 0 on no match, NULL if any argument is NULL.
549+
*/
550+
public function regexp_like( $expr, $pattern, $match_type = '' ) {
551+
if ( null === $expr || null === $pattern || null === $match_type ) {
552+
return null;
553+
}
554+
$compiled = $this->regexp_compile( $pattern, $match_type );
555+
$result = $this->regexp_run(
556+
function () use ( $compiled, $expr ) {
557+
return preg_match( $compiled, $expr );
558+
}
559+
);
560+
if ( false === $result ) {
561+
$this->regexp_fail( $pattern );
562+
}
563+
return $result;
564+
}
565+
539566
/**
540567
* Method to emulate MySQL FIELD() function.
541568
*
@@ -896,4 +923,130 @@ public function _helper_like_to_glob_pattern( $pattern ) {
896923

897924
return $pattern;
898925
}
926+
927+
/**
928+
* Compile a MySQL-style regex into a PCRE pattern string.
929+
*
930+
* Translates MySQL match_type flags (c/i/m/n/u) to PCRE modifiers and always
931+
* appends the u (UTF-8) modifier. Case-insensitive is the default, matching
932+
* the existing REGEXP operator.
933+
*
934+
* MySQL's native engine is ICU; we use PHP's PCRE. The two diverge in a
935+
* few corners:
936+
*
937+
* - Some Unicode property shorthands and POSIX class spellings differ.
938+
* - PCRE accepts both `(?<name>...)` and `(?P<name>...)`; MySQL accepts
939+
* only the former and errors on the latter.
940+
* - MySQL's `u` match_type flag ("Unix-only line endings") narrows the
941+
* meaning of `^`, `$`, and `.` to just "\n". PCRE's default line
942+
* handling already behaves this way, so the flag is accepted but has
943+
* no effect; it is MySQL's default mode (without `u`) that is broader
944+
* and cannot be fully emulated through the `m` modifier alone.
945+
*
946+
* Known limitations of this emulation:
947+
*
948+
* - The default (case-insensitive) is correct for the usual
949+
* `utf8mb4_0900_ai_ci` collation; callers that rely on a `_bin` or
950+
* `_cs` collation must pass an explicit `c` match_type because this
951+
* helper has no access to the session collation.
952+
* - The `u` (UTF-8) PCRE modifier is always applied. Binary data with
953+
* invalid UTF-8 bytes that matches fine under the legacy `REGEXP`
954+
* operator raises "Invalid UTF-8 data in regular expression input."
955+
* when routed through REGEXP_LIKE / _REPLACE / _SUBSTR / _INSTR.
956+
*
957+
* @param string $pattern The MySQL regex pattern.
958+
* @param string $match_type MySQL match_type flag string.
959+
*
960+
* @throws Exception If the pattern is empty or the match_type string
961+
* contains an unrecognized flag.
962+
* @return string PCRE-ready pattern with delimiter and modifiers.
963+
*/
964+
private function regexp_compile( $pattern, $match_type ) {
965+
if ( '' === (string) $pattern ) {
966+
throw new Exception( 'Illegal argument to a regular expression.' );
967+
}
968+
$match_type = (string) $match_type;
969+
$case_sensitive = false;
970+
$multiline = false;
971+
$dotall = false;
972+
$len = strlen( $match_type );
973+
for ( $i = 0; $i < $len; $i++ ) {
974+
$flag = $match_type[ $i ];
975+
if ( 'c' === $flag ) {
976+
$case_sensitive = true;
977+
} elseif ( 'i' === $flag ) {
978+
$case_sensitive = false;
979+
} elseif ( 'm' === $flag ) {
980+
$multiline = true;
981+
} elseif ( 'n' === $flag ) {
982+
$dotall = true;
983+
} elseif ( 'u' === $flag ) {
984+
// Unix-only line endings. PCRE's default matches this already; no-op.
985+
continue;
986+
} else {
987+
throw new Exception( "Invalid match_type flag: $flag." );
988+
}
989+
}
990+
991+
$modifiers = 'u';
992+
if ( ! $case_sensitive ) {
993+
$modifiers .= 'i';
994+
}
995+
if ( $multiline ) {
996+
$modifiers .= 'm';
997+
}
998+
if ( $dotall ) {
999+
$modifiers .= 's';
1000+
}
1001+
1002+
return '/' . str_replace( '/', '\\/', $pattern ) . '/' . $modifiers;
1003+
}
1004+
1005+
/**
1006+
* Run a preg_* callable with PHP warnings suppressed.
1007+
*
1008+
* PHPUnit's strict error handler turns preg_* warnings into ErrorExceptions
1009+
* before we can translate them into a MySQL-style error. This wrapper
1010+
* suppresses those warnings so the caller can check the result sentinel
1011+
* (false for preg_match, null for preg_replace / preg_replace_callback)
1012+
* and throw a clean exception.
1013+
*
1014+
* @param callable $op Preg operation. Must be self-contained.
1015+
*
1016+
* @return mixed Return value of the callable.
1017+
*/
1018+
private function regexp_run( $op ) {
1019+
set_error_handler( static function () {} );
1020+
try {
1021+
return $op();
1022+
} finally {
1023+
restore_error_handler();
1024+
}
1025+
}
1026+
1027+
/**
1028+
* Translate a preg_* failure into a caller-friendly exception message.
1029+
*
1030+
* Uses preg_last_error() to distinguish invalid patterns from runtime
1031+
* limit failures and invalid-UTF-8 input.
1032+
*
1033+
* @param string $pattern The original MySQL regex pattern.
1034+
*
1035+
* @throws Exception Always.
1036+
* @return void
1037+
*/
1038+
private function regexp_fail( $pattern ) {
1039+
$err = preg_last_error();
1040+
if (
1041+
PREG_BACKTRACK_LIMIT_ERROR === $err
1042+
|| PREG_RECURSION_LIMIT_ERROR === $err
1043+
|| ( defined( 'PREG_JIT_STACKLIMIT_ERROR' ) && PREG_JIT_STACKLIMIT_ERROR === $err )
1044+
) {
1045+
throw new Exception( 'Regular expression evaluation exceeded internal limits.' );
1046+
}
1047+
if ( PREG_BAD_UTF8_ERROR === $err ) {
1048+
throw new Exception( 'Invalid UTF-8 data in regular expression input.' );
1049+
}
1050+
throw new Exception( 'Invalid regular expression: ' . $pattern . '.' );
1051+
}
8991052
}

packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,119 @@ public static function regexpOperators() {
104104
);
105105
}
106106

107+
/**
108+
* @dataProvider regexpLikeCases
109+
*/
110+
public function testRegexpLike( $expr, $pattern, $match_type, $expected ) {
111+
$expr_sql = null === $expr ? 'NULL' : "'" . addslashes( $expr ) . "'";
112+
$pattern_sql = null === $pattern ? 'NULL' : "'" . addslashes( $pattern ) . "'";
113+
$args = $expr_sql . ', ' . $pattern_sql;
114+
if ( null !== $match_type ) {
115+
$args .= ", '" . addslashes( $match_type ) . "'";
116+
}
117+
$this->assertQuery( "SELECT REGEXP_LIKE($args) AS r" );
118+
$this->assertSame( $expected, $this->engine->get_query_results()[0]->r );
119+
}
120+
121+
public static function regexpLikeCases() {
122+
return array(
123+
// Basic matching.
124+
'match' => array( 'abc', 'abc', null, '1' ),
125+
'no match' => array( 'xbc', 'abc', null, '0' ),
126+
'quantifier match' => array( 'abbbbc', 'ab*bc', null, '1' ),
127+
128+
// Default is case-insensitive (matches existing REGEXP operator behavior).
129+
'default i' => array( 'ABC', 'abc', null, '1' ),
130+
131+
// Explicit flags.
132+
'explicit c' => array( 'ABC', 'abc', 'c', '0' ),
133+
'explicit i' => array( 'ABC', 'abc', 'i', '1' ),
134+
135+
// Later flag wins.
136+
'ci -> c' => array( 'ABC', 'abc', 'ci', '1' ),
137+
'ic -> i' => array( 'ABC', 'abc', 'ic', '0' ),
138+
139+
// Multiline.
140+
'm off: ^ anchored' => array( "abc\ndef", '^def', null, '0' ),
141+
'm on: ^ per line' => array( "abc\ndef", '^def', 'm', '1' ),
142+
143+
// Dot matches newline.
144+
"n off: . no \\n" => array( "a\nb", 'a.b', null, '0' ),
145+
"n on: . matches \\n" => array( "a\nb", 'a.b', 'n', '1' ),
146+
147+
// NULL propagation.
148+
'null expr' => array( null, 'abc', null, null ),
149+
'null pattern' => array( 'abc', null, null, null ),
150+
);
151+
}
152+
153+
public function testRegexpLikeNullMatchType() {
154+
$this->assertQuery( "SELECT REGEXP_LIKE('abc', 'abc', NULL) AS r" );
155+
$this->assertNull( $this->engine->get_query_results()[0]->r );
156+
}
157+
158+
public function testRegexpLikeInvalidFlag() {
159+
$this->assertQueryError(
160+
"SELECT REGEXP_LIKE('abc', 'a', 'x')",
161+
'Invalid match_type flag: x.'
162+
);
163+
}
164+
165+
public function testRegexpLikeInvalidPattern() {
166+
$this->assertQueryError(
167+
"SELECT REGEXP_LIKE('abc', '(abc')",
168+
'Invalid regular expression: (abc.'
169+
);
170+
}
171+
172+
public function testRegexpMatchTypeMultipleFlags() {
173+
// Later-wins across a four-character match_type. 'cimn' ends in 'n',
174+
// so case-insensitive (last of c/i) + multiline + dotall apply.
175+
$this->assertQuery( "SELECT REGEXP_LIKE('ABC', 'abc', 'cimn') AS r" );
176+
$this->assertSame( '1', $this->engine->get_query_results()[0]->r );
177+
}
178+
179+
public function testRegexpMatchTypeUnixFlagNoOp() {
180+
// The 'u' flag is accepted for source compatibility but has no effect
181+
// (PCRE's default already matches MySQL's 'u' semantics).
182+
$this->assertQuery( "SELECT REGEXP_LIKE('abc', 'abc', 'u') AS r" );
183+
$this->assertSame( '1', $this->engine->get_query_results()[0]->r );
184+
}
185+
186+
public function testRegexpMatchTypeEmpty() {
187+
// Empty match_type behaves like the default (case-insensitive).
188+
$this->assertQuery( "SELECT REGEXP_LIKE('ABC', 'abc', '') AS r" );
189+
$this->assertSame( '1', $this->engine->get_query_results()[0]->r );
190+
}
191+
192+
public function testRegexpInvalidUtf8() {
193+
// Raw 0xFF is never valid UTF-8; /u rejects it, which regexp_fail
194+
// translates to a dedicated error.
195+
$this->assertQueryError(
196+
"SELECT REGEXP_LIKE(CAST(X'FF' AS CHAR), 'a')",
197+
'Invalid UTF-8 data in regular expression input.'
198+
);
199+
}
200+
201+
public function testRegexpBacktrackLimit() {
202+
// Classic exponential-backtracking pattern that exceeds PCRE's default
203+
// backtrack limit; exercises the PREG_BACKTRACK_LIMIT_ERROR branch of
204+
// regexp_fail().
205+
$subject = str_repeat( 'a', 30 );
206+
$this->assertQueryError(
207+
"SELECT REGEXP_LIKE('$subject', '^(a?){30}a{30}\$')",
208+
'Regular expression evaluation exceeded internal limits.'
209+
);
210+
}
211+
212+
public function testRegexpLegacyOperatorRegression() {
213+
// The legacy REGEXP operator must keep working alongside REGEXP_LIKE.
214+
$this->assertQuery( "SELECT 'abc' REGEXP 'ABC' AS r" );
215+
$this->assertSame( '1', $this->engine->get_query_results()[0]->r );
216+
$this->assertQuery( "SELECT 'abc' REGEXP 'xyz' AS r" );
217+
$this->assertSame( '0', $this->engine->get_query_results()[0]->r );
218+
}
219+
107220
public function testInsertDateNow() {
108221
$this->assertQuery(
109222
"INSERT INTO _dates (option_name, option_value) VALUES ('first', now());"

0 commit comments

Comments
 (0)