Skip to content

Commit 2f619da

Browse files
committed
Add REGEXP_SUBSTR() UDF
Returns the Nth matched substring at or after a given character position, or NULL if no match. Reuses regexp_compile(), regexp_char_to_byte_offset() (with $allow_past_end so pos = char_count + 1 yields NULL), regexp_find_matches(), and regexp_fail() introduced with REGEXP_REPLACE. Negative or zero `occurrence` is clamped to 1, matching MySQL. Tests cover the data-driven happy path, NULL propagation, the occurrence clamp, the pos = char_count + 1 / pos > char_count + 1 boundary, multi-byte matches, invalid patterns, invalid flags, and a lookbehind whose context spans pos.
1 parent 14700b6 commit 2f619da

2 files changed

Lines changed: 133 additions & 0 deletions

File tree

packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ public static function register_for( $pdo ): self {
7373
'regexp' => 'regexp',
7474
'regexp_like' => 'regexp_like',
7575
'regexp_replace' => 'regexp_replace',
76+
'regexp_substr' => 'regexp_substr',
7677
'field' => 'field',
7778
'log' => 'log',
7879
'least' => 'least',
@@ -632,6 +633,45 @@ public function regexp_replace( $expr, $pattern, $replacement, $pos = 1, $occurr
632633
return $out;
633634
}
634635

636+
/**
637+
* Method to emulate MySQL REGEXP_SUBSTR() function.
638+
*
639+
* Values of `occurrence` less than 1 are clamped to 1, matching MySQL.
640+
* `pos = char_count + 1` is accepted and yields no match (NULL).
641+
*
642+
* @param string|null $expr Subject string.
643+
* @param string|null $pattern Regex pattern.
644+
* @param int|null $pos 1-based character position to start matching.
645+
* @param int|null $occurrence Which match to return (1-based; <= 0 clamps to 1).
646+
* @param string|null $match_type MySQL match_type flags.
647+
*
648+
* @throws Exception If the pattern is not a valid regular expression, or pos is out of range.
649+
* @return string|null The matched substring, NULL if no match or any argument is NULL.
650+
*/
651+
public function regexp_substr( $expr, $pattern, $pos = 1, $occurrence = 1, $match_type = '' ) {
652+
if (
653+
null === $expr || null === $pattern
654+
|| null === $pos || null === $occurrence || null === $match_type
655+
) {
656+
return null;
657+
}
658+
659+
// MySQL clamps occurrence <= 0 to 1.
660+
$n = max( 1, (int) $occurrence );
661+
662+
$compiled = $this->regexp_compile( $pattern, $match_type );
663+
$byte_start = $this->regexp_char_to_byte_offset( $expr, (int) $pos, true );
664+
665+
$matches = $this->regexp_find_matches( $compiled, $expr, $byte_start, $n );
666+
if ( false === $matches ) {
667+
$this->regexp_fail( $pattern );
668+
}
669+
if ( count( $matches ) < $n ) {
670+
return null;
671+
}
672+
return $matches[ $n - 1 ][0][0];
673+
}
674+
635675
/**
636676
* Method to emulate MySQL FIELD() function.
637677
*

packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,99 @@ public function testRegexpReplaceEmptyReplacement() {
417417
$this->assertSame( 'ac', $this->engine->get_query_results()[0]->r );
418418
}
419419

420+
/**
421+
* @dataProvider regexpSubstrCases
422+
*/
423+
public function testRegexpSubstr( $sql, $expected ) {
424+
$this->assertQuery( "SELECT $sql AS r" );
425+
$this->assertSame( $expected, $this->engine->get_query_results()[0]->r );
426+
}
427+
428+
public static function regexpSubstrCases() {
429+
return array(
430+
'basic match' => array( "REGEXP_SUBSTR('abc123def', '[0-9]+')", '123' ),
431+
'no match' => array( "REGEXP_SUBSTR('abcdef', '[0-9]+')", null ),
432+
'pos' => array( "REGEXP_SUBSTR('abc123def456', '[0-9]+', 5)", '23' ),
433+
'pos with occurrence=2' => array( "REGEXP_SUBSTR('abc123def456', '[0-9]+', 5, 2)", '456' ),
434+
'occurrence' => array( "REGEXP_SUBSTR('a1 b2 c3', '[a-z][0-9]', 1, 2)", 'b2' ),
435+
'occurrence too high' => array( "REGEXP_SUBSTR('a1 b2', '[a-z][0-9]', 1, 5)", null ),
436+
'match_type c' => array( "REGEXP_SUBSTR('ABC', 'abc', 1, 1, 'c')", null ),
437+
'multibyte match' => array( "REGEXP_SUBSTR('café', 'é')", 'é' ),
438+
'null expr' => array( 'REGEXP_SUBSTR(NULL, \'abc\')', null ),
439+
'null pattern' => array( "REGEXP_SUBSTR('abc', NULL)", null ),
440+
);
441+
}
442+
443+
public function testRegexpSubstrNullPos() {
444+
$this->assertQuery( "SELECT REGEXP_SUBSTR('abc', 'a', NULL) AS r" );
445+
$this->assertNull( $this->engine->get_query_results()[0]->r );
446+
}
447+
448+
public function testRegexpSubstrNullOccurrence() {
449+
$this->assertQuery( "SELECT REGEXP_SUBSTR('abc', 'a', 1, NULL) AS r" );
450+
$this->assertNull( $this->engine->get_query_results()[0]->r );
451+
}
452+
453+
public function testRegexpSubstrNullMatchType() {
454+
$this->assertQuery( "SELECT REGEXP_SUBSTR('abc', 'a', 1, 1, NULL) AS r" );
455+
$this->assertNull( $this->engine->get_query_results()[0]->r );
456+
}
457+
458+
public function testRegexpSubstrOccurrenceClampedToOne() {
459+
// MySQL clamps occurrence <= 0 to 1.
460+
$this->assertQuery( "SELECT REGEXP_SUBSTR('abcabc', 'b', 1, 0) AS r" );
461+
$this->assertSame( 'b', $this->engine->get_query_results()[0]->r );
462+
$this->assertQuery( "SELECT REGEXP_SUBSTR('abcabc', 'b', 1, -5) AS r" );
463+
$this->assertSame( 'b', $this->engine->get_query_results()[0]->r );
464+
}
465+
466+
public function testRegexpSubstrPosOutOfRange() {
467+
$this->assertQueryError(
468+
"SELECT REGEXP_SUBSTR('abc', 'a', 10)",
469+
'Index out of bounds in regular expression search.'
470+
);
471+
}
472+
473+
public function testRegexpSubstrPosAtEnd() {
474+
// MySQL allows pos = char_count + 1 for SUBSTR; returns NULL.
475+
$this->assertQuery( "SELECT REGEXP_SUBSTR('abc', 'a', 4) AS r" );
476+
$this->assertNull( $this->engine->get_query_results()[0]->r );
477+
}
478+
479+
public function testRegexpSubstrPosBeyondEnd() {
480+
$this->assertQueryError(
481+
"SELECT REGEXP_SUBSTR('abc', 'a', 5)",
482+
'Index out of bounds in regular expression search.'
483+
);
484+
}
485+
486+
public function testRegexpSubstrPosZero() {
487+
$this->assertQueryError(
488+
"SELECT REGEXP_SUBSTR('abc', 'a', 0)",
489+
'Index out of bounds in regular expression search.'
490+
);
491+
}
492+
493+
public function testRegexpSubstrInvalidPattern() {
494+
$this->assertQueryError(
495+
"SELECT REGEXP_SUBSTR('abc', '(abc')",
496+
'Invalid regular expression: (abc.'
497+
);
498+
}
499+
500+
public function testRegexpSubstrInvalidFlag() {
501+
$this->assertQueryError(
502+
"SELECT REGEXP_SUBSTR('abc', 'a', 1, 1, 'x')",
503+
'Invalid match_type flag: x.'
504+
);
505+
}
506+
507+
public function testRegexpSubstrLookbehindAcrossPos() {
508+
// The lookbehind sees bytes before pos because the full subject is kept.
509+
$this->assertQuery( "SELECT REGEXP_SUBSTR('ab', '(?<=a)b', 2) AS r" );
510+
$this->assertSame( 'b', $this->engine->get_query_results()[0]->r );
511+
}
512+
420513
public function testInsertDateNow() {
421514
$this->assertQuery(
422515
"INSERT INTO _dates (option_name, option_value) VALUES ('first', now());"

0 commit comments

Comments
 (0)