Skip to content

Commit beb1aa5

Browse files
committed
Add REGEXP_INSTR() UDF
Returns the 1-based character position of the Nth match (or 0 if none), with return_option controlling whether to report the match start or the position one past its end. Adds the small regexp_byte_offset_to_char_index() helper that converts a byte offset returned by PCRE into a UTF-8 character index. `pos` greater than char_count is rejected even when SUBSTR / REPLACE allow it, matching MySQL's stricter validation for INSTR. Negative or zero `occurrence` is clamped to 1, also matching MySQL. return_option must be 0 (start) or 1 (one past end); anything else raises "Incorrect arguments to regexp_instr: return_option must be 1 or 0." (matching MySQL's wording). The check runs before the occurrence clamp so the message is consistent. Tests cover the data-driven happy path, NULL propagation, the occurrence clamp, the straddling-match boundary, multi-byte return_option=1, the return_option validation (including under otherwise no-op occurrences), and a lookbehind whose context spans pos.
1 parent 2f619da commit beb1aa5

2 files changed

Lines changed: 179 additions & 0 deletions

File tree

packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ public static function register_for( $pdo ): self {
7474
'regexp_like' => 'regexp_like',
7575
'regexp_replace' => 'regexp_replace',
7676
'regexp_substr' => 'regexp_substr',
77+
'regexp_instr' => 'regexp_instr',
7778
'field' => 'field',
7879
'log' => 'log',
7980
'least' => 'least',
@@ -672,6 +673,59 @@ public function regexp_substr( $expr, $pattern, $pos = 1, $occurrence = 1, $matc
672673
return $matches[ $n - 1 ][0][0];
673674
}
674675

676+
/**
677+
* Method to emulate MySQL REGEXP_INSTR() function.
678+
*
679+
* Values of `occurrence` less than 1 are clamped to 1, matching MySQL.
680+
* `pos` greater than char_count is rejected (unlike SUBSTR and REPLACE).
681+
*
682+
* @param string|null $expr Subject string.
683+
* @param string|null $pattern Regex pattern.
684+
* @param int|null $pos 1-based character position to start matching.
685+
* @param int|null $occurrence Which match to locate (1-based; <= 0 clamps to 1).
686+
* @param int|null $return_option 0 = start of match (default), 1 = one past end.
687+
* @param string|null $match_type MySQL match_type flags.
688+
*
689+
* @throws Exception If the pattern is invalid, pos is out of range, or
690+
* return_option is not 0 or 1.
691+
* @return int|null 1-based character position, 0 if no match, NULL on NULL input.
692+
*/
693+
public function regexp_instr( $expr, $pattern, $pos = 1, $occurrence = 1, $return_option = 0, $match_type = '' ) {
694+
if (
695+
null === $expr || null === $pattern
696+
|| null === $pos || null === $occurrence
697+
|| null === $return_option || null === $match_type
698+
) {
699+
return null;
700+
}
701+
702+
$ret = (int) $return_option;
703+
if ( 0 !== $ret && 1 !== $ret ) {
704+
throw new Exception( 'Incorrect arguments to regexp_instr: return_option must be 1 or 0.' );
705+
}
706+
// MySQL clamps occurrence <= 0 to 1.
707+
$n = max( 1, (int) $occurrence );
708+
709+
$compiled = $this->regexp_compile( $pattern, $match_type );
710+
$byte_start = $this->regexp_char_to_byte_offset( $expr, (int) $pos );
711+
712+
$matches = $this->regexp_find_matches( $compiled, $expr, $byte_start, $n );
713+
if ( false === $matches ) {
714+
$this->regexp_fail( $pattern );
715+
}
716+
if ( count( $matches ) < $n ) {
717+
return 0;
718+
}
719+
720+
list( $matched_text, $matched_byte_offset ) = $matches[ $n - 1 ][0];
721+
$target_byte = $matched_byte_offset;
722+
if ( 1 === $ret ) {
723+
$target_byte += strlen( $matched_text );
724+
}
725+
726+
return $this->regexp_byte_offset_to_char_index( $expr, $target_byte ) + 1;
727+
}
728+
675729
/**
676730
* Method to emulate MySQL FIELD() function.
677731
*
@@ -1170,6 +1224,29 @@ private function regexp_char_to_byte_offset( $s, $char_pos, $allow_past_end = fa
11701224
throw new Exception( 'Index out of bounds in regular expression search.' );
11711225
}
11721226

1227+
/**
1228+
* Convert a byte offset within a UTF-8 string into the 0-based character index.
1229+
*
1230+
* The byte offset is expected to fall on a UTF-8 code point boundary, as is
1231+
* the case for offsets returned by PCRE. Offsets greater than the string
1232+
* length are clamped to the string length as a defensive measure.
1233+
*
1234+
* @param string $s UTF-8 string.
1235+
* @param int $byte_offset Byte offset on a code point boundary.
1236+
*
1237+
* @return int 0-based character index.
1238+
*/
1239+
private function regexp_byte_offset_to_char_index( $s, $byte_offset ) {
1240+
$byte_offset = min( $byte_offset, strlen( $s ) );
1241+
$chars = 0;
1242+
for ( $i = 0; $i < $byte_offset; $i++ ) {
1243+
if ( ( ord( $s[ $i ] ) & 0xC0 ) !== 0x80 ) {
1244+
++$chars;
1245+
}
1246+
}
1247+
return $chars;
1248+
}
1249+
11731250
/**
11741251
* Expand a MySQL/ICU-style replacement template.
11751252
*

packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,108 @@ public function testRegexpSubstrLookbehindAcrossPos() {
510510
$this->assertSame( 'b', $this->engine->get_query_results()[0]->r );
511511
}
512512

513+
/**
514+
* @dataProvider regexpInstrCases
515+
*/
516+
public function testRegexpInstr( $sql, $expected ) {
517+
$this->assertQuery( "SELECT $sql AS r" );
518+
$this->assertSame( $expected, $this->engine->get_query_results()[0]->r );
519+
}
520+
521+
public static function regexpInstrCases() {
522+
return array(
523+
'basic' => array( "REGEXP_INSTR('dog cat dog', 'dog')", '1' ),
524+
'no match' => array( "REGEXP_INSTR('abc', 'xyz')", '0' ),
525+
'second match' => array( "REGEXP_INSTR('dog cat dog', 'dog', 1, 2)", '9' ),
526+
'pos skips first match' => array( "REGEXP_INSTR('dog cat dog', 'dog', 5)", '9' ),
527+
'return_option=1 (end)' => array( "REGEXP_INSTR('dog cat dog', 'dog', 1, 1, 1)", '4' ),
528+
'match_type c miss' => array( "REGEXP_INSTR('DOG', 'dog', 1, 1, 0, 'c')", '0' ),
529+
'multibyte position' => array( "REGEXP_INSTR('café123', '[0-9]+')", '5' ),
530+
);
531+
}
532+
533+
public function testRegexpInstrNullExpr() {
534+
$this->assertQuery( "SELECT REGEXP_INSTR(NULL, 'abc') AS r" );
535+
$this->assertNull( $this->engine->get_query_results()[0]->r );
536+
}
537+
538+
public function testRegexpInstrNullPattern() {
539+
$this->assertQuery( "SELECT REGEXP_INSTR('abc', NULL) AS r" );
540+
$this->assertNull( $this->engine->get_query_results()[0]->r );
541+
}
542+
543+
public function testRegexpInstrPosZero() {
544+
$this->assertQueryError(
545+
"SELECT REGEXP_INSTR('abc', 'a', 0)",
546+
'Index out of bounds in regular expression search.'
547+
);
548+
}
549+
550+
public function testRegexpInstrPosOutOfRange() {
551+
$this->assertQueryError(
552+
"SELECT REGEXP_INSTR('abc', 'a', 10)",
553+
'Index out of bounds in regular expression search.'
554+
);
555+
}
556+
557+
public function testRegexpInstrInvalidReturnOption() {
558+
$this->assertQueryError(
559+
"SELECT REGEXP_INSTR('abc', 'a', 1, 1, 2)",
560+
'Incorrect arguments to regexp_instr: return_option must be 1 or 0.'
561+
);
562+
}
563+
564+
public function testRegexpInstrInvalidReturnOptionWithOccurrenceZero() {
565+
// return_option must be validated before occurrence is clamped, so an
566+
// invalid return_option consistently errors regardless of occurrence.
567+
$this->assertQueryError(
568+
"SELECT REGEXP_INSTR('abc', 'a', 1, 0, 99)",
569+
'Incorrect arguments to regexp_instr: return_option must be 1 or 0.'
570+
);
571+
}
572+
573+
public function testRegexpInstrInvalidPattern() {
574+
$this->assertQueryError(
575+
"SELECT REGEXP_INSTR('abc', '(abc')",
576+
'Invalid regular expression: (abc.'
577+
);
578+
}
579+
580+
public function testRegexpInstrInvalidFlag() {
581+
$this->assertQueryError(
582+
"SELECT REGEXP_INSTR('abc', 'a', 1, 1, 0, 'x')",
583+
'Invalid match_type flag: x.'
584+
);
585+
}
586+
587+
public function testRegexpInstrOccurrenceClampedToOne() {
588+
// MySQL clamps occurrence <= 0 to 1.
589+
$this->assertQuery( "SELECT REGEXP_INSTR('abcabc', 'b', 1, 0) AS r" );
590+
$this->assertSame( '2', $this->engine->get_query_results()[0]->r );
591+
$this->assertQuery( "SELECT REGEXP_INSTR('abcabc', 'b', 1, -5) AS r" );
592+
$this->assertSame( '2', $this->engine->get_query_results()[0]->r );
593+
}
594+
595+
public function testRegexpInstrStraddlingMatch() {
596+
// A match that starts before pos is not returned; the next match at or
597+
// after pos is returned instead.
598+
$this->assertQuery( "SELECT REGEXP_INSTR('abc123def', '[0-9]+', 5) AS r" );
599+
$this->assertSame( '5', $this->engine->get_query_results()[0]->r );
600+
}
601+
602+
public function testRegexpInstrMultibyteReturnOptionEnd() {
603+
// Multibyte match ('é' is 2 bytes) with return_option=1 (one past end).
604+
// 'aéb' char positions: a=1, é=2, b=3. 'é' matches at char 2, end char position = 3.
605+
$this->assertQuery( "SELECT REGEXP_INSTR('aéb', 'é', 1, 1, 1) AS r" );
606+
$this->assertSame( '3', $this->engine->get_query_results()[0]->r );
607+
}
608+
609+
public function testRegexpInstrLookbehindAcrossPos() {
610+
// The lookbehind sees bytes before pos because the full subject is kept.
611+
$this->assertQuery( "SELECT REGEXP_INSTR('ab', '(?<=a)b', 2) AS r" );
612+
$this->assertSame( '2', $this->engine->get_query_results()[0]->r );
613+
}
614+
513615
public function testInsertDateNow() {
514616
$this->assertQuery(
515617
"INSERT INTO _dates (option_name, option_value) VALUES ('first', now());"

0 commit comments

Comments
 (0)