Skip to content

Commit 14700b6

Browse files
committed
Add REGEXP_REPLACE() UDF
Implements MySQL REGEXP_REPLACE(expr, pattern, replacement [, pos [, occurrence [, match_type]]]) with three new private helpers: - regexp_char_to_byte_offset() converts a 1-based character pos into a byte offset, accepting char_count + 1 for the "start at end" case that MySQL allows for REPLACE / SUBSTR. - regexp_find_matches() walks the subject with preg_match and its offset argument so that lookbehind assertions can see bytes before pos. Skips UTF-8 continuation bytes after zero-width matches. - regexp_expand_replacement() implements MySQL/ICU replacement grammar: "$N" backreferences (with "$0" = full match and longest valid digit-prefix wins), "\X" emits X literally, "${N}" is rejected as invalid, and a trailing lone backslash is dropped. Errors mirror MySQL's: "A capture group has an invalid name." (3887) and "Index out of bounds in regular expression search." (3686). REGEXP_REPLACE rebuilds the result by walking collected matches, emitting the in-between bytes verbatim and substituting only the targeted occurrence (or all when occurrence = 0). Negative occurrence is clamped to 1 to match MySQL. Tests cover the data-driven happy path, NULL propagation, every documented backreference form, lookbehind across pos, zero-width matches, the pos = char_count + 1 edge, the negative-occurrence clamp, and the ICU error branches.
1 parent 60e7764 commit 14700b6

2 files changed

Lines changed: 425 additions & 0 deletions

File tree

packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ public static function register_for( $pdo ): self {
7272
'if' => '_if',
7373
'regexp' => 'regexp',
7474
'regexp_like' => 'regexp_like',
75+
'regexp_replace' => 'regexp_replace',
7576
'field' => 'field',
7677
'log' => 'log',
7778
'least' => 'least',
@@ -563,6 +564,74 @@ function () use ( $compiled, $expr ) {
563564
return $result;
564565
}
565566

567+
/**
568+
* Method to emulate MySQL REGEXP_REPLACE() function.
569+
*
570+
* Uses MySQL/ICU replacement grammar: "$N" backreferences ("$0" is the
571+
* full match), "\X" emits X (drops the backslash), "${N}" is rejected.
572+
* Negative `occurrence` is clamped to 1; `pos = char_count + 1` is
573+
* accepted and returns the subject unchanged.
574+
*
575+
* @param string|null $expr Subject string.
576+
* @param string|null $pattern Regex pattern.
577+
* @param string|null $replacement Replacement string (supports $N backreferences).
578+
* @param int|null $pos 1-based character position to start matching.
579+
* @param int|null $occurrence Nth match to replace; 0 = all matches.
580+
* @param string|null $match_type MySQL match_type flags.
581+
*
582+
* @throws Exception If the pattern is not a valid regular expression, or pos is out of range.
583+
* @return string|null The replaced string, or NULL if any argument is NULL.
584+
*/
585+
public function regexp_replace( $expr, $pattern, $replacement, $pos = 1, $occurrence = 0, $match_type = '' ) {
586+
if (
587+
null === $expr || null === $pattern || null === $replacement
588+
|| null === $pos || null === $occurrence || null === $match_type
589+
) {
590+
return null;
591+
}
592+
593+
$compiled = $this->regexp_compile( $pattern, $match_type );
594+
$byte_start = $this->regexp_char_to_byte_offset( $expr, (int) $pos, true );
595+
$n = (int) $occurrence;
596+
597+
// 0 means replace all; negative occurrences are clamped to 1 (MySQL behavior).
598+
if ( $n < 0 ) {
599+
$n = 1;
600+
}
601+
602+
$matches = $this->regexp_find_matches( $compiled, $expr, $byte_start, $n > 0 ? $n : -1 );
603+
if ( false === $matches ) {
604+
$this->regexp_fail( $pattern );
605+
}
606+
607+
// Rebuild: bytes before pos are untouched, then walk the collected
608+
// matches, substituting only the targeted occurrence (or all when N=0).
609+
$out = substr( $expr, 0, $byte_start );
610+
$cur = $byte_start;
611+
foreach ( $matches as $i => $m ) {
612+
$match_start = $m[0][1];
613+
$match_length = strlen( $m[0][0] );
614+
615+
$out .= substr( $expr, $cur, $match_start - $cur );
616+
617+
$replace_this = 0 === $n || ( $i + 1 ) === $n;
618+
if ( $replace_this ) {
619+
$groups = array();
620+
foreach ( $m as $g ) {
621+
$groups[] = $g[0];
622+
}
623+
$out .= $this->regexp_expand_replacement( $replacement, $groups );
624+
} else {
625+
$out .= $m[0][0];
626+
}
627+
628+
$cur = $match_start + $match_length;
629+
}
630+
$out .= substr( $expr, $cur );
631+
632+
return $out;
633+
}
634+
566635
/**
567636
* Method to emulate MySQL FIELD() function.
568637
*
@@ -1024,6 +1093,162 @@ private function regexp_run( $op ) {
10241093
}
10251094
}
10261095

1096+
/**
1097+
* Convert a 1-based character position into a byte offset into the UTF-8 string.
1098+
*
1099+
* @param string $s UTF-8 string.
1100+
* @param int $char_pos 1-based character position.
1101+
* @param bool $allow_past_end Whether to accept char_pos == char_count + 1
1102+
* (returns strlen($s)). MySQL allows this for
1103+
* REGEXP_REPLACE and REGEXP_SUBSTR but not for
1104+
* REGEXP_INSTR.
1105+
*
1106+
* @throws Exception If $char_pos is out of range.
1107+
* @return int Byte offset into $s.
1108+
*/
1109+
private function regexp_char_to_byte_offset( $s, $char_pos, $allow_past_end = false ) {
1110+
if ( $char_pos < 1 ) {
1111+
throw new Exception( 'Index out of bounds in regular expression search.' );
1112+
}
1113+
if ( 1 === $char_pos ) {
1114+
return 0;
1115+
}
1116+
$byte_len = strlen( $s );
1117+
$chars = 1;
1118+
for ( $i = 0; $i < $byte_len; $i++ ) {
1119+
// Count every byte that isn't a UTF-8 continuation byte.
1120+
if ( ( ord( $s[ $i ] ) & 0xC0 ) !== 0x80 ) {
1121+
if ( $chars === $char_pos ) {
1122+
return $i;
1123+
}
1124+
++$chars;
1125+
}
1126+
}
1127+
if ( $allow_past_end && $chars === $char_pos ) {
1128+
return $byte_len;
1129+
}
1130+
throw new Exception( 'Index out of bounds in regular expression search.' );
1131+
}
1132+
1133+
/**
1134+
* Expand a MySQL/ICU-style replacement template.
1135+
*
1136+
* Rules (from ICU, used by MySQL REGEXP_REPLACE):
1137+
* - "\X" for any X: emit X, drop the backslash (also applies to "\\" -> "\").
1138+
* - Trailing lone backslash: dropped.
1139+
* - "$N" (N is one or more digits): emit the Nth capture group. Consumes
1140+
* the longest digit run that forms a valid group index; any trailing
1141+
* digits become literal text.
1142+
* - "$" not followed by a digit: error (matches MySQL ERROR 3887).
1143+
* - "$N" where N is larger than any existing group: error (ERROR 3686).
1144+
* - "${N}" is NOT supported and raises the same error as a bare "$".
1145+
*
1146+
* @param string $replacement The replacement template.
1147+
* @param array $groups Capture-group texts, with index 0 = full match.
1148+
*
1149+
* @throws Exception On an invalid "$..." reference.
1150+
* @return string The expanded replacement.
1151+
*/
1152+
private function regexp_expand_replacement( $replacement, $groups ) {
1153+
$max_group = count( $groups ) - 1;
1154+
$out = '';
1155+
$len = strlen( $replacement );
1156+
$i = 0;
1157+
while ( $i < $len ) {
1158+
$c = $replacement[ $i ];
1159+
if ( '\\' === $c ) {
1160+
if ( $i + 1 < $len ) {
1161+
$out .= $replacement[ $i + 1 ];
1162+
$i += 2;
1163+
} else {
1164+
++$i;
1165+
}
1166+
continue;
1167+
}
1168+
if ( '$' === $c ) {
1169+
if ( $i + 1 >= $len || ! ctype_digit( $replacement[ $i + 1 ] ) ) {
1170+
throw new Exception( 'A capture group has an invalid name.' );
1171+
}
1172+
$j = $i + 1;
1173+
while ( $j < $len && ctype_digit( $replacement[ $j ] ) ) {
1174+
++$j;
1175+
}
1176+
// Longest digit prefix that refers to an existing group wins;
1177+
// remaining digits are emitted literally.
1178+
$digits = substr( $replacement, $i + 1, $j - $i - 1 );
1179+
$idx = null;
1180+
$consumed = 0;
1181+
for ( $k = strlen( $digits ); $k > 0; --$k ) {
1182+
$cand = (int) substr( $digits, 0, $k );
1183+
if ( $cand <= $max_group ) {
1184+
$idx = $cand;
1185+
$consumed = $k;
1186+
break;
1187+
}
1188+
}
1189+
if ( null === $idx ) {
1190+
throw new Exception( 'Index out of bounds in regular expression search.' );
1191+
}
1192+
$out .= $groups[ $idx ];
1193+
$i += 1 + $consumed;
1194+
continue;
1195+
}
1196+
$out .= $c;
1197+
++$i;
1198+
}
1199+
return $out;
1200+
}
1201+
1202+
/**
1203+
* Walk the subject applying a compiled pattern starting at a byte offset.
1204+
*
1205+
* Returns a list of match arrays in PREG_OFFSET_CAPTURE format. Uses
1206+
* preg_match with its offset argument rather than slicing the subject so
1207+
* lookbehind assertions can see bytes preceding byte_start.
1208+
*
1209+
* @param string $compiled PCRE-wrapped pattern.
1210+
* @param string $subject Full subject string.
1211+
* @param int $byte_start Byte offset at which matching begins.
1212+
* @param int $limit Max matches to collect; -1 for unlimited.
1213+
*
1214+
* @return array|false List of match arrays, or false on preg error.
1215+
*/
1216+
private function regexp_find_matches( $compiled, $subject, $byte_start, $limit ) {
1217+
return $this->regexp_run(
1218+
function () use ( $compiled, $subject, $byte_start, $limit ) {
1219+
$results = array();
1220+
$offset = $byte_start;
1221+
$len = strlen( $subject );
1222+
while ( -1 === $limit || count( $results ) < $limit ) {
1223+
$r = preg_match( $compiled, $subject, $m, PREG_OFFSET_CAPTURE, $offset );
1224+
if ( false === $r ) {
1225+
return false;
1226+
}
1227+
if ( 0 === $r ) {
1228+
break;
1229+
}
1230+
$results[] = $m;
1231+
$match_start = $m[0][1];
1232+
$match_length = strlen( $m[0][0] );
1233+
$next = $match_start + $match_length;
1234+
if ( 0 === $match_length ) {
1235+
// Advance past a zero-width match to avoid looping on the same offset.
1236+
// Skip any UTF-8 continuation bytes so the next match starts on a code point boundary.
1237+
++$next;
1238+
while ( $next < $len && ( ord( $subject[ $next ] ) & 0xC0 ) === 0x80 ) {
1239+
++$next;
1240+
}
1241+
}
1242+
if ( $next > $len ) {
1243+
break;
1244+
}
1245+
$offset = $next;
1246+
}
1247+
return $results;
1248+
}
1249+
);
1250+
}
1251+
10271252
/**
10281253
* Translate a preg_* failure into a caller-friendly exception message.
10291254
*

0 commit comments

Comments
 (0)