@@ -72,6 +72,7 @@ public static function register_for( $pdo ): self {
7272 'if ' => '_if ' ,
7373 'regexp ' => 'regexp ' ,
7474 'regexp_like ' => 'regexp_like ' ,
75+ 'regexp_replace ' => 'regexp_replace ' ,
7576 'field ' => 'field ' ,
7677 'log ' => 'log ' ,
7778 'least ' => 'least ' ,
@@ -563,6 +564,74 @@ function () use ( $compiled, $expr ) {
563564 return $ result ;
564565 }
565566
567+ /**
568+ * Method to emulate MySQL REGEXP_REPLACE() function.
569+ *
570+ * Uses MySQL/ICU replacement grammar: "$N" backreferences ("$0" is the
571+ * full match), "\X" emits X (drops the backslash), "${N}" is rejected.
572+ * Negative `occurrence` is clamped to 1; `pos = char_count + 1` is
573+ * accepted and returns the subject unchanged.
574+ *
575+ * @param string|null $expr Subject string.
576+ * @param string|null $pattern Regex pattern.
577+ * @param string|null $replacement Replacement string (supports $N backreferences).
578+ * @param int|null $pos 1-based character position to start matching.
579+ * @param int|null $occurrence Nth match to replace; 0 = all matches.
580+ * @param string|null $match_type MySQL match_type flags.
581+ *
582+ * @throws Exception If the pattern is not a valid regular expression, or pos is out of range.
583+ * @return string|null The replaced string, or NULL if any argument is NULL.
584+ */
585+ public function regexp_replace ( $ expr , $ pattern , $ replacement , $ pos = 1 , $ occurrence = 0 , $ match_type = '' ) {
586+ if (
587+ null === $ expr || null === $ pattern || null === $ replacement
588+ || null === $ pos || null === $ occurrence || null === $ match_type
589+ ) {
590+ return null ;
591+ }
592+
593+ $ compiled = $ this ->regexp_compile ( $ pattern , $ match_type );
594+ $ byte_start = $ this ->regexp_char_to_byte_offset ( $ expr , (int ) $ pos , true );
595+ $ n = (int ) $ occurrence ;
596+
597+ // 0 means replace all; negative occurrences are clamped to 1 (MySQL behavior).
598+ if ( $ n < 0 ) {
599+ $ n = 1 ;
600+ }
601+
602+ $ matches = $ this ->regexp_find_matches ( $ compiled , $ expr , $ byte_start , $ n > 0 ? $ n : -1 );
603+ if ( false === $ matches ) {
604+ $ this ->regexp_fail ( $ pattern );
605+ }
606+
607+ // Rebuild: bytes before pos are untouched, then walk the collected
608+ // matches, substituting only the targeted occurrence (or all when N=0).
609+ $ out = substr ( $ expr , 0 , $ byte_start );
610+ $ cur = $ byte_start ;
611+ foreach ( $ matches as $ i => $ m ) {
612+ $ match_start = $ m [0 ][1 ];
613+ $ match_length = strlen ( $ m [0 ][0 ] );
614+
615+ $ out .= substr ( $ expr , $ cur , $ match_start - $ cur );
616+
617+ $ replace_this = 0 === $ n || ( $ i + 1 ) === $ n ;
618+ if ( $ replace_this ) {
619+ $ groups = array ();
620+ foreach ( $ m as $ g ) {
621+ $ groups [] = $ g [0 ];
622+ }
623+ $ out .= $ this ->regexp_expand_replacement ( $ replacement , $ groups );
624+ } else {
625+ $ out .= $ m [0 ][0 ];
626+ }
627+
628+ $ cur = $ match_start + $ match_length ;
629+ }
630+ $ out .= substr ( $ expr , $ cur );
631+
632+ return $ out ;
633+ }
634+
566635 /**
567636 * Method to emulate MySQL FIELD() function.
568637 *
@@ -1024,6 +1093,162 @@ private function regexp_run( $op ) {
10241093 }
10251094 }
10261095
1096+ /**
1097+ * Convert a 1-based character position into a byte offset into the UTF-8 string.
1098+ *
1099+ * @param string $s UTF-8 string.
1100+ * @param int $char_pos 1-based character position.
1101+ * @param bool $allow_past_end Whether to accept char_pos == char_count + 1
1102+ * (returns strlen($s)). MySQL allows this for
1103+ * REGEXP_REPLACE and REGEXP_SUBSTR but not for
1104+ * REGEXP_INSTR.
1105+ *
1106+ * @throws Exception If $char_pos is out of range.
1107+ * @return int Byte offset into $s.
1108+ */
1109+ private function regexp_char_to_byte_offset ( $ s , $ char_pos , $ allow_past_end = false ) {
1110+ if ( $ char_pos < 1 ) {
1111+ throw new Exception ( 'Index out of bounds in regular expression search. ' );
1112+ }
1113+ if ( 1 === $ char_pos ) {
1114+ return 0 ;
1115+ }
1116+ $ byte_len = strlen ( $ s );
1117+ $ chars = 1 ;
1118+ for ( $ i = 0 ; $ i < $ byte_len ; $ i ++ ) {
1119+ // Count every byte that isn't a UTF-8 continuation byte.
1120+ if ( ( ord ( $ s [ $ i ] ) & 0xC0 ) !== 0x80 ) {
1121+ if ( $ chars === $ char_pos ) {
1122+ return $ i ;
1123+ }
1124+ ++$ chars ;
1125+ }
1126+ }
1127+ if ( $ allow_past_end && $ chars === $ char_pos ) {
1128+ return $ byte_len ;
1129+ }
1130+ throw new Exception ( 'Index out of bounds in regular expression search. ' );
1131+ }
1132+
1133+ /**
1134+ * Expand a MySQL/ICU-style replacement template.
1135+ *
1136+ * Rules (from ICU, used by MySQL REGEXP_REPLACE):
1137+ * - "\X" for any X: emit X, drop the backslash (also applies to "\\" -> "\").
1138+ * - Trailing lone backslash: dropped.
1139+ * - "$N" (N is one or more digits): emit the Nth capture group. Consumes
1140+ * the longest digit run that forms a valid group index; any trailing
1141+ * digits become literal text.
1142+ * - "$" not followed by a digit: error (matches MySQL ERROR 3887).
1143+ * - "$N" where N is larger than any existing group: error (ERROR 3686).
1144+ * - "${N}" is NOT supported and raises the same error as a bare "$".
1145+ *
1146+ * @param string $replacement The replacement template.
1147+ * @param array $groups Capture-group texts, with index 0 = full match.
1148+ *
1149+ * @throws Exception On an invalid "$..." reference.
1150+ * @return string The expanded replacement.
1151+ */
1152+ private function regexp_expand_replacement ( $ replacement , $ groups ) {
1153+ $ max_group = count ( $ groups ) - 1 ;
1154+ $ out = '' ;
1155+ $ len = strlen ( $ replacement );
1156+ $ i = 0 ;
1157+ while ( $ i < $ len ) {
1158+ $ c = $ replacement [ $ i ];
1159+ if ( '\\' === $ c ) {
1160+ if ( $ i + 1 < $ len ) {
1161+ $ out .= $ replacement [ $ i + 1 ];
1162+ $ i += 2 ;
1163+ } else {
1164+ ++$ i ;
1165+ }
1166+ continue ;
1167+ }
1168+ if ( '$ ' === $ c ) {
1169+ if ( $ i + 1 >= $ len || ! ctype_digit ( $ replacement [ $ i + 1 ] ) ) {
1170+ throw new Exception ( 'A capture group has an invalid name. ' );
1171+ }
1172+ $ j = $ i + 1 ;
1173+ while ( $ j < $ len && ctype_digit ( $ replacement [ $ j ] ) ) {
1174+ ++$ j ;
1175+ }
1176+ // Longest digit prefix that refers to an existing group wins;
1177+ // remaining digits are emitted literally.
1178+ $ digits = substr ( $ replacement , $ i + 1 , $ j - $ i - 1 );
1179+ $ idx = null ;
1180+ $ consumed = 0 ;
1181+ for ( $ k = strlen ( $ digits ); $ k > 0 ; --$ k ) {
1182+ $ cand = (int ) substr ( $ digits , 0 , $ k );
1183+ if ( $ cand <= $ max_group ) {
1184+ $ idx = $ cand ;
1185+ $ consumed = $ k ;
1186+ break ;
1187+ }
1188+ }
1189+ if ( null === $ idx ) {
1190+ throw new Exception ( 'Index out of bounds in regular expression search. ' );
1191+ }
1192+ $ out .= $ groups [ $ idx ];
1193+ $ i += 1 + $ consumed ;
1194+ continue ;
1195+ }
1196+ $ out .= $ c ;
1197+ ++$ i ;
1198+ }
1199+ return $ out ;
1200+ }
1201+
1202+ /**
1203+ * Walk the subject applying a compiled pattern starting at a byte offset.
1204+ *
1205+ * Returns a list of match arrays in PREG_OFFSET_CAPTURE format. Uses
1206+ * preg_match with its offset argument rather than slicing the subject so
1207+ * lookbehind assertions can see bytes preceding byte_start.
1208+ *
1209+ * @param string $compiled PCRE-wrapped pattern.
1210+ * @param string $subject Full subject string.
1211+ * @param int $byte_start Byte offset at which matching begins.
1212+ * @param int $limit Max matches to collect; -1 for unlimited.
1213+ *
1214+ * @return array|false List of match arrays, or false on preg error.
1215+ */
1216+ private function regexp_find_matches ( $ compiled , $ subject , $ byte_start , $ limit ) {
1217+ return $ this ->regexp_run (
1218+ function () use ( $ compiled , $ subject , $ byte_start , $ limit ) {
1219+ $ results = array ();
1220+ $ offset = $ byte_start ;
1221+ $ len = strlen ( $ subject );
1222+ while ( -1 === $ limit || count ( $ results ) < $ limit ) {
1223+ $ r = preg_match ( $ compiled , $ subject , $ m , PREG_OFFSET_CAPTURE , $ offset );
1224+ if ( false === $ r ) {
1225+ return false ;
1226+ }
1227+ if ( 0 === $ r ) {
1228+ break ;
1229+ }
1230+ $ results [] = $ m ;
1231+ $ match_start = $ m [0 ][1 ];
1232+ $ match_length = strlen ( $ m [0 ][0 ] );
1233+ $ next = $ match_start + $ match_length ;
1234+ if ( 0 === $ match_length ) {
1235+ // Advance past a zero-width match to avoid looping on the same offset.
1236+ // Skip any UTF-8 continuation bytes so the next match starts on a code point boundary.
1237+ ++$ next ;
1238+ while ( $ next < $ len && ( ord ( $ subject [ $ next ] ) & 0xC0 ) === 0x80 ) {
1239+ ++$ next ;
1240+ }
1241+ }
1242+ if ( $ next > $ len ) {
1243+ break ;
1244+ }
1245+ $ offset = $ next ;
1246+ }
1247+ return $ results ;
1248+ }
1249+ );
1250+ }
1251+
10271252 /**
10281253 * Translate a preg_* failure into a caller-friendly exception message.
10291254 *
0 commit comments