diff --git a/Cargo.lock b/Cargo.lock index 457c4106..f033ae8f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -182,7 +182,7 @@ version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbfd150b5dbdb988bcc8fb1fe787eb6b7ee6180ca24da683b61ea5405f3d43ff" dependencies = [ - "bindgen 0.69.5", + "bindgen", "cc", "cmake", "dunce", @@ -242,30 +242,12 @@ dependencies = [ "proc-macro2", "quote", "regex", - "rustc-hash 1.1.0", + "rustc-hash", "shlex", "syn 2.0.104", "which", ] -[[package]] -name = "bindgen" -version = "0.71.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" -dependencies = [ - "bitflags 2.9.1", - "cexpr", - "clang-sys", - "itertools 0.13.0", - "proc-macro2", - "quote", - "regex", - "rustc-hash 2.1.1", - "shlex", - "syn 2.0.104", -] - [[package]] name = "bitflags" version = "1.3.2" @@ -1248,15 +1230,6 @@ dependencies = [ "either", ] -[[package]] -name = "itertools" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" -dependencies = [ - "either", -] - [[package]] name = "itoa" version = "1.0.15" @@ -1343,7 +1316,6 @@ dependencies = [ "pprof", "pyo3", "recursive", - "regex", "rocksdb", "rust_decimal", "rustyline", @@ -1434,13 +1406,11 @@ version = "0.17.1+9.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b7869a512ae9982f4d46ba482c2a304f1efd80c6412a3d4bf57bb79a619679f" dependencies = [ - "bindgen 0.69.5", + "bindgen", "bzip2-sys", "cc", "libc", "libz-sys", - "lz4-sys", - "zstd-sys", ] [[package]] @@ -1515,16 +1485,6 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" -[[package]] -name = "lz4-sys" -version = "1.11.1+lz4-1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bd8c0d6c6ed0cd30b3652886bb8711dc4bb01d637a68105a3d5158039b418e6" -dependencies = [ - "cc", - "libc", -] - [[package]] name = "macros-test" version = "0.4.0" @@ -2326,12 +2286,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" -[[package]] -name = "rustc-hash" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" - [[package]] name = "rustix" version = "0.38.44" @@ -3511,14 +3465,3 @@ name = "zeroize" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" - -[[package]] -name = "zstd-sys" -version = "2.0.15+zstd.1.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" -dependencies = [ - "bindgen 0.71.1", - "cc", - "pkg-config", -] diff --git a/Cargo.toml b/Cargo.toml index e19e269c..2697c842 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,6 @@ paste = { version = "1" } parking_lot = { version = "0.12", features = ["arc_lock"] } pyo3 = { version = "0.23", features = ["auto-initialize"], optional = true } recursive = { version = "0.1" } -regex = { version = "1" } rust_decimal = { version = "1" } serde = { version = "1", features = ["derive", "rc"] } kite_sql_serde_macros = { version = "0.2.0", path = "kite_sql_serde_macros" } @@ -91,7 +90,7 @@ tempfile = { version = "3.10" } sqlite = { version = "0.34" } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] -rocksdb = { version = "0.23", optional = true } +rocksdb = { version = "0.23", optional = true, default-features = false, features = ["bindgen-runtime"] } librocksdb-sys = { version = "0.17.1", optional = true } lmdb = { version = "0.8.0", optional = true } lmdb-sys = { version = "0.8.0", optional = true } diff --git a/src/expression/evaluator.rs b/src/expression/evaluator.rs index 69f93d0e..e05e855c 100644 --- a/src/expression/evaluator.rs +++ b/src/expression/evaluator.rs @@ -19,7 +19,6 @@ use crate::types::evaluator::binary_create; use crate::types::tuple::TupleLike; use crate::types::value::{DataValue, Utf8Type}; use crate::types::{CharLengthUnits, LogicalType}; -use regex::Regex; use std::borrow::Cow; use std::cmp; use std::cmp::Ordering; @@ -228,22 +227,7 @@ impl ScalarExpression { .map(String::from) .unwrap_or_default(); } - let trim_regex = match trim_where { - Some(TrimWhereField::Both) | None => Regex::new(&format!( - r"^(?:{0})*([\w\W]*?)(?:{0})*$", - regex::escape(&trim_what) - )) - .unwrap(), - Some(TrimWhereField::Leading) => { - Regex::new(&format!(r"^(?:{0})*([\w\W]*?)", regex::escape(&trim_what))) - .unwrap() - } - Some(TrimWhereField::Trailing) => { - Regex::new(&format!(r"([\w\W]*?)(?:{0})*$", regex::escape(&trim_what))) - .unwrap() - } - }; - let string_trimmed = trim_regex.replace_all(string, "$1").to_string(); + let string_trimmed = trim_string(string, &trim_what, *trim_where); Ok(DataValue::Utf8 { value: string_trimmed, @@ -360,6 +344,31 @@ impl ScalarExpression { } } +fn trim_string(value: &str, trim_what: &str, trim_where: Option) -> String { + if trim_what.is_empty() { + return value.to_string(); + } + + let mut trimmed = value; + if matches!( + trim_where, + Some(TrimWhereField::Leading | TrimWhereField::Both) | None + ) { + while let Some(rest) = trimmed.strip_prefix(trim_what) { + trimmed = rest; + } + } + if matches!( + trim_where, + Some(TrimWhereField::Trailing | TrimWhereField::Both) | None + ) { + while let Some(rest) = trimmed.strip_suffix(trim_what) { + trimmed = rest; + } + } + trimmed.to_string() +} + #[cfg(test)] mod tests { use super::*; @@ -407,4 +416,23 @@ mod tests { assert_eq!(expr.eval::<&[DataValue]>(None)?, DataValue::Boolean(false)); Ok(()) } + + #[test] + fn trim_string_removes_requested_sides() { + assert_eq!(trim_string("xxhelloxx", "x", None), "hello"); + assert_eq!( + trim_string("xxhelloxx", "x", Some(TrimWhereField::Both)), + "hello" + ); + assert_eq!( + trim_string("xxhelloxx", "x", Some(TrimWhereField::Leading)), + "helloxx" + ); + assert_eq!( + trim_string("xxhelloxx", "x", Some(TrimWhereField::Trailing)), + "xxhello" + ); + assert_eq!(trim_string("ababhelloab", "ab", None), "hello"); + assert_eq!(trim_string("hello", "", None), "hello"); + } } diff --git a/src/types/evaluator/utf8.rs b/src/types/evaluator/utf8.rs index 78f4dbd1..7eb4ac85 100644 --- a/src/types/evaluator/utf8.rs +++ b/src/types/evaluator/utf8.rs @@ -21,7 +21,6 @@ use crate::types::CharLengthUnits; use crate::types::LogicalType; use chrono::{DateTime, Datelike, NaiveDate, NaiveDateTime, NaiveTime, Timelike}; use ordered_float::OrderedFloat; -use regex::Regex; use rust_decimal::Decimal; use serde::{Deserialize, Serialize}; use std::hint; @@ -321,22 +320,89 @@ impl BinaryEvaluator for Utf8NotLikeBinaryEvaluator { } fn string_like(value: &str, pattern: &str, escape_char: Option) -> bool { - let mut regex_pattern = String::new(); - let mut chars = pattern.chars().peekable(); - while let Some(c) = chars.next() { - if matches!(escape_char.map(|escape_c| escape_c == c), Some(true)) { - if let Some(next_char) = chars.next() { - regex_pattern.push(next_char); + let mut value_idx = 0; + let mut pattern_idx = 0; + let mut last_many = None; + + while value_idx < value.len() { + match next_like_pattern_token(pattern, pattern_idx, escape_char) { + Some((LikePatternToken::Literal(pattern_ch), next_pattern_idx)) => { + if let Some((value_ch, next_value_idx)) = next_char_at(value, value_idx) { + if value_ch == pattern_ch { + value_idx = next_value_idx; + pattern_idx = next_pattern_idx; + continue; + } + } + } + Some((LikePatternToken::AnyOne, next_pattern_idx)) => { + if let Some((_, next_value_idx)) = next_char_at(value, value_idx) { + value_idx = next_value_idx; + pattern_idx = next_pattern_idx; + continue; + } } - } else if c == '%' { - regex_pattern.push_str(".*"); - } else if c == '_' { - regex_pattern.push('.'); + Some((LikePatternToken::AnyMany, next_pattern_idx)) => { + pattern_idx = next_pattern_idx; + last_many = Some((pattern_idx, value_idx)); + continue; + } + None => {} + } + + let Some((after_many_pattern_idx, many_value_idx)) = last_many else { + return false; + }; + let Some((_, next_many_value_idx)) = next_char_at(value, many_value_idx) else { + return false; + }; + value_idx = next_many_value_idx; + pattern_idx = after_many_pattern_idx; + last_many = Some((after_many_pattern_idx, value_idx)); + } + + while let Some((token, next_pattern_idx)) = + next_like_pattern_token(pattern, pattern_idx, escape_char) + { + if token != LikePatternToken::AnyMany { + return false; + } + pattern_idx = next_pattern_idx; + } + true +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum LikePatternToken { + Literal(char), + AnyOne, + AnyMany, +} + +fn next_like_pattern_token( + pattern: &str, + index: usize, + escape_char: Option, +) -> Option<(LikePatternToken, usize)> { + let (ch, next_index) = next_char_at(pattern, index)?; + if escape_char.is_some_and(|escape_ch| escape_ch == ch) { + if let Some((escaped_ch, escaped_next_index)) = next_char_at(pattern, next_index) { + Some((LikePatternToken::Literal(escaped_ch), escaped_next_index)) } else { - regex_pattern.push(c); + Some((LikePatternToken::Literal(ch), next_index)) } + } else if ch == '%' { + Some((LikePatternToken::AnyMany, next_index)) + } else if ch == '_' { + Some((LikePatternToken::AnyOne, next_index)) + } else { + Some((LikePatternToken::Literal(ch), next_index)) } - Regex::new(®ex_pattern).unwrap().is_match(value) +} + +fn next_char_at(input: &str, index: usize) -> Option<(char, usize)> { + let ch = input.get(index..)?.chars().next()?; + Some((ch, index + ch.len_utf8())) } #[cfg(all(test, not(target_arch = "wasm32")))] @@ -372,6 +438,60 @@ mod test { .unwrap(), DataValue::Boolean(true) ); + assert_eq!( + Utf8NotLikeBinaryEvaluator { escape_char: None } + .binary_eval(&utf8("kite"), &utf8("ki%")) + .unwrap(), + DataValue::Boolean(false) + ); + } + + #[test] + fn test_string_like_patterns() { + let cases = [ + ("", "", None, true), + ("", "%", None, true), + ("", "_", None, false), + ("a", "", None, false), + ("a", "a", None, true), + ("a", "_", None, true), + ("ab", "_", None, false), + ("ab", "__", None, true), + ("abc", "a%", None, true), + ("abc", "%c", None, true), + ("abc", "%b%", None, true), + ("abc", "a%c", None, true), + ("abc", "a%d", None, false), + ("abc", "%a", None, false), + ("abc", "%%", None, true), + ("abc", "a%%c", None, true), + ("abc", "a%c%", None, true), + ("abbbc", "a%bc", None, true), + ("abXc", "a%bc", None, false), + ("skite", "ki%", None, false), + ("ki.e", "ki.e", None, true), + ("kite", "ki.e", None, false), + ("ki*e", "ki*e", None, true), + ("F%ck", "F@%ck", Some('@'), true), + ("Fack", "F@%ck", Some('@'), false), + ("F_ck", "F@_ck", Some('@'), true), + ("Fack", "F@_ck", Some('@'), false), + ("@", "@", Some('@'), true), + ("%", "\\%", Some('\\'), true), + ("好", "_", None, true), + ("好a", "好_", None, true), + ("好a", "__", None, true), + ("好a", "_", None, false), + ("你好", "你%", None, true), + ]; + + for (value, pattern, escape_char, expected) in cases { + assert_eq!( + string_like(value, pattern, escape_char), + expected, + "value={value:?}, pattern={pattern:?}, escape_char={escape_char:?}" + ); + } } #[test]