From 7cac8e9ba4f64a30e8b0bd8c541fd8e6e616bced Mon Sep 17 00:00:00 2001 From: Reiase Date: Sat, 13 Jun 2026 23:50:05 +0800 Subject: [PATCH 1/3] Update dependencies and enhance documentation - Updated `Cargo.lock` to reflect new versions for several dependencies, including `arrow` and `async-trait`. - Removed deprecated `android-tzdata` package from `Cargo.lock`. - Enhanced `Cargo.toml` to include new features for `datafusion` and updated `arrow` version to `58.1.0`. - Added new documentation for the data layer in both English and Chinese, detailing its architecture and design goals. - Introduced a hidden `bench` command in the CLI for stress testing the in-process data layer, with various subcommands for benchmarking different aspects. - Improved the overall structure and clarity of the documentation, including navigation updates in `mkdocs.yml`. --- .github/actions/setup-build-env/action.yml | 2 +- Cargo.lock | 1032 ++++++--- Cargo.toml | 14 +- docs/mkdocs.yml | 2 + docs/src/design/data-layer.md | 293 +++ docs/src/design/data-layer.zh.md | 262 +++ docs/src/design/index.md | 1 + docs/src/design/index.zh.md | 1 + probing/cli/Cargo.toml | 1 + probing/cli/src/cli/bench/args.rs | 232 ++ probing/cli/src/cli/bench/metrics.rs | 311 +++ probing/cli/src/cli/bench/mod.rs | 61 + probing/cli/src/cli/bench/runners/coldscan.rs | 92 + probing/cli/src/cli/bench/runners/common.rs | 167 ++ probing/cli/src/cli/bench/runners/compact.rs | 118 + probing/cli/src/cli/bench/runners/mixed.rs | 184 ++ probing/cli/src/cli/bench/runners/mod.rs | 10 + probing/cli/src/cli/bench/runners/mp.rs | 307 +++ probing/cli/src/cli/bench/runners/scan.rs | 51 + probing/cli/src/cli/bench/runners/write.rs | 198 ++ probing/cli/src/cli/bench/workload.rs | 189 ++ probing/cli/src/cli/commands.rs | 4 + probing/cli/src/cli/mod.rs | 5 + probing/core/Cargo.toml | 6 +- probing/core/src/core/memtable_sql.rs | 2048 +++++++++++++++++ probing/core/src/core/mod.rs | 6 + probing/core/src/core/plugin.rs | 93 +- probing/core/src/core/plugin_advanced.rs | 590 +++++ probing/extensions/cc/Cargo.toml | 2 +- probing/extensions/python/Cargo.toml | 2 + .../python/src/extensions/python/exttbls.rs | 595 +++-- .../python/src/extensions/python/tbls.rs | 214 +- probing/memtable/Cargo.toml | 1 + probing/memtable/src/discover.rs | 115 +- probing/memtable/src/layout.rs | 222 +- probing/memtable/src/lib.rs | 3 +- probing/memtable/src/memc/codec.rs | 281 +++ probing/memtable/src/memc/compactor.rs | 421 ++++ probing/memtable/src/memc/layout.rs | 434 ++++ probing/memtable/src/memc/mod.rs | 64 + probing/memtable/src/memc/reader.rs | 255 ++ probing/memtable/src/memc/store.rs | 247 ++ probing/memtable/src/memc/tests.rs | 643 ++++++ probing/memtable/src/memc/writer.rs | 337 +++ probing/memtable/src/memtable.rs | 848 ++++++- probing/memtable/src/raw.rs | 61 +- probing/memtable/src/writer.rs | 16 +- probing/server/Cargo.toml | 2 +- probing/server/src/engine.rs | 5 +- probing/server/src/memtable_ext.rs | 702 +----- 50 files changed, 10162 insertions(+), 1588 deletions(-) create mode 100644 docs/src/design/data-layer.md create mode 100644 docs/src/design/data-layer.zh.md create mode 100644 probing/cli/src/cli/bench/args.rs create mode 100644 probing/cli/src/cli/bench/metrics.rs create mode 100644 probing/cli/src/cli/bench/mod.rs create mode 100644 probing/cli/src/cli/bench/runners/coldscan.rs create mode 100644 probing/cli/src/cli/bench/runners/common.rs create mode 100644 probing/cli/src/cli/bench/runners/compact.rs create mode 100644 probing/cli/src/cli/bench/runners/mixed.rs create mode 100644 probing/cli/src/cli/bench/runners/mod.rs create mode 100644 probing/cli/src/cli/bench/runners/mp.rs create mode 100644 probing/cli/src/cli/bench/runners/scan.rs create mode 100644 probing/cli/src/cli/bench/runners/write.rs create mode 100644 probing/cli/src/cli/bench/workload.rs create mode 100644 probing/core/src/core/memtable_sql.rs create mode 100644 probing/core/src/core/plugin_advanced.rs create mode 100644 probing/memtable/src/memc/codec.rs create mode 100644 probing/memtable/src/memc/compactor.rs create mode 100644 probing/memtable/src/memc/layout.rs create mode 100644 probing/memtable/src/memc/mod.rs create mode 100644 probing/memtable/src/memc/reader.rs create mode 100644 probing/memtable/src/memc/store.rs create mode 100644 probing/memtable/src/memc/tests.rs create mode 100644 probing/memtable/src/memc/writer.rs diff --git a/.github/actions/setup-build-env/action.yml b/.github/actions/setup-build-env/action.yml index 0b890dfc..6f26a114 100644 --- a/.github/actions/setup-build-env/action.yml +++ b/.github/actions/setup-build-env/action.yml @@ -66,7 +66,7 @@ runs: test -e ~/.cargo/bin/rnr || cargo install rnr test -e ~/.cargo/bin/cargo-nextest || cargo install --locked cargo-nextest test -e ~/.cargo/bin/cargo-binstall || cargo install cargo-binstall - test -e ~/.cargo/bin/dx || cargo binstall dioxus-cli@0.7.0 -y + test -e ~/.cargo/bin/dx || cargo binstall dioxus-cli@0.7.6 -y test -e ~/.cargo/bin/trunk || cargo install trunk --locked - name: Install Python Build Dependencies diff --git a/Cargo.lock b/Cargo.lock index 0566fe30..cf4703ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,12 +55,6 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" -[[package]] -name = "android-tzdata" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" - [[package]] name = "android_system_properties" version = "0.1.5" @@ -140,9 +134,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1bb018b6960c87fd9d025009820406f74e83281185a8bdcb44880d2aa5c9a87" +checksum = "d441fdda254b65f3e9025910eb2c2066b6295d9c8ed409522b8d2ace1ff8574c" dependencies = [ "arrow-arith", "arrow-array", @@ -161,23 +155,23 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44de76b51473aa888ecd6ad93ceb262fb8d40d1f1154a4df2f069b3590aa7575" +checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "num", + "num-traits", ] [[package]] name = "arrow-array" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29ed77e22744475a9a53d00026cf8e166fe73cf42d89c4c4ae63607ee1cfcc3f" +checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d" dependencies = [ "ahash", "arrow-buffer", @@ -186,30 +180,34 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.15.2", - "num", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", ] [[package]] name = "arrow-buffer" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0391c96eb58bf7389171d1e103112d3fc3e5625ca6b372d606f2688f1ea4cce" +checksum = "898f4cf1e9598fdb77f356fdf2134feedfd0ee8d5a4e0a5f573e7d0aec16baa4" dependencies = [ "bytes", "half", - "num", + "num-bigint", + "num-traits", ] [[package]] name = "arrow-cast" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f39e1d774ece9292697fcbe06b5584401b26bd34be1bec25c33edae65c2420ff" +checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", + "arrow-ord", "arrow-schema", "arrow-select", "atoi", @@ -218,15 +216,15 @@ dependencies = [ "comfy-table", "half", "lexical-core", - "num", + "num-traits", "ryu", ] [[package]] name = "arrow-csv" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9055c972a07bf12c2a827debfd34f88d3b93da1941d36e1d9fee85eebe38a12a" +checksum = "ca025bd0f38eeecb57c2153c0123b960494138e6a957bbda10da2b25415209fe" dependencies = [ "arrow-array", "arrow-cast", @@ -234,41 +232,42 @@ dependencies = [ "chrono", "csv", "csv-core", - "lazy_static", "regex", ] [[package]] name = "arrow-data" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf75ac27a08c7f48b88e5c923f267e980f27070147ab74615ad85b5c5f90473d" +checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4" dependencies = [ "arrow-buffer", "arrow-schema", "half", - "num", + "num-integer", + "num-traits", ] [[package]] name = "arrow-ipc" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a222f0d93772bd058d1268f4c28ea421a603d66f7979479048c429292fac7b2e" +checksum = "609a441080e338147a84e8e6904b6da482cefb957c5cdc0f3398872f69a315d0" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", + "arrow-select", "flatbuffers", "lz4_flex", ] [[package]] name = "arrow-json" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9085342bbca0f75e8cb70513c0807cc7351f1fbf5cb98192a67d5e3044acb033" +checksum = "6ead0914e4861a531be48fe05858265cf854a4880b9ed12618b1d08cba9bebc8" dependencies = [ "arrow-array", "arrow-buffer", @@ -277,20 +276,22 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.9.0", + "indexmap 2.14.0", + "itoa", "lexical-core", "memchr", - "num", - "serde", + "num-traits", + "ryu", + "serde_core", "serde_json", "simdutf8", ] [[package]] name = "arrow-ord" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab2f1065a5cad7b9efa9e22ce5747ce826aa3855766755d4904535123ef431e7" +checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5" dependencies = [ "arrow-array", "arrow-buffer", @@ -301,9 +302,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3703a0e3e92d23c3f756df73d2dc9476873f873a76ae63ef9d3de17fda83b2d8" +checksum = "e14fe367802f16d7668163ff647830258e6e0aeea9a4d79aaedf273af3bdcd3e" dependencies = [ "arrow-array", "arrow-buffer", @@ -314,29 +315,33 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73a47aa0c771b5381de2b7f16998d351a6f4eb839f1e13d48353e17e873d969b" +checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743" +dependencies = [ + "serde_core", + "serde_json", +] [[package]] name = "arrow-select" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24b7b85575702b23b85272b01bc1c25a01c9b9852305e5d0078c79ba25d995d4" +checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe" dependencies = [ "ahash", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", - "num", + "num-traits", ] [[package]] name = "arrow-string" -version = "55.1.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9260fddf1cdf2799ace2b4c2fc0356a9789fa7551e0953e35435536fecefebbd" +checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e" dependencies = [ "arrow-array", "arrow-buffer", @@ -344,20 +349,20 @@ dependencies = [ "arrow-schema", "arrow-select", "memchr", - "num", + "num-traits", "regex", "regex-syntax", ] [[package]] name = "async-trait" -version = "0.1.88" +version = "0.1.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -450,7 +455,7 @@ checksum = "604fde5e028fea851ce1d8570bbdc034bec850d157f7569d10f347d06808c05c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -489,9 +494,9 @@ checksum = "92fde17f91e7ba10b2a07f8dff29530b77144894bc6ae850fbc66e1276af0d28" [[package]] name = "bigdecimal" -version = "0.4.8" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" dependencies = [ "autocfg", "libm", @@ -559,9 +564,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.10.1" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" [[package]] name = "camino" @@ -624,40 +629,28 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.40" +version = "0.4.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ - "android-tzdata", "iana-time-zone", "js-sys", "num-traits", "serde", "wasm-bindgen", - "windows-link", + "windows-link 0.2.1", ] [[package]] name = "chrono-tz" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efdce149c370f133a071ca8ef6ea340b7b88748ab0810097a9e2976eaa34b4f3" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" dependencies = [ "chrono", - "chrono-tz-build", "phf", ] -[[package]] -name = "chrono-tz-build" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402" -dependencies = [ - "parse-zoneinfo", - "phf_codegen", -] - [[package]] name = "ciborium" version = "0.2.2" @@ -735,7 +728,7 @@ dependencies = [ "proc-macro2", "pulldown-cmark", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -809,7 +802,7 @@ dependencies = [ "cookie", "document-features", "idna", - "indexmap 2.9.0", + "indexmap 2.14.0", "log", "serde", "serde_derive", @@ -1027,7 +1020,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -1038,7 +1031,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -1063,12 +1056,11 @@ checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" [[package]] name = "datafusion" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe060b978f74ab446be722adb8a274e052e005bf6dfd171caadc3abaad10080" +checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ "arrow", - "arrow-ipc", "arrow-schema", "async-trait", "bytes", @@ -1078,6 +1070,7 @@ dependencies = [ "datafusion-common", "datafusion-common-runtime", "datafusion-datasource", + "datafusion-datasource-arrow", "datafusion-datasource-csv", "datafusion-datasource-json", "datafusion-execution", @@ -1085,11 +1078,12 @@ dependencies = [ "datafusion-expr-common", "datafusion-functions", "datafusion-functions-aggregate", + "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", - "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", + "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", "datafusion-physical-optimizer", "datafusion-physical-plan", @@ -1100,7 +1094,7 @@ dependencies = [ "log", "object_store", "parking_lot 0.12.3", - "rand 0.8.5", + "rand", "regex", "sqlparser", "tempfile", @@ -1111,9 +1105,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61fe34f401bd03724a1f96d12108144f8cd495a3cdda2bf5e091822fb80b7e66" +checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ "arrow", "async-trait", @@ -1126,7 +1120,6 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-plan", "datafusion-session", - "datafusion-sql", "futures", "itertools 0.14.0", "log", @@ -1137,9 +1130,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4411b8e3bce5e0fc7521e44f201def2e2d5d1b5f176fb56e8cdc9942c890f00" +checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ "arrow", "async-trait", @@ -1149,28 +1142,29 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-physical-expr", + "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", "datafusion-physical-plan", - "datafusion-session", "futures", + "itertools 0.14.0", "log", "object_store", - "tokio", ] [[package]] name = "datafusion-common" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0734015d81c8375eb5d4869b7f7ecccc2ee8d6cb81948ef737cd0e7b743bd69c" +checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash", "arrow", "arrow-ipc", - "base64 0.22.1", + "chrono", "half", - "hashbrown 0.14.5", - "indexmap 2.9.0", + "hashbrown 0.16.1", + "indexmap 2.14.0", + "itertools 0.14.0", "libc", "log", "object_store", @@ -1182,9 +1176,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5167bb1d2ccbb87c6bc36c295274d7a0519b14afcfdaf401d53cbcaa4ef4968b" +checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" dependencies = [ "futures", "log", @@ -1193,9 +1187,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e602dcdf2f50c2abf297cc2203c73531e6f48b29516af7695d338cf2a778b1" +checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ "arrow", "async-trait", @@ -1206,6 +1200,7 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-physical-expr", + "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", @@ -1214,27 +1209,49 @@ dependencies = [ "itertools 0.14.0", "log", "object_store", - "rand 0.8.5", + "rand", "tokio", "url", ] +[[package]] +name = "datafusion-datasource-arrow" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" +dependencies = [ + "arrow", + "arrow-ipc", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "object_store", + "tokio", +] + [[package]] name = "datafusion-datasource-csv" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bb2253952dc32296ed5b84077cb2e0257fea4be6373e1c376426e17ead4ef6" +checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ "arrow", "async-trait", "bytes", - "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", "datafusion-datasource", "datafusion-execution", "datafusion-expr", - "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", @@ -1246,20 +1263,18 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8c7f47a5d2fe03bfa521ec9bafdb8a5c82de8377f60967c3663f00c8790352" +checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ "arrow", "async-trait", "bytes", - "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", "datafusion-datasource", "datafusion-execution", "datafusion-expr", - "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", @@ -1267,40 +1282,46 @@ dependencies = [ "object_store", "serde_json", "tokio", + "tokio-stream", ] [[package]] name = "datafusion-doc" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a91f8c2c5788ef32f48ff56c68e5b545527b744822a284373ac79bba1ba47292" +checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" [[package]] name = "datafusion-execution" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06f004d100f49a3658c9da6fb0c3a9b760062d96cd4ad82ccc3b7b69a9fb2f84" +checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ "arrow", + "arrow-buffer", + "async-trait", + "chrono", "dashmap", "datafusion-common", "datafusion-expr", + "datafusion-physical-expr-common", "futures", "log", "object_store", "parking_lot 0.12.3", - "rand 0.8.5", + "rand", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a4e4ce3802609be38eeb607ee72f6fe86c3091460de9dbfae9e18db423b3964" +checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ "arrow", + "async-trait", "chrono", "datafusion-common", "datafusion-doc", @@ -1308,7 +1329,8 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", - "indexmap 2.9.0", + "indexmap 2.14.0", + "itertools 0.14.0", "paste", "serde_json", "sqlparser", @@ -1316,27 +1338,28 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "422ac9cf3b22bbbae8cdf8ceb33039107fde1b5492693168f13bd566b1bcc839" +checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ "arrow", "datafusion-common", - "indexmap 2.9.0", + "indexmap 2.14.0", "itertools 0.14.0", "paste", ] [[package]] name = "datafusion-functions" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ddf0a0a2db5d2918349c978d42d80926c6aa2459cd8a3c533a84ec4bb63479e" +checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ "arrow", "arrow-buffer", "base64 0.22.1", "chrono", + "chrono-tz", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1346,7 +1369,9 @@ dependencies = [ "hex", "itertools 0.14.0", "log", - "rand 0.8.5", + "memchr", + "num-traits", + "rand", "regex", "unicode-segmentation", "uuid", @@ -1354,9 +1379,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "408a05dafdc70d05a38a29005b8b15e21b0238734dab1e98483fcb58038c5aba" +checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash", "arrow", @@ -1370,14 +1395,15 @@ dependencies = [ "datafusion-physical-expr-common", "half", "log", + "num-traits", "paste", ] [[package]] name = "datafusion-functions-aggregate-common" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "756d21da2dd6c9bef97af1504970ff56cbf35d03fbd4ffd62827f02f4d2279d4" +checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash", "arrow", @@ -1386,11 +1412,36 @@ dependencies = [ "datafusion-physical-expr-common", ] +[[package]] +name = "datafusion-functions-nested" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" +dependencies = [ + "arrow", + "arrow-ord", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr-common", + "hashbrown 0.16.1", + "itertools 0.14.0", + "itoa", + "log", + "paste", +] + [[package]] name = "datafusion-functions-table" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9a97220736c8fff1446e936be90d57216c06f28969f9ffd3b72ac93c958c8a" +checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ "arrow", "async-trait", @@ -1404,10 +1455,11 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefc2d77646e1aadd1d6a9c40088937aedec04e68c5f0465939912e1291f8193" +checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ + "arrow", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -1421,9 +1473,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd4aff082c42fa6da99ce0698c85addd5252928c908eb087ca3cfa64ff16b313" +checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1431,27 +1483,28 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df6f88d7ee27daf8b108ba910f9015176b36fbc72902b1ca5c2a5f1d1717e1a1" +checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ - "datafusion-expr", + "datafusion-doc", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] name = "datafusion-optimizer" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084d9f979c4b155346d3c34b18f4256e6904ded508e9554d90fed416415c3515" +checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ "arrow", "chrono", "datafusion-common", "datafusion-expr", + "datafusion-expr-common", "datafusion-physical-expr", - "indexmap 2.9.0", + "indexmap 2.14.0", "itertools 0.14.0", "log", "regex", @@ -1460,9 +1513,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c536062b0076f4e30084065d805f389f9fe38af0ca75bcbac86bc5e9fbab65" +checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash", "arrow", @@ -1472,33 +1525,52 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", - "hashbrown 0.14.5", - "indexmap 2.9.0", + "hashbrown 0.16.1", + "indexmap 2.14.0", "itertools 0.14.0", - "log", + "parking_lot 0.12.3", "paste", "petgraph", + "tokio", +] + +[[package]] +name = "datafusion-physical-expr-adapter" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "itertools 0.14.0", ] [[package]] name = "datafusion-physical-expr-common" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8a92b53b3193fac1916a1c5b8e3f4347c526f6822e56b71faa5fb372327a863" +checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash", "arrow", + "chrono", "datafusion-common", "datafusion-expr-common", - "hashbrown 0.14.5", + "hashbrown 0.16.1", + "indexmap 2.14.0", "itertools 0.14.0", + "parking_lot 0.12.3", ] [[package]] name = "datafusion-physical-optimizer" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fa0a5ac94c7cf3da97bedabd69d6bbca12aef84b9b37e6e9e8c25286511b5e2" +checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ "arrow", "datafusion-common", @@ -1508,75 +1580,86 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", + "datafusion-pruning", "itertools 0.14.0", - "log", ] [[package]] name = "datafusion-physical-plan" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "690c615db468c2e5fe5085b232d8b1c088299a6c63d87fd960a354a71f7acb55" +checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash", "arrow", "arrow-ord", "arrow-schema", "async-trait", - "chrono", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr", "datafusion-physical-expr-common", "futures", "half", - "hashbrown 0.14.5", - "indexmap 2.9.0", + "hashbrown 0.16.1", + "indexmap 2.14.0", "itertools 0.14.0", "log", + "num-traits", "parking_lot 0.12.3", "pin-project-lite", "tokio", ] [[package]] -name = "datafusion-session" -version = "47.0.0" +name = "datafusion-pruning" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad229a134c7406c057ece00c8743c0c34b97f4e72f78b475fe17b66c5e14fa4f" +checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ "arrow", - "async-trait", - "dashmap", "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", + "datafusion-datasource", + "datafusion-expr-common", "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-plan", - "datafusion-sql", - "futures", "itertools 0.14.0", "log", - "object_store", +] + +[[package]] +name = "datafusion-session" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" +dependencies = [ + "async-trait", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", "parking_lot 0.12.3", - "tokio", ] [[package]] name = "datafusion-sql" -version = "47.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64f6ab28b72b664c21a27b22a2ff815fd390ed224c26e89a93b5a8154a4e8607" +checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ "arrow", "bigdecimal", + "chrono", "datafusion-common", "datafusion-expr", - "indexmap 2.9.0", + "datafusion-functions-nested", + "indexmap 2.14.0", "log", "regex", "sqlparser", @@ -1618,7 +1701,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -1628,7 +1711,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -1649,7 +1732,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -1736,7 +1819,7 @@ checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -1827,11 +1910,23 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" dependencies = [ "percent-encoding", ] @@ -1902,7 +1997,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -1973,10 +2068,23 @@ checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 5.2.0", "wasi 0.14.2+wasi-0.2.4", ] +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "wasip2", + "wasip3", +] + [[package]] name = "gimli" version = "0.31.1" @@ -1991,13 +2099,14 @@ checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "half" -version = "2.6.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", "num-traits", + "zerocopy 0.8.48", ] [[package]] @@ -2011,16 +2120,32 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +dependencies = [ + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" dependencies = [ - "ahash", "allocator-api2", + "equivalent", + "foldhash 0.2.0", ] [[package]] name = "hashbrown" -version = "0.15.2" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" [[package]] name = "heck" @@ -2293,9 +2418,15 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "ident_case" version = "1.0.1" @@ -2304,9 +2435,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "1.0.3" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" dependencies = [ "idna_adapter", "smallvec", @@ -2354,12 +2485,14 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.9.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.15.2", + "hashbrown 0.17.0", + "serde", + "serde_core", ] [[package]] @@ -2375,7 +2508,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" dependencies = [ "ahash", - "indexmap 2.9.0", + "indexmap 2.14.0", "is-terminal", "itoa", "log", @@ -2396,7 +2529,7 @@ dependencies = [ "crossbeam-channel", "crossbeam-utils", "dashmap", - "indexmap 2.9.0", + "indexmap 2.14.0", "itoa", "log", "num-format", @@ -2486,7 +2619,7 @@ checksum = "199b7932d97e325aff3a7030e141eafe7f2c6268e1d1b24859b753a627f45254" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -2505,6 +2638,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "lexical-core" version = "1.0.5" @@ -2571,9 +2710,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.176" +version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" [[package]] name = "libloading" @@ -2643,9 +2782,9 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "lz4_flex" -version = "0.11.3" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" +checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a" dependencies = [ "twox-hash", ] @@ -2658,9 +2797,9 @@ checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" [[package]] name = "memchr" -version = "2.7.4" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "memmap2" @@ -2724,14 +2863,14 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.3" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" dependencies = [ "libc", "log", "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -2899,14 +3038,16 @@ dependencies = [ [[package]] name = "object_store" -version = "0.12.0" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9ce831b09395f933addbc56d894d889e4b226eba304d4e7adbab591e26daf1e" +checksum = "622acbc9100d3c10e2ee15804b0caa40e55c933d5aa53814cd520805b7958a49" dependencies = [ "async-trait", "bytes", "chrono", - "futures", + "futures-channel", + "futures-core", + "futures-util", "http", "humantime", "itertools 0.14.0", @@ -2917,6 +3058,8 @@ dependencies = [ "tracing", "url", "walkdir", + "wasm-bindgen-futures", + "web-time", ] [[package]] @@ -2996,15 +3139,6 @@ dependencies = [ "windows-targets", ] -[[package]] -name = "parse-zoneinfo" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24" -dependencies = [ - "regex", -] - [[package]] name = "paste" version = "1.0.15" @@ -3025,9 +3159,9 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.3.1" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "pete" @@ -3043,48 +3177,30 @@ dependencies = [ [[package]] name = "petgraph" -version = "0.7.1" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", - "indexmap 2.9.0", + "hashbrown 0.15.2", + "indexmap 2.14.0", + "serde", ] [[package]] name = "phf" -version = "0.11.3" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" dependencies = [ "phf_shared", ] -[[package]] -name = "phf_codegen" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" -dependencies = [ - "phf_generator", - "phf_shared", -] - -[[package]] -name = "phf_generator" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" -dependencies = [ - "phf_shared", - "rand 0.8.5", -] - [[package]] name = "phf_shared" -version = "0.11.3" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" dependencies = [ "siphasher", ] @@ -3106,7 +3222,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -3198,7 +3314,17 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ - "zerocopy 0.8.25", + "zerocopy 0.8.48", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.117", ] [[package]] @@ -3254,6 +3380,7 @@ dependencies = [ "nix 0.30.1", "once_cell", "pete", + "probing-memtable", "probing-proto", "probing-store", "procfs", @@ -3280,10 +3407,12 @@ dependencies = [ "log", "once_cell", "probing-macros", + "probing-memtable", "probing-proto", "serde", "serde_json", "sled", + "tempfile", "thiserror 2.0.12", "tokio", "url", @@ -3296,7 +3425,7 @@ version = "0.2.4" dependencies = [ "probing-core", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -3305,6 +3434,7 @@ version = "0.2.4" dependencies = [ "libc", "memmap2", + "pco", "xxhash-rust", ] @@ -3344,6 +3474,7 @@ dependencies = [ "probing-cc", "probing-cli", "probing-core", + "probing-memtable", "probing-proto", "probing-store", "pyo3", @@ -3351,6 +3482,7 @@ dependencies = [ "regex", "serde_json", "signal-hook-registry", + "tempfile", "tokio", ] @@ -3425,15 +3557,6 @@ dependencies = [ "hex", ] -[[package]] -name = "psm" -version = "0.1.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f" -dependencies = [ - "cc", -] - [[package]] name = "pulldown-cmark" version = "0.13.0" @@ -3491,7 +3614,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -3504,7 +3627,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -3527,9 +3650,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.40" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] @@ -3541,15 +3664,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" [[package]] -name = "rand" -version = "0.8.5" +name = "r-efi" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" [[package]] name = "rand" @@ -3557,20 +3675,10 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" dependencies = [ - "rand_chacha 0.9.0", + "rand_chacha", "rand_core 0.9.3", ] -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core 0.6.4", -] - [[package]] name = "rand_chacha" version = "0.9.0" @@ -3586,9 +3694,6 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom 0.2.16", -] [[package]] name = "rand_core" @@ -3628,26 +3733,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "recursive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" -dependencies = [ - "recursive-proc-macro-impl", - "stacker", -] - -[[package]] -name = "recursive-proc-macro-impl" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" -dependencies = [ - "quote", - "syn 2.0.101", -] - [[package]] name = "redox_syscall" version = "0.2.16" @@ -3688,9 +3773,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.1" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" dependencies = [ "aho-corasick", "memchr", @@ -3700,9 +3785,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", @@ -3711,9 +3796,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.5" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "rgb" @@ -3875,22 +3960,32 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -4019,34 +4114,33 @@ checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" [[package]] name = "socket2" -version = "0.5.9" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] name = "sqlparser" -version = "0.55.0" +version = "0.61.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11" +checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" dependencies = [ "log", - "recursive", "sqlparser_derive", ] [[package]] name = "sqlparser_derive" -version = "0.3.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" +checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -4055,19 +4149,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" -[[package]] -name = "stacker" -version = "0.1.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b" -dependencies = [ - "cc", - "cfg-if", - "libc", - "psm", - "windows-sys 0.59.0", -] - [[package]] name = "static_assertions" version = "1.1.0" @@ -4136,7 +4217,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -4181,9 +4262,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.101" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -4204,7 +4285,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -4286,7 +4367,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -4297,7 +4378,7 @@ checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -4364,29 +4445,40 @@ dependencies = [ [[package]] name = "tokio" -version = "1.45.0" +version = "1.52.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2513ca694ef9ede0fb23fe71a4ee4107cb102b9dc1930f6d0fd77aae068ae165" +checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6" dependencies = [ - "backtrace", "bytes", "libc", "mio", "pin-project-lite", "socket2", "tokio-macros", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.5.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", +] + +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", + "tokio-util", ] [[package]] @@ -4414,6 +4506,19 @@ dependencies = [ "tungstenite 0.28.0", ] +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + [[package]] name = "tower" version = "0.5.2" @@ -4462,7 +4567,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -4491,7 +4596,7 @@ dependencies = [ "http", "httparse", "log", - "rand 0.9.1", + "rand", "sha1", "thiserror 2.0.12", "utf-8", @@ -4508,7 +4613,7 @@ dependencies = [ "http", "httparse", "log", - "rand 0.9.1", + "rand", "sha1", "thiserror 2.0.12", "utf-8", @@ -4516,13 +4621,9 @@ dependencies = [ [[package]] name = "twox-hash" -version = "1.6.3" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" -dependencies = [ - "cfg-if", - "static_assertions", -] +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" [[package]] name = "typenum" @@ -4554,6 +4655,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "unindent" version = "0.2.4" @@ -4596,13 +4703,14 @@ dependencies = [ [[package]] name = "url" -version = "2.5.4" +version = "2.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] @@ -4637,13 +4745,13 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.16.0" +version = "1.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" +checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" dependencies = [ - "getrandom 0.3.2", + "getrandom 0.4.2", "js-sys", - "serde", + "serde_core", "wasm-bindgen", ] @@ -4723,6 +4831,24 @@ dependencies = [ "wit-bindgen-rt", ] +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen 0.57.1", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", +] + [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -4745,10 +4871,23 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.100" @@ -4767,7 +4906,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4781,6 +4920,40 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.14.0", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags 2.9.0", + "hashbrown 0.15.2", + "indexmap 2.14.0", + "semver", +] + [[package]] name = "web-sys" version = "0.3.77" @@ -4840,7 +5013,7 @@ checksum = "4763c1de310c86d75a878046489e2e5ba02c649d185f21c67d4cf8a56d098980" dependencies = [ "windows-implement", "windows-interface", - "windows-link", + "windows-link 0.1.1", "windows-result", "windows-strings", ] @@ -4853,7 +5026,7 @@ checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -4864,7 +5037,7 @@ checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -4873,13 +5046,19 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + [[package]] name = "windows-result" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c64fd11a4fd95df68efcfee5f44a294fe71b8bc6a91993e2791938abcc712252" dependencies = [ - "windows-link", + "windows-link 0.1.1", ] [[package]] @@ -4888,7 +5067,7 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2ba9642430ee452d5a7aa78d72907ebe8cfda358e8cb7918a2050581322f97" dependencies = [ - "windows-link", + "windows-link 0.1.1", ] [[package]] @@ -4909,6 +5088,15 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link 0.2.1", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -4973,6 +5161,32 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck 0.5.0", + "wit-parser", +] + [[package]] name = "wit-bindgen-rt" version = "0.39.0" @@ -4982,6 +5196,74 @@ dependencies = [ "bitflags 2.9.0", ] +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck 0.5.0", + "indexmap 2.14.0", + "prettyplease", + "syn 2.0.117", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.117", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags 2.9.0", + "indexmap 2.14.0", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.14.0", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + [[package]] name = "write16" version = "1.0.0" @@ -5020,7 +5302,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", "synstructure", ] @@ -5035,11 +5317,11 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.25" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ - "zerocopy-derive 0.8.25", + "zerocopy-derive 0.8.48", ] [[package]] @@ -5050,18 +5332,18 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] name = "zerocopy-derive" -version = "0.8.25" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] [[package]] @@ -5081,7 +5363,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", "synstructure", ] @@ -5110,5 +5392,5 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.117", ] diff --git a/Cargo.toml b/Cargo.toml index 75edce8c..56c8210b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,13 @@ description = "Performance and Stability Diagnostic Tool for AI Applications" [workspace.dependencies] anyhow = "1.0" -arrow = { version = "55.1.0", default-features = false, features = ["csv"] } +arrow = { version = "58.1.0", default-features = false, features = ["csv"] } +datafusion = { version = "53.1.0", default-features = false, features = [ + "datetime_expressions", + "sql", + "regex_expressions", + "string_expressions", +] } chrono = { version = "0.4", features = ["serde"] } ctor = { version = "0.4.1", features = [] } env_logger = { version = "0.11.6", default-features = false, features = [ @@ -77,7 +83,9 @@ default = ["extension-module", "use-mimalloc"] [dependencies] probing-core = { path = "probing/core" } probing-server = { path = "probing/server", default-features = false } -probing-python = { path = "probing/extensions/python", default-features = false, features=["tracing"] } +probing-python = { path = "probing/extensions/python", default-features = false, features = [ + "tracing", +] } probing-cli = { path = "probing/cli" } anyhow = { workspace = true } @@ -110,6 +118,6 @@ tokio = { workspace = true } [profile.release] opt-level = "z" # Optimize for size. panic = "unwind" -strip = false # Automatically strip symbols from the binary. +strip = false # Automatically strip symbols from the binary. lto = "thin" codegen-units = 16 diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index ecf6d041..5851dd9a 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -89,6 +89,7 @@ plugins: Troubleshooting: 常见问题 Design: 设计文档 Architecture: 系统架构 + Data Layer: 数据层 Profiling: 性能分析 Distributed: 分布式 Cluster with Pulsing: 基于 Pulsing 的集群 @@ -123,6 +124,7 @@ nav: - Design: - design/index.md - Architecture: design/architecture.md + - Data Layer: design/data-layer.md - Profiling: design/profiling.md - Debugging: design/debugging.md - Distributed: design/distributed.md diff --git a/docs/src/design/data-layer.md b/docs/src/design/data-layer.md new file mode 100644 index 00000000..ddcf4156 --- /dev/null +++ b/docs/src/design/data-layer.md @@ -0,0 +1,293 @@ +# Data Layer + +Probing's data layer is a **per-process, crash-resilient, time-retained data plane** for +observability data (metrics, samples, traces). Every producer writes through one in-house +columnar store, [`probing-memtable`](https://github.com/DeepLink-org/probing), and every +consumer queries it through SQL (DataFusion). It is built as **two tiers**: + +- a **hot tier** (`MEMT`): a fixed-capacity ring buffer for the live window — constant memory, + zero-allocation writes; +- a **cold tier** (`MEMC`): immutable, compressed segments for time retention beyond the ring, + with whole-file eviction. + +A single SQL time predicate prunes and queries both tiers at once. + +## Design Goals + +- **Bounded resource use.** The hot ring never grows; the cold store is capped by a byte budget + and TTL. +- **Crash resilience.** A process killed mid-write never surfaces torn rows; cold segments + recover from a torn tail via forward scan. +- **Time retention.** Data that scrolls out of the hot ring survives in cold segments and stays + queryable. +- **One write path, one read path.** Producers (server, Python/Torch extensions) all write + `probing-memtable`; consumers all go through `probing-core::memtable_sql`. +- **Fork safety.** Correct under fork-heavy workloads (PyTorch DataLoader workers). + +## Architecture + +```mermaid +graph LR + APP[push_row / RowWriter] --> MEMT + subgraph HOT[Hot tier · probing-memtable] + MEMT[MEMT ring buffer] --> SEALED[sealed chunk\nmin/max ts + generation] + end + SEALED -->|transpose + Pco| ROLLER[Compactor / Roller] + ROLLER --> MEMC + subgraph COLD[Cold tier · MEMC segments] + MEMC[ColdStore\nimmutable segments] + end + + SQL[SQL query] --> HCT[HotColdTable] + HCT -->|chunk pruning| MEMT + HCT -->|segment + page pruning| MEMC +``` + +The hot tier is mapped read-only at query time; the cold tier is read via `SegmentReader`. The +`HotColdTable` provider unions them into one scan, deduplicating chunks that exist in both tiers. + +## Hot Tier (MEMT) + +### File Layout + +Every MEMT buffer (heap, shared memory, or mmap'd file) begins with a 64-byte header (one cache +line), followed by per-column descriptors, then chunk data. + +**Header v3 (64 bytes):** + +| offset | size | field | notes | +|---|---|---|---| +| 0 | 4 | `magic` | `0x4D454D54` (`"MEMT"`) | +| 4 | 2 | `version` | 3 | +| 6 | 2 | `header_size` | 64 (validation) | +| 8 | 2 | `byte_order` | BOM `[0x01,0x02]` | +| 10 | 2 | `ts_col` | timestamp column index + 1 (0 = none) | +| 12 | 4 | `flags` | feature bits (`FLAG_DEDUP`, …) | +| 16 | 4 | `num_cols` | | +| 20 | 4 | `num_chunks` | ring slot count | +| 24 | 4 | `chunk_size` | bytes per chunk | +| 28 | 4 | `data_offset` | 64-aligned | +| 32 | 4 | `write_chunk` | `AtomicU32` — current ring slot | +| 36 | 4 | `write_lock` | `AtomicU32` — 0 = free, else holder PID | +| 40 | 4 | `refcount` | `AtomicU32` | +| 44 | 4 | `creator_pid` | | +| 48 | 8 | `creator_start_time` | for PID-recycling detection | +| 56 | 8 | `lock_owner_start` | `AtomicU64` — lock holder's start time | + +Bytes 0–31 are the **cold zone** (immutable after init); bytes 32–63 are the **hot zone** +(atomically mutated), split to avoid false sharing. Each chunk starts with a 40-byte +`ChunkHeader` carrying a `generation` counter and per-chunk `min_ts`/`max_ts` (`AtomicI64`). + +### Backends + +The same API backs three storage kinds: + +- **Heap** — a private `Vec`; for in-process use. +- **POSIX shared memory** (`shm_open` + `mmap`) — cross-process, named, unlinked on cleanup. +- **File-backed mmap** — persistent, discoverable files under `//`. This is what + the SQL layer reads. + +### Ring Buffer & Generations + +Writes append to the current chunk; when a row does not fit, the writer advances to the next ring +slot (wrapping), sealing the previous chunk. Each slot carries a monotonically increasing +`generation` (incremented every time the ring wraps onto it). Readers materialize chunks in +**logical (oldest → newest) order** and re-check the generation after reading — a chunk recycled +mid-read is discarded rather than surfacing torn rows. + +### Robust Write Lock + +`write_lock` holds **0 (free) or the holder's PID**. A waiter spins; if it spins past +`LOCK_STEAL_TIMEOUT` (500 ms) it enters a steal decision: + +- if the holder process no longer exists (`kill(pid, 0)`), the lock is stolen; +- if the holder exists but its kernel start time differs from `lock_owner_start`, the PID was + recycled by an unrelated process — stolen after a short re-check grace. + +Stealing is data-safe: rows only become visible via the `Release` store of `row_count` at the end +of a write, so a half-written row from a dead holder stays uncommitted and is simply overwritten. + +!!! note "Fork safety" + The holder's start time is read via a per-PID cache, **not** a one-shot cache. A child that + inherited a parent's cached value would record the parent's start time and be mistaken for a + recycled PID by a waiter — exactly the hazard fork-heavy workloads (PyTorch DataLoader) + trigger. Re-reading whenever the live PID changes makes every post-fork caller observe its own + start time. (Start times come from `/proc` on Linux; on platforms without it the steal-on-recycle + path is inert.) + +### Timestamp Metadata + +When the schema has an `I64` column named `timestamp` (or `ts`), `ts_col` records it and the write +path maintains per-chunk `min_ts`/`max_ts`. This is the basis for chunk-level time pruning at query +time, and it is **structurally identical** to the cold tier's page/segment time ranges. + +## Cold Tier (MEMC) + +### Directory & File Naming + +Cold segments live in `//cold` — co-located with, and scoped like, the hot ring +files, so cold data never mixes across processes. Each segment is named +`-.memc`, where `writer_id` is a hash of `(pid, start_time)` and `seq` is a +monotonically increasing sequence the `ColdStore` recovers on open. + +### Segment Format + +A segment is a sequence of 64-aligned blocks. All integrity checks use **xxh3-64 truncated to 32 +bits**. + +**Segment header (64 bytes):** `magic` (`"MEMC"`), `version` (1), BOM, `flags` (bit 0 = sealed), +`writer_pid`, `writer_start`, `created_unix_ms`, `footer_off` (0 until sealed), segment-wide +`ts_min`/`ts_max`, `page_count`, header checksum. + +**Blocks** share a 64-byte header: + +| magic | meaning | +|---|---| +| `MCTB` | table-definition block — declares a `table_id`, name, column dtypes, ts column | +| `MCPG` | page block — one columnar page for a `table_id` | +| `MCFT` | footer — page directory written on seal | + +The page/block header carries `table_id`, `row_count`, `col_count`, `ts_min`/`ts_max`, +`payload_len`, `payload_xxh`, and — crucially for restart dedup — `source_gen` and `source_chunk` +(the hot-ring chunk generation and index this page was drained from; `u32::MAX` = not applicable). +The header is itself checksummed (covering `source_chunk`). + +A single segment holds pages from **multiple tables**, distinguished by `table_id`. This decouples +file/directory count from table count: hundreds of tables share one set of segment files. + +### Column Encodings + +Each column is encoded independently (`ColEncoding`): + +- **`Pco`** — numeric columns (`i32/i64/f32/f64/u32/u64`), compressed with Pco (level 8). Monotonic + timestamp columns compress > 4×. +- **`RawFixed`** — `u8` (Pco offers no benefit for byte columns). +- **`RawVarLen`** — `Str`/`Bytes`, stored as concatenated `[u32 len][bytes]` entries (Pco has no + string support). + +### Crash Recovery + +- A **sealed** segment is read via its footer page directory — O(1) location of every page. +- An **unsealed or torn** segment is recovered by **forward scan**: walk blocks from the start, + verifying each block's header and payload checksum, stopping at the first bad block and dropping + the torn tail. Table-definition blocks are always scanned (cheap, and they precede pages). + +There is no heuristic that tries to repair a half-written record. + +!!! warning "Durability" + Pages are not `fsync`'d individually (only `sync_data` on seal). A `SIGKILL` may lose + not-yet-flushed tail pages of the open segment. This is acceptable for observability data but + is an explicit trade-off. + +## Compactor (Roller) + +The `Compactor` drains newly-sealed hot chunks into cold segments. + +- **Drain semantics.** Only `Sealed` chunks are drained (never the currently-writing chunk). Rows + are transposed to columns; the chunk's `generation` is re-checked before and after — if the ring + recycled it, the page is dropped and retried next pass. Draining is **idempotent**: a per-chunk + `drained_gen` high-water mark skips already-compacted chunk generations. +- **Rolling.** The open segment is sealed and a new one started when it reaches + `target_segment_bytes` (default 64 MiB — the main fragmentation knob), or when it exceeds + `max_segment_age` (default 300 s, so low-rate tables still become queryable), or on explicit + flush. +- **Eviction.** `enforce` deletes oldest segments past a byte budget (`max_total_bytes`) or TTL, + always protecting the newest segment. + +### Exactly-Once Across Restarts + +`drained_gen` is in-memory, so a naive restart over a persistent cold dir would re-compact +chunks still resident in the hot ring, producing duplicate rows. `prime_from_cold()` rebuilds the +watermark on startup: it scans existing cold segments and, per `(table, source_chunk)`, takes the +max `source_gen`, merging it into `drained_gen` the first time a table is seen. The result is +**exactly-once** even across restarts. + +## Runtime Owner + +`ColdCompactor` is a process-global singleton (modeled on the task-stats worker) that gives the +compactor a single lifecycle home: + +- a background thread **rediscovers** ring files under `//` each pass (tables appear + over time), drains each into the shared `ColdStore`, rolls by age, and enforces the budget; +- on startup it calls `prime_from_cold()`; on stop it flushes (seals the open segment). + +It is **opt-in** (off by default) to avoid spawning a compaction thread in every forked worker. +Configuration is applied via the `MemTableExtension` option surface or environment variables; the +server calls `start_cold_compaction_from_env()` at engine init. + +## SQL Integration + +### Catalog Discovery + +mmap files under `//` are exposed as DataFusion tables, with the filename mapping +to `(schema, table)`: + +- first `.` splits schema vs table — `acme.actors` → schema `acme`, table `actors`; +- no `.` → schema `memtable` (e.g. `metrics` → `memtable.metrics`). + +`DynamicMmapCatalog` merges these dynamic schemas with the static `probe` catalog. A query like +`SELECT … FROM probe.memtable.metrics` resolves through `MmapFileSchemaProvider::table()`. + +### Providers + +- **`RingMmapTable`** — lazy provider over a hot ring file. Materializes Arrow batches at `scan()` + time, pruning chunks whose `[min_ts, max_ts]` cannot match the query's time predicate. +- **`HotColdTable`** — unions a hot ring with its cold segments under one logical table (keyed by + on-disk basename, so names never collide across schemas). This is what the catalog returns for + ring tables. + +### Three-Level Time Pruning + +One time predicate prunes both tiers, in increasing granularity: + +1. **Segment level** — skip a sealed cold segment whose header `ts_range` cannot match (no mmap). +2. **Page level** — skip cold pages outside the range via the page directory. +3. **Chunk level** — skip hot chunks outside the range via their `min_ts`/`max_ts`. + +Hot and cold batches are handed to the scan as two partitions, so projection, filter, and limit +pushdown apply uniformly across both. + +### Hot∪Cold Exactly-Once + +A compacted chunk still lives in the hot ring until overwritten, so a naive union would +double-count it. `cold_scan` returns the set of `(source_chunk, source_gen)` the cold pages came +from; the hot side then **excludes** any chunk whose `(index, current generation)` is in that set. +Each row is counted exactly once, and the dedup is immune to ring recycling (the generation check +re-validates). + +## Configuration Reference + +| `SET memtable.*` | env | meaning | default | +|---|---|---|---| +| `cold_compaction` | `PROBING_COLD` | run the background compactor (`on`/`off`) | off | +| `cold_max_total_mb` | `PROBING_COLD_MAX_TOTAL_MB` | cold-store byte budget (MiB) | unlimited | +| `cold_ttl_secs` | `PROBING_COLD_TTL_SECS` | evict cold segments older than this | none | +| — | `PROBING_COLD_TARGET_MB` | segment roll size (MiB) | 64 | +| — | `PROBING_COLD_POLL_MS` | drain-pass interval | 2000 | +| — | `PROBING_COLD_MAX_AGE_SECS` | seal idle open segment after | 300 | + +## Guarantees & Known Limits + +**Guaranteed:** + +- No torn rows on reads (generation re-validation); cold torn-tail recovery. +- Exactly-once across tiers (query dedup) and across restarts (`prime_from_cold`). +- Bounded hot memory; bounded cold bytes/TTL. +- Fork-safe locking. + +**Known trade-offs (P2 backlog):** + +- **Cold dir is per-PID.** Cross-process isolation is clean, but cold data is not persistent across + restarts by default (a new PID is a new cold dir). `prime_from_cold` makes restart dedup correct + whenever a persistent cold dir is configured. +- **No per-page `fsync`** — a `SIGKILL` may lose the open segment's not-yet-flushed tail. +- **No segment-level manifest** — multi-segment queries open each segment header to prune. +- **Pco level is fixed (8)** — not adapted per column. +- **Runtime is single-process** — cross-process / cluster aggregation is not yet wired. + +## Testing + +The data layer ships with unit and end-to-end tests: hot-ring lock/recycle/fork tests +(`probing-memtable`), MEMC format/recovery/compactor tests (including restart-dedup with a negative +control), and SQL end-to-end tests that drain through the runtime owner and query the union through +the real catalog path (`probing-core::memtable_sql`). diff --git a/docs/src/design/data-layer.zh.md b/docs/src/design/data-layer.zh.md new file mode 100644 index 00000000..718317fc --- /dev/null +++ b/docs/src/design/data-layer.zh.md @@ -0,0 +1,262 @@ +# 数据层 + +Probing 的数据层是一个面向观测数据(指标、采样、trace)的**单进程、抗崩溃、带时间保留的数据面**。 +所有生产者都通过同一套自研列式存储 [`probing-memtable`](https://github.com/DeepLink-org/probing) +写入,所有消费者都通过 SQL(DataFusion)读取。它由**两层**构成: + +- **热层**(`MEMT`):固定容量的环形缓冲区,承载实时窗口——常量内存、零分配写入; +- **冷层**(`MEMC`):不可变、压缩的段文件,用于超出环形窗口的时间保留,按整文件淘汰。 + +一条 SQL 时间谓词即可同时对两层做剪枝与查询。 + +## 设计目标 + +- **资源有界。** 热层环形缓冲永不增长;冷层受字节预算与 TTL 双重约束。 +- **抗崩溃。** 写入中途被杀的进程不会暴露半行数据;冷段可从尾部撕裂中通过前向扫描恢复。 +- **时间保留。** 滚出热层环形窗口的数据落入冷段,依然可查。 +- **一条写路径、一条读路径。** 生产者(server、Python/Torch 扩展)统一写 `probing-memtable`; + 消费者统一走 `probing-core::memtable_sql`。 +- **fork 安全。** 在大量 fork 的负载(如 PyTorch DataLoader worker)下依然正确。 + +## 总体架构 + +```mermaid +graph LR + APP[push_row / RowWriter] --> MEMT + subgraph HOT[热层 · probing-memtable] + MEMT[MEMT 环形缓冲] --> SEALED[已封存 chunk\nmin/max ts + generation] + end + SEALED -->|转置 + Pco| ROLLER[Compactor / Roller] + ROLLER --> MEMC + subgraph COLD[冷层 · MEMC 段] + MEMC[ColdStore\n不可变段] + end + + SQL[SQL 查询] --> HCT[HotColdTable] + HCT -->|chunk 剪枝| MEMT + HCT -->|段 + page 剪枝| MEMC +``` + +查询时热层以只读方式 mmap,冷层通过 `SegmentReader` 读取。`HotColdTable` provider 将两者合并为 +一次扫描,并对同时存在于两层的 chunk 做去重。 + +## 热层(MEMT) + +### 文件布局 + +每个 MEMT 缓冲区(堆、共享内存或 mmap 文件)都以 64 字节头部(一个 cache line)开始,随后是 +逐列描述符,再是 chunk 数据。 + +**Header v3(64 字节):** + +| 偏移 | 大小 | 字段 | 说明 | +|---|---|---|---| +| 0 | 4 | `magic` | `0x4D454D54`(`"MEMT"`) | +| 4 | 2 | `version` | 3 | +| 6 | 2 | `header_size` | 64(仅校验) | +| 8 | 2 | `byte_order` | BOM `[0x01,0x02]` | +| 10 | 2 | `ts_col` | 时间戳列索引 + 1(0 = 无) | +| 12 | 4 | `flags` | 特性位(`FLAG_DEDUP` 等) | +| 16 | 4 | `num_cols` | | +| 20 | 4 | `num_chunks` | 环形槽位数 | +| 24 | 4 | `chunk_size` | 每个 chunk 字节数 | +| 28 | 4 | `data_offset` | 64 对齐 | +| 32 | 4 | `write_chunk` | `AtomicU32`——当前环形槽位 | +| 36 | 4 | `write_lock` | `AtomicU32`——0 = 空闲,否则为持有者 PID | +| 40 | 4 | `refcount` | `AtomicU32` | +| 44 | 4 | `creator_pid` | | +| 48 | 8 | `creator_start_time` | 用于 PID 回收检测 | +| 56 | 8 | `lock_owner_start` | `AtomicU64`——锁持有者的进程启动时间 | + +字节 0–31 是**冷区**(初始化后不可变),字节 32–63 是**热区**(运行时原子修改),二者分离以避免 +伪共享。每个 chunk 以 40 字节的 `ChunkHeader` 开头,携带 `generation` 计数器及逐 chunk 的 +`min_ts`/`max_ts`(`AtomicI64`)。 + +### 三种后端 + +同一套 API 支撑三种存储形态: + +- **堆内存**——私有 `Vec`,用于进程内使用; +- **POSIX 共享内存**(`shm_open` + `mmap`)——跨进程、具名、清理时 unlink; +- **文件 mmap**——持久、可发现的文件,位于 `//`。SQL 层读取的正是这种。 + +### 环形缓冲与 generation + +写入追加到当前 chunk;当一行放不下时,写入者推进到下一个环形槽位(绕回),同时封存上一个 chunk。 +每个槽位携带单调递增的 `generation`(每次环形绕回到该槽位即自增)。读取者按**逻辑顺序(旧 → 新)** +物化 chunk,并在读取后复核 generation——若某 chunk 在读取过程中被回收,则丢弃而非暴露半行数据。 + +### Robust 写锁 + +`write_lock` 存放 **0(空闲)或持有者的 PID**。等待者自旋;若自旋超过 `LOCK_STEAL_TIMEOUT` +(500 ms),进入抢占判定: + +- 若持有者进程已不存在(`kill(pid, 0)`),抢占该锁; +- 若持有者存在但其内核启动时间与 `lock_owner_start` 不符,说明 PID 已被无关进程回收——经短暂复核 + 宽限后抢占。 + +抢占是数据安全的:行只有在写入结束时通过 `row_count` 的 `Release` 存储才可见,因此已死持有者写到 +一半的行不会提交,会被直接覆盖。 + +!!! note "fork 安全" + 持有者启动时间通过**按 PID 缓存**读取,而非一次性缓存。若子进程继承了父进程的缓存值,就会记录 + 父进程的启动时间,从而被等待者误判为 PID 回收——这正是大量 fork 的负载(PyTorch DataLoader) + 会触发的隐患。每当存活 PID 变化即重新读取,可让每个 fork 后的调用者观察到自己的启动时间。 + (启动时间在 Linux 上来自 `/proc`;在不具备该接口的平台上,回收抢占路径自动失效。) + +### 时间戳元数据 + +当 schema 含有名为 `timestamp`(或 `ts`)的 `I64` 列时,`ts_col` 记录其位置,写入路径维护逐 +chunk 的 `min_ts`/`max_ts`。这是查询时 chunk 级时间剪枝的基础,并且与冷层的 page/段时间范围在 +**结构上完全一致**。 + +## 冷层(MEMC) + +### 目录与文件命名 + +冷段位于 `//cold`——与热层环形文件同处一地、同样按进程隔离,因此冷数据绝不会跨进程 +混淆。每个段命名为 `-.memc`,其中 `writer_id` 是 `(pid, start_time)` 的哈希, +`seq` 是 `ColdStore` 打开时恢复的单调递增序号。 + +### 段格式 + +一个段是一系列 64 对齐的 block。所有完整性校验都使用 **xxh3-64 截断为 32 位**。 + +**段头部(64 字节):** `magic`(`"MEMC"`)、`version`(1)、BOM、`flags`(bit 0 = 已封存)、 +`writer_pid`、`writer_start`、`created_unix_ms`、`footer_off`(封存前为 0)、段级 +`ts_min`/`ts_max`、`page_count`、头部校验和。 + +**Block** 共享 64 字节头部: + +| magic | 含义 | +|---|---| +| `MCTB` | 表定义 block——声明一个 `table_id`、表名、列 dtype、时间戳列 | +| `MCPG` | page(数据)block——某个 `table_id` 的一列页 | +| `MCFT` | footer——封存时写入的 page 目录 | + +page/block 头部携带 `table_id`、`row_count`、`col_count`、`ts_min`/`ts_max`、`payload_len`、 +`payload_xxh`,以及对重启去重至关重要的 `source_gen` 与 `source_chunk`(该 page 从哪个热层 +chunk 的 generation 和索引抽取而来;`u32::MAX` = 不适用)。头部本身也带校验和(覆盖 +`source_chunk`)。 + +单个段可容纳**多张表**的 page,以 `table_id` 区分。这让文件/目录数量与表数量解耦:成百上千张表 +共享同一组段文件。 + +### 列编码 + +每列独立编码(`ColEncoding`): + +- **`Pco`**——数值列(`i32/i64/f32/f64/u32/u64`),用 Pco(level 8)压缩。单调时间戳列压缩比 > 4×; +- **`RawFixed`**——`u8`(Pco 对字节列无收益); +- **`RawVarLen`**——`Str`/`Bytes`,以连续的 `[u32 len][bytes]` 条目存储(Pco 不支持字符串)。 + +### 崩溃恢复 + +- **已封存**段通过 footer 的 page 目录读取——O(1) 定位每个 page; +- **未封存或撕裂**的段通过**前向扫描**恢复:从头遍历 block,校验每个 block 的头部和 payload + 校验和,在第一个坏 block 处停止并丢弃撕裂的尾部。表定义 block 总会被扫描(开销小,且位于 page 之前)。 + +不存在任何试图修复半行记录的启发式逻辑。 + +!!! warning "持久性" + page 不会逐个 `fsync`(仅在封存时 `sync_data`)。`SIGKILL` 可能丢失当前打开段尚未刷盘的尾部 + page。对观测数据可接受,但这是一个明确的取舍。 + +## Compactor(Roller) + +`Compactor` 将新封存的热层 chunk 徕出(drain)到冷段。 + +- **徕出语义。** 只徕出 `Sealed` 状态的 chunk(绝不动正在写入的 chunk)。行被转置为列;徕出前后 + 复核该 chunk 的 `generation`——若环形已回收它,丢弃该 page 并在下一轮重试。徕出是**幂等**的: + 逐 chunk 的 `drained_gen` 高水位跳过已压缩的 chunk generation。 +- **滚动。** 当打开的段达到 `target_segment_bytes`(默认 64 MiB——主要的碎片化调节旋钮)、超过 + `max_segment_age`(默认 300 s,让低速率表也能及时可查),或显式 flush 时,封存当前段并新开一个。 +- **淘汰。** `enforce` 在超出字节预算(`max_total_bytes`)或 TTL 时删除最旧的段,并始终保护最新段。 + +### 跨重启的精确一次 + +`drained_gen` 在内存中,因此朴素重启面对持久冷目录时会重新压缩仍驻留在热层环形中的 chunk,产生重复 +行。`prime_from_cold()` 在启动时重建高水位:扫描已有冷段,按 `(表, source_chunk)` 取 `source_gen` +的最大值,在首次见到某表时合并进 `drained_gen`。结果即使跨重启也保证**精确一次**。 + +## 运行时 Owner + +`ColdCompactor` 是进程级全局单例(仿照 task-stats worker),为 compactor 提供唯一的生命周期归宿: + +- 后台线程每轮**重新发现** `//` 下的环形文件(表会随时间出现),将每个徕出到共享的 + `ColdStore`,按时长滚动,并执行预算约束; +- 启动时调用 `prime_from_cold()`;停止时 flush(封存打开的段)。 + +它**默认关闭**(opt-in),以避免在每个 fork 出来的 worker 中都启动一个压缩线程。配置通过 +`MemTableExtension` 选项面或环境变量下发;server 在引擎初始化时调用 +`start_cold_compaction_from_env()`。 + +## SQL 集成 + +### Catalog 发现 + +`//` 下的 mmap 文件被暴露为 DataFusion 表,文件名映射到 `(schema, table)`: + +- 首个 `.` 分隔 schema 与 table——`acme.actors` → schema `acme`、table `actors`; +- 无 `.` → schema `memtable`(例如 `metrics` → `memtable.metrics`)。 + +`DynamicMmapCatalog` 将这些动态 schema 与静态 `probe` catalog 合并。形如 +`SELECT … FROM probe.memtable.metrics` 的查询经 `MmapFileSchemaProvider::table()` 解析。 + +### Provider + +- **`RingMmapTable`**——热层环形文件之上的惰性 provider。在 `scan()` 时才物化 Arrow batch,并剪掉 + `[min_ts, max_ts]` 无法匹配查询时间谓词的 chunk。 +- **`HotColdTable`**——将一个热层环形与其冷段合并为同一张逻辑表(以磁盘 basename 为键,使表名跨 + schema 永不冲突)。这是 catalog 为环形表返回的 provider。 + +### 三级时间剪枝 + +一条时间谓词以递增粒度剪枝两层: + +1. **段级**——跳过头部 `ts_range` 无法匹配的已封存冷段(无需 mmap); +2. **page 级**——通过 page 目录跳过范围外的冷 page; +3. **chunk 级**——通过 `min_ts`/`max_ts` 跳过范围外的热 chunk。 + +热、冷 batch 作为两个分区交给扫描,因此投影、过滤、limit 下推对两层一致生效。 + +### 热∪冷的精确一次 + +被压缩的 chunk 在被覆盖前仍存活于热层环形中,朴素的合并会重复计数。`cold_scan` 返回冷 page 来源的 +`(source_chunk, source_gen)` 集合;热侧据此**排除**任何 `(索引, 当前 generation)` 落在该集合中的 +chunk。每行恰好计数一次,且去重对环形回收免疫(generation 复核会重新验证)。 + +## 配置参考 + +| `SET memtable.*` | 环境变量 | 含义 | 默认 | +|---|---|---|---| +| `cold_compaction` | `PROBING_COLD` | 运行后台 compactor(`on`/`off`) | 关闭 | +| `cold_max_total_mb` | `PROBING_COLD_MAX_TOTAL_MB` | 冷层字节预算(MiB) | 无限 | +| `cold_ttl_secs` | `PROBING_COLD_TTL_SECS` | 淘汰早于此时长的冷段 | 无 | +| — | `PROBING_COLD_TARGET_MB` | 段滚动大小(MiB) | 64 | +| — | `PROBING_COLD_POLL_MS` | 徕出轮询间隔 | 2000 | +| — | `PROBING_COLD_MAX_AGE_SECS` | 空闲打开段多久后封存 | 300 | + +## 保证与已知边界 + +**已保证:** + +- 读取无半行数据(generation 复核);冷层尾部撕裂可恢复; +- 跨层精确一次(查询去重)与跨重启精确一次(`prime_from_cold`); +- 热层内存有界;冷层字节/TTL 有界; +- fork 安全的锁。 + +**已知取舍(P2 待办):** + +- **冷目录按 PID 隔离。** 跨进程隔离干净,但默认不跨重启持久化(新 PID = 新冷目录)。在配置了持久 + 冷目录时,`prime_from_cold` 保证重启去重正确。 +- **无逐 page `fsync`**——`SIGKILL` 可能丢失打开段尚未刷盘的尾部。 +- **无段级 manifest**——多段查询需打开每个段头部做剪枝。 +- **Pco level 固定(8)**——未按列自适应。 +- **运行时为单进程**——跨进程 / 集群聚合尚未打通。 + +## 测试 + +数据层附带单元与端到端测试:热层环形的锁/回收/fork 测试(`probing-memtable`)、MEMC 的 +格式/恢复/compactor 测试(含带反例的重启去重),以及经运行时 owner 徕出、再通过真实 catalog 路径 +查询合并结果的 SQL 端到端测试(`probing-core::memtable_sql`)。 diff --git a/docs/src/design/index.md b/docs/src/design/index.md index 409aba92..54fd89bf 100644 --- a/docs/src/design/index.md +++ b/docs/src/design/index.md @@ -42,6 +42,7 @@ Probing's core mission is simple: **make distributed systems feel Pythonic again | Document | Description | |----------|-------------| | [Architecture](architecture.md) | System structure and components | +| [Data Layer](data-layer.md) | Hot/cold columnar store and SQL integration | | [Profiling](profiling.md) | Performance data collection | | [Debugging](debugging.md) | Debugging capabilities | | [Distributed](distributed.md) | Multi-node support | diff --git a/docs/src/design/index.zh.md b/docs/src/design/index.zh.md index 6ab7c14a..e80bac50 100644 --- a/docs/src/design/index.zh.md +++ b/docs/src/design/index.zh.md @@ -42,6 +42,7 @@ Probing 的核心使命很简单:**让分布式系统重新变得 Pythonic** | 文档 | 描述 | |------|------| | [系统架构](architecture.md) | 系统结构和组件 | +| [数据层](data-layer.md) | 冷热分层列式存储与 SQL 集成 | | [性能分析](profiling.md) | 性能数据收集 | | [调试](debugging.md) | 调试能力 | | [分布式](distributed.md) | 多节点支持 | diff --git a/probing/cli/Cargo.toml b/probing/cli/Cargo.toml index 352e9500..5377752a 100644 --- a/probing/cli/Cargo.toml +++ b/probing/cli/Cargo.toml @@ -20,6 +20,7 @@ path = "src/lib.rs" probing-proto = { path = "../proto", default-features = false, features = [] } probing-store = { path = "../crates/store", default-features = false, features = [ ] } +probing-memtable = { path = "../memtable" } anyhow = { workspace = true } log = { workspace = true } diff --git a/probing/cli/src/cli/bench/args.rs b/probing/cli/src/cli/bench/args.rs new file mode 100644 index 00000000..ad9b7cff --- /dev/null +++ b/probing/cli/src/cli/bench/args.rs @@ -0,0 +1,232 @@ +//! Argument structs for the `bench` subcommands (clap derive). + +use std::path::PathBuf; + +use clap::{Args, ValueEnum}; + +use super::workload::{SchemaKind, WorkloadSpec}; + +/// Storage backend under test. +#[derive(ValueEnum, Debug, Clone, Copy, PartialEq, Eq)] +pub enum Backend { + /// Process-private heap buffer (no cross-handle sharing). + Heap, + /// POSIX shared memory (`shm_open`). + Shm, + /// mmap'd regular file at an explicit path. + File, + /// Discoverable mmap'd file under the data dir (SQL-visible). + Shared, +} + +/// Streaming row writer vs. value-vector `push_row`. +#[derive(ValueEnum, Debug, Clone, Copy, PartialEq, Eq)] +pub enum WriterMode { + /// `push_row` — concurrency-safe auto-advance; allocates a value row. + Push, + /// `RowWriter` streaming fast path (single-threaded only). + Streaming, +} + +/// Schema / row-shape options shared by every subcommand. +#[derive(Args, Debug, Clone)] +pub struct SchemaArgs { + /// Built-in column layout. + #[arg(long, value_enum, default_value = "metrics")] + pub schema: SchemaKind, + + /// Number of f64 columns for `--schema wide`. + #[arg(long, default_value_t = 16)] + pub wide_cols: usize, + + /// Byte length of the `msg` payload for `--schema logs`. + #[arg(long, default_value_t = 32)] + pub str_len: usize, +} + +impl SchemaArgs { + pub fn spec(&self) -> WorkloadSpec { + WorkloadSpec { + kind: self.schema, + wide_cols: self.wide_cols.max(1), + str_len: self.str_len, + } + } +} + +/// Ring geometry shared by subcommands that build a hot table. +#[derive(Args, Debug, Clone)] +pub struct RingArgs { + /// Bytes per ring chunk. + #[arg(long, default_value_t = 256 * 1024)] + pub chunk_size: u32, + + /// Number of ring chunks (slots). + #[arg(long, default_value_t = 64)] + pub chunks: u32, +} + +#[derive(Args, Debug, Clone)] +pub struct WriteArgs { + #[command(flatten)] + pub schema: SchemaArgs, + #[command(flatten)] + pub ring: RingArgs, + + /// Storage backend. + #[arg(long, value_enum, default_value = "heap")] + pub backend: Backend, + + /// Total rows to write (across all threads). + #[arg(long, default_value_t = 1_000_000)] + pub rows: u64, + + /// Concurrent writer threads. >1 requires a shared backend + /// (shm/file/shared) to exercise the cross-handle write lock. + #[arg(long, default_value_t = 1)] + pub threads: usize, + + /// Writer API to exercise. + #[arg(long, value_enum, default_value = "push")] + pub writer: WriterMode, + + /// File path for `--backend file` (defaults to a temp file). + #[arg(long)] + pub path: Option, + + /// Record a per-row latency histogram (adds measurable overhead). + #[arg(long)] + pub latency: bool, + + /// Warm-up rows per thread, excluded from measurement. + #[arg(long, default_value_t = 0)] + pub warmup: u64, +} + +#[derive(Args, Debug, Clone)] +pub struct ScanArgs { + #[command(flatten)] + pub schema: SchemaArgs, + #[command(flatten)] + pub ring: RingArgs, + + /// Rows to pre-populate before scanning. + #[arg(long, default_value_t = 1_000_000)] + pub rows: u64, + + /// Number of full scan passes to time. + #[arg(long, default_value_t = 5)] + pub iters: usize, +} + +#[derive(Args, Debug, Clone)] +pub struct CompactArgs { + #[command(flatten)] + pub schema: SchemaArgs, + #[command(flatten)] + pub ring: RingArgs, + + /// Rows to ingest and compact. + #[arg(long, default_value_t = 2_000_000)] + pub rows: u64, + + /// Segment roll size in MiB (`target_segment_bytes`). + #[arg(long, default_value_t = 8)] + pub target_mb: u64, + + /// Cold directory (defaults to a temp dir; removed on exit unless --keep). + #[arg(long)] + pub dir: Option, + + /// Keep the cold directory after the run. + #[arg(long)] + pub keep: bool, +} + +#[derive(Args, Debug, Clone)] +pub struct ColdscanArgs { + #[command(flatten)] + pub schema: SchemaArgs, + #[command(flatten)] + pub ring: RingArgs, + + /// Read an existing cold directory instead of building one. + #[arg(long)] + pub dir: Option, + + /// Rows to ingest when building a cold store (ignored with --dir). + #[arg(long, default_value_t = 2_000_000)] + pub rows: u64, + + /// Segment roll size in MiB when building (ignored with --dir). + #[arg(long, default_value_t = 8)] + pub target_mb: u64, + + /// Number of full read passes to time. + #[arg(long, default_value_t = 3)] + pub iters: usize, +} + +#[derive(Args, Debug, Clone)] +pub struct MixedArgs { + #[command(flatten)] + pub schema: SchemaArgs, + #[command(flatten)] + pub ring: RingArgs, + + /// Shared backend for the pipeline. + #[arg(long, value_enum, default_value = "shared")] + pub backend: Backend, + + /// Concurrent writer threads. + #[arg(long, default_value_t = 2)] + pub writers: usize, + + /// Concurrent reader (scan) threads. + #[arg(long, default_value_t = 1)] + pub readers: usize, + + /// Run duration in seconds. + #[arg(long, default_value_t = 10)] + pub duration: u64, + + /// Disable the background compactor (hot-only pipeline). + #[arg(long)] + pub no_compact: bool, + + /// Segment roll size in MiB. + #[arg(long, default_value_t = 8)] + pub target_mb: u64, + + /// Cold-store byte budget in MiB (eviction trigger). + #[arg(long)] + pub max_total_mb: Option, + + /// Cold-store TTL in seconds. + #[arg(long)] + pub ttl_secs: Option, +} + +#[derive(Args, Debug, Clone)] +pub struct MpArgs { + #[command(flatten)] + pub schema: SchemaArgs, + #[command(flatten)] + pub ring: RingArgs, + + /// Shared backend (must be cross-process: shm/file/shared). + #[arg(long, value_enum, default_value = "shared")] + pub backend: Backend, + + /// Number of writer processes. + #[arg(long, default_value_t = 1)] + pub writers: usize, + + /// Number of reader processes. + #[arg(long, default_value_t = 2)] + pub readers: usize, + + /// Measurement window in seconds (the soak is time-driven, not row-driven). + #[arg(long, default_value_t = 10)] + pub duration: u64, +} diff --git a/probing/cli/src/cli/bench/metrics.rs b/probing/cli/src/cli/bench/metrics.rs new file mode 100644 index 00000000..59d0d11b --- /dev/null +++ b/probing/cli/src/cli/bench/metrics.rs @@ -0,0 +1,311 @@ +//! Measurement primitives: a bounded latency reservoir and a small +//! report builder that renders either as an aligned table or as JSON. + +use std::time::Duration; + +/// Reservoir-sampled latency recorder (nanoseconds). +/// +/// Per-operation timing on a hot write path is itself measurable overhead, +/// so latency capture is opt-in. When enabled we keep an unbiased uniform +/// sample of at most `cap` observations (reservoir sampling) plus exact +/// `min`/`max`/`sum`/`count`, which is enough for stable tail-quantile +/// estimates without unbounded memory. +pub struct Latency { + samples: Vec, + cap: usize, + seen: u64, + min: u64, + max: u64, + sum: u128, + rng: u64, +} + +impl Latency { + pub fn new(cap: usize) -> Self { + Self { + samples: Vec::with_capacity(cap.min(1 << 16)), + cap: cap.max(1), + seen: 0, + min: u64::MAX, + max: 0, + sum: 0, + rng: 0x9E37_79B9_7F4A_7C15, + } + } + + #[inline] + fn next_rng(&mut self) -> u64 { + // xorshift64* + let mut x = self.rng; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + self.rng = x; + x.wrapping_mul(0x2545_F491_4F6C_DD1D) + } + + #[inline] + pub fn record(&mut self, ns: u64) { + self.seen += 1; + self.min = self.min.min(ns); + self.max = self.max.max(ns); + self.sum += ns as u128; + if self.samples.len() < self.cap { + self.samples.push(ns); + } else { + let j = (self.next_rng() % self.seen) as usize; + if j < self.cap { + self.samples[j] = ns; + } + } + } + + pub fn merge(&mut self, other: &Latency) { + for &s in &other.samples { + self.record(s); + } + // record() above already folded the sampled values; fix the exact + // aggregates from the source's exact tallies instead of the sample. + if other.seen > 0 { + self.min = self.min.min(other.min); + self.max = self.max.max(other.max); + } + } + + pub fn count(&self) -> u64 { + self.seen + } + + pub fn mean_ns(&self) -> f64 { + if self.seen == 0 { + 0.0 + } else { + self.sum as f64 / self.seen as f64 + } + } + + pub fn min_ns(&self) -> u64 { + if self.seen == 0 { + 0 + } else { + self.min + } + } + + pub fn max_ns(&self) -> u64 { + self.max + } + + /// Estimated quantile (`q` in `[0,1]`) from the reservoir sample. + pub fn quantile_ns(&self, q: f64) -> u64 { + if self.samples.is_empty() { + return 0; + } + let mut s = self.samples.clone(); + s.sort_unstable(); + let q = q.clamp(0.0, 1.0); + let idx = ((s.len() as f64 - 1.0) * q).round() as usize; + s[idx] + } +} + +// ── Report ─────────────────────────────────────────────────────────── + +/// One labelled measurement: a human string plus a machine-readable value. +struct Entry { + key: String, + display: String, + json: serde_json::Value, +} + +/// Accumulates labelled results and renders them as an aligned table or +/// a JSON object. Construction order is preserved. +pub struct Report { + title: String, + entries: Vec, +} + +impl Report { + pub fn new(title: impl Into) -> Self { + Self { + title: title.into(), + entries: Vec::new(), + } + } + + fn push(&mut self, key: &str, display: String, json: serde_json::Value) -> &mut Self { + self.entries.push(Entry { + key: key.to_string(), + display, + json, + }); + self + } + + pub fn text(&mut self, key: &str, value: impl Into) -> &mut Self { + let v = value.into(); + let json = serde_json::Value::String(v.clone()); + self.push(key, v, json) + } + + pub fn count(&mut self, key: &str, n: u64) -> &mut Self { + self.push( + key, + group_thousands(n), + serde_json::Value::from(n), + ) + } + + pub fn float(&mut self, key: &str, v: f64, suffix: &str) -> &mut Self { + let disp = if suffix.is_empty() { + format!("{v:.3}") + } else { + format!("{v:.3} {suffix}") + }; + self.push(key, disp, json_f64(v)) + } + + pub fn ratio(&mut self, key: &str, v: f64) -> &mut Self { + self.push(key, format!("{v:.2}x"), json_f64(v)) + } + + pub fn bytes(&mut self, key: &str, n: u64) -> &mut Self { + self.push(key, human_bytes(n), serde_json::Value::from(n)) + } + + pub fn duration(&mut self, key: &str, d: Duration) -> &mut Self { + self.push( + key, + format!("{:.3} s", d.as_secs_f64()), + json_f64(d.as_secs_f64()), + ) + } + + /// Throughput in ops/second, displayed with an SI suffix. + pub fn rate(&mut self, key: &str, ops: u64, elapsed: Duration, unit: &str) -> &mut Self { + let per_sec = rate_per_sec(ops, elapsed); + self.push( + key, + format!("{} {unit}/s", si(per_sec)), + json_f64(per_sec), + ) + } + + /// Throughput in bytes/second, displayed as MiB/s. + pub fn byte_rate(&mut self, key: &str, bytes: u64, elapsed: Duration) -> &mut Self { + let per_sec = rate_per_sec(bytes, elapsed); + self.push( + key, + format!("{:.2} MiB/s", per_sec / (1024.0 * 1024.0)), + json_f64(per_sec), + ) + } + + /// Append the standard quantile rows for a latency recorder. + pub fn latency(&mut self, prefix: &str, lat: &Latency) -> &mut Self { + if lat.count() == 0 { + return self; + } + self.float(&format!("{prefix} min"), lat.min_ns() as f64, "ns"); + self.float(&format!("{prefix} mean"), lat.mean_ns(), "ns"); + self.float(&format!("{prefix} p50"), lat.quantile_ns(0.50) as f64, "ns"); + self.float(&format!("{prefix} p99"), lat.quantile_ns(0.99) as f64, "ns"); + self.float(&format!("{prefix} p999"), lat.quantile_ns(0.999) as f64, "ns"); + self.float(&format!("{prefix} max"), lat.max_ns() as f64, "ns"); + self + } + + pub fn print_table(&self) { + let width = self + .entries + .iter() + .map(|e| e.key.len()) + .max() + .unwrap_or(0); + println!("\n {}", self.title); + println!(" {}", "─".repeat(self.title.len().max(20))); + for e in &self.entries { + println!(" {: serde_json::Value { + let mut map = serde_json::Map::new(); + for e in &self.entries { + map.insert(e.key.clone(), e.json.clone()); + } + serde_json::json!({ "benchmark": self.title, "metrics": map }) + } + + pub fn emit(&self, json: bool) { + if json { + println!( + "{}", + serde_json::to_string_pretty(&self.to_json()) + .unwrap_or_else(|_| "{}".to_string()) + ); + } else { + self.print_table(); + } + } +} + +// ── formatting helpers ───────────────────────────────────────────────── + +fn json_f64(v: f64) -> serde_json::Value { + serde_json::Number::from_f64(v) + .map(serde_json::Value::Number) + .unwrap_or(serde_json::Value::Null) +} + +pub fn rate_per_sec(n: u64, elapsed: Duration) -> f64 { + let s = elapsed.as_secs_f64(); + if s <= 0.0 { + 0.0 + } else { + n as f64 / s + } +} + +/// SI-suffixed magnitude (K/M/G) for human-readable rates. +pub fn si(v: f64) -> String { + let a = v.abs(); + if a >= 1e9 { + format!("{:.2}G", v / 1e9) + } else if a >= 1e6 { + format!("{:.2}M", v / 1e6) + } else if a >= 1e3 { + format!("{:.2}K", v / 1e3) + } else { + format!("{v:.0}") + } +} + +pub fn human_bytes(n: u64) -> String { + const UNITS: [&str; 5] = ["B", "KiB", "MiB", "GiB", "TiB"]; + let mut v = n as f64; + let mut i = 0; + while v >= 1024.0 && i < UNITS.len() - 1 { + v /= 1024.0; + i += 1; + } + if i == 0 { + format!("{n} B") + } else { + format!("{v:.2} {}", UNITS[i]) + } +} + +fn group_thousands(n: u64) -> String { + let s = n.to_string(); + let len = s.len(); + let mut out = String::with_capacity(len + len / 3); + for (i, c) in s.chars().enumerate() { + if i > 0 && (len - i).is_multiple_of(3) { + out.push(','); + } + out.push(c); + } + out +} diff --git a/probing/cli/src/cli/bench/mod.rs b/probing/cli/src/cli/bench/mod.rs new file mode 100644 index 00000000..263de0bf --- /dev/null +++ b/probing/cli/src/cli/bench/mod.rs @@ -0,0 +1,61 @@ +//! Hidden `bench` command: a load generator and stress harness for the +//! probing data layer (hot MEMT ring + cold MEMC segments). +//! +//! This is an internal/diagnostic command (hidden from `--help`). Run +//! `probing bench --help` for per-workload options. + +pub mod args; +pub mod metrics; +pub mod runners; +pub mod workload; + +use anyhow::Result; +use clap::{Args, Subcommand}; + +use args::{ColdscanArgs, CompactArgs, MixedArgs, MpArgs, ScanArgs, WriteArgs}; + +/// Stress and benchmark the in-process data layer. +#[derive(Args, Debug)] +pub struct BenchCommand { + /// Emit machine-readable JSON instead of a formatted table. + #[arg(long, global = true)] + pub json: bool, + + /// PRNG seed for reproducible synthetic data. + #[arg(long, global = true, default_value_t = 0x00C0_FFEE)] + pub seed: u64, + + #[command(subcommand)] + pub command: BenchSub, +} + +#[derive(Subcommand, Debug)] +pub enum BenchSub { + /// Write throughput across storage backends and writer counts. + Write(WriteArgs), + /// Sequential scan throughput over a freshly populated hot ring. + Scan(ScanArgs), + /// Cold-tier compaction throughput and hot→cold compression ratio. + Compact(CompactArgs), + /// Cold-segment read + decode throughput. + Coldscan(ColdscanArgs), + /// End-to-end pipeline: writers + background compactor + readers. + Mixed(MixedArgs), + /// Multi-process, time-driven soak: writer + reader processes share a table. + Mp(MpArgs), +} + +impl BenchCommand { + pub fn run(&self) -> Result<()> { + let seed = self.seed; + let json = self.json; + match &self.command { + BenchSub::Write(a) => runners::write::run(a, json, seed), + BenchSub::Scan(a) => runners::scan::run(a, json, seed), + BenchSub::Compact(a) => runners::compact::run(a, json, seed), + BenchSub::Coldscan(a) => runners::coldscan::run(a, json, seed), + BenchSub::Mixed(a) => runners::mixed::run(a, json, seed), + BenchSub::Mp(a) => runners::mp::run(a, json, seed), + } + } +} diff --git a/probing/cli/src/cli/bench/runners/coldscan.rs b/probing/cli/src/cli/bench/runners/coldscan.rs new file mode 100644 index 00000000..19ffbe44 --- /dev/null +++ b/probing/cli/src/cli/bench/runners/coldscan.rs @@ -0,0 +1,92 @@ +//! `coldscan` — read + decode throughput over MEMC cold segments. +//! +//! Opens every `.memc` segment in a cold directory and decodes every page +//! (Pco / raw), folding row counts into a sink. Reports both the logical +//! (decoded) throughput and the on-disk (compressed) read rate. + +use std::time::Instant; + +use anyhow::{bail, Result}; +use probing_memtable::memc::{ColdStore, SegmentReader}; + +use crate::cli::bench::args::ColdscanArgs; +use crate::cli::bench::metrics::Report; + +pub fn run(args: &ColdscanArgs, json: bool, seed: u64) -> Result<()> { + let spec = args.schema.spec(); + let row_bytes = spec.approx_row_bytes() as u64; + + let (dir, built, temp) = match &args.dir { + Some(d) => (d.clone(), 0u64, false), + None => { + let dir = super::common::temp_dir("coldscan")?; + let drained = super::common::build_cold( + &dir, + &spec, + &args.ring, + args.rows, + args.target_mb, + seed, + )?; + (dir, drained, true) + } + }; + + let store = ColdStore::open(&dir)?; + let segments = store.segment_paths(); + if segments.is_empty() { + bail!("no .memc segments found under {}", dir.display()); + } + + let iters = args.iters.max(1); + let mut rows_per_pass = 0u64; + let mut disk_per_pass = 0u64; + let mut sink = 0u64; + + let start = Instant::now(); + for _ in 0..iters { + let mut rows = 0u64; + let mut disk = 0u64; + for path in &segments { + let reader = SegmentReader::open(path) + .map_err(|e| anyhow::anyhow!("open {}: {e}", path.display()))?; + for (i, page) in reader.pages().iter().enumerate() { + disk += page.block_len as u64; + let cols = reader + .read_page(i) + .map_err(|e| anyhow::anyhow!("decode page {i}: {e}"))?; + let n = cols.first().map(|c| c.len()).unwrap_or(0) as u64; + rows += n; + sink = sink.wrapping_add(n); + } + } + rows_per_pass = rows; + disk_per_pass = disk; + } + let elapsed = start.elapsed(); + std::hint::black_box(sink); + + let rows_total = rows_per_pass * iters as u64; + let disk_total = disk_per_pass * iters as u64; + let logical_total = rows_total * row_bytes; + + let mut report = Report::new(format!("coldscan · {:?}", args.schema.schema)); + report.text("schema", format!("{:?}", args.schema.schema)); + if built > 0 { + report.count("rows built", built); + } + report + .count("segments", segments.len() as u64) + .count("rows/pass", rows_per_pass) + .count("read passes", iters as u64) + .duration("elapsed", elapsed) + .rate("decode rate", rows_total, elapsed, "rows") + .byte_rate("logical rate", logical_total, elapsed) + .byte_rate("on-disk read", disk_total, elapsed); + report.emit(json); + + if temp { + let _ = std::fs::remove_dir_all(&dir); + } + Ok(()) +} diff --git a/probing/cli/src/cli/bench/runners/common.rs b/probing/cli/src/cli/bench/runners/common.rs new file mode 100644 index 00000000..cdb5f846 --- /dev/null +++ b/probing/cli/src/cli/bench/runners/common.rs @@ -0,0 +1,167 @@ +//! Shared helpers: unique names/paths, temp dirs, and hot-ring population. + +use std::io; +use std::path::PathBuf; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +use anyhow::Result; +use probing_memtable::memc::{ColdStore, Compactor, CompactorConfig}; +use probing_memtable::{DType, MemTable}; + +use crate::cli::bench::args::RingArgs; +use crate::cli::bench::workload::{RowGen, WorkloadSpec}; + +/// How to attach to an already-created shared table (used by multi-handle +/// runners). Heap is excluded because it cannot be shared. +#[derive(Clone)] +pub enum Attach { + Shm(String), + File(PathBuf), +} + +impl Attach { + pub fn open(&self) -> io::Result { + match self { + Attach::Shm(name) => MemTable::open_shm(name), + Attach::File(path) => MemTable::open_file(path), + } + } + + /// Serialize for passing to a child process (`shm:` / `file:`). + pub fn encode(&self) -> String { + match self { + Attach::Shm(name) => format!("shm:{name}"), + Attach::File(path) => format!("file:{}", path.display()), + } + } + + pub fn parse(s: &str) -> Result { + if let Some(name) = s.strip_prefix("shm:") { + Ok(Attach::Shm(name.to_string())) + } else if let Some(path) = s.strip_prefix("file:") { + Ok(Attach::File(PathBuf::from(path))) + } else { + anyhow::bail!("invalid attach descriptor: {s}") + } + } +} + +/// Scan all resident rows of `table` once through the cursor, folding values +/// into a sink. Returns `(value_sink, row_count)`. +pub fn scan_all(table: &MemTable, dtypes: &[DType]) -> (u64, u64) { + let mut sink = 0u64; + let mut rows = 0u64; + for chunk in table.chunks_logical() { + for row in table.rows(chunk) { + let mut c = row.cursor(); + for dt in dtypes { + match dt { + DType::U8 => sink = sink.wrapping_add(c.next_u8() as u64), + DType::U32 => sink = sink.wrapping_add(c.next_u32() as u64), + DType::I32 => sink = sink.wrapping_add(c.next_i32() as u64), + DType::I64 => sink = sink.wrapping_add(c.next_i64() as u64), + DType::U64 => sink = sink.wrapping_add(c.next_u64()), + DType::F32 => sink = sink.wrapping_add(c.next_f32().to_bits() as u64), + DType::F64 => sink = sink.wrapping_add(c.next_f64().to_bits()), + DType::Str => sink = sink.wrapping_add(c.next_str().len() as u64), + DType::Bytes => sink = sink.wrapping_add(c.next_bytes().len() as u64), + } + } + rows += 1; + } + } + (sink, rows) +} + +/// A process-and-time unique token for naming temp files / shm objects. +pub fn unique_token() -> String { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + format!("{}-{}", std::process::id(), nanos % 1_000_000) +} + +/// A temp file path (not created). +pub fn temp_path(label: &str) -> PathBuf { + std::env::temp_dir().join(format!("probing-bench-{label}-{}.memt", unique_token())) +} + +/// A temp directory path (created). +pub fn temp_dir(label: &str) -> Result { + let dir = std::env::temp_dir().join(format!("probing-bench-{label}-{}", unique_token())); + std::fs::create_dir_all(&dir)?; + Ok(dir) +} + +/// POSIX shm name (short enough for macOS' 31-byte cap). +pub fn shm_name() -> String { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + format!("/pb{}", nanos % 1_000_000_000) +} + +/// Fill a hot table with `rows` rows via the concurrency-safe `push_row` +/// path, returning the number of rows written. Used by scan/compact/cold +/// builders where ingest speed is not the measured quantity. +pub fn populate(table: &mut MemTable, spec: &WorkloadSpec, rows: u64, seed: u64) -> u64 { + let mut gen = RowGen::new(spec.clone(), seed, 0); + let mut scratch: Vec = Vec::new(); + for _ in 0..rows { + let values = gen.values(&mut scratch); + table.push_row_unchecked(&values); + } + rows +} + +/// Ingest `rows` rows and compact them into MEMC segments under `dir`, +/// interleaving drains so the ring never overwrites undrained chunks. +/// Returns rows actually drained to cold. +pub fn build_cold( + dir: &std::path::Path, + spec: &WorkloadSpec, + ring: &RingArgs, + rows: u64, + target_mb: u64, + seed: u64, +) -> Result { + let row_bytes = spec.approx_row_bytes() as u64; + let mut table = MemTable::new(&spec.schema(), ring.chunk_size, ring.chunks); + let store = ColdStore::open(dir)?; + let config = CompactorConfig { + target_segment_bytes: target_mb * 1024 * 1024, + max_segment_age: Duration::from_secs(3600), + poll_interval: Duration::from_millis(1), + max_total_bytes: None, + ttl: None, + }; + let mut compactor = Compactor::new(store, config); + + let rows_per_chunk = ((ring.chunk_size as u64).saturating_sub(40)) / (row_bytes + 4).max(1); + let batch = (rows_per_chunk * (ring.chunks as u64 / 2).max(1)).max(1); + + let mut gen = RowGen::new(spec.clone(), seed, 0); + let mut scratch: Vec = Vec::new(); + let mut ingested = 0u64; + let mut drained = 0u64; + while ingested < rows { + let n = batch.min(rows - ingested); + for _ in 0..n { + let values = gen.values(&mut scratch); + table.push_row_unchecked(&values); + } + ingested += n; + drained += compactor.drain_view("bench", &table.view())? as u64; + } + loop { + let n = compactor.drain_view("bench", &table.view())? as u64; + drained += n; + if n == 0 { + break; + } + } + compactor.flush()?; + Ok(drained) +} diff --git a/probing/cli/src/cli/bench/runners/compact.rs b/probing/cli/src/cli/bench/runners/compact.rs new file mode 100644 index 00000000..56d5a6d7 --- /dev/null +++ b/probing/cli/src/cli/bench/runners/compact.rs @@ -0,0 +1,118 @@ +//! `compact` — cold-tier roller throughput and hot→cold compression ratio. +//! +//! Ingest is interleaved with drain passes so sealed chunks are compacted +//! before the ring can recycle them; we time the drain work separately from +//! the end-to-end wall clock. + +use std::time::{Duration, Instant}; + +use anyhow::Result; +use probing_memtable::memc::{ColdStore, Compactor, CompactorConfig}; +use probing_memtable::MemTable; + +use crate::cli::bench::args::CompactArgs; +use crate::cli::bench::metrics::Report; +use crate::cli::bench::workload::RowGen; + +pub fn run(args: &CompactArgs, json: bool, seed: u64) -> Result<()> { + let spec = args.schema.spec(); + let row_bytes = spec.approx_row_bytes() as u64; + + let dir = match &args.dir { + Some(d) => { + std::fs::create_dir_all(d)?; + d.clone() + } + None => super::common::temp_dir("compact")?, + }; + + let mut table = MemTable::new(&spec.schema(), args.ring.chunk_size, args.ring.chunks); + let store = ColdStore::open(&dir)?; + let config = CompactorConfig { + target_segment_bytes: args.target_mb * 1024 * 1024, + max_segment_age: Duration::from_secs(3600), + poll_interval: Duration::from_millis(1), + max_total_bytes: None, + ttl: None, + }; + let mut compactor = Compactor::new(store, config); + + // Drain every ~half-ring worth of rows so undrained sealed chunks never + // exceed ring capacity. + let rows_per_chunk = + ((args.ring.chunk_size as u64).saturating_sub(40)) / (row_bytes + 4).max(1); + let batch = (rows_per_chunk * (args.ring.chunks as u64 / 2).max(1)).max(1); + + let mut gen = RowGen::new(spec.clone(), seed, 0); + let mut scratch: Vec = Vec::new(); + let name = "bench"; + + let mut ingested = 0u64; + let mut drained = 0u64; + let mut drain_time = Duration::ZERO; + let wall = Instant::now(); + + while ingested < args.rows { + let n = batch.min(args.rows - ingested); + for _ in 0..n { + let values = gen.values(&mut scratch); + table.push_row_unchecked(&values); + } + ingested += n; + + let t = Instant::now(); + drained += compactor.drain_view(name, &table.view())? as u64; + drain_time += t.elapsed(); + } + + // Final drains (sealed-but-not-yet-drained chunks) + seal the tail. + loop { + let t = Instant::now(); + let n = compactor.drain_view(name, &table.view())? as u64; + drain_time += t.elapsed(); + drained += n; + if n == 0 { + break; + } + } + let t = Instant::now(); + compactor.flush()?; + drain_time += t.elapsed(); + let wall = wall.elapsed(); + + let stats = compactor.stats(); + let logical = drained * row_bytes; + let ratio = if stats.total_bytes > 0 { + logical as f64 / stats.total_bytes as f64 + } else { + 0.0 + }; + + let mut report = Report::new(format!("compact · {:?}", args.schema.schema)); + report + .text("schema", format!("{:?}", args.schema.schema)) + .count("rows ingested", ingested) + .count("rows drained", drained) + .count("cold segments", stats.segment_count as u64) + .bytes("hot logical", logical) + .bytes("cold on-disk", stats.total_bytes) + .ratio("compression", ratio) + .duration("drain time", drain_time) + .duration("wall time", wall) + .rate("compact rate", drained, drain_time, "rows") + .byte_rate("cold write rate", stats.total_bytes, drain_time); + report.emit(json); + + if args.dir.is_none() && !args.keep { + let _ = std::fs::remove_dir_all(&dir); + } else { + report_dir(&dir, json); + } + Ok(()) +} + +fn report_dir(dir: &std::path::Path, json: bool) { + if !json { + println!(" cold dir: {}", dir.display()); + } +} diff --git a/probing/cli/src/cli/bench/runners/mixed.rs b/probing/cli/src/cli/bench/runners/mixed.rs new file mode 100644 index 00000000..4b92d666 --- /dev/null +++ b/probing/cli/src/cli/bench/runners/mixed.rs @@ -0,0 +1,184 @@ +//! `mixed` — end-to-end pipeline / soak: concurrent writers, optional +//! background compactor, and concurrent readers over one shared table for a +//! fixed duration. Reports per-role throughput plus the resulting cold-tier +//! footprint. + +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use anyhow::{bail, Result}; +use probing_memtable::memc::{ColdStore, Compactor, CompactorConfig}; +use probing_memtable::{DType, MemTable}; + +use crate::cli::bench::args::{Backend, MixedArgs}; +use crate::cli::bench::metrics::Report; +use super::common::{scan_all, shm_name, temp_dir, temp_path, unique_token, Attach}; +use crate::cli::bench::workload::RowGen; + +pub fn run(args: &MixedArgs, json: bool, seed: u64) -> Result<()> { + let spec = args.schema.spec(); + let row_bytes = spec.approx_row_bytes() as u64; + let writers = args.writers.max(1); + let readers = args.readers; + + // Create the shared backing; keep the creator alive for the whole run. + let mut cleanup_file: Option = None; + let (attach, _creator) = match args.backend { + Backend::Heap => bail!("mixed requires a shared backend (shm/file/shared), not heap"), + Backend::Shm => { + let name = shm_name(); + let creator = + MemTable::shm(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?; + (Attach::Shm(name), creator) + } + Backend::File => { + let path = temp_path("mixed"); + cleanup_file = Some(path.clone()); + let creator = + MemTable::file_at(&path, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?; + (Attach::File(path), creator) + } + Backend::Shared => { + let name = format!("bench-{}", unique_token()); + let creator = + MemTable::shared(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?; + let path = creator.path().expect("shared path").to_path_buf(); + (Attach::File(path), creator) + } + }; + + let dtypes: Vec = (0..spec.schema().cols.len()) + .map(|i| spec.schema().cols[i].dtype) + .collect(); + + let stop = Arc::new(AtomicBool::new(false)); + let write_rows = Arc::new(AtomicU64::new(0)); + let read_rows = Arc::new(AtomicU64::new(0)); + let read_passes = Arc::new(AtomicU64::new(0)); + + // Background compactor (own read handle to the shared mapping). + let cold_dir = temp_dir("mixed-cold")?; + let compactor_handle = if args.no_compact { + None + } else { + let store = ColdStore::open(&cold_dir)?; + let config = CompactorConfig { + target_segment_bytes: args.target_mb * 1024 * 1024, + max_segment_age: Duration::from_secs(args.duration.max(1)), + poll_interval: Duration::from_millis(50), + max_total_bytes: args.max_total_mb.map(|m| m * 1024 * 1024), + ttl: args.ttl_secs.map(Duration::from_secs), + }; + let handle = attach.open()?; + Some(Compactor::new(store, config).spawn(vec![("bench".to_string(), handle)])) + }; + + let mut threads = Vec::new(); + + for tid in 0..writers { + let attach = attach.clone(); + let spec = spec.clone(); + let stop = stop.clone(); + let write_rows = write_rows.clone(); + let seed = seed ^ (0x9E37_79B9_u64.wrapping_mul(tid as u64 + 1)); + threads.push(std::thread::spawn(move || -> Result<()> { + let mut table = attach.open()?; + let mut gen = RowGen::new(spec.clone(), seed, (tid as i64) * 1_000_000_000); + let mut scratch: Vec = Vec::new(); + let mut local = 0u64; + while !stop.load(Ordering::Relaxed) { + for _ in 0..256 { + let values = gen.values(&mut scratch); + table.push_row_unchecked(&values); + } + local += 256; + } + write_rows.fetch_add(local, Ordering::Relaxed); + Ok(()) + })); + } + + for _ in 0..readers { + let attach = attach.clone(); + let stop = stop.clone(); + let read_rows = read_rows.clone(); + let read_passes = read_passes.clone(); + let dtypes = dtypes.clone(); + threads.push(std::thread::spawn(move || -> Result<()> { + let table = attach.open()?; + let mut rows = 0u64; + let mut passes = 0u64; + let mut sink = 0u64; + while !stop.load(Ordering::Relaxed) { + let (s, n) = scan_all(&table, &dtypes); + sink = sink.wrapping_add(s); + rows += n; + passes += 1; + } + std::hint::black_box(sink); + read_rows.fetch_add(rows, Ordering::Relaxed); + read_passes.fetch_add(passes, Ordering::Relaxed); + Ok(()) + })); + } + + let start = Instant::now(); + std::thread::sleep(Duration::from_secs(args.duration.max(1))); + stop.store(true, Ordering::Relaxed); + for t in threads { + t.join().unwrap()?; + } + let elapsed = start.elapsed(); + + if let Some(h) = compactor_handle { + h.stop(); + } + + let total_writes = write_rows.load(Ordering::Relaxed); + let total_reads = read_rows.load(Ordering::Relaxed); + let passes = read_passes.load(Ordering::Relaxed); + + let cold = if args.no_compact { + None + } else { + ColdStore::open(&cold_dir).ok().map(|s| s.stats()) + }; + + let mut report = Report::new(format!("mixed · {:?} · {:?}", args.backend, args.schema.schema)); + report + .text("backend", format!("{:?}", args.backend)) + .text("schema", format!("{:?}", args.schema.schema)) + .count("writers", writers as u64) + .count("readers", readers as u64) + .text("compactor", if args.no_compact { "off" } else { "on" }) + .duration("duration", elapsed) + .count("rows written", total_writes) + .rate("write rate", total_writes, elapsed, "rows") + .byte_rate("write bw", total_writes * row_bytes, elapsed); + if readers > 0 { + report + .count("scan passes", passes) + .count("rows scanned", total_reads) + .rate("read rate", total_reads, elapsed, "rows"); + } + if let Some(c) = cold { + let logical = total_writes * row_bytes; + let ratio = if c.total_bytes > 0 { + logical as f64 / c.total_bytes as f64 + } else { + 0.0 + }; + report + .count("cold segments", c.segment_count as u64) + .bytes("cold on-disk", c.total_bytes) + .ratio("compression*", ratio); + } + report.emit(json); + + if let Some(p) = cleanup_file { + let _ = std::fs::remove_file(p); + } + let _ = std::fs::remove_dir_all(&cold_dir); + Ok(()) +} diff --git a/probing/cli/src/cli/bench/runners/mod.rs b/probing/cli/src/cli/bench/runners/mod.rs new file mode 100644 index 00000000..0f9b0a37 --- /dev/null +++ b/probing/cli/src/cli/bench/runners/mod.rs @@ -0,0 +1,10 @@ +//! Workload runners. Each `run` function executes one `bench` subcommand +//! and prints a [`Report`](super::metrics::Report). + +pub mod coldscan; +pub mod common; +pub mod compact; +pub mod mixed; +pub mod mp; +pub mod scan; +pub mod write; diff --git a/probing/cli/src/cli/bench/runners/mp.rs b/probing/cli/src/cli/bench/runners/mp.rs new file mode 100644 index 00000000..b5ba246e --- /dev/null +++ b/probing/cli/src/cli/bench/runners/mp.rs @@ -0,0 +1,307 @@ +//! `mp` — fully multi-process, time-driven soak. +//! +//! The orchestrator process creates a shared table, then re-execs itself to +//! spawn one (or more) writer processes and several reader processes, each +//! attaching to the same mapping by name/path. Every worker runs for a fixed +//! wall-clock window (synchronised by a shared start instant) and prints a +//! one-line JSON result; the orchestrator aggregates them. +//! +//! This is the scenario the data layer is built for: independent OS processes +//! contending on the in-buffer robust write lock (writers) while others read +//! lock-free (readers) — the cross-process path threads cannot exercise. +//! +//! Worker vs. orchestrator is selected by the `PROBING_BENCH_MP_ROLE` +//! environment variable, so the public surface stays a single `mp` command. + +use std::process::{Child, Command, Stdio}; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + +use anyhow::{bail, Context, Result}; +use clap::ValueEnum; +use probing_memtable::{DType, MemTable}; + +use super::common::{scan_all, shm_name, temp_path, unique_token, Attach}; +use crate::cli::bench::args::{Backend, MpArgs}; +use crate::cli::bench::metrics::Report; +use crate::cli::bench::workload::RowGen; + +const ENV_ROLE: &str = "PROBING_BENCH_MP_ROLE"; +const ENV_ATTACH: &str = "PROBING_BENCH_MP_ATTACH"; +const ENV_START_MS: &str = "PROBING_BENCH_MP_START_MS"; + +pub fn run(args: &MpArgs, json: bool, seed: u64) -> Result<()> { + match std::env::var(ENV_ROLE) { + Ok(role) => run_worker(args, &role, seed), + Err(_) => orchestrate(args, json, seed), + } +} + +// ── orchestrator ─────────────────────────────────────────────────────── + +fn orchestrate(args: &MpArgs, json: bool, seed: u64) -> Result<()> { + let spec = args.schema.spec(); + let row_bytes = spec.approx_row_bytes() as u64; + let writers = args.writers.max(1); + let readers = args.readers; + if writers + readers == 0 { + bail!("need at least one worker (--writers/--readers)"); + } + + // Create the shared backing and keep it alive for the whole run. + let (attach, _creator) = match args.backend { + Backend::Heap => bail!("mp requires a shared backend (shm/file/shared), not heap"), + Backend::Shm => { + let name = shm_name(); + let creator = + MemTable::shm(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?; + (Attach::Shm(name), creator) + } + Backend::File => { + let path = temp_path("mp"); + let creator = + MemTable::file_at(&path, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?; + (Attach::File(path), creator) + } + Backend::Shared => { + let name = format!("mp-{}", unique_token()); + let creator = + MemTable::shared(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?; + let path = creator.path().expect("shared path").to_path_buf(); + (Attach::File(path), creator) + } + }; + + let exe = std::env::current_exe().context("resolve current executable")?; + let passthrough = passthrough_args(args); + // Give every child time to launch and attach before the measured window. + let start_ms = now_ms() + 1_000; + + let mut children: Vec<(String, Child)> = Vec::with_capacity(writers + readers); + for i in 0..writers { + children.push(( + "writer".into(), + spawn_worker(&exe, &passthrough, "writer", &attach, start_ms, seed ^ (i as u64 + 1))?, + )); + } + for i in 0..readers { + children.push(( + "reader".into(), + spawn_worker(&exe, &passthrough, "reader", &attach, start_ms, seed ^ (0x100 + i as u64))?, + )); + } + + // Collect results (each worker self-terminates after the window). + let mut write_rows = 0u64; + let mut read_rows = 0u64; + let mut read_passes = 0u64; + let mut worker_pids: Vec = Vec::new(); + let mut max_elapsed = 0.0f64; + let mut failures = 0usize; + + for (role, child) in children { + let out = child.wait_with_output().context("await worker")?; + if !out.status.success() { + failures += 1; + eprintln!("worker {role} exited with {:?}", out.status.code()); + continue; + } + let stdout = String::from_utf8_lossy(&out.stdout); + let line = stdout.lines().rev().find(|l| !l.trim().is_empty()); + let Some(line) = line else { + failures += 1; + continue; + }; + let v: serde_json::Value = + serde_json::from_str(line.trim()).with_context(|| format!("parse worker output: {line}"))?; + let rows = v.get("rows").and_then(|x| x.as_u64()).unwrap_or(0); + let passes = v.get("passes").and_then(|x| x.as_u64()).unwrap_or(0); + let elapsed = v.get("elapsed_s").and_then(|x| x.as_f64()).unwrap_or(0.0); + if let Some(pid) = v.get("pid").and_then(|x| x.as_u64()) { + worker_pids.push(pid); + } + max_elapsed = max_elapsed.max(elapsed); + match role.as_str() { + "writer" => write_rows += rows, + "reader" => { + read_rows += rows; + read_passes += passes; + } + _ => {} + } + } + + let window = Duration::from_secs_f64(max_elapsed.max(1e-9)); + let mut report = Report::new(format!("mp · {:?} · {:?}", args.backend, args.schema.schema)); + report + .text("backend", format!("{:?}", args.backend)) + .text("schema", format!("{:?}", args.schema.schema)) + .count("writer procs", writers as u64) + .count("reader procs", readers as u64) + .duration("window", window) + .count("rows written", write_rows) + .rate("write rate", write_rows, window, "rows") + .byte_rate("write bw", write_rows * row_bytes, window); + if readers > 0 { + report + .count("scan passes", read_passes) + .count("rows scanned", read_rows) + .rate("read rate", read_rows, window, "rows") + .byte_rate("read bw", read_rows * row_bytes, window); + } + if failures > 0 { + report.count("failed workers", failures as u64); + } + report.emit(json); + + if let Attach::File(p) = &attach { + if matches!(args.backend, Backend::File) { + let _ = std::fs::remove_file(p); + } + } + if failures > 0 { + bail!("{failures} worker(s) failed"); + } + Ok(()) +} + +/// Flags that reproduce the table geometry in a child (role/attach/start go +/// through the environment). +fn passthrough_args(args: &MpArgs) -> Vec { + let kind = args + .schema + .schema + .to_possible_value() + .map(|p| p.get_name().to_string()) + .unwrap_or_else(|| "metrics".into()); + let backend = args + .backend + .to_possible_value() + .map(|p| p.get_name().to_string()) + .unwrap_or_else(|| "shared".into()); + vec![ + "bench".into(), + "mp".into(), + "--schema".into(), + kind, + "--wide-cols".into(), + args.schema.wide_cols.to_string(), + "--str-len".into(), + args.schema.str_len.to_string(), + "--chunk-size".into(), + args.ring.chunk_size.to_string(), + "--chunks".into(), + args.ring.chunks.to_string(), + "--backend".into(), + backend, + "--duration".into(), + args.duration.to_string(), + ] +} + +fn spawn_worker( + exe: &std::path::Path, + passthrough: &[String], + role: &str, + attach: &Attach, + start_ms: u128, + seed: u64, +) -> Result { + Command::new(exe) + .args(passthrough) + .args(["--seed", &seed.to_string()]) + .env(ENV_ROLE, role) + .env(ENV_ATTACH, attach.encode()) + .env(ENV_START_MS, start_ms.to_string()) + .stdout(Stdio::piped()) + .stderr(Stdio::inherit()) + .spawn() + .with_context(|| format!("spawn {role} worker")) +} + +// ── worker ─────────────────────────────────────────────────────────────── + +fn run_worker(args: &MpArgs, role: &str, seed: u64) -> Result<()> { + let attach = Attach::parse(&std::env::var(ENV_ATTACH).context("missing attach env")?)?; + let start_ms: u128 = std::env::var(ENV_START_MS) + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or_else(now_ms); + let duration = Duration::from_secs(args.duration.max(1)); + let spec = args.schema.spec(); + + // Attach to the shared table (retry briefly in case of a startup race). + let mut table = open_with_retry(&attach)?; + + spin_until(start_ms); + let t0 = Instant::now(); + let (rows, passes) = match role { + "writer" => { + let mut gen = RowGen::new(spec.clone(), seed, (std::process::id() as i64) << 20); + let mut scratch: Vec = Vec::new(); + let mut rows = 0u64; + while t0.elapsed() < duration { + for _ in 0..256 { + let values = gen.values(&mut scratch); + table.push_row_unchecked(&values); + } + rows += 256; + } + (rows, 0u64) + } + "reader" => { + let dtypes: Vec = (0..spec.schema().cols.len()) + .map(|i| spec.schema().cols[i].dtype) + .collect(); + let mut rows = 0u64; + let mut passes = 0u64; + let mut sink = 0u64; + while t0.elapsed() < duration { + let (s, n) = scan_all(&table, &dtypes); + sink = sink.wrapping_add(s); + rows += n; + passes += 1; + } + std::hint::black_box(sink); + (rows, passes) + } + other => bail!("unknown worker role: {other}"), + }; + let elapsed = t0.elapsed().as_secs_f64(); + + let out = serde_json::json!({ + "role": role, + "pid": std::process::id(), + "rows": rows, + "passes": passes, + "elapsed_s": elapsed, + }); + println!("{out}"); + Ok(()) +} + +fn open_with_retry(attach: &Attach) -> Result { + let deadline = Instant::now() + Duration::from_secs(5); + loop { + match attach.open() { + Ok(t) => return Ok(t), + Err(e) if Instant::now() < deadline => { + let _ = e; + std::thread::sleep(Duration::from_millis(10)); + } + Err(e) => return Err(e).context("attach to shared table"), + } + } +} + +fn now_ms() -> u128 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis()) + .unwrap_or(0) +} + +fn spin_until(start_ms: u128) { + while now_ms() < start_ms { + std::thread::sleep(Duration::from_millis(1)); + } +} diff --git a/probing/cli/src/cli/bench/runners/scan.rs b/probing/cli/src/cli/bench/runners/scan.rs new file mode 100644 index 00000000..01ebdc2c --- /dev/null +++ b/probing/cli/src/cli/bench/runners/scan.rs @@ -0,0 +1,51 @@ +//! `scan` — sequential read throughput over a populated hot ring. +//! +//! Reads every committed row in logical (oldest→newest) order through the +//! O(1)-per-column cursor, folding values into a sink so the work is not +//! optimised away. + +use std::time::Instant; + +use anyhow::Result; +use probing_memtable::{DType, MemTable}; + +use crate::cli::bench::args::ScanArgs; +use crate::cli::bench::metrics::Report; +use super::common::{populate, scan_all}; + +pub fn run(args: &ScanArgs, json: bool, seed: u64) -> Result<()> { + let spec = args.schema.spec(); + let mut table = MemTable::new(&spec.schema(), args.ring.chunk_size, args.ring.chunks); + populate(&mut table, &spec, args.rows, seed); + + let dtypes: Vec = (0..table.num_cols()) + .map(|i| table.col_dtype(i).expect("known dtype")) + .collect(); + + // Warm pass (also tells us how many rows survived the ring). + let resident = scan_all(&table, &dtypes); + + let iters = args.iters.max(1); + let start = Instant::now(); + let mut sink = 0u64; + for _ in 0..iters { + sink = sink.wrapping_add(scan_all(&table, &dtypes).0); + } + let elapsed = start.elapsed(); + std::hint::black_box(sink); + + let rows_total = resident.1 * iters as u64; + let bytes_total = rows_total * spec.approx_row_bytes() as u64; + + let mut report = Report::new(format!("scan · {:?}", args.schema.schema)); + report + .text("schema", format!("{:?}", args.schema.schema)) + .count("rows ingested", args.rows) + .count("rows resident", resident.1) + .count("scan passes", iters as u64) + .duration("elapsed", elapsed) + .rate("throughput", rows_total, elapsed, "rows") + .byte_rate("bandwidth", bytes_total, elapsed); + report.emit(json); + Ok(()) +} diff --git a/probing/cli/src/cli/bench/runners/write.rs b/probing/cli/src/cli/bench/runners/write.rs new file mode 100644 index 00000000..16fd4aa4 --- /dev/null +++ b/probing/cli/src/cli/bench/runners/write.rs @@ -0,0 +1,198 @@ +//! `write` — write throughput across backends, writer counts and APIs. +//! +//! With `--threads > 1` on a shared backend (`shm`/`file`/`shared`) every +//! thread opens its own handle to the same mapping, so the run genuinely +//! contends on the in-buffer robust write lock. The `heap` backend cannot be +//! shared, so multi-threaded heap runs use independent per-thread tables +//! (parallel throughput, no lock contention). + +use std::sync::Barrier; +use std::time::Instant; + +use anyhow::{bail, Result}; +use probing_memtable::MemTable; + +use super::common; +use crate::cli::bench::args::{Backend, RingArgs, WriteArgs, WriterMode}; +use crate::cli::bench::metrics::{Latency, Report}; +use crate::cli::bench::workload::{RowGen, WorkloadSpec}; + +/// How a worker thread obtains its table handle. +enum Source { + Heap, + Shm(String), + File(std::path::PathBuf), +} + +struct WorkerOut { + rows: u64, + bytes: u64, + latency: Option, +} + +pub fn run(args: &WriteArgs, json: bool, seed: u64) -> Result<()> { + let spec = args.schema.spec(); + let threads = args.threads.max(1); + + if args.writer == WriterMode::Streaming && threads > 1 { + bail!("--writer streaming requires --threads 1 (advance-on-overflow is not concurrency-safe)"); + } + if threads > 1 && args.backend == Backend::Heap { + eprintln!( + "note: heap backend cannot be shared; --threads {threads} uses independent \ + per-thread tables (no lock contention)" + ); + } + + // Set up the backing for shared backends; keep the creator handle alive + // for the whole run so attached worker handles stay valid. + let mut cleanup_file: Option = None; + let (source, _creator) = match args.backend { + Backend::Heap => (Source::Heap, None), + Backend::Shm => { + let name = common::shm_name(); + let creator = MemTable::shm(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?; + (Source::Shm(name), Some(creator)) + } + Backend::File => { + let path = args + .path + .clone() + .unwrap_or_else(|| common::temp_path("write")); + if args.path.is_none() { + cleanup_file = Some(path.clone()); + } + let creator = + MemTable::file_at(&path, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?; + (Source::File(path), Some(creator)) + } + Backend::Shared => { + let name = format!("bench-{}", common::unique_token()); + let creator = + MemTable::shared(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?; + let path = creator + .path() + .expect("shared table has a path") + .to_path_buf(); + (Source::File(path), Some(creator)) + } + }; + + let per_thread = args.rows / threads as u64; + let remainder = args.rows % threads as u64; + let barrier = Barrier::new(threads + 1); + let lat_cap = if args.latency { 1 << 16 } else { 0 }; + + let (outs, elapsed) = std::thread::scope(|scope| { + let mut handles = Vec::with_capacity(threads); + for tid in 0..threads { + let rows = per_thread + if (tid as u64) < remainder { 1 } else { 0 }; + let spec = spec.clone(); + let source = &source; + let barrier = &barrier; + let ring = args.ring.clone(); + let writer = args.writer; + let warmup = args.warmup; + handles.push(scope.spawn(move || -> Result { + let mut table = open_handle(source, &spec, &ring)?; + let seed = seed ^ (0x9E37_79B9_u64.wrapping_mul(tid as u64 + 1)); + // Distinct time windows per writer. + let start_ts = (tid as i64) * 1_000_000_000; + let mut gen = RowGen::new(spec.clone(), seed, start_ts); + + run_rows(&mut table, &mut gen, writer, warmup, &mut None); + + barrier.wait(); + let mut lat = (lat_cap > 0).then(|| Latency::new(lat_cap)); + let written = run_rows(&mut table, &mut gen, writer, rows, &mut lat); + Ok(WorkerOut { + rows: written, + bytes: written * spec.approx_row_bytes() as u64, + latency: lat, + }) + })); + } + + // Release the workers together, then time the full write window. + barrier.wait(); + let start = Instant::now(); + let outs: Vec> = handles.into_iter().map(|h| h.join().unwrap()).collect(); + (outs, start.elapsed()) + }); + + let mut total_rows = 0u64; + let mut total_bytes = 0u64; + let mut merged = Latency::new(lat_cap.max(1)); + for o in outs { + let o = o?; + total_rows += o.rows; + total_bytes += o.bytes; + if let Some(l) = o.latency { + merged.merge(&l); + } + } + + if let Some(p) = cleanup_file { + let _ = std::fs::remove_file(p); + } + + let mut report = Report::new(format!("write · {:?} · {:?}", args.backend, args.schema.schema)); + report + .text("backend", format!("{:?}", args.backend)) + .text("schema", format!("{:?}", args.schema.schema)) + .text("writer", format!("{:?}", args.writer)) + .count("threads", threads as u64) + .count("rows", total_rows) + .duration("elapsed", elapsed) + .rate("throughput", total_rows, elapsed, "rows") + .byte_rate("bandwidth", total_bytes, elapsed) + .rate("per-thread", total_rows / threads as u64, elapsed, "rows"); + if args.latency { + report.latency("latency", &merged); + } + report.emit(json); + Ok(()) +} + +fn open_handle(source: &Source, spec: &WorkloadSpec, ring: &RingArgs) -> Result { + Ok(match source { + Source::Heap => MemTable::new(&spec.schema(), ring.chunk_size, ring.chunks), + Source::Shm(name) => MemTable::open_shm(name)?, + Source::File(path) => MemTable::open_file(path)?, + }) +} + +/// Write `rows` rows, optionally recording per-row latency. Returns rows written. +fn run_rows( + table: &mut MemTable, + gen: &mut RowGen, + mode: WriterMode, + rows: u64, + lat: &mut Option, +) -> u64 { + let mut scratch: Vec = Vec::new(); + for _ in 0..rows { + let t = lat.as_ref().map(|_| Instant::now()); + match mode { + WriterMode::Push => { + let values = gen.values(&mut scratch); + table.push_row_unchecked(&values); + } + WriterMode::Streaming => { + let ok = { + let mut w = table.row_writer(); + gen.write_into(&mut w) + }; + if !ok { + table.advance_chunk(); + let mut w = table.row_writer(); + let _ = gen.write_into(&mut w); + } + } + } + if let (Some(l), Some(t)) = (lat.as_mut(), t) { + l.record(t.elapsed().as_nanos() as u64); + } + } + rows +} diff --git a/probing/cli/src/cli/bench/workload.rs b/probing/cli/src/cli/bench/workload.rs new file mode 100644 index 00000000..b073f9ea --- /dev/null +++ b/probing/cli/src/cli/bench/workload.rs @@ -0,0 +1,189 @@ +//! Synthetic schemas and deterministic row generation. +//! +//! Generators are driven by a seedable xorshift PRNG so a run is fully +//! reproducible given `--seed`. The timestamp column is named `timestamp` +//! (recognised by the memtable as the designated time column) and is kept +//! monotonically increasing, which is both realistic for observability data +//! and the case Pco compresses best. + +use std::str::FromStr; + +use probing_memtable::{DType, RowWriter, Schema, Value}; + +/// Built-in column layouts covering the main compression / width regimes. +#[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)] +pub enum SchemaKind { + /// `timestamp:i64, value:f64, tag:u32` — narrow numeric, the common + /// metrics shape; compresses very well in the cold tier. + Metrics, + /// `timestamp:i64` + N `f64` columns — wide numeric rows. + Wide, + /// `timestamp:i64, level:u32, msg:str` — variable-length string payload + /// (no Pco, exercises the raw var-len path). + Logs, +} + +impl FromStr for SchemaKind { + type Err = String; + fn from_str(s: &str) -> Result { + match s.to_ascii_lowercase().as_str() { + "metrics" => Ok(Self::Metrics), + "wide" => Ok(Self::Wide), + "logs" => Ok(Self::Logs), + other => Err(format!( + "unknown schema '{other}' (expected metrics|wide|logs)" + )), + } + } +} + +/// Parameters that shape a generated workload. +#[derive(Debug, Clone)] +pub struct WorkloadSpec { + pub kind: SchemaKind, + /// Number of `f64` columns for [`SchemaKind::Wide`]. + pub wide_cols: usize, + /// Length in bytes of the `msg` payload for [`SchemaKind::Logs`]. + pub str_len: usize, +} + +impl WorkloadSpec { + pub fn schema(&self) -> Schema { + match self.kind { + SchemaKind::Metrics => Schema::new() + .col("timestamp", DType::I64) + .col("value", DType::F64) + .col("tag", DType::U32), + SchemaKind::Wide => { + let mut s = Schema::new().col("timestamp", DType::I64); + for i in 0..self.wide_cols { + s = s.col(&format!("f{i}"), DType::F64); + } + s + } + SchemaKind::Logs => Schema::new() + .col("timestamp", DType::I64) + .col("level", DType::U32) + .col("msg", DType::Str), + } + } + + /// Approximate encoded bytes of one row (excludes the 4-byte row-length + /// prefix); used to translate row counts into a logical byte rate. + pub fn approx_row_bytes(&self) -> usize { + match self.kind { + SchemaKind::Metrics => 8 + 8 + 4, + SchemaKind::Wide => 8 + self.wide_cols * 8, + SchemaKind::Logs => 8 + 4 + 4 + self.str_len, + } + } +} + +/// Deterministic per-thread row generator. +pub struct RowGen { + spec: WorkloadSpec, + rng: u64, + ts: i64, + msg: String, +} + +impl RowGen { + /// `seed` should differ per thread for independent streams; `start_ts` + /// offsets the monotonic timestamp so concurrent streams don't fully + /// overlap in time. + pub fn new(spec: WorkloadSpec, seed: u64, start_ts: i64) -> Self { + let str_len = spec.str_len; + Self { + spec, + rng: seed | 1, + ts: start_ts, + msg: String::with_capacity(str_len), + } + } + + #[inline] + fn next(&mut self) -> u64 { + let mut x = self.rng; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + self.rng = x; + x.wrapping_mul(0x2545_F491_4F6C_DD1D) + } + + /// Write one row through the streaming [`RowWriter`] fast path. + /// + /// Returns the value of [`RowWriter::finish`] — `false` means the row + /// did not fit the current chunk (the caller should advance and retry). + #[inline] + pub fn write_into(&mut self, w: &mut RowWriter) -> bool { + // Timestamp advances by a small positive jitter (1..=4): monotone, + // realistic, Pco-friendly. + self.ts += 1 + (self.next() & 0x3) as i64; + let ts = self.ts; + match self.spec.kind { + SchemaKind::Metrics => { + let v = (self.next() % 1_000_000) as f64 * 0.001; + w.put_i64(ts) + .put_f64(v) + .put_u32((self.next() % 1024) as u32) + .finish() + } + SchemaKind::Wide => { + let mut wr = w.put_i64(ts); + for _ in 0..self.spec.wide_cols { + let v = (self.next() % 1_000_000) as f64 * 0.001; + wr = wr.put_f64(v); + } + wr.finish() + } + SchemaKind::Logs => { + self.fill_msg(); + w.put_i64(ts) + .put_u32((self.next() % 5) as u32) + .put_str(&self.msg) + .finish() + } + } + } + + fn fill_msg(&mut self) { + self.msg.clear(); + const ALPHABET: &[u8] = b"abcdefghijklmnopqrstuvwxyz0123456789 "; + for _ in 0..self.spec.str_len { + let c = ALPHABET[(self.next() as usize) % ALPHABET.len()]; + self.msg.push(c as char); + } + } + + /// Build a borrowed [`Value`] row for the `push_row` path. The returned + /// vector borrows `self.msg` for the logs schema, so it must be consumed + /// before the next call. + pub fn values<'a>(&'a mut self, scratch: &'a mut Vec) -> Vec> { + self.ts += 1 + (self.next() & 0x3) as i64; + let ts = self.ts; + match self.spec.kind { + SchemaKind::Metrics => { + let v = (self.next() % 1_000_000) as f64 * 0.001; + vec![Value::I64(ts), Value::F64(v), Value::U32((self.next() % 1024) as u32)] + } + SchemaKind::Wide => { + scratch.clear(); + for _ in 0..self.spec.wide_cols { + scratch.push((self.next() % 1_000_000) as f64 * 0.001); + } + let mut row = Vec::with_capacity(1 + scratch.len()); + row.push(Value::I64(ts)); + for v in scratch.iter() { + row.push(Value::F64(*v)); + } + row + } + SchemaKind::Logs => { + let level = (self.next() % 5) as u32; + self.fill_msg(); + vec![Value::I64(ts), Value::U32(level), Value::Str(&self.msg)] + } + } + } +} diff --git a/probing/cli/src/cli/commands.rs b/probing/cli/src/cli/commands.rs index 520593f0..b28f26c9 100644 --- a/probing/cli/src/cli/commands.rs +++ b/probing/cli/src/cli/commands.rs @@ -170,4 +170,8 @@ pub enum Commands { /// Access various storage backends #[command(subcommand = false, hide = true)] Store(StoreCommand), + + /// Stress and benchmark the in-process data layer + #[command(hide = true)] + Bench(super::bench::BenchCommand), } diff --git a/probing/cli/src/cli/mod.rs b/probing/cli/src/cli/mod.rs index 26da682c..b900e672 100644 --- a/probing/cli/src/cli/mod.rs +++ b/probing/cli/src/cli/mod.rs @@ -2,6 +2,7 @@ use anyhow::Result; use clap::Parser; use probing_proto::prelude::Query; +pub mod bench; pub mod commands; pub mod ctrl; pub mod repl; @@ -75,6 +76,9 @@ impl Cli { Some(Commands::Store(cmd)) => { return cmd.run().await; } + Some(Commands::Bench(cmd)) => { + return cmd.run(); + } _ => {} } @@ -170,6 +174,7 @@ impl Cli { Commands::Launch { .. } | Commands::List { .. } | Commands::Store(..) + | Commands::Bench(..) | Commands::External(..) => { unreachable!("These commands should be handled in run() method") } diff --git a/probing/core/Cargo.toml b/probing/core/Cargo.toml index 8acd1bc8..dcf48226 100644 --- a/probing/core/Cargo.toml +++ b/probing/core/Cargo.toml @@ -12,6 +12,7 @@ crate-type = ["rlib"] [dependencies] probing-proto = { path = "../proto" } probing-macros = { path = "../macros" } +probing-memtable = { path = "../memtable" } anyhow = { workspace = true } arrow = { workspace = true } @@ -24,10 +25,13 @@ serde_json = { workspace = true } thiserror = { workspace = true } async-trait = "0.1.83" -datafusion = { version = "47.0.0", default-features = false, features = [] } +datafusion = { workspace = true } futures = "0.3.31" sled = "0.34.7" bincode = "1.3.3" uuid = { version = "1.0", features = ["v4", "serde"] } url = "2.5" libc = "0.2" + +[dev-dependencies] +tempfile = "3.8" diff --git a/probing/core/src/core/memtable_sql.rs b/probing/core/src/core/memtable_sql.rs new file mode 100644 index 00000000..99d381ba --- /dev/null +++ b/probing/core/src/core/memtable_sql.rs @@ -0,0 +1,2048 @@ +//! Mmap memtable ↔ SQL catalog integration. +//! +//! Exposes mmap'd memtable files (MEMT rings / MEMH hash tables) under +//! `//` as DataFusion tables. Shared by the server and the +//! language extensions so that every data producer writes through +//! `probing-memtable` and every consumer queries through this module. +//! +//! ## File → SQL mapping (no hard-coded product prefix) +//! +//! - **First `.` splits schema vs table** — `acme.actors` → schema `acme`, table `actors`; +//! `foo.bar.baz` → schema `foo`, table `bar.baz` (on-disk name is the full filename). +//! - **No `.`** — exposed as `memtable.` (e.g. `metrics` → `memtable.metrics`). +//! +//! Schema head and table tail must be non-empty; only ASCII letters, digits, `_`, and +//! `.` inside the table tail are allowed (no `/`, `\\`). Leading-dot names are ignored. +//! +//! ## Read semantics (ring tables) +//! +//! - Files are **mmap'd read-only** (no full-file heap copy); only touched +//! pages are faulted in. +//! - Chunks are materialised in **logical (oldest → newest) write order** +//! via [`MemTableView::chunks_logical`], one Arrow `RecordBatch` per chunk. +//! - Each chunk's `generation` is re-checked after reading: a chunk recycled +//! by the writer mid-read is **discarded** instead of surfacing torn rows. +//! - When the table has a designated timestamp column, chunks whose +//! `[min_ts, max_ts]` range cannot satisfy the query's time predicates are +//! **pruned** before materialisation ([`RingMmapTable`]). + +use std::any::Any; +use std::collections::{BTreeSet, HashSet}; +use std::panic::AssertUnwindSafe; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; +use std::thread::JoinHandle; +use std::time::Duration; + +use async_trait::async_trait; +use once_cell::sync::Lazy; +use datafusion::arrow::array::{ + ArrayRef, BinaryArray, BinaryBuilder, Float32Array, Float32Builder, Float64Array, + Float64Builder, GenericStringBuilder, Int32Array, Int32Builder, Int64Array, Int64Builder, + RecordBatch, StringArray, UInt32Array, UInt32Builder, UInt64Array, UInt64Builder, UInt8Array, + UInt8Builder, +}; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::catalog::CatalogProvider; +use datafusion::catalog::SchemaProvider; +use datafusion::catalog::Session; +use datafusion::datasource::{TableProvider, TableType}; +use datafusion::error::DataFusionError; +use datafusion::error::Result as DfResult; +use datafusion::logical_expr::{Expr, Operator, TableProviderFilterPushDown}; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::scalar::ScalarValue; + +use probing_memtable::discover::{default_dir, MappedFile}; +use probing_memtable::memc::{ColdStats, ColdStore, ColumnData, Compactor, CompactorConfig, SegmentReader}; +use probing_memtable::{detect_table, DType, MemTableView, MemhView, TableKind, TypedValue}; + +use super::plugin_advanced::{scan_memory_partitions, supports_filters_pushdown_for_schema}; +use super::{ + EngineCall, EngineDatasource, EngineError, EngineExtension, EngineExtensionOption, Maybe, + Plugin, PluginAdvancedTable, PluginType, +}; +use probing_macros::EngineExtension as EngineExtensionDerive; + +/// SQL schema used for mmap files whose basename contains no `.`. +pub const DEFAULT_UNDOTTED_SCHEMA: &str = "memtable"; + +fn self_dir() -> std::path::PathBuf { + default_dir().join(std::process::id().to_string()) +} + +/// Cold-segment directory for this process: `//cold`. +/// +/// Co-located with (and scoped like) the hot ring files so cold data never +/// mixes across processes, and the compactor writer and this read path agree +/// on one location without extra configuration. +pub fn cold_dir() -> std::path::PathBuf { + self_dir().join("cold") +} + +#[inline] +fn valid_schema_head(s: &str) -> bool { + !s.is_empty() && s.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'_') +} + +#[inline] +fn valid_table_tail(s: &str) -> bool { + !s.is_empty() + && !s.contains('/') + && !s.contains('\\') + && s.bytes() + .all(|b| b.is_ascii_alphanumeric() || b == b'_' || b == b'.') +} + +/// Map basename `filename` → `(schema, table)` for routing; [`None`] if skipped. +pub fn classify_mmap_basename(filename: &str) -> Option<(String, String)> { + if filename.starts_with('.') { + return None; + } + if let Some((head, tail)) = filename.split_once('.') { + if valid_schema_head(head) && valid_table_tail(tail) { + return Some((head.to_string(), tail.to_string())); + } + return None; + } + if valid_schema_head(filename) { + Some((DEFAULT_UNDOTTED_SCHEMA.to_string(), filename.to_string())) + } else { + None + } +} + +/// On-disk filename for a `(schema, table)` pair. +pub fn mmap_filename_for(schema: &str, table: &str) -> String { + if schema == DEFAULT_UNDOTTED_SCHEMA { + table.to_string() + } else { + format!("{schema}.{table}") + } +} + +fn tables_in_schema(target_schema: &str) -> Vec { + let dir = self_dir(); + let Ok(entries) = std::fs::read_dir(&dir) else { + return vec![]; + }; + let mut out = Vec::new(); + for e in entries.flatten() { + if !e.path().is_file() { + continue; + } + let n = e.file_name().to_string_lossy().to_string(); + if let Some((sch, tbl)) = classify_mmap_basename(&n) { + if sch == target_schema { + out.push(tbl); + } + } + } + out.sort(); + out.dedup(); + out +} + +fn discover_all_schemas() -> BTreeSet { + let mut out = BTreeSet::new(); + let dir = self_dir(); + if let Ok(entries) = std::fs::read_dir(&dir) { + for e in entries.flatten() { + if !e.path().is_file() { + continue; + } + let n = e.file_name().to_string_lossy().to_string(); + if let Some((sch, _)) = classify_mmap_basename(&n) { + out.insert(sch); + } + } + } + out.insert(DEFAULT_UNDOTTED_SCHEMA.to_string()); + out +} + +/// Whether an mmap file backs `schema.table` right now (validates the table +/// name first so user-supplied SQL identifiers can never escape the data dir). +fn mmap_table_exists(schema: &str, table: &str) -> bool { + if !valid_table_tail(table) { + return false; + } + self_dir().join(mmap_filename_for(schema, table)).is_file() +} + +/// Mmap ring / MEMH → Arrow batches, then a [`PluginAdvancedTable`] so DataFusion can push +/// filters and limits into the scan path. +pub fn bytes_to_pushdown_table(data: &[u8], logical_name: &str) -> Arc { + match detect_table(data) { + Some(TableKind::Ring) => { + let view = match MemTableView::new(data) { + Ok(v) => v, + Err(_) => return Arc::new(PluginAdvancedTable::empty_sentinel(logical_name)), + }; + let schema = view_to_arrow_schema(&view); + let batches = view_to_recordbatches(&view); + match PluginAdvancedTable::try_new(logical_name, schema, batches) { + Ok(t) => Arc::new(t), + Err(e) => { + log::error!("memtable PluginAdvancedTable (ring): {e}"); + Arc::new(PluginAdvancedTable::empty_sentinel(logical_name)) + } + } + } + Some(TableKind::Hash) => { + let view = match MemhView::new(data) { + Ok(v) => v, + Err(_) => return Arc::new(PluginAdvancedTable::empty_sentinel(logical_name)), + }; + let schema = memh_kv_schema(); + let batches = memh_view_to_recordbatch(&view); + if batches.is_empty() { + return Arc::new(PluginAdvancedTable::empty_sentinel(logical_name)); + } + match PluginAdvancedTable::try_new(logical_name, schema, batches) { + Ok(t) => Arc::new(t), + Err(e) => { + log::error!("memtable PluginAdvancedTable (memh): {e}"); + Arc::new(PluginAdvancedTable::empty_sentinel(logical_name)) + } + } + } + None => Arc::new(PluginAdvancedTable::empty_sentinel(logical_name)), + } +} + +fn dtype_to_arrow(dt: DType) -> DataType { + match dt { + DType::U8 => DataType::UInt8, + DType::U32 => DataType::UInt32, + DType::I32 => DataType::Int32, + DType::I64 => DataType::Int64, + DType::F32 => DataType::Float32, + DType::F64 => DataType::Float64, + DType::U64 => DataType::UInt64, + DType::Str => DataType::Utf8, + DType::Bytes => DataType::Binary, + } +} + +/// Arrow schema mirroring a ring table's column layout. +pub fn view_to_arrow_schema(view: &MemTableView) -> SchemaRef { + let s = view.schema(); + let fields: Vec = s + .cols + .iter() + .map(|c| Field::new(&c.name, dtype_to_arrow(c.dtype), true)) + .collect(); + SchemaRef::new(Schema::new(fields)) +} + +enum ColBuilder { + U8(UInt8Builder), + U32(UInt32Builder), + I32(Int32Builder), + I64(Int64Builder), + F32(Float32Builder), + F64(Float64Builder), + U64(UInt64Builder), + Str(GenericStringBuilder), + Bytes(BinaryBuilder), +} + +fn make_builders(view: &MemTableView) -> Vec { + view.schema() + .cols + .iter() + .map(|c| match c.dtype { + DType::U8 => ColBuilder::U8(UInt8Builder::new()), + DType::U32 => ColBuilder::U32(UInt32Builder::new()), + DType::I32 => ColBuilder::I32(Int32Builder::new()), + DType::I64 => ColBuilder::I64(Int64Builder::new()), + DType::F32 => ColBuilder::F32(Float32Builder::new()), + DType::F64 => ColBuilder::F64(Float64Builder::new()), + DType::U64 => ColBuilder::U64(UInt64Builder::new()), + DType::Str => ColBuilder::Str(GenericStringBuilder::new()), + DType::Bytes => ColBuilder::Bytes(BinaryBuilder::new()), + }) + .collect() +} + +/// Materialise one chunk into a `RecordBatch`. +/// +/// Returns [`None`] when the chunk was recycled while being read (its +/// generation moved), or when reading panicked on a torn ref — both mean +/// the bytes can no longer be trusted, so the whole chunk is dropped +/// rather than surfacing corrupt rows to SQL. +fn chunk_to_recordbatch( + view: &MemTableView, + chunk: usize, + arrow_schema: &SchemaRef, +) -> Option { + let generation_before = view.chunk_generation(chunk); + + let arrays = std::panic::catch_unwind(AssertUnwindSafe(|| { + let mut builders = make_builders(view); + // RowIter itself stops yielding once it observes a generation change; + // rows read before that may still be torn, hence the re-check below. + for row in view.rows(chunk) { + let mut cursor = row.cursor(); + for builder in builders.iter_mut() { + match builder { + ColBuilder::U8(b) => b.append_value(cursor.next_u8()), + ColBuilder::U32(b) => b.append_value(cursor.next_u32()), + ColBuilder::I32(b) => b.append_value(cursor.next_i32()), + ColBuilder::I64(b) => b.append_value(cursor.next_i64()), + ColBuilder::F32(b) => b.append_value(cursor.next_f32()), + ColBuilder::F64(b) => b.append_value(cursor.next_f64()), + ColBuilder::U64(b) => b.append_value(cursor.next_u64()), + ColBuilder::Str(b) => b.append_value(cursor.next_str()), + ColBuilder::Bytes(b) => b.append_value(cursor.next_bytes()), + } + } + } + builders + .into_iter() + .map(|b| -> ArrayRef { + match b { + ColBuilder::U8(mut b) => Arc::new(b.finish()), + ColBuilder::U32(mut b) => Arc::new(b.finish()), + ColBuilder::I32(mut b) => Arc::new(b.finish()), + ColBuilder::I64(mut b) => Arc::new(b.finish()), + ColBuilder::F32(mut b) => Arc::new(b.finish()), + ColBuilder::F64(mut b) => Arc::new(b.finish()), + ColBuilder::U64(mut b) => Arc::new(b.finish()), + ColBuilder::Str(mut b) => Arc::new(b.finish()), + ColBuilder::Bytes(mut b) => Arc::new(b.finish()), + } + }) + .collect::>() + })) + .map_err(|_| { + log::debug!("memtable chunk {chunk} recycled mid-read; dropping"); + }) + .ok()?; + + if view.chunk_generation(chunk) != generation_before { + log::debug!("memtable chunk {chunk} recycled during materialisation; dropping"); + return None; + } + + match RecordBatch::try_new(arrow_schema.clone(), arrays) { + Ok(batch) if batch.num_rows() > 0 => Some(batch), + Ok(_) => None, + Err(e) => { + log::error!("memtable chunk {chunk} → RecordBatch failed: {e}"); + None + } + } +} + +/// Materialise a ring view as record batches in **logical (oldest → newest) +/// order**, one batch per surviving chunk. +/// +/// Always returns at least one (possibly empty) batch so the table keeps its +/// real schema even when no rows are visible. +pub fn view_to_recordbatches(view: &MemTableView) -> Vec { + let arrow_schema = view_to_arrow_schema(view); + let mut batches: Vec = view + .chunks_logical() + .into_iter() + .filter_map(|chunk| chunk_to_recordbatch(view, chunk, &arrow_schema)) + .collect(); + if batches.is_empty() { + batches.push(RecordBatch::new_empty(arrow_schema)); + } + batches +} + +// ── Time-range pruning (chunk level) ────────────────────────────────── + +/// Inclusive time window extracted from query predicates on the designated +/// timestamp column. `None` on either side = unbounded. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct TsBounds { + pub lower: Option, + pub upper: Option, +} + +impl TsBounds { + fn is_unbounded(&self) -> bool { + self.lower.is_none() && self.upper.is_none() + } + + fn tighten_lower(&mut self, v: i64) { + self.lower = Some(self.lower.map_or(v, |cur| cur.max(v))); + } + + fn tighten_upper(&mut self, v: i64) { + self.upper = Some(self.upper.map_or(v, |cur| cur.min(v))); + } +} + +/// Integer value of a literal usable as a timestamp bound. +fn literal_as_i64(expr: &Expr) -> Option { + let Expr::Literal(scalar, _) = expr else { + return None; + }; + match scalar { + ScalarValue::Int64(Some(v)) => Some(*v), + ScalarValue::Int32(Some(v)) => Some(*v as i64), + ScalarValue::UInt32(Some(v)) => Some(*v as i64), + ScalarValue::UInt64(Some(v)) => i64::try_from(*v).ok(), + ScalarValue::TimestampMicrosecond(Some(v), _) => Some(*v), + _ => None, + } +} + +fn is_ts_column(expr: &Expr, ts_name: &str) -> bool { + matches!(expr, Expr::Column(c) if c.name == ts_name) +} + +/// Fold one predicate into `bounds`. Conservative: comparisons are widened +/// to inclusive bounds (`>` treated as `>=`), unrecognised shapes are +/// ignored — pruning may keep too much, never too little. +fn fold_ts_predicate(expr: &Expr, ts_name: &str, bounds: &mut TsBounds) { + match expr { + Expr::BinaryExpr(be) if be.op == Operator::And => { + fold_ts_predicate(&be.left, ts_name, bounds); + fold_ts_predicate(&be.right, ts_name, bounds); + } + Expr::BinaryExpr(be) => { + let (op, lit) = if is_ts_column(&be.left, ts_name) { + let Some(v) = literal_as_i64(&be.right) else { + return; + }; + (be.op, v) + } else if is_ts_column(&be.right, ts_name) { + // `lit op ts` — mirror the comparison. + let Some(v) = literal_as_i64(&be.left) else { + return; + }; + let mirrored = match be.op { + Operator::Gt => Operator::Lt, + Operator::GtEq => Operator::LtEq, + Operator::Lt => Operator::Gt, + Operator::LtEq => Operator::GtEq, + other => other, + }; + (mirrored, v) + } else { + return; + }; + match op { + Operator::Gt | Operator::GtEq => bounds.tighten_lower(lit), + Operator::Lt | Operator::LtEq => bounds.tighten_upper(lit), + Operator::Eq => { + bounds.tighten_lower(lit); + bounds.tighten_upper(lit); + } + _ => {} + } + } + Expr::Between(b) if !b.negated && is_ts_column(&b.expr, ts_name) => { + if let Some(lo) = literal_as_i64(&b.low) { + bounds.tighten_lower(lo); + } + if let Some(hi) = literal_as_i64(&b.high) { + bounds.tighten_upper(hi); + } + } + _ => {} + } +} + +/// Extract the time window implied by `filters` (each entry is ANDed by +/// DataFusion) on the column named `ts_name`. +pub fn ts_bounds_from_filters(filters: &[Expr], ts_name: &str) -> TsBounds { + let mut bounds = TsBounds::default(); + for f in filters { + fold_ts_predicate(f, ts_name, &mut bounds); + } + bounds +} + +/// `false` only when the chunk's committed `[min_ts, max_ts]` provably lies +/// outside `bounds`. Races with the writer resolve to `true` (keep the +/// chunk) — materialisation re-validates the generation anyway. +fn chunk_may_match(view: &MemTableView, chunk: usize, bounds: &TsBounds) -> bool { + if bounds.is_unbounded() { + return true; + } + let generation_before = view.chunk_generation(chunk); + let Some((min_ts, max_ts)) = view.chunk_ts_range(chunk) else { + return true; + }; + if view.chunk_generation(chunk) != generation_before { + return true; // recycled mid-read: range untrustworthy, do not prune + } + !(bounds.lower.is_some_and(|lo| max_ts < lo) || bounds.upper.is_some_and(|hi| min_ts > hi)) +} + +/// Like [`view_to_recordbatches`], skipping chunks outside `bounds`. +pub fn view_to_recordbatches_pruned(view: &MemTableView, bounds: &TsBounds) -> Vec { + let arrow_schema = view_to_arrow_schema(view); + let mut batches: Vec = view + .chunks_logical() + .into_iter() + .filter(|&chunk| chunk_may_match(view, chunk, bounds)) + .filter_map(|chunk| chunk_to_recordbatch(view, chunk, &arrow_schema)) + .collect(); + if batches.is_empty() { + batches.push(RecordBatch::new_empty(arrow_schema)); + } + batches +} + +/// Like [`view_to_recordbatches_pruned`], additionally skipping any chunk +/// whose `(index, current generation)` is in `excluded` — used to drop hot +/// chunks already materialised from the cold tier, so a hot∪cold union counts +/// each row exactly once even while a compacted chunk still lives in the ring. +fn view_to_recordbatches_pruned_excluding( + view: &MemTableView, + bounds: &TsBounds, + excluded: &HashSet<(usize, u64)>, +) -> Vec { + let arrow_schema = view_to_arrow_schema(view); + let mut batches: Vec = view + .chunks_logical() + .into_iter() + .filter(|&chunk| chunk_may_match(view, chunk, bounds)) + .filter(|&chunk| !excluded.contains(&(chunk, view.chunk_generation(chunk)))) + .filter_map(|chunk| chunk_to_recordbatch(view, chunk, &arrow_schema)) + .collect(); + if batches.is_empty() { + batches.push(RecordBatch::new_empty(arrow_schema)); + } + batches +} + +// ── Lazy ring TableProvider (prunes + materialises at scan time) ────── + +/// [`TableProvider`] over an mmap'd MEMT ring file that defers Arrow +/// materialisation to `scan()`, where the query's filters are known: +/// chunks whose `[min_ts, max_ts]` cannot match the time predicates are +/// skipped without faulting in their pages. +#[derive(Debug)] +pub struct RingMmapTable { + mapped: Arc, + schema: SchemaRef, +} + +impl RingMmapTable { + pub fn try_new(mapped: MappedFile) -> Result { + let view = MemTableView::new(mapped.as_bytes())?; + let schema = view_to_arrow_schema(&view); + Ok(Self { + mapped: Arc::new(mapped), + schema, + }) + } + + /// Time window implied by `filters` on this ring's designated timestamp + /// column (unbounded when there is no ts column or the file is torn). + pub fn bounds_for(&self, filters: &[Expr]) -> TsBounds { + match MemTableView::new(self.mapped.as_bytes()) { + Ok(view) => view + .ts_col() + .map(|i| ts_bounds_from_filters(filters, view.col_name(i))) + .unwrap_or_default(), + Err(_) => TsBounds::default(), + } + } + + /// Materialise surviving chunks within `bounds`, one batch per chunk. + pub fn pruned_batches(&self, bounds: &TsBounds) -> Vec { + match MemTableView::new(self.mapped.as_bytes()) { + Ok(view) => view_to_recordbatches_pruned(&view, bounds), + Err(_) => vec![RecordBatch::new_empty(Arc::clone(&self.schema))], + } + } + + /// Like [`pruned_batches`](Self::pruned_batches), skipping chunks whose + /// `(index, generation)` is already represented in the cold tier. + fn pruned_batches_excluding( + &self, + bounds: &TsBounds, + excluded: &HashSet<(usize, u64)>, + ) -> Vec { + match MemTableView::new(self.mapped.as_bytes()) { + Ok(view) => view_to_recordbatches_pruned_excluding(&view, bounds, excluded), + Err(_) => vec![RecordBatch::new_empty(Arc::clone(&self.schema))], + } + } +} + +#[async_trait] +impl TableProvider for RingMmapTable { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn table_type(&self) -> TableType { + TableType::Base + } + + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> DfResult> { + supports_filters_pushdown_for_schema(&self.schema, filters) + } + + async fn scan( + &self, + state: &dyn Session, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, + ) -> DfResult> { + let bounds = self.bounds_for(filters); + let batches = self.pruned_batches(&bounds); + scan_memory_partitions( + state, + Arc::clone(&self.schema), + &[batches], + projection, + filters, + limit, + ) + .await + } +} + +// ── Cold segments (MEMC) → Arrow, with two-level time pruning ───────── + +/// `.memc` segment paths in `dir`, or empty if the dir does not exist. +/// Read-only: never creates the directory (unlike `ColdStore::open`). +fn cold_segment_paths(dir: &std::path::Path) -> Vec { + let mut out = Vec::new(); + if let Ok(entries) = std::fs::read_dir(dir) { + for e in entries.flatten() { + let p = e.path(); + if p.extension().and_then(|s| s.to_str()) == Some("memc") { + out.push(p); + } + } + } + out +} + +/// One decoded cold column → an Arrow array (schema order is preserved). +fn cold_column_to_array(col: ColumnData) -> ArrayRef { + match col { + ColumnData::U8(v) => Arc::new(UInt8Array::from(v)), + ColumnData::U32(v) => Arc::new(UInt32Array::from(v)), + ColumnData::I32(v) => Arc::new(Int32Array::from(v)), + ColumnData::I64(v) => Arc::new(Int64Array::from(v)), + ColumnData::F32(v) => Arc::new(Float32Array::from(v)), + ColumnData::F64(v) => Arc::new(Float64Array::from(v)), + ColumnData::U64(v) => Arc::new(UInt64Array::from(v)), + ColumnData::Str(v) => Arc::new(StringArray::from_iter_values(v)), + ColumnData::Bytes(v) => Arc::new(BinaryArray::from_iter_values(v)), + } +} + +/// Decode the cold pages of `table` within `bounds`, returning the batches and +/// the set of hot-ring `(chunk index, generation)` those pages came from. +/// +/// Two-level pruning mirrors the hot ring: sealed segments whose header +/// `ts_range` cannot match are skipped without reading pages, then each +/// segment's page directory is pruned per-page before decode. The returned +/// `covered` set lets the caller drop the corresponding still-resident hot +/// chunks so a hot∪cold union never double-counts a compacted chunk. +fn cold_scan( + dir: &std::path::Path, + table: &str, + schema: &SchemaRef, + bounds: &TsBounds, +) -> (Vec, HashSet<(usize, u64)>) { + let mut out = Vec::new(); + let mut covered: HashSet<(usize, u64)> = HashSet::new(); + for path in cold_segment_paths(dir) { + let Ok(reader) = SegmentReader::open(&path) else { + continue; // unreadable/foreign file: skip rather than fail the scan + }; + if let Some((smin, smax)) = reader.ts_range() { + if bounds.lower.is_some_and(|lo| smax < lo) || bounds.upper.is_some_and(|hi| smin > hi) + { + continue; // segment-level prune: whole file out of range + } + } + let Some(tid) = reader.table_id_by_name(table) else { + continue; // this segment holds no pages for the queried table + }; + let pages = reader.pages(); + for idx in reader.pages_in_range(tid, bounds.lower, bounds.upper) { + if let Some(p) = pages.get(idx) { + if p.source_chunk != probing_memtable::memc::SOURCE_CHUNK_NONE { + covered.insert((p.source_chunk as usize, p.source_gen)); + } + } + match reader.read_page(idx) { + Ok(cols) => { + let arrays: Vec = + cols.into_iter().map(cold_column_to_array).collect(); + match RecordBatch::try_new(Arc::clone(schema), arrays) { + Ok(b) if b.num_rows() > 0 => out.push(b), + Ok(_) => {} + Err(e) => log::error!("cold page {idx} → RecordBatch failed: {e}"), + } + } + Err(e) => log::debug!("cold page {idx} decode skipped: {e}"), + } + } + } + (out, covered) +} + +/// [`TableProvider`] unioning a hot ring with its cold MEMC segments under one +/// logical table. A single time predicate prunes both tiers: hot chunks by +/// `[min_ts, max_ts]`, cold segments/pages by their recorded ranges. Hot and +/// cold batches are handed to the scan as two partitions, so projection, +/// filter, and limit pushdown apply uniformly across both. +#[derive(Debug)] +pub struct HotColdTable { + hot: RingMmapTable, + cold_dir: std::path::PathBuf, + table: String, + schema: SchemaRef, +} + +impl HotColdTable { + pub fn new(hot: RingMmapTable, cold_dir: std::path::PathBuf, table: impl Into) -> Self { + let schema = hot.schema(); + Self { + hot, + cold_dir, + table: table.into(), + schema, + } + } +} + +#[async_trait] +impl TableProvider for HotColdTable { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn table_type(&self) -> TableType { + TableType::Base + } + + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> DfResult> { + supports_filters_pushdown_for_schema(&self.schema, filters) + } + + async fn scan( + &self, + state: &dyn Session, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, + ) -> DfResult> { + let bounds = self.hot.bounds_for(filters); + let (cold, covered) = cold_scan(&self.cold_dir, &self.table, &self.schema, &bounds); + // Drop hot chunks already in cold so each row is counted once. + let hot = self.hot.pruned_batches_excluding(&bounds, &covered); + + let partitions: Vec> = if cold.is_empty() { + vec![hot] + } else { + vec![hot, cold] + }; + scan_memory_partitions( + state, + Arc::clone(&self.schema), + &partitions, + projection, + filters, + limit, + ) + .await + } +} + +/// Route an mmap'd file to its [`TableProvider`]: MEMT rings get the lazy +/// pruning provider; MEMH (and anything else) keeps the eager path. +pub fn mapped_file_to_table(mapped: MappedFile, logical_name: &str) -> Arc { + match detect_table(mapped.as_bytes()) { + Some(TableKind::Ring) => match RingMmapTable::try_new(mapped) { + Ok(t) => Arc::new(t), + Err(_) => Arc::new(PluginAdvancedTable::empty_sentinel(logical_name)), + }, + _ => bytes_to_pushdown_table(mapped.as_bytes(), logical_name), + } +} + +// ── MEMH: key-value table → two-column RecordBatch ──────────────────── + +/// Fixed Arrow schema for MEMH tables: `key` (Utf8) + `value` (Utf8). +/// +/// All MEMH values are serialised to strings so that heterogeneous value types +/// (scalars, strings, bytes) can be represented in a single column and queried +/// with SQL string predicates. +fn memh_kv_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, true), + ])) +} + +fn typed_value_to_str(v: &TypedValue<'_>) -> String { + match v { + TypedValue::U8(n) => n.to_string(), + TypedValue::I32(n) => n.to_string(), + TypedValue::I64(n) => n.to_string(), + TypedValue::F32(n) => n.to_string(), + TypedValue::F64(n) => n.to_string(), + TypedValue::U64(n) => n.to_string(), + TypedValue::U32(n) => n.to_string(), + TypedValue::Str(s) => s.to_string(), + TypedValue::Bytes(b) => { + // Hex-encode without adding a dep; e.g. "0xdeadbeef" + let mut out = String::with_capacity(2 + b.len() * 2); + out.push_str("0x"); + for byte in *b { + use std::fmt::Write; + let _ = write!(out, "{byte:02x}"); + } + out + } + } +} + +fn memh_view_to_recordbatch(view: &MemhView<'_>) -> Vec { + let schema = memh_kv_schema(); + let mut keys: GenericStringBuilder = GenericStringBuilder::new(); + let mut values: GenericStringBuilder = GenericStringBuilder::new(); + + for (k, v) in view.iter() { + keys.append_value(k); + values.append_value(typed_value_to_str(&v)); + } + + match RecordBatch::try_new( + schema, + vec![Arc::new(keys.finish()), Arc::new(values.finish())], + ) { + Ok(batch) => vec![batch], + Err(e) => { + log::error!("memh → RecordBatch failed: {e}"); + vec![] + } + } +} + +// ── Dynamic schemas from mmap filenames ─────────────────────────────── + +/// One DataFusion schema combining mmap-backed tables with an optional inner +/// (static) provider. +/// +/// Lookup order: mmap file first, then `inner`. Mmap files only exist when a +/// producer explicitly created them, so they take precedence over static +/// providers — some of which (e.g. lazy namespaces) claim every name exists. +#[derive(Debug)] +pub struct MmapFileSchemaProvider { + schema: String, + inner: Option>, +} + +impl MmapFileSchemaProvider { + pub fn new(schema: impl Into) -> Self { + Self { + schema: schema.into(), + inner: None, + } + } + + /// Merge with a static provider: mmap tables shadow `inner` only on + /// exact-name collision; everything else falls through. + pub fn with_inner(schema: impl Into, inner: Option>) -> Self { + Self { + schema: schema.into(), + inner, + } + } +} + +#[async_trait] +impl SchemaProvider for MmapFileSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + let mut names = tables_in_schema(&self.schema); + if let Some(inner) = &self.inner { + names.extend(inner.table_names()); + } + names.sort(); + names.dedup(); + names + } + + async fn table(&self, name: &str) -> DfResult>> { + if mmap_table_exists(&self.schema, name) { + let basename = mmap_filename_for(&self.schema, name); + let path = self_dir().join(&basename); + // Zero-copy read: map the file instead of copying it to the heap. + // Ring files materialise lazily at scan() time with chunk-level + // time pruning; only surviving chunk bytes get faulted in. A ring + // is unioned with its cold MEMC segments (keyed by the unique + // on-disk basename) so one query spans both tiers. + if let Ok(mapped) = MappedFile::open(&path) { + if let Some(TableKind::Ring) = detect_table(mapped.as_bytes()) { + return Ok(Some(match RingMmapTable::try_new(mapped) { + Ok(ring) => Arc::new(HotColdTable::new(ring, cold_dir(), basename)), + Err(_) => Arc::new(PluginAdvancedTable::empty_sentinel(name)), + })); + } + return Ok(Some(mapped_file_to_table(mapped, name))); + } + } + match &self.inner { + Some(inner) => inner.table(name).await, + None => Ok(None), + } + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> DfResult>> { + match &self.inner { + Some(inner) => inner.register_table(name, table), + None => Err(DataFusionError::NotImplemented( + "unable to create tables".to_string(), + )), + } + } + + fn deregister_table(&self, name: &str) -> DfResult>> { + match &self.inner { + Some(inner) => inner.deregister_table(name), + None => Err(DataFusionError::NotImplemented( + "unable to drop tables".to_string(), + )), + } + } + + fn table_exist(&self, name: &str) -> bool { + mmap_table_exists(&self.schema, name) + || self + .inner + .as_ref() + .map(|inner| inner.table_exist(name)) + .unwrap_or(false) + } +} + +/// Wraps the `probe` catalog: static schemas (python, cluster, …) keep +/// working, mmap-backed schemas are discovered at query time, and when both +/// exist for the same name they are **merged** (mmap tables first) instead of +/// the mmap side shadowing the static provider. +#[derive(Debug)] +struct DynamicMmapCatalog { + inner: Arc, +} + +impl CatalogProvider for DynamicMmapCatalog { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + let mut names: BTreeSet = self.inner.schema_names().into_iter().collect(); + for sch in discover_all_schemas() { + names.insert(sch); + } + names.into_iter().collect() + } + + fn schema(&self, name: &str) -> Option> { + let inner = self.inner.schema(name); + let has_mmap = name == DEFAULT_UNDOTTED_SCHEMA || !tables_in_schema(name).is_empty(); + match (has_mmap, inner) { + (true, inner) => Some(Arc::new(MmapFileSchemaProvider::with_inner(name, inner))), + (false, Some(inner)) => Some(inner), + (false, None) => None, + } + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> DfResult>> { + self.inner.register_schema(name, schema) + } +} + +/// Namespace plugin that wraps the `probe` catalog with [`DynamicMmapCatalog`] +/// for dynamic schema discovery from mmap files at query time. +#[derive(Debug, Default)] +pub struct UnifiedMemtablePlugin; + +impl Plugin for UnifiedMemtablePlugin { + fn name(&self) -> String { + "mmap_memtables".into() + } + fn kind(&self) -> PluginType { + PluginType::Namespace + } + fn namespace(&self) -> String { + "memtable".into() + } + + fn provide_catalog(&self, inner: Arc) -> Option> { + Some(Arc::new(DynamicMmapCatalog { inner })) + } +} + +// ── Cold compaction runtime owner ───────────────────────────────────── + +/// Tunables for the background hot→cold compactor. +#[derive(Clone, Debug)] +pub struct ColdRuntimeConfig { + /// Whether the background compactor thread runs. + pub enabled: bool, + /// Sleep between drain passes. + pub poll: Duration, + /// Seal + roll a segment once it reaches this size (fragmentation knob). + pub target_segment_bytes: u64, + /// Seal an idle open segment after this long so it becomes queryable. + pub max_segment_age: Duration, + /// Cold-store byte budget; oldest segments evicted past it. + pub max_total_bytes: Option, + /// Drop cold segments older than this. + pub ttl: Option, +} + +impl Default for ColdRuntimeConfig { + fn default() -> Self { + Self { + enabled: false, + poll: Duration::from_secs(2), + target_segment_bytes: 64 * 1024 * 1024, + max_segment_age: Duration::from_secs(300), + max_total_bytes: None, + ttl: None, + } + } +} + +impl ColdRuntimeConfig { + fn to_compactor(&self) -> CompactorConfig { + CompactorConfig { + target_segment_bytes: self.target_segment_bytes, + max_segment_age: self.max_segment_age, + poll_interval: self.poll, + max_total_bytes: self.max_total_bytes, + ttl: self.ttl, + } + } + + /// Build a config from `PROBING_COLD*` environment variables, used to + /// auto-start compaction at engine init (opt-in, off by default). + pub fn from_env() -> Self { + fn env_u64(k: &str) -> Option { + std::env::var(k).ok().and_then(|v| v.trim().parse().ok()) + } + let mut c = Self::default(); + if let Ok(v) = std::env::var("PROBING_COLD") { + c.enabled = matches!(v.trim(), "1" | "on" | "true" | "yes"); + } + if let Some(mb) = env_u64("PROBING_COLD_TARGET_MB") { + c.target_segment_bytes = mb.saturating_mul(1024 * 1024); + } + if let Some(mb) = env_u64("PROBING_COLD_MAX_TOTAL_MB") { + c.max_total_bytes = Some(mb.saturating_mul(1024 * 1024)); + } + if let Some(s) = env_u64("PROBING_COLD_TTL_SECS") { + c.ttl = Some(Duration::from_secs(s)); + } + if let Some(ms) = env_u64("PROBING_COLD_POLL_MS") { + c.poll = Duration::from_millis(ms.max(50)); + } + if let Some(s) = env_u64("PROBING_COLD_MAX_AGE_SECS") { + c.max_segment_age = Duration::from_secs(s); + } + c + } +} + +/// Ring files under `self_dir()` that are candidate compaction sources, +/// returned as `(on-disk basename, path)`. The basename is the cold table +/// identity (matching the SQL read path), so names never collide across +/// schemas. The `cold/` subdir is skipped (it is a directory, not a file). +fn cold_source_candidates() -> Vec<(String, std::path::PathBuf)> { + let mut out = Vec::new(); + if let Ok(entries) = std::fs::read_dir(self_dir()) { + for e in entries.flatten() { + let p = e.path(); + if !p.is_file() { + continue; + } + let name = e.file_name().to_string_lossy().to_string(); + if classify_mmap_basename(&name).is_some() { + out.push((name, p)); + } + } + } + out +} + +/// Process-global owner of the background hot→cold compactor thread. +/// +/// Modeled on the task-stats worker: a lazy singleton with start/stop, so the +/// compactor has a single lifecycle home regardless of how many producers +/// create hot tables. The loop rediscovers ring files each pass (tables appear +/// over time), drains newly-sealed chunks into the shared cold store, rolls +/// segments by age, and enforces the byte/TTL budget. +pub struct ColdCompactor { + running: Arc, + handle: Mutex>>, +} + +impl ColdCompactor { + pub fn instance() -> &'static Self { + static INSTANCE: Lazy = Lazy::new(|| ColdCompactor { + running: Arc::new(AtomicBool::new(false)), + handle: Mutex::new(None), + }); + &INSTANCE + } + + pub fn is_running(&self) -> bool { + self.running.load(Ordering::Acquire) + } + + /// (Re)apply `cfg`: stop any running thread, then start a fresh one when + /// `cfg.enabled`. Idempotent and the single entry point for the config + /// surface, so changing a knob simply restarts with the new settings. + pub fn apply(&self, cfg: ColdRuntimeConfig) { + self.stop(); + if cfg.enabled { + self.start(cfg); + } + } + + fn start(&self, cfg: ColdRuntimeConfig) { + if self.running.swap(true, Ordering::SeqCst) { + return; // already running + } + let dir = cold_dir(); + let store = match ColdStore::open(&dir) { + Ok(s) => s, + Err(e) => { + log::error!("cold compactor: cannot open {}: {e}", dir.display()); + self.running.store(false, Ordering::SeqCst); + return; + } + }; + let mut compactor = Compactor::new(store, cfg.to_compactor()); + // Exactly-once across restarts: recover per-chunk watermarks from any + // segments already on disk before draining. + if let Err(e) = compactor.prime_from_cold() { + log::warn!("cold compactor: prime_from_cold failed: {e}"); + } + + let running = self.running.clone(); + let poll = cfg.poll; + let handle = std::thread::Builder::new() + .name("memc-compactor".into()) + .spawn(move || { + while running.load(Ordering::SeqCst) { + for (name, path) in cold_source_candidates() { + let Ok(mapped) = MappedFile::open(&path) else { + continue; + }; + if !matches!(detect_table(mapped.as_bytes()), Some(TableKind::Ring)) { + continue; // only ring tables tier to cold + } + if let Ok(view) = MemTableView::new(mapped.as_bytes()) { + if let Err(e) = compactor.drain_view(&name, &view) { + log::debug!("cold compactor: drain {name}: {e}"); + } + } + } + let _ = compactor.maybe_roll_on_age(); + let _ = compactor.enforce(); + sleep_interruptible(&running, poll); + } + // Final flush so the last open segment is sealed on shutdown. + if let Err(e) = compactor.flush() { + log::debug!("cold compactor: final flush: {e}"); + } + }) + .expect("spawn memc-compactor thread"); + *self.handle.lock().unwrap() = Some(handle); + } + + /// Signal the thread to flush and exit, then join it. + pub fn stop(&self) { + if !self.running.swap(false, Ordering::SeqCst) { + return; + } + if let Some(h) = self.handle.lock().unwrap().take() { + let _ = h.join(); + } + } + + pub fn stats(&self) -> Option { + ColdStore::open(cold_dir()).ok().map(|s| s.stats()) + } +} + +/// Sleep up to `total`, waking early (within ~200ms) if `running` is cleared. +fn sleep_interruptible(running: &AtomicBool, total: Duration) { + let step = Duration::from_millis(200); + let mut left = total; + while left > Duration::ZERO && running.load(Ordering::SeqCst) { + let nap = left.min(step); + std::thread::sleep(nap); + left = left.saturating_sub(nap); + } +} + +/// Start (or stop) background compaction from `PROBING_COLD*` env vars. +/// Call once after the engine is built; off by default. +pub fn start_cold_compaction_from_env() { + ColdCompactor::instance().apply(ColdRuntimeConfig::from_env()); +} + +// ── EngineExtension ──────────────────────────────────────────────────── + +/// Exposes mmap memtables to SQL and owns the cold-compaction config surface. +/// +/// Config knobs (also settable via `SET memtable. = ...`): +/// - `cold_compaction` (`on`/`off`) — run the background compactor. +/// - `cold_max_total_mb` — cold-store byte budget in MiB. +/// - `cold_ttl_secs` — evict cold segments older than this. +#[derive(Debug, Default, EngineExtensionDerive)] +pub struct MemTableExtension { + /// Background hot→cold compaction switch: "on" or "off". + #[option(aliases = ["cold.compaction"])] + cold_compaction: Maybe, + /// Cold-store byte budget in MiB (oldest segments evicted past it). + #[option(aliases = ["cold.max_total_mb"])] + cold_max_total_mb: Maybe, + /// Evict cold segments older than this many seconds. + #[option(aliases = ["cold.ttl_secs"])] + cold_ttl_secs: Maybe, +} + +impl MemTableExtension { + fn cold_enabled(&self) -> bool { + matches!( + self.cold_compaction, + Maybe::Just(ref s) if matches!(s.trim(), "1" | "on" | "true" | "yes") + ) + } + + /// Merge the current option fields over the env-derived defaults. + fn cold_config(&self) -> ColdRuntimeConfig { + let mut cfg = ColdRuntimeConfig::from_env(); + cfg.enabled = self.cold_enabled(); + if let Maybe::Just(mb) = self.cold_max_total_mb { + cfg.max_total_bytes = (mb > 0).then(|| (mb as u64).saturating_mul(1024 * 1024)); + } + if let Maybe::Just(s) = self.cold_ttl_secs { + cfg.ttl = (s > 0).then(|| Duration::from_secs(s as u64)); + } + cfg + } + + fn apply_cold(&self) { + ColdCompactor::instance().apply(self.cold_config()); + } + + fn set_cold_compaction(&mut self, v: Maybe) -> Result<(), EngineError> { + self.cold_compaction = v; + self.apply_cold(); + Ok(()) + } + + fn set_cold_max_total_mb(&mut self, v: Maybe) -> Result<(), EngineError> { + self.cold_max_total_mb = v; + self.apply_cold(); + Ok(()) + } + + fn set_cold_ttl_secs(&mut self, v: Maybe) -> Result<(), EngineError> { + self.cold_ttl_secs = v; + self.apply_cold(); + Ok(()) + } +} + +impl EngineCall for MemTableExtension {} + +impl EngineDatasource for MemTableExtension { + fn datasrc( + &self, + _namespace: &str, + _name: Option<&str>, + ) -> Option> { + Some(Arc::new(UnifiedMemtablePlugin)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion::arrow::array::{AsArray, Float64Array, Int32Array, Int64Array, UInt8Array}; + use probing_memtable::{MemTable, Schema as MtSchema, Value}; + use std::sync::Mutex; + + /// `PROBING_DATA_DIR` is process-global; serialize tests that mutate it. + static PROBING_DATA_DIR_LOCK: Mutex<()> = Mutex::new(()); + + fn concat_i64(batches: &[RecordBatch], col: usize) -> Vec { + batches + .iter() + .flat_map(|b| { + let a = b.column(col).as_any().downcast_ref::().unwrap(); + (0..a.len()).map(|i| a.value(i)).collect::>() + }) + .collect() + } + + fn collect_i32(batches: &[RecordBatch]) -> Vec { + batches + .iter() + .flat_map(|b| { + let a = b.column(0).as_any().downcast_ref::().unwrap(); + (0..a.len()).map(|i| a.value(i)).collect::>() + }) + .collect() + } + + #[test] + fn dtype_mapping_covers_all_variants() { + assert_eq!(dtype_to_arrow(DType::U8), DataType::UInt8); + assert_eq!(dtype_to_arrow(DType::U32), DataType::UInt32); + assert_eq!(dtype_to_arrow(DType::I32), DataType::Int32); + assert_eq!(dtype_to_arrow(DType::I64), DataType::Int64); + assert_eq!(dtype_to_arrow(DType::F32), DataType::Float32); + assert_eq!(dtype_to_arrow(DType::F64), DataType::Float64); + assert_eq!(dtype_to_arrow(DType::U64), DataType::UInt64); + assert_eq!(dtype_to_arrow(DType::Str), DataType::Utf8); + assert_eq!(dtype_to_arrow(DType::Bytes), DataType::Binary); + } + + #[test] + fn recordbatch_from_mixed_types() { + let schema = MtSchema::new() + .col("id", DType::I32) + .col("value", DType::F64) + .col("tag", DType::Str); + let mut t = MemTable::new(&schema, 4096, 2); + t.push_row(&[Value::I32(1), Value::F64(3.14), Value::Str("hello")]); + t.push_row(&[Value::I32(2), Value::F64(2.72), Value::Str("world")]); + + let view = t.view(); + let batches = view_to_recordbatches(&view); + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 3); + + let ids = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ids.value(0), 1); + assert_eq!(ids.value(1), 2); + + let vals = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!((vals.value(0) - 3.14).abs() < 1e-10); + assert!((vals.value(1) - 2.72).abs() < 1e-10); + + let tags: &datafusion::arrow::array::StringArray = batch.column(2).as_string(); + assert_eq!(tags.value(0), "hello"); + assert_eq!(tags.value(1), "world"); + } + + #[test] + fn recordbatches_multiple_chunks_in_logical_order() { + let schema = MtSchema::new().col("v", DType::I64); + // Small chunk so rows spill across chunks + let mut t = MemTable::new(&schema, 128, 4); + for i in 0..20 { + t.push_row(&[Value::I64(i)]); + } + + let view = t.view(); + let batches = view_to_recordbatches(&view); + assert!(!batches.is_empty()); + + // Concatenated in logical order, surviving values must be strictly + // increasing — even though the ring may have wrapped. + let values = concat_i64(&batches, 0); + assert!(!values.is_empty()); + for w in values.windows(2) { + assert!(w[1] > w[0], "values not in logical order: {values:?}"); + } + // The most recent row always survives. + assert_eq!(*values.last().unwrap(), 19); + } + + #[test] + fn recordbatches_logical_order_after_wrap() { + let schema = MtSchema::new().col("v", DType::I64); + let mut t = MemTable::new(&schema, 80, 2); + t.push_row(&[Value::I64(10)]); // chunk 0, gen 1 + t.advance_chunk(); + t.push_row(&[Value::I64(20)]); // chunk 1, gen 1 + t.advance_chunk(); // wrap: chunk 0 → gen 2 + t.push_row(&[Value::I64(30)]); // chunk 0, gen 2 + + let view = t.view(); + let batches = view_to_recordbatches(&view); + // chunk 1 (older) first, then recycled chunk 0 + assert_eq!(concat_i64(&batches, 0), vec![20, 30]); + } + + #[test] + fn recordbatch_empty_table_keeps_schema() { + let schema = MtSchema::new().col("x", DType::U8); + let t = MemTable::new(&schema, 1024, 1); + let view = t.view(); + let batches = view_to_recordbatches(&view); + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 0); + assert_eq!(batches[0].schema().field(0).name(), "x"); + } + + #[test] + fn arrow_schema_matches_memtable_schema() { + let schema = MtSchema::new() + .col("ts", DType::I64) + .col("cpu", DType::F64) + .col("name", DType::Str); + let t = MemTable::new(&schema, 1024, 1); + let view = t.view(); + let arrow = view_to_arrow_schema(&view); + + assert_eq!(arrow.fields().len(), 3); + assert_eq!(arrow.field(0).name(), "ts"); + assert_eq!(*arrow.field(0).data_type(), DataType::Int64); + assert_eq!(arrow.field(1).name(), "cpu"); + assert_eq!(*arrow.field(1).data_type(), DataType::Float64); + assert_eq!(arrow.field(2).name(), "name"); + assert_eq!(*arrow.field(2).data_type(), DataType::Utf8); + } + + #[test] + fn recordbatch_u8_column() { + let schema = MtSchema::new().col("flag", DType::U8); + let mut t = MemTable::new(&schema, 1024, 1); + t.push_row(&[Value::U8(0)]); + t.push_row(&[Value::U8(255)]); + + let view = t.view(); + let batches = view_to_recordbatches(&view); + let col = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col.value(0), 0); + assert_eq!(col.value(1), 255); + } + + // ── time-range pruning ───────────────────────────────────────────── + + #[test] + fn ts_bounds_extraction_from_filters() { + use datafusion::prelude::{col, lit}; + + // Conjunction across filter entries + let b = ts_bounds_from_filters( + &[col("ts").gt_eq(lit(100i64)), col("ts").lt(lit(200i64))], + "ts", + ); + assert_eq!( + b, + TsBounds { + lower: Some(100), + upper: Some(200) + } + ); + + // AND inside one entry + tightening + let f = col("ts").gt(lit(10i64)).and(col("ts").gt(lit(50i64))); + assert_eq!(ts_bounds_from_filters(&[f], "ts").lower, Some(50)); + + // Literal on the left mirrors the comparison: 300 <= ts + let f = lit(300i64).lt_eq(col("ts")); + assert_eq!(ts_bounds_from_filters(&[f], "ts").lower, Some(300)); + + // BETWEEN + let f = col("ts").between(lit(10i64), lit(20i64)); + let b = ts_bounds_from_filters(&[f], "ts"); + assert_eq!((b.lower, b.upper), (Some(10), Some(20))); + + // Equality pins both sides + let f = col("ts").eq(lit(42i64)); + let b = ts_bounds_from_filters(&[f], "ts"); + assert_eq!((b.lower, b.upper), (Some(42), Some(42))); + + // OR cannot be folded → unbounded (conservative) + let f = col("ts").gt(lit(5i64)).or(col("v").eq(lit(1i64))); + let b = ts_bounds_from_filters(&[f], "ts"); + assert_eq!((b.lower, b.upper), (None, None)); + + // Predicates on other columns are ignored + let b = ts_bounds_from_filters(&[col("v").gt(lit(5i64))], "ts"); + assert_eq!((b.lower, b.upper), (None, None)); + } + + #[test] + fn pruned_batches_skip_out_of_range_chunks() { + let schema = MtSchema::new().col("ts", DType::I64); + // ChunkHeader=40, I64 row=12 → 64-40=24 → 2 rows per chunk + let mut t = MemTable::new(&schema, 64, 4); + for ts in [10i64, 20, 30, 40, 50, 60] { + t.push_row(&[Value::I64(ts)]); + } + let view = t.view(); + assert_eq!(view_to_recordbatches(&view).len(), 3); + + // lower bound falls inside chunk 1: chunk 0 (max 20) pruned + let pruned = view_to_recordbatches_pruned( + &view, + &TsBounds { + lower: Some(35), + upper: None, + }, + ); + assert_eq!(concat_i64(&pruned, 0), vec![30, 40, 50, 60]); + + // tight window: only the chunk containing [50, 60] survives + let pruned = view_to_recordbatches_pruned( + &view, + &TsBounds { + lower: Some(55), + upper: Some(58), + }, + ); + assert_eq!(concat_i64(&pruned, 0), vec![50, 60]); + + // window past all data: everything pruned, schema kept + let pruned = view_to_recordbatches_pruned( + &view, + &TsBounds { + lower: Some(1000), + upper: None, + }, + ); + assert_eq!(pruned.len(), 1); + assert_eq!(pruned[0].num_rows(), 0); + assert_eq!(pruned[0].schema().field(0).name(), "ts"); + + // unbounded: identical to the unpruned materialisation + let unpruned = view_to_recordbatches_pruned(&view, &TsBounds::default()); + assert_eq!(concat_i64(&unpruned, 0), vec![10, 20, 30, 40, 50, 60]); + } + + #[test] + fn tables_without_ts_col_are_never_pruned() { + let schema = MtSchema::new().col("v", DType::I64); // not a ts name + let mut t = MemTable::new(&schema, 64, 4); + for v in [1i64, 2, 3, 4] { + t.push_row(&[Value::I64(v)]); + } + let view = t.view(); + assert_eq!(view.ts_col(), None); + // Even with bounds set, chunks without ts metadata must survive. + let batches = view_to_recordbatches_pruned( + &view, + &TsBounds { + lower: Some(100), + upper: None, + }, + ); + assert_eq!(concat_i64(&batches, 0), vec![1, 2, 3, 4]); + } + + #[tokio::test] + async fn ring_mmap_table_sql_end_to_end() { + let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap(); + use datafusion::prelude::SessionContext; + use probing_memtable::discover::ExposedTable; + + let tmp = tempfile::tempdir().unwrap(); + let orig = std::env::var("PROBING_DATA_DIR").ok(); + std::env::set_var("PROBING_DATA_DIR", tmp.path()); + + let schema = MtSchema::new() + .col("timestamp", DType::I64) + .col("v", DType::I32); + // 2 rows per chunk → 12 rows spread over 8 chunks + let mut table = ExposedTable::create("prune_demo", &schema, 80, 8).unwrap(); + for i in 1i64..=12 { + table.push_row(&[Value::I64(i * 100), Value::I32(i as i32)]); + } + + let path = self_dir().join("prune_demo"); + let mapped = MappedFile::open(&path).unwrap(); + let provider = mapped_file_to_table(mapped, "prune_demo"); + assert!( + provider.as_any().downcast_ref::().is_some(), + "ring files must get the lazy pruning provider" + ); + + let ctx = SessionContext::new(); + ctx.register_table("prune_demo", provider).unwrap(); + let batches = ctx + .sql("SELECT v FROM prune_demo WHERE timestamp >= 700 AND timestamp < 1100 ORDER BY v") + .await + .unwrap() + .collect() + .await + .unwrap(); + let got: Vec = batches + .iter() + .flat_map(|b| { + let a = b + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + (0..a.len()).map(|i| a.value(i)).collect::>() + }) + .collect(); + assert_eq!(got, vec![7, 8, 9, 10]); + + drop(table); + match orig { + Some(v) => std::env::set_var("PROBING_DATA_DIR", v), + None => std::env::remove_var("PROBING_DATA_DIR"), + } + } + + #[tokio::test] + async fn hot_cold_union_dedups_and_spans_time() { + use datafusion::prelude::SessionContext; + use probing_memtable::memc::{ColdStore, Compactor, CompactorConfig}; + + let tmp = tempfile::tempdir().unwrap(); + let hot_path = tmp.path().join("hc_demo"); + let cold = tmp.path().join("cold"); + + let schema = MtSchema::new() + .col("timestamp", DType::I64) + .col("v", DType::I32); + // 2 rows per chunk, 4 chunks. + let mut t = MemTable::file_at(&hot_path, &schema, 80, 4).unwrap(); + for i in 1i64..=6 { + t.push_row(&[Value::I64(i * 100), Value::I32(i as i32)]); + } + // chunks 0,1 sealed (ts 100,200 / 300,400); chunk 2 full-but-writing (500,600). + + { + let store = ColdStore::open(&cold).unwrap(); + let mut c = Compactor::new( + store, + CompactorConfig { + target_segment_bytes: 1 << 30, + ..Default::default() + }, + ); + let drained = c.drain_view("hc_demo", &t.view()).unwrap(); + assert_eq!(drained, 4, "two sealed chunks → 4 rows compacted"); + c.flush().unwrap(); + } + + let mapped = MappedFile::open(&hot_path).unwrap(); + let ring = RingMmapTable::try_new(mapped).unwrap(); + let provider: Arc = + Arc::new(HotColdTable::new(ring, cold.clone(), "hc_demo")); + + let ctx = SessionContext::new(); + ctx.register_table("hc_demo", provider).unwrap(); + + // Full scan: cold (4) + hot tail (2), with the still-resident compacted + // chunks deduped out of hot — exactly-once across tiers. + let all = ctx + .sql("SELECT v FROM hc_demo ORDER BY v") + .await + .unwrap() + .collect() + .await + .unwrap(); + assert_eq!(collect_i32(&all), vec![1, 2, 3, 4, 5, 6]); + + // One time predicate prunes both tiers and selects across the boundary. + let span = ctx + .sql("SELECT v FROM hc_demo WHERE timestamp >= 200 AND timestamp <= 500 ORDER BY v") + .await + .unwrap() + .collect() + .await + .unwrap(); + assert_eq!(collect_i32(&span), vec![2, 3, 4, 5]); + + drop(t); + } + + #[tokio::test] + async fn cold_compactor_runtime_drains_and_is_queryable() { + let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap(); + use datafusion::prelude::SessionContext; + use probing_memtable::discover::ExposedTable; + + let tmp = tempfile::tempdir().unwrap(); + let orig = std::env::var("PROBING_DATA_DIR").ok(); + std::env::set_var("PROBING_DATA_DIR", tmp.path()); + + let schema = MtSchema::new() + .col("timestamp", DType::I64) + .col("v", DType::I32); + let mut table = ExposedTable::create("rt_demo", &schema, 80, 8).unwrap(); + for i in 1i64..=6 { + table.push_row(&[Value::I64(i * 100), Value::I32(i as i32)]); + } + // chunks 0,1 sealed; chunk 2 full-but-writing (stays hot-only). + + // The runtime owner discovers the ring on its own and drains it. + ColdCompactor::instance().apply(ColdRuntimeConfig { + enabled: true, + poll: Duration::from_millis(50), + ..Default::default() + }); + + let mut waited = 0; + while ColdCompactor::instance() + .stats() + .map(|s| s.segment_count) + .unwrap_or(0) + == 0 + && waited < 5000 + { + std::thread::sleep(Duration::from_millis(50)); + waited += 50; + } + ColdCompactor::instance().stop(); // final flush seals the open segment + assert!( + ColdCompactor::instance() + .stats() + .map(|s| s.segment_count) + .unwrap_or(0) + >= 1, + "compactor should have produced a cold segment" + ); + + // Query through the same hot∪cold provider the catalog builds. + let path = self_dir().join("rt_demo"); + let mapped = MappedFile::open(&path).unwrap(); + let ring = RingMmapTable::try_new(mapped).unwrap(); + let provider: Arc = + Arc::new(HotColdTable::new(ring, cold_dir(), "rt_demo")); + let ctx = SessionContext::new(); + ctx.register_table("rt_demo", provider).unwrap(); + let all = ctx + .sql("SELECT v FROM rt_demo ORDER BY v") + .await + .unwrap() + .collect() + .await + .unwrap(); + assert_eq!(collect_i32(&all), vec![1, 2, 3, 4, 5, 6]); + + drop(table); + ColdCompactor::instance().stop(); + match orig { + Some(v) => std::env::set_var("PROBING_DATA_DIR", v), + None => std::env::remove_var("PROBING_DATA_DIR"), + } + } + + #[tokio::test] + async fn engine_catalog_query_unions_cold_tier() { + let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap(); + use datafusion::catalog::MemoryCatalogProvider; + use datafusion::prelude::SessionContext; + use probing_memtable::discover::ExposedTable; + + let tmp = tempfile::tempdir().unwrap(); + let orig = std::env::var("PROBING_DATA_DIR").ok(); + std::env::set_var("PROBING_DATA_DIR", tmp.path()); + + let schema = MtSchema::new() + .col("timestamp", DType::I64) + .col("v", DType::I32); + let mut table = ExposedTable::create("metrics", &schema, 80, 8).unwrap(); + for i in 1i64..=6 { + table.push_row(&[Value::I64(i * 100), Value::I32(i as i32)]); + } + + // Drain the sealed chunks to cold via the runtime owner. + ColdCompactor::instance().apply(ColdRuntimeConfig { + enabled: true, + poll: Duration::from_millis(50), + ..Default::default() + }); + let mut waited = 0; + while ColdCompactor::instance() + .stats() + .map(|s| s.segment_count) + .unwrap_or(0) + == 0 + && waited < 5000 + { + std::thread::sleep(Duration::from_millis(50)); + waited += 50; + } + ColdCompactor::instance().stop(); + + // Real query path: register the dynamic catalog and resolve the table + // purely by name — DynamicMmapCatalog → MmapFileSchemaProvider → + // HotColdTable, exactly as the engine does. + let ctx = SessionContext::new(); + let catalog = Arc::new(DynamicMmapCatalog { + inner: Arc::new(MemoryCatalogProvider::new()), + }); + ctx.register_catalog("probe", catalog); + + let all = ctx + .sql("SELECT v FROM probe.memtable.metrics ORDER BY v") + .await + .unwrap() + .collect() + .await + .unwrap(); + assert_eq!(collect_i32(&all), vec![1, 2, 3, 4, 5, 6], "hot∪cold once"); + + // One time predicate prunes across both tiers through the catalog. + let span = ctx + .sql( + "SELECT v FROM probe.memtable.metrics \ + WHERE timestamp >= 200 AND timestamp <= 500 ORDER BY v", + ) + .await + .unwrap() + .collect() + .await + .unwrap(); + assert_eq!(collect_i32(&span), vec![2, 3, 4, 5]); + + drop(table); + ColdCompactor::instance().stop(); + match orig { + Some(v) => std::env::set_var("PROBING_DATA_DIR", v), + None => std::env::remove_var("PROBING_DATA_DIR"), + } + } + + #[test] + fn classify_and_mmap_roundtrip() { + assert_eq!( + classify_mmap_basename("pulsing.actors"), + Some(("pulsing".into(), "actors".into())) + ); + assert_eq!( + classify_mmap_basename("foo.bar.baz"), + Some(("foo".into(), "bar.baz".into())) + ); + assert_eq!( + classify_mmap_basename("metrics"), + Some((DEFAULT_UNDOTTED_SCHEMA.into(), "metrics".into())) + ); + assert_eq!( + mmap_filename_for(DEFAULT_UNDOTTED_SCHEMA, "metrics"), + "metrics" + ); + assert_eq!(mmap_filename_for("pulsing", "actors"), "pulsing.actors"); + assert_eq!(mmap_filename_for("foo", "bar.baz"), "foo.bar.baz"); + } + + #[test] + fn mmap_table_exists_rejects_path_traversal() { + assert!(!mmap_table_exists("memtable", "../../etc/passwd")); + assert!(!mmap_table_exists("memtable", "a/b")); + assert!(!mmap_table_exists("memtable", "")); + } + + fn read_pushdown_from_mmap(schema: &str, table: &str) -> Arc { + let path = self_dir().join(mmap_filename_for(schema, table)); + let mapped = MappedFile::open(path).unwrap(); + bytes_to_pushdown_table(mapped.as_bytes(), table) + } + + #[test] + fn namespace_list_and_mmap_read_via_exposed_table() { + let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap(); + use probing_memtable::discover::ExposedTable; + + let tmp = tempfile::tempdir().unwrap(); + let orig = std::env::var("PROBING_DATA_DIR").ok(); + std::env::set_var("PROBING_DATA_DIR", tmp.path()); + + let schema = MtSchema::new().col("ts", DType::I64).col("msg", DType::Str); + let mut table = ExposedTable::create("test_metrics", &schema, 4096, 2).unwrap(); + { + let mut w = table.writer(); + w.push_row(&[Value::I64(100), Value::Str("alpha")]); + w.push_row(&[Value::I64(200), Value::Str("beta")]); + } + + let names = tables_in_schema(DEFAULT_UNDOTTED_SCHEMA); + assert!( + names.contains(&"test_metrics".to_string()), + "got: {names:?}" + ); + assert!(mmap_table_exists(DEFAULT_UNDOTTED_SCHEMA, "test_metrics")); + + let provider = read_pushdown_from_mmap(DEFAULT_UNDOTTED_SCHEMA, "test_metrics"); + assert!(provider + .as_any() + .downcast_ref::() + .is_some()); + + let path = self_dir().join(mmap_filename_for(DEFAULT_UNDOTTED_SCHEMA, "test_metrics")); + let mapped = MappedFile::open(&path).unwrap(); + let view = MemTableView::new(mapped.as_bytes()).unwrap(); + let batches = view_to_recordbatches(&view); + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + assert_eq!(batch.num_rows(), 2); + + let ts = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ts.value(0), 100); + assert_eq!(ts.value(1), 200); + + let msgs: &datafusion::arrow::array::StringArray = batch.column(1).as_string(); + assert_eq!(msgs.value(0), "alpha"); + assert_eq!(msgs.value(1), "beta"); + + drop(table); + match orig { + Some(v) => std::env::set_var("PROBING_DATA_DIR", v), + None => std::env::remove_var("PROBING_DATA_DIR"), + } + } + + #[test] + fn dotted_schema_isolated_from_memtable_list() { + let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap(); + use probing_memtable::discover::ExposedTable; + + let tmp = tempfile::tempdir().unwrap(); + let orig = std::env::var("PROBING_DATA_DIR").ok(); + std::env::set_var("PROBING_DATA_DIR", tmp.path()); + + let schema = MtSchema::new().col("ts", DType::I64).col("msg", DType::Str); + let dotted = mmap_filename_for("acme", "metrics_demo"); + let mut ring = ExposedTable::create(&dotted, &schema, 4096, 2).unwrap(); + { + let mut w = ring.writer(); + w.push_row(&[Value::I64(1), Value::Str("x")]); + } + + let mem_names = tables_in_schema(DEFAULT_UNDOTTED_SCHEMA); + assert!( + !mem_names.contains(&"metrics_demo".to_string()), + "dotted file must not appear as memtable table: {mem_names:?}" + ); + + let acme_names = tables_in_schema("acme"); + assert!( + acme_names.contains(&"metrics_demo".to_string()), + "got: {acme_names:?}" + ); + + let provider = read_pushdown_from_mmap("acme", "metrics_demo"); + assert!(provider + .as_any() + .downcast_ref::() + .is_some()); + + drop(ring); + match orig { + Some(v) => std::env::set_var("PROBING_DATA_DIR", v), + None => std::env::remove_var("PROBING_DATA_DIR"), + } + } + + #[tokio::test] + async fn merged_schema_provider_does_not_shadow_inner() { + let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap(); + use datafusion::catalog::MemorySchemaProvider; + use datafusion::datasource::MemTable as DfMemTable; + use probing_memtable::discover::ExposedTable; + + let tmp = tempfile::tempdir().unwrap(); + let orig = std::env::var("PROBING_DATA_DIR").ok(); + std::env::set_var("PROBING_DATA_DIR", tmp.path()); + + // Static (inner) provider with one table + let inner = Arc::new(MemorySchemaProvider::new()); + let static_schema = Arc::new(Schema::new(vec![Field::new( + "x", + DataType::Int64, + false, + )])); + let static_batch = RecordBatch::try_new( + static_schema.clone(), + vec![Arc::new(Int64Array::from(vec![42i64]))], + ) + .unwrap(); + inner + .register_table( + "static_tbl".to_string(), + Arc::new(DfMemTable::try_new(static_schema, vec![vec![static_batch]]).unwrap()), + ) + .unwrap(); + + // Mmap table in schema "python" + let mt_schema = MtSchema::new().col("v", DType::I64); + let mut ring = + ExposedTable::create(&mmap_filename_for("python", "extern_tbl"), &mt_schema, 4096, 2) + .unwrap(); + ring.push_row(&[Value::I64(7)]); + + let merged = MmapFileSchemaProvider::with_inner("python", Some(inner.clone() as _)); + + // Both tables visible + let names = merged.table_names(); + assert!(names.contains(&"extern_tbl".to_string()), "got {names:?}"); + assert!(names.contains(&"static_tbl".to_string()), "got {names:?}"); + + // Static table still resolvable through the merged provider + assert!(merged.table("static_tbl").await.unwrap().is_some()); + // Mmap table resolvable too + assert!(merged.table("extern_tbl").await.unwrap().is_some()); + assert!(merged.table_exist("static_tbl")); + assert!(merged.table_exist("extern_tbl")); + + drop(ring); + match orig { + Some(v) => std::env::set_var("PROBING_DATA_DIR", v), + None => std::env::remove_var("PROBING_DATA_DIR"), + } + } +} diff --git a/probing/core/src/core/mod.rs b/probing/core/src/core/mod.rs index d8c34a42..4cbe12c3 100644 --- a/probing/core/src/core/mod.rs +++ b/probing/core/src/core/mod.rs @@ -4,6 +4,8 @@ pub mod cluster_model; mod engine; mod error; pub mod extension; +pub mod memtable_sql; +mod plugin_advanced; mod plugin; pub use engine::Engine; @@ -19,8 +21,12 @@ pub use plugin::CustomNamespaceDataSource; pub use plugin::CustomTable; pub use plugin::LazyTableSource; pub use plugin::NamespacePluginHelper; +pub use plugin_advanced::PluginAdvancedTable; pub use plugin::TablePluginHelper; +pub use memtable_sql::MemTableExtension; +pub use memtable_sql::UnifiedMemtablePlugin; + pub use extension::EngineCall; pub use extension::EngineDatasource; pub use extension::EngineExtension; diff --git a/probing/core/src/core/plugin.rs b/probing/core/src/core/plugin.rs index 8adf70f9..e7efdef5 100644 --- a/probing/core/src/core/plugin.rs +++ b/probing/core/src/core/plugin.rs @@ -6,19 +6,26 @@ use async_trait::async_trait; use datafusion::arrow::array::RecordBatch; use datafusion::arrow::datatypes::SchemaRef; use datafusion::catalog::{CatalogProvider, SchemaProvider, Session, TableProvider}; -use datafusion::datasource::memory::DataSourceExec; -use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::common::Statistics; use datafusion::datasource::TableType; use datafusion::error::{DataFusionError, Result}; use datafusion::execution::SessionState; +use datafusion::logical_expr::TableProviderFilterPushDown; +use datafusion::physical_plan::common::compute_record_batch_statistics; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::Expr; +use super::plugin_advanced::{scan_memory_partitions, supports_filters_pushdown_for_schema}; + /// Trait defining a custom table with static/dynamic schema and data /// /// Implement this to create tables that: /// - Have a fixed name /// - Use a predefined schema +/// +/// The default [`TableDataSource`] integration applies **conservative** `WHERE` / `LIMIT` +/// pushdown (same rules as [`super::plugin_advanced`](super::plugin_advanced)): simple predicates +/// whose columns all exist on the table may run inside the scan; others stay in a planner `Filter`. pub trait CustomTable { /// Returns the table name (must be constant) fn name() -> &'static str; @@ -120,21 +127,47 @@ impl TableProvider TableType::Base } + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + supports_filters_pushdown_for_schema(&T::schema(), filters) + } + + fn statistics(&self) -> Option { + let partitions = vec![T::data()]; + Some(compute_record_batch_statistics( + &partitions, + T::schema().as_ref(), + None, + )) + } + async fn scan( &self, - _state: &dyn Session, + state: &dyn Session, projection: Option<&Vec>, - // filters and limit can be used here to inject some push-down operations if needed - _filters: &[Expr], - _limit: Option, + filters: &[Expr], + limit: Option, ) -> Result> { - let data = T::data(); - let srccfg = MemorySourceConfig::try_new(&[data], T::schema(), projection.cloned())?; - let exec = DataSourceExec::new(Arc::new(srccfg)); - Ok(Arc::new(exec)) + let batches = T::data(); + let partitions = vec![batches]; + scan_memory_partitions( + state, + T::schema(), + &partitions, + projection, + filters, + limit, + ) + .await } } +/// Eager in-memory table built from pre-materialized [`RecordBatch`]es (e.g. mmap → Arrow). +/// +/// Supports the same **conservative** `WHERE` / `LIMIT` pushdown as [`TableDataSource`] via +/// [`super::plugin_advanced::scan_memory_partitions`](super::plugin_advanced::scan_memory_partitions). #[derive(Default, Debug)] pub struct LazyTableSource { pub name: String, @@ -163,13 +196,31 @@ impl TableProvider for LazyTableSource { TableType::Base } + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + supports_filters_pushdown_for_schema(&self.schema(), filters) + } + + fn statistics(&self) -> Option { + if self.data.is_empty() { + return None; + } + let partitions = vec![self.data.clone()]; + Some(compute_record_batch_statistics( + &partitions, + self.schema().as_ref(), + None, + )) + } + async fn scan( &self, - _state: &dyn Session, + state: &dyn Session, projection: Option<&Vec>, - // filters and limit can be used here to inject some push-down operations if needed - _filters: &[Expr], - _limit: Option, + filters: &[Expr], + limit: Option, ) -> Result> { let data = &self.data; if data.is_empty() { @@ -178,10 +229,16 @@ impl TableProvider for LazyTableSource { )); } let schema = data[0].schema(); - let srccfg = - MemorySourceConfig::try_new(std::slice::from_ref(data), schema, projection.cloned())?; - let exec = DataSourceExec::new(Arc::new(srccfg)); - Ok(Arc::new(exec)) + let partitions = vec![self.data.clone()]; + scan_memory_partitions( + state, + schema, + &partitions, + projection, + filters, + limit, + ) + .await } } diff --git a/probing/core/src/core/plugin_advanced.rs b/probing/core/src/core/plugin_advanced.rs new file mode 100644 index 00000000..9f1fa941 --- /dev/null +++ b/probing/core/src/core/plugin_advanced.rs @@ -0,0 +1,590 @@ +//! Advanced [`TableProvider`] path and **shared pushdown helpers** for in-memory Arrow batches. +//! +//! [`PluginAdvancedTable`] is aimed at internal callers (e.g. mmap memtables). The same filter / +//! limit / stats behaviour is reused by [`super::plugin::TableDataSource`](super::plugin::TableDataSource) +//! and [`super::plugin::LazyTableSource`](super::plugin::LazyTableSource) via [`scan_memory_partitions`] +//! and [`supports_filters_pushdown_for_schema`]. + +use std::any::Any; +use std::collections::HashSet; +use std::fmt::Debug; +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::arrow::array::Int64Array; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::arrow::record_batch::{RecordBatch, RecordBatchOptions}; +use datafusion::catalog::Session; +use datafusion::common::tree_node::TreeNode; +use datafusion::common::DFSchema; +use datafusion::common::Statistics; +use datafusion::datasource::memory::{DataSourceExec, MemorySourceConfig}; +use datafusion::datasource::{TableProvider, TableType}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; +use datafusion::physical_expr::utils::conjunction; +use datafusion::physical_plan::common::compute_record_batch_statistics; +use datafusion::physical_plan::filter::FilterExecBuilder; +use datafusion::physical_plan::ExecutionPlan; + +/// In-memory table: one or more partitions of [`RecordBatch`]es sharing `schema`. +/// +/// - Declares **filter push-down** for predicates that pass a conservative structural check +/// (no subqueries, all referenced columns exist on the table schema). +/// - Applies pushed filters in `scan` via [`FilterExec`] on top of [`MemorySourceConfig`]. +/// - Applies **`LIMIT` / fetch** on the memory source when there are no pushed filters, and on +/// [`FilterExec`] when filters are present (so limit still applies with pushdown). +/// - Exposes **row / null-count style statistics** via [`TableProvider::statistics`]. +#[derive(Debug)] +pub struct PluginAdvancedTable { + /// Logical table name (for `Debug` / tracing only). + label: String, + schema: SchemaRef, + /// Partition layout expected by [`MemorySourceConfig`]. + partitions: Vec>, +} + +impl PluginAdvancedTable { + pub fn label(&self) -> &str { + &self.label + } + + /// Build from a single partition list; validates each batch against `schema`. + pub fn try_new( + label: impl Into, + schema: SchemaRef, + batches: Vec, + ) -> Result { + let label = label.into(); + for b in &batches { + Self::check_batch_schema(&label, &schema, b)?; + } + Ok(Self { + label, + schema, + partitions: vec![batches], + }) + } + + /// Multi-partition layout (advanced; most callers use [`Self::try_new`]). + pub fn try_new_partitions( + label: impl Into, + schema: SchemaRef, + partitions: Vec>, + ) -> Result { + let label = label.into(); + for part in &partitions { + for b in part { + Self::check_batch_schema(&label, &schema, b)?; + } + } + Ok(Self { + label, + schema, + partitions, + }) + } + + /// Sentinel for invalid mmap / empty inputs (zero-row, minimal schema). + pub fn empty_sentinel(label: impl Into) -> Self { + let label = label.into(); + let schema = Arc::new(Schema::new(vec![Field::new( + "_empty", + DataType::Int64, + true, + )])); + let empty = RecordBatch::try_new_with_options( + Arc::clone(&schema), + vec![Arc::new(Int64Array::from(Vec::::new()))], + &RecordBatchOptions::new().with_row_count(Some(0)), + ) + .expect("empty batch"); + Self { + label, + schema, + partitions: vec![vec![empty]], + } + } + + fn check_batch_schema(label: &str, expected: &SchemaRef, batch: &RecordBatch) -> Result<()> { + let got = batch.schema(); + if got.as_ref() != expected.as_ref() { + return Err(DataFusionError::Plan(format!( + "PluginAdvancedTable {label}: batch schema mismatch (expected {expected}, got {got})" + ))); + } + Ok(()) + } +} + +/// `true` if `expr` contains constructs we cannot evaluate inside a plain memory scan. +pub(crate) fn has_unsupported_pushdown_subexpr(expr: &Expr) -> bool { + use datafusion::logical_expr::Expr as E; + expr.exists(|e| { + Ok(matches!( + e, + E::ScalarSubquery(_) + | E::Exists { .. } + | E::InSubquery(_) + | E::Placeholder(_) + | E::GroupingSet(_) + | E::OuterReferenceColumn(_, _) + )) + }) + .unwrap_or(true) +} + +/// Structural gate for [`TableProvider::supports_filters_pushdown`] without a [`Session`]. +pub(crate) fn can_push_filter_exact_for_schema(schema: &SchemaRef, expr: &Expr) -> bool { + if has_unsupported_pushdown_subexpr(expr) { + return false; + } + let names: HashSet = schema + .fields() + .iter() + .map(|f| f.name().clone()) + .collect(); + for c in expr.column_refs() { + if !names.contains(c.name()) { + return false; + } + } + true +} + +pub(crate) fn supports_filters_pushdown_for_schema( + schema: &SchemaRef, + filters: &[&Expr], +) -> Result> { + Ok(filters + .iter() + .map(|f| { + if can_push_filter_exact_for_schema(schema, f) { + TableProviderFilterPushDown::Exact + } else { + TableProviderFilterPushDown::Unsupported + } + }) + .collect()) +} + +/// Build a scan plan over in-memory partitions with optional filter + limit pushdown. +pub(crate) async fn scan_memory_partitions( + state: &dyn Session, + schema: SchemaRef, + partitions: &[Vec], + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, +) -> Result> { + let show_sizes = state.config_options().explain.show_sizes; + + let plan: Arc = if filters.is_empty() { + let mem = MemorySourceConfig::try_new(partitions, schema.clone(), projection.cloned())? + .with_show_sizes(show_sizes) + .with_limit(limit); + DataSourceExec::from_data_source(mem) + } else { + // Predicates are compiled against the FULL table schema, so the + // source must scan unprojected; otherwise column indices inside the + // physical predicate would resolve against the projected batch + // (e.g. `a > 1` silently evaluating on column `b`). The requested + // projection is applied by FilterExec on the way out. + let df_schema = DFSchema::try_from(Arc::clone(&schema))?; + let mut phys = Vec::new(); + for f in filters { + phys.push(state.create_physical_expr(f.clone(), &df_schema)?); + } + let predicate = conjunction(phys); + + let mem = MemorySourceConfig::try_new(partitions, schema.clone(), None)? + .with_show_sizes(show_sizes); + let input: Arc = DataSourceExec::from_data_source(mem); + let filt = FilterExecBuilder::new(predicate, input) + .apply_projection(projection.cloned())? + .with_fetch(limit) + .build()?; + Arc::new(filt) + }; + + Ok(plan) +} + +#[async_trait] +impl TableProvider for PluginAdvancedTable { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn table_type(&self) -> TableType { + TableType::Base + } + + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + supports_filters_pushdown_for_schema(&self.schema, filters) + } + + fn statistics(&self) -> Option { + Some(compute_record_batch_statistics( + &self.partitions, + self.schema.as_ref(), + None, + )) + } + + async fn scan( + &self, + state: &dyn Session, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, + ) -> Result> { + scan_memory_partitions( + state, + self.schema(), + &self.partitions, + projection, + filters, + limit, + ) + .await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion::arrow::array::Int32Array; + use datafusion::common::stats::Precision; + use datafusion::datasource::TableProvider; + use datafusion::execution::context::TaskContext; + use datafusion::logical_expr::expr_fn::{out_ref_col, placeholder}; + use datafusion::logical_expr::TableProviderFilterPushDown; + use datafusion::physical_plan::collect; + use datafusion::prelude::{col, lit, SessionContext}; + use std::sync::Arc; + + fn test_schema_id() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])) + } + + fn batch_ids(schema: &SchemaRef, values: Vec) -> Result { + RecordBatch::try_new( + Arc::clone(schema), + vec![Arc::new(Int32Array::from(values))], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } + + // --- construction --- + + #[test] + fn try_new_accepts_matching_schema() -> Result<()> { + let schema = test_schema_id(); + let b = batch_ids(&schema, vec![1, 2])?; + let t = PluginAdvancedTable::try_new("x", Arc::clone(&schema), vec![b])?; + assert_eq!(t.label(), "x"); + assert_eq!(t.schema().fields().len(), 1); + Ok(()) + } + + #[test] + fn try_new_rejects_schema_mismatch() { + let expected = test_schema_id(); + let wrong = Arc::new(Schema::new(vec![Field::new( + "other", + DataType::Int32, + false, + )])); + let batch = batch_ids(&wrong, vec![1]).unwrap(); + let err = PluginAdvancedTable::try_new("bad", expected, vec![batch]).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("batch schema mismatch"), + "unexpected error: {msg}" + ); + } + + #[test] + fn try_new_partitions_validates_all_batches() { + let schema = test_schema_id(); + let wrong = Arc::new(Schema::new(vec![Field::new( + "x", + DataType::Int32, + false, + )])); + let good = batch_ids(&schema, vec![1]).unwrap(); + let bad = batch_ids(&wrong, vec![2]).unwrap(); + let err = PluginAdvancedTable::try_new_partitions( + "p", + Arc::clone(&schema), + vec![vec![good], vec![bad]], + ) + .unwrap_err(); + assert!(err.to_string().contains("batch schema mismatch")); + } + + #[test] + fn try_new_partitions_succeeds() -> Result<()> { + let schema = test_schema_id(); + let p0 = batch_ids(&schema, vec![1, 2])?; + let p1 = batch_ids(&schema, vec![3])?; + let t = PluginAdvancedTable::try_new_partitions("m", schema.clone(), vec![vec![p0], vec![p1]])?; + let s = t.statistics().expect("stats"); + assert_eq!(s.num_rows, Precision::Exact(3)); + Ok(()) + } + + #[test] + fn empty_sentinel_zero_rows_and_schema() { + let t = PluginAdvancedTable::empty_sentinel("mmap-empty"); + assert_eq!(t.label(), "mmap-empty"); + assert_eq!(t.schema().fields().len(), 1); + assert_eq!(t.schema().field(0).name(), "_empty"); + let s = t.statistics().expect("stats"); + assert_eq!(s.num_rows, Precision::Exact(0)); + } + + // --- pushdown helpers --- + + #[test] + fn has_unsupported_detects_placeholder_outer_ref() { + assert!(has_unsupported_pushdown_subexpr(&placeholder("$1"))); + assert!(has_unsupported_pushdown_subexpr(&out_ref_col( + DataType::Int32, + "c" + ))); + assert!(!has_unsupported_pushdown_subexpr(&col("id"))); + assert!(!has_unsupported_pushdown_subexpr(&col("id").gt(lit(0i32)))); + } + + #[test] + fn can_push_filter_exact_for_schema_gate() { + let schema = test_schema_id(); + assert!(can_push_filter_exact_for_schema( + &schema, + &col("id").gt(lit(1i32)) + )); + assert!(!can_push_filter_exact_for_schema( + &schema, + &col("missing").gt(lit(1i32)) + )); + assert!(!can_push_filter_exact_for_schema( + &schema, + &placeholder("$1") + )); + } + + #[test] + fn supports_filters_pushdown_for_schema_mixed() -> Result<()> { + let schema = test_schema_id(); + let f1 = col("id").gt(lit(0i32)); + let f2 = col("nope").eq(lit(1i32)); + let v = supports_filters_pushdown_for_schema(&schema, &[&f1, &f2])?; + assert_eq!(v.len(), 2); + assert_eq!(v[0], TableProviderFilterPushDown::Exact); + assert_eq!(v[1], TableProviderFilterPushDown::Unsupported); + Ok(()) + } + + // --- scan_memory_partitions --- + + #[tokio::test] + async fn scan_memory_partitions_limit_without_filter() -> Result<()> { + let schema = test_schema_id(); + let batch = batch_ids(&schema, vec![10, 20, 30, 40])?; + let ctx = SessionContext::new(); + let state = ctx.state(); + let plan = scan_memory_partitions( + &state, + Arc::clone(&schema), + &[vec![batch]], + None, + &[], + Some(2), + ) + .await?; + let batches = collect(plan, Arc::new(TaskContext::default())).await?; + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 2); + let arr = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(arr.value(0), 10); + assert_eq!(arr.value(1), 20); + Ok(()) + } + + #[tokio::test] + async fn scan_memory_partitions_filter_and_limit() -> Result<()> { + let schema = test_schema_id(); + let batch = batch_ids(&schema, vec![1, 2, 3, 4, 5])?; + let filter = col("id").gt(lit(2i32)); + let ctx = SessionContext::new(); + let state = ctx.state(); + let plan = scan_memory_partitions( + &state, + Arc::clone(&schema), + &[vec![batch]], + None, + std::slice::from_ref(&filter), + Some(2), + ) + .await?; + let batches = collect(plan, Arc::new(TaskContext::default())).await?; + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 2); + let arr = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(arr.value(0), 3); + assert_eq!(arr.value(1), 4); + Ok(()) + } + + #[tokio::test] + async fn scan_memory_partitions_filter_with_projection_uses_full_schema() -> Result<()> { + // Regression: predicate column (`a`) is NOT part of the projection. + // The filter must still evaluate against the full schema instead of + // resolving indices on the projected batch. + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, false), + Field::new("b", DataType::Int64, false), + ])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int64Array::from(vec![1i64, 2, 3])), + Arc::new(Int64Array::from(vec![10i64, 20, 30])), + ], + )?; + let filter = col("a").gt(lit(1i64)); + let ctx = SessionContext::new(); + let state = ctx.state(); + let plan = scan_memory_partitions( + &state, + Arc::clone(&schema), + &[vec![batch]], + Some(&vec![1usize]), // project only `b` + std::slice::from_ref(&filter), + None, + ) + .await?; + let batches = collect(plan, Arc::new(TaskContext::default())).await?; + assert_eq!(batches.len(), 1); + let out = &batches[0]; + assert_eq!(out.num_columns(), 1); + assert_eq!(out.schema().field(0).name(), "b"); + let arr = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(arr.len(), 2); + assert_eq!(arr.value(0), 20); + assert_eq!(arr.value(1), 30); + Ok(()) + } + + #[tokio::test] + async fn scan_memory_partitions_invalid_column_in_filter_errors() -> Result<()> { + let schema = test_schema_id(); + let batch = batch_ids(&schema, vec![1])?; + let bad_filter = col("unknown").eq(lit(1i32)); + let ctx = SessionContext::new(); + let state = ctx.state(); + let err = scan_memory_partitions( + &state, + schema, + &[vec![batch]], + None, + std::slice::from_ref(&bad_filter), + None, + ) + .await + .unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("unknown") || msg.contains("column"), + "unexpected: {msg}" + ); + Ok(()) + } + + // --- TableProvider --- + + #[test] + fn table_provider_as_any_and_table_type() -> Result<()> { + let schema = test_schema_id(); + let t = PluginAdvancedTable::try_new("t", schema, vec![batch_ids(&test_schema_id(), vec![1])?])?; + assert!(t.as_any().downcast_ref::().is_some()); + assert_eq!(t.table_type(), TableType::Base); + Ok(()) + } + + #[tokio::test] + async fn table_provider_supports_filters_pushdown_delegates() -> Result<()> { + let schema = test_schema_id(); + let t = PluginAdvancedTable::try_new("t", schema, vec![batch_ids(&test_schema_id(), vec![1])?])?; + let f = col("id").gt(lit(0i32)); + let v = t.supports_filters_pushdown(&[&f])?; + assert_eq!(v, vec![TableProviderFilterPushDown::Exact]); + Ok(()) + } + + #[tokio::test] + async fn filter_and_limit_pushdown_scan() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + )?; + let table = Arc::new(PluginAdvancedTable::try_new("t", Arc::clone(&schema), vec![batch])?); + let ctx = SessionContext::new(); + ctx.register_table("t", table)?; + let df = ctx.sql("SELECT id FROM t WHERE id > 2 LIMIT 2").await?; + let batches = df.collect().await?; + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 2); + let arr = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(arr.value(0), 3); + assert_eq!(arr.value(1), 4); + Ok(()) + } + + #[tokio::test] + async fn statistics_reports_row_count() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![10, 20]))], + )?; + let table = PluginAdvancedTable::try_new("t", schema, vec![batch])?; + let s = table.statistics().expect("stats"); + assert_eq!(s.num_rows, Precision::Exact(2)); + Ok(()) + } +} diff --git a/probing/extensions/cc/Cargo.toml b/probing/extensions/cc/Cargo.toml index 5081cd0d..06721d59 100644 --- a/probing/extensions/cc/Cargo.toml +++ b/probing/extensions/cc/Cargo.toml @@ -22,7 +22,7 @@ thiserror = { workspace = true } async-trait = "0.1.83" rmesg = { version = "1.0.21", optional = true } -datafusion = { version = "47.0.0", default-features = false, features = [] } +datafusion = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] procfs = { version = "0.17.0", default-features = false, features = ["chrono"] } diff --git a/probing/extensions/python/Cargo.toml b/probing/extensions/python/Cargo.toml index 887bc8cc..f06f9c32 100644 --- a/probing/extensions/python/Cargo.toml +++ b/probing/extensions/python/Cargo.toml @@ -14,6 +14,7 @@ default = ["extension-module", "tracing"] [dependencies] probing-cc = { path = "../cc" } probing-core = { path = "../../core" } +probing-memtable = { path = "../../memtable" } probing-proto = { path = "../../proto" } probing-store = { path = "../../crates/store" } probing-cli = { path = "../../cli" } @@ -53,6 +54,7 @@ regex = ">=1.6.0" [dev-dependencies] tokio = { workspace = true } +tempfile = "3.8" [build-dependencies] pyo3-build-config = "0.25.1" diff --git a/probing/extensions/python/src/extensions/python/exttbls.rs b/probing/extensions/python/src/extensions/python/exttbls.rs index 8c5132eb..f4571d21 100644 --- a/probing/extensions/python/src/extensions/python/exttbls.rs +++ b/probing/extensions/python/src/extensions/python/exttbls.rs @@ -1,19 +1,50 @@ -use std::sync::Arc; -use std::{collections::HashMap, sync::Mutex}; +//! Python-facing `ExternalTable`, backed by **mmap memtables**. +//! +//! Each table is an [`ExposedTable`] (MEMT ring buffer) under +//! `//python.`, so: +//! +//! - data **survives a crash** of the producing process (postmortem-readable), +//! - any process can query it via the mmap SQL catalog +//! (`probing_core::core::memtable_sql`) as `python.`, +//! - the training process only ever pays the cost of an mmap row write — +//! query-side materialisation happens in whoever runs the SQL. +//! +//! The first appended row fixes the column dtypes (the Python API only +//! declares column names). A leading `timestamp` column (microseconds since +//! epoch, `I64`) is always present, matching the previous TimeSeries layout. + +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; use once_cell::sync::Lazy; -use probing_proto::prelude::{Ele, TimeSeries}; -use probing_proto::types::series::DiscardStrategy; +use probing_memtable::discover::ExposedTable; +use probing_memtable::{DType, Schema as MtSchema, Value}; +use probing_proto::prelude::Ele; use pyo3::prelude::*; use pyo3::types::{PyDict, PyType}; use pyo3::{pyclass, pymethods, Bound, PyObject, PyResult, Python}; use crate::features::convert::{ele_to_python, python_to_ele}; -fn value_to_object(py: Python, v: &probing_proto::prelude::Ele) -> PyObject { +/// SQL schema (and filename prefix) for Python extern tables. +pub const EXTERN_TABLE_SCHEMA: &str = "python"; + +/// Ring layout: fixed chunk count; chunk byte size derives from capacity. +const NUM_CHUNKS: u32 = 8; +const MIN_CHUNK_BYTES: usize = 4 * 1024; +const MAX_CHUNK_BYTES: usize = 8 * 1024 * 1024; + +fn value_to_object(py: Python, v: &Ele) -> PyObject { ele_to_python(py, v).unwrap_or_else(|_| py.None()) } +fn now_micros() -> i64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_micros() as i64) + .unwrap_or(0) +} + #[pyclass] pub struct PyExternalTableConfig { #[pyo3(get)] @@ -49,22 +80,6 @@ impl FromPyObject<'_> for PyExternalTableConfig { } } -impl From for DiscardStrategy { - fn from(py_config: PyExternalTableConfig) -> Self { - match py_config.discard_strategy.as_str() { - "BaseElementCount" => DiscardStrategy::BaseElementCount { - discard_threshold: py_config.discard_threshold, - chunk_size: py_config.chunk_size, - }, - "BaseMemorySize" => DiscardStrategy::BaseMemorySize { - discard_threshold: py_config.discard_threshold, - chunk_size: py_config.chunk_size, - }, - _ => DiscardStrategy::None, - } - } -} - #[pymethods] impl PyExternalTableConfig { #[new] @@ -76,9 +91,10 @@ impl PyExternalTableConfig { } } + #[allow(clippy::wrong_self_convention)] // Python-facing method name, kept for API compat fn into_py(&self, py: Python<'_>) -> PyObject { let dict = PyDict::new(py); - dict.set_item("chunk_size", &self.chunk_size).unwrap(); + dict.set_item("chunk_size", self.chunk_size).unwrap(); dict.set_item("discard_threshold", self.discard_threshold) .unwrap(); dict.set_item("discard_strategy", &self.discard_strategy) @@ -87,12 +103,231 @@ impl PyExternalTableConfig { } } -pub static EXTERN_TABLES: Lazy>>>> = +/// Total ring capacity in bytes derived from the (legacy) discard config. +/// +/// - `BaseMemorySize`: `discard_threshold` *is* a byte budget. +/// - `BaseElementCount`: estimate 64 bytes/row. +/// - anything else: 16 MiB default. +fn ring_capacity_bytes(discard_threshold: usize, strategy: &str) -> usize { + let raw = match strategy { + "BaseMemorySize" => discard_threshold, + "BaseElementCount" => discard_threshold.saturating_mul(64), + _ => 16 * 1024 * 1024, + }; + raw.clamp(MIN_CHUNK_BYTES * NUM_CHUNKS as usize, 1 << 30) +} + +fn ring_chunk_bytes(capacity: usize) -> u32 { + (capacity / NUM_CHUNKS as usize).clamp(MIN_CHUNK_BYTES, MAX_CHUNK_BYTES) as u32 +} + +/// Column dtype inferred from the first appended value. +fn ele_dtype(e: &Ele) -> DType { + match e { + Ele::I32(_) => DType::I32, + Ele::I64(_) => DType::I64, + Ele::F32(_) => DType::F32, + Ele::F64(_) => DType::F64, + Ele::BOOL(_) => DType::U8, + Ele::DataTime(_) => DType::U64, + Ele::Text(_) | Ele::Url(_) | Ele::Nil => DType::Str, + } +} + +/// Owned cell value: coerced from an [`Ele`] to match the column dtype, so a +/// `Vec` row can borrow from it. +enum OwnedVal { + U8(u8), + I32(i32), + I64(i64), + F32(f32), + F64(f64), + U64(u64), + S(String), +} + +fn ele_to_owned(e: &Ele, dt: DType) -> OwnedVal { + let as_f64 = |e: &Ele| match e { + Ele::I32(v) => *v as f64, + Ele::I64(v) => *v as f64, + Ele::F32(v) => *v as f64, + Ele::F64(v) => *v, + Ele::BOOL(v) => *v as u8 as f64, + Ele::DataTime(v) => *v as f64, + _ => 0.0, + }; + match dt { + DType::U8 => OwnedVal::U8(match e { + Ele::BOOL(v) => *v as u8, + other => as_f64(other) as u8, + }), + DType::I32 => OwnedVal::I32(as_f64(e) as i32), + DType::I64 => OwnedVal::I64(as_f64(e) as i64), + DType::F32 => OwnedVal::F32(as_f64(e) as f32), + DType::F64 => OwnedVal::F64(as_f64(e)), + DType::U64 => OwnedVal::U64(as_f64(e) as u64), + DType::U32 => OwnedVal::U64(as_f64(e) as u64), + DType::Str | DType::Bytes => OwnedVal::S(match e { + Ele::Text(s) | Ele::Url(s) => s.clone(), + Ele::Nil => String::new(), + other => other.to_string(), + }), + } +} + +fn owned_to_value(o: &OwnedVal) -> Value<'_> { + match o { + OwnedVal::U8(v) => Value::U8(*v), + OwnedVal::I32(v) => Value::I32(*v), + OwnedVal::I64(v) => Value::I64(*v), + OwnedVal::F32(v) => Value::F32(*v), + OwnedVal::F64(v) => Value::F64(*v), + OwnedVal::U64(v) => Value::U64(*v), + OwnedVal::S(s) => Value::Str(s), + } +} + +/// State behind one extern table. The mmap ring is created lazily on the +/// first append because the Python API declares names but not types. +pub struct ExternBacking { + name: String, + columns: Vec, + capacity_bytes: usize, + dtypes: Vec, + table: Option, +} + +impl ExternBacking { + fn new(name: &str, columns: Vec, capacity_bytes: usize) -> Self { + Self { + name: name.to_string(), + columns, + capacity_bytes, + dtypes: vec![], + table: None, + } + } + + fn ensure_table(&mut self, first_row: &[Ele]) -> Result<(), String> { + if self.table.is_some() { + return Ok(()); + } + let dtypes: Vec = first_row.iter().map(ele_dtype).collect(); + let mut schema = MtSchema::new().col("timestamp", DType::I64); + for (name, dt) in self.columns.iter().zip(dtypes.iter()) { + schema = schema.col(name, *dt); + } + let chunk_bytes = ring_chunk_bytes(self.capacity_bytes); + let filename = format!("{EXTERN_TABLE_SCHEMA}.{}", self.name); + let table = ExposedTable::create(&filename, &schema, chunk_bytes, NUM_CHUNKS) + .map_err(|e| format!("failed to create mmap table {filename}: {e}"))?; + self.dtypes = dtypes; + self.table = Some(table); + Ok(()) + } + + fn append(&mut self, timestamp: i64, values: &[Ele]) -> Result<(), String> { + if values.len() != self.columns.len() { + return Err("column count mismatch".to_string()); + } + self.ensure_table(values)?; + + let owned: Vec = values + .iter() + .zip(self.dtypes.iter()) + .map(|(e, dt)| ele_to_owned(e, *dt)) + .collect(); + let mut row: Vec = Vec::with_capacity(owned.len() + 1); + row.push(Value::I64(timestamp)); + row.extend(owned.iter().map(owned_to_value)); + + // ExposedTable::push_row validates schema and auto-advances chunks. + self.table + .as_mut() + .expect("ensured above") + .push_row(&row); + Ok(()) + } + + /// Rows in chronological order; when `limit` is set, only the most + /// recent `limit` rows are returned (still oldest → newest). + fn take(&self, limit: Option) -> Vec<(Ele, Vec)> { + let Some(table) = &self.table else { + return vec![]; + }; + let view = table.view(); + let mut out: Vec<(Ele, Vec)> = Vec::new(); + for chunk in view.chunks_logical() { + for row in view.rows(chunk) { + let mut cursor = row.cursor(); + let ts = Ele::I64(cursor.next_i64()); + let vals: Vec = self + .dtypes + .iter() + .map(|dt| match dt { + DType::U8 => Ele::BOOL(cursor.next_u8() != 0), + DType::I32 => Ele::I32(cursor.next_i32()), + DType::I64 => Ele::I64(cursor.next_i64()), + DType::F32 => Ele::F32(cursor.next_f32()), + DType::F64 => Ele::F64(cursor.next_f64()), + DType::U64 => Ele::DataTime(cursor.next_u64()), + DType::U32 => Ele::I64(cursor.next_u32() as i64), + DType::Str => Ele::Text(cursor.next_str().to_string()), + DType::Bytes => Ele::Text(String::from_utf8_lossy(cursor.next_bytes()).to_string()), + }) + .collect(); + out.push((ts, vals)); + } + } + if let Some(limit) = limit { + if out.len() > limit { + out.drain(..out.len() - limit); + } + } + out + } +} + +impl std::fmt::Debug for ExternBacking { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ExternBacking") + .field("name", &self.name) + .field("columns", &self.columns) + .field("created", &self.table.is_some()) + .finish() + } +} + +pub static EXTERN_TABLES: Lazy>>>> = Lazy::new(|| Mutex::new(Default::default())); #[pyclass] #[derive(Clone, Debug)] -pub struct ExternalTable(Arc>, usize); +pub struct ExternalTable(Arc>, usize); + +impl ExternalTable { + fn extract_eles(values: Vec) -> Vec { + Python::with_gil(|py| { + values + .into_iter() + .map(|v| { + let bound = v.bind(py); + python_to_ele(bound).unwrap_or(Ele::Nil) + }) + .collect() + }) + } + + fn create_backing( + name: &str, + columns: Vec, + discard_threshold: usize, + discard_strategy: &str, + ) -> Arc> { + let capacity = ring_capacity_bytes(discard_threshold, discard_strategy); + Arc::new(Mutex::new(ExternBacking::new(name, columns, capacity))) + } +} #[pymethods] impl ExternalTable { @@ -105,32 +340,22 @@ impl ExternalTable { discard_threshold: usize, discard_strategy: String, ) -> Self { + let _ = chunk_size; // ring chunking is byte-based; kept for API compat let ncolumn = columns.len(); - let config = PyExternalTableConfig { - chunk_size, - discard_threshold, - discard_strategy, - }; - let config: DiscardStrategy = config.into(); - let ts = Arc::new(Mutex::new( - TimeSeries::builder_with_config(config) - .with_columns(columns) - .build(), - )); + let backing = Self::create_backing(name, columns, discard_threshold, &discard_strategy); EXTERN_TABLES .lock() .unwrap() - .insert(name.to_string(), ts.clone()); - ExternalTable(ts, ncolumn) + .insert(name.to_string(), backing.clone()); + ExternalTable(backing, ncolumn) } #[classmethod] fn get(_cls: &Bound<'_, PyType>, name: &str) -> PyResult { let binding = EXTERN_TABLES.lock().unwrap(); - let ts = binding.get(name); - if let Some(ts) = ts { - let ncolumn = ts.lock().unwrap().cols.len(); - Ok(ExternalTable(ts.clone(), ncolumn)) + if let Some(backing) = binding.get(name) { + let ncolumn = backing.lock().unwrap().columns.len(); + Ok(ExternalTable(backing.clone(), ncolumn)) } else { Err(pyo3::exceptions::PyValueError::new_err(format!( "table {name} not found" @@ -148,37 +373,30 @@ impl ExternalTable { discard_threshold: usize, discard_strategy: String, ) -> PyResult { + let _ = chunk_size; let mut binding = EXTERN_TABLES.lock().unwrap(); - let ts = binding.get(name); - if let Some(ts) = ts { - let ncolumn = ts.lock().unwrap().cols.len(); - Ok(ExternalTable(ts.clone(), ncolumn)) + if let Some(backing) = binding.get(name) { + let ncolumn = backing.lock().unwrap().columns.len(); + Ok(ExternalTable(backing.clone(), ncolumn)) } else { let ncolumn = columns.len(); - let config = PyExternalTableConfig { - chunk_size, - discard_threshold, - discard_strategy, - }; - let config: DiscardStrategy = config.into(); - let ts = Arc::new(Mutex::new( - TimeSeries::builder_with_config(config) - .with_columns(columns) - .build(), - )); - binding.insert(name.to_string(), ts.clone()); - Ok(ExternalTable(ts, ncolumn)) + let backing = + Self::create_backing(name, columns, discard_threshold, &discard_strategy); + binding.insert(name.to_string(), backing.clone()); + Ok(ExternalTable(backing, ncolumn)) } } #[classmethod] fn drop(_cls: &Bound<'_, PyType>, name: &str) -> PyResult<()> { + // Dropping the backing drops the ExposedTable, which unlinks the + // mmap file and removes the table from SQL. let _ = EXTERN_TABLES.lock().unwrap().remove(name); Ok(()) } fn names(&self) -> Vec { - self.0.lock().unwrap().names.clone() + self.0.lock().unwrap().columns.clone() } fn append(&mut self, values: Vec) -> PyResult<()> { @@ -187,23 +405,12 @@ impl ExternalTable { "column count mismatch", )); } - let t = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) + let eles = Self::extract_eles(values); + self.0 + .lock() .unwrap() - .as_micros() as i64; - let values: Vec = Python::with_gil(|py| { - values - .into_iter() - .map(|v| { - let bound = v.bind(py); - python_to_ele(&bound).unwrap_or(Ele::Nil) - }) - .collect() - }); - match self.0.lock().unwrap().append(t.into(), values) { - Ok(_) => Ok(()), - Err(e) => Err(pyo3::exceptions::PyValueError::new_err(e.to_string())), - } + .append(now_micros(), &eles) + .map_err(pyo3::exceptions::PyValueError::new_err) } fn append_ts(&mut self, t: i64, values: Vec) -> PyResult<()> { @@ -212,26 +419,25 @@ impl ExternalTable { "column count mismatch", )); } - let values: Vec = Python::with_gil(|py| { - values - .into_iter() - .map(|v| { - let bound = v.bind(py); - python_to_ele(&bound).unwrap_or(Ele::Nil) - }) - .collect() - }); - let _ = self.0.lock().unwrap().append(t.into(), values); + let eles = Self::extract_eles(values); + self.0 + .lock() + .unwrap() + .append(t, &eles) + .map_err(pyo3::exceptions::PyValueError::new_err) + } + + fn append_many(&mut self, rows: Vec>) -> PyResult<()> { + for row in rows { + self.append(row)?; + } Ok(()) } #[pyo3(signature = (limit=None))] fn take(&self, limit: Option) -> PyResult)>> { - let result: Vec<(PyObject, Vec)> = self - .0 - .lock() - .unwrap() - .take(limit) + let rows = self.0.lock().unwrap().take(limit); + let result = rows .iter() .map(|(t, vals)| { Python::with_gil(|py| { @@ -252,21 +458,23 @@ impl ExternalTable { mod tests { use super::*; use crate::extensions::python::PythonPlugin; - use probing_cc::extensions::envs::EnvPlugin; - use probing_cc::extensions::files::FilesPlugin; - use probing_core::core::Engine; + use probing_core::core::{Engine, UnifiedMemtablePlugin}; use pyo3::ffi::c_str; + /// Route all mmap files of this test process into one tempdir. + static TEST_DATA_DIR: Lazy = Lazy::new(|| { + let dir = tempfile::tempdir().unwrap(); + std::env::set_var("PROBING_DATA_DIR", dir.path()); + dir + }); + fn setup() { - // Module registration is now handled automatically via _core module - // In test environment, we need to manually set up the probing module - // since _core may not be importable as a Python module + let _ = &*TEST_DATA_DIR; pyo3::prepare_freethreaded_python(); Python::with_gil(|py| { use pyo3::types::PyModule; use pyo3::PyTypeInfo; - // Get or create probing module let sys = PyModule::import(py, "sys").unwrap(); let modules = sys.getattr("modules").unwrap(); @@ -278,8 +486,6 @@ mod tests { m }; - // Manually add ExternalTable to probing module for tests - // This mimics what _core module does if !probing.hasattr("ExternalTable").unwrap_or(false) { probing .setattr("ExternalTable", ExternalTable::type_object(py)) @@ -288,19 +494,23 @@ mod tests { }); } - fn setup_table3() { + /// Create a table with a unique name and three rows; idempotent per name. + fn setup_table(name: &str) { setup(); Python::with_gil(|py| { py.run( - c_str!( + &std::ffi::CString::new(format!( r#" import probing -table3 = probing.ExternalTable.get_or_create("table3", ["a", "b"]) -table3.append([1, 2]) -table3.append([3, 4]) -table3.append([5, 6]) - "# - ), +if not hasattr(probing, "_made_{name}"): + t = probing.ExternalTable.get_or_create("{name}", ["a", "b"]) + t.append([1, 2]) + t.append([3, 4]) + t.append([5, 6]) + probing._made_{name} = True +"# + )) + .unwrap(), None, None, ) @@ -308,6 +518,16 @@ table3.append([5, 6]) }); } + async fn engine_with_python() -> Engine { + Engine::builder() + .with_default_namespace("probe") + .with_plugin(PythonPlugin::create("python")) + .with_plugin(Arc::new(UnifiedMemtablePlugin)) + .build() + .await + .unwrap() + } + #[test] fn test_create_new_table() { setup(); @@ -337,8 +557,7 @@ table = probing.ExternalTable.get_or_create("table2", ["a", "b"]) ) .unwrap(); let binding = EXTERN_TABLES.lock().unwrap(); - let table1 = binding.get("table2"); - assert!(table1.is_some()); + assert!(binding.contains_key("table2")); }); } @@ -346,25 +565,12 @@ table = probing.ExternalTable.get_or_create("table2", ["a", "b"]) fn test_drop_table_in_python() { setup(); Python::with_gil(|py| { - // Create the table first - py.run( - c_str!( - r#" -import probing -probing.ExternalTable.get_or_create("table2", ["a", "b"]) - "# - ), - None, - None, - ) - .unwrap(); - - // Now drop it py.run( c_str!( r#" import probing -probing.ExternalTable.drop("table2") +probing.ExternalTable.get_or_create("table_to_drop", ["a", "b"]) +probing.ExternalTable.drop("table_to_drop") "# ), None, @@ -372,28 +578,68 @@ probing.ExternalTable.drop("table2") ) .unwrap(); let binding = EXTERN_TABLES.lock().unwrap(); - let table1 = binding.get("table2"); - assert!(table1.is_none()); + assert!(!binding.contains_key("table_to_drop")); + }); + } + + #[test] + fn test_append_take_roundtrip_and_mmap_file() { + setup(); + let mut table = ExternalTable::new( + "roundtrip", + vec!["x".to_string(), "msg".to_string()], + 10000, + 1_000_000, + "BaseMemorySize".to_string(), + ); + Python::with_gil(|py| { + let vals: Vec = vec![ + 1i64.into_pyobject(py).unwrap().into_any().unbind(), + "hello".into_pyobject(py).unwrap().into_any().unbind(), + ]; + table.append(vals).unwrap(); + let vals: Vec = vec![ + 2i64.into_pyobject(py).unwrap().into_any().unbind(), + "world".into_pyobject(py).unwrap().into_any().unbind(), + ]; + table.append(vals).unwrap(); + }); + + // mmap file exists on disk under //python.roundtrip + let path = probing_memtable::discover::default_dir() + .join(std::process::id().to_string()) + .join("python.roundtrip"); + assert!(path.is_file(), "mmap file missing: {path:?}"); + + // take() returns rows oldest → newest, with coerced values + let rows = table.take(None).unwrap(); + assert_eq!(rows.len(), 2); + Python::with_gil(|py| { + let (_, vals) = &rows[0]; + assert_eq!(vals[0].extract::(py).unwrap(), 1); + assert_eq!(vals[1].extract::(py).unwrap(), "hello"); + let (_, vals) = &rows[1]; + assert_eq!(vals[0].extract::(py).unwrap(), 2); + assert_eq!(vals[1].extract::(py).unwrap(), "world"); + }); + + // take(limit) keeps the most recent rows + let rows = table.take(Some(1)).unwrap(); + assert_eq!(rows.len(), 1); + Python::with_gil(|py| { + assert_eq!(rows[0].1[1].extract::(py).unwrap(), "world"); }); } #[test] fn test_see_py_table_in_engine() { - setup_table3(); + setup_table("table3"); let rt = tokio::runtime::Builder::new_multi_thread() .worker_threads(4) .enable_all() .build() .unwrap(); - let engine = rt - .block_on(async { - Engine::builder() - .with_default_namespace("probe") - .with_plugin(PythonPlugin::create("python")) - .build() - .await - }) - .unwrap(); + let engine = rt.block_on(engine_with_python()); let tables = rt.block_on(async { engine .async_query( @@ -402,10 +648,8 @@ probing.ExternalTable.drop("table2") .await .unwrap() }); - // Query may return None if no tables found let df = tables.expect("Table 'table3' should be found in information_schema.tables"); assert!(!df.cols.is_empty(), "Should have at least one column"); - // Check if we have any rows - DataFrame.len() returns number of rows assert!( df.len() > 0, "Table 'table3' should be found in information_schema.tables" @@ -414,85 +658,86 @@ probing.ExternalTable.drop("table2") #[test] fn test_see_py_table_data_in_engine() { - setup_table3(); + setup_table("table4"); let rt = tokio::runtime::Builder::new_multi_thread() .worker_threads(4) .enable_all() .build() .unwrap(); - let engine = rt - .block_on(async { - Engine::builder() - .with_default_namespace("probe") - .with_plugin(PythonPlugin::create("python")) - .build() - .await - }) - .unwrap(); + let engine = rt.block_on(engine_with_python()); let tables = rt.block_on(async { engine - .async_query("select * from python.table3 ") + .async_query("select * from python.table4 ") .await .unwrap() }); - let df = tables.expect("Table 'table3' should be queryable"); - // DataFrame.len() returns number of rows + let df = tables.expect("Table 'table4' should be queryable"); assert_eq!(df.len(), 3, "Should have 3 rows"); + // timestamp + a + b + assert_eq!(df.names.len(), 3, "Should have 3 columns: {:?}", df.names); + assert_eq!(df.names[0], "timestamp"); } #[test] fn test_calculate_in_sql_with_filter() { - setup_table3(); + setup_table("table5"); let rt = tokio::runtime::Builder::new_multi_thread() .worker_threads(4) .enable_all() .build() .unwrap(); - let engine = rt - .block_on(async { - Engine::builder() - .with_default_namespace("probe") - .with_plugin(PythonPlugin::create("python")) - .build() - .await - }) - .unwrap(); + let engine = rt.block_on(engine_with_python()); let tables = rt.block_on(async { engine - .async_query("select a + b as c from python.table3 where a > 1") + .async_query("select a + b as c from python.table5 where a > 1") .await .unwrap() }); let df = tables.expect("Query should return results"); - // DataFrame.len() returns number of rows assert_eq!(df.len(), 2, "Should have 2 rows where a > 1"); } #[test] fn test_aggregate_in_sql() { - setup_table3(); + setup_table("table6"); let rt = tokio::runtime::Builder::new_multi_thread() .worker_threads(4) .enable_all() .build() .unwrap(); - let engine = rt - .block_on(async { - Engine::builder() - .with_default_namespace("probe") - .with_plugin(PythonPlugin::create("python")) - .build() - .await - }) - .unwrap(); + let engine = rt.block_on(engine_with_python()); let tables = rt.block_on(async { engine - .async_query("select sum(a), sum(b) from python.table3") + .async_query("select sum(a), sum(b) from python.table6") .await .unwrap() }); let df = tables.expect("Aggregation query should return results"); - println!("{df:?}"); assert!(!df.cols.is_empty(), "Should have aggregation results"); } + + #[test] + fn test_static_python_tables_not_shadowed() { + // Extern mmap tables under schema `python` must not hide the static + // namespace (backtrace, expression tables) — the merged catalog + // resolves mmap first, then falls through to the inner provider. + setup_table("table7"); + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + .unwrap(); + let engine = rt.block_on(engine_with_python()); + // `python.\`time.time()\`` is served by the static namespace's + // expression path; it must still resolve with extern tables present. + let result = rt.block_on(async { + engine + .async_query("select * from python.`time.time()`") + .await + }); + assert!( + result.is_ok(), + "static python namespace shadowed: {result:?}" + ); + } } diff --git a/probing/extensions/python/src/extensions/python/tbls.rs b/probing/extensions/python/src/extensions/python/tbls.rs index 7dfa7d1a..a16a20b6 100644 --- a/probing/extensions/python/src/extensions/python/tbls.rs +++ b/probing/extensions/python/src/extensions/python/tbls.rs @@ -5,13 +5,12 @@ use std::sync::Arc; use anyhow::Result; use log::error; +use probing_core::core::LazyTableSource; use probing_core::core::{ ArrayRef, CustomNamespace, DataType, Field, Float64Array, Int64Array, NamespacePluginHelper, RecordBatch, Schema, SchemaRef, StringArray, }; -use probing_core::core::{Float32Array, Int32Array, LazyTableSource}; -use probing_proto::prelude::{CallFrame, Ele, TimeSeries}; -use probing_proto::types; +use probing_proto::prelude::CallFrame; use pyo3::types::PyAnyMethods; use pyo3::types::PyDict; use pyo3::types::PyDictMethods; @@ -161,27 +160,6 @@ impl PythonNamespace { }) } - fn data_from_extern(expr: &str) -> Result> { - let binding = super::exttbls::EXTERN_TABLES - .lock() - .map_err(|e| anyhow::anyhow!("Failed to lock EXTERN_TABLES: {:?}", e))?; - - let table = binding - .get(expr) - .ok_or_else(|| anyhow::anyhow!("Table '{}' not found", expr))?; - - let names = table - .lock() - .map_err(|e| anyhow::anyhow!("Failed to lock table: {:?}", e))? - .names - .clone(); - - let ts = table - .lock() - .map_err(|e| anyhow::anyhow!("Failed to lock table: {:?}", e))?; - - Self::time_series_to_recordbatch(names, &ts) - } } impl CustomNamespace for PythonNamespace { @@ -190,15 +168,10 @@ impl CustomNamespace for PythonNamespace { } fn list() -> Vec { - let mut tables = super::exttbls::EXTERN_TABLES.lock().map_or_else( - |e| { - log::error!("Failed to lock EXTERN_TABLES: {e:?}"); - vec![] - }, - |binding| binding.keys().cloned().collect(), - ); - tables.push("backtrace".to_string()); // Add backtrace to the list - tables + // Extern tables (`probing.ExternalTable`) are mmap-backed and served + // by the mmap SQL catalog (`probing_core::core::memtable_sql`), not + // by this namespace. + vec!["backtrace".to_string()] } fn data(expr: &str) -> Vec { @@ -210,14 +183,6 @@ impl CustomNamespace for PythonNamespace { vec![] } } - } else if Self::list().contains(&expr.to_string()) { - match Self::data_from_extern(expr) { - Ok(batches) => batches, - Err(e) => { - error!("Error getting data from extern: {e:?}"); - vec![] - } - } } else { match Self::data_from_python(expr) { Ok(batches) => batches, @@ -244,168 +209,21 @@ impl CustomNamespace for PythonNamespace { }); } - let binding = super::exttbls::EXTERN_TABLES.lock().map_or_else( - |e| { - log::error!("Failed to lock EXTERN_TABLES: {e:?}"); - Default::default() - }, - |binding| binding.clone(), - ); - - if binding.contains_key(expr) { - let table = binding.get(expr).unwrap(); - let names = table.lock().unwrap().names.clone(); - let dtypes = table - .lock() - .unwrap() - .cols - .iter() - .map(|x| x.dtype()) - .collect::>(); - let mut fields = Vec::new(); - - // Check if table already has a timestamp column - let has_timestamp = names.iter().any(|n| n == "timestamp"); - - // Only add timestamp if it doesn't already exist - if !has_timestamp { - fields.push(Field::new("timestamp", DataType::Int64, true)); - } - - for (name, dtype) in names.iter().zip(dtypes.iter()) { - fields.push(Field::new( - name, - match dtype { - types::EleType::I64 => DataType::Int64, - types::EleType::F64 => DataType::Float64, - types::EleType::I32 => DataType::Int32, - types::EleType::F32 => DataType::Float32, - _ => DataType::Utf8, - }, - false, - )); - } - - let schema = Some(SchemaRef::new(Schema::new(fields))); - - Arc::new(LazyTableSource { - name: expr.to_string(), - schema, - data: Self::data_from_extern(expr).unwrap_or_default(), - }) + let data: Vec = Self::data_from_python(expr).unwrap_or_default(); + let schema = if data.is_empty() { + None } else { - let data: Vec = Self::data_from_python(expr).unwrap_or_default(); - let schema = if data.is_empty() { - None - } else { - Some(data[0].schema().clone()) - }; - Arc::new(LazyTableSource { - name: expr.to_string(), - schema, - data, - }) - } + Some(data[0].schema().clone()) + }; + Arc::new(LazyTableSource { + name: expr.to_string(), + schema, + data, + }) } } impl PythonNamespace { - pub fn time_series_to_recordbatch( - names: Vec, - ts: &TimeSeries, - ) -> Result> { - let mut fields: Vec = vec![]; - let mut columns: Vec = vec![]; - - fields.push(Field::new("timestamp", DataType::Int64, true)); - names.iter().zip(ts.cols.iter()).for_each(|(name, col)| { - let data_type = match col.dtype() { - types::EleType::I64 => DataType::Int64, - types::EleType::F64 => DataType::Float64, - types::EleType::I32 => DataType::Int32, - types::EleType::F32 => DataType::Float32, - _ => DataType::Utf8, - }; - fields.push(Field::new(name, data_type, false)); - }); - - let length = ts.len(); - - let timeseries = ts - .timestamp - .iter() - .take(length) - .map(|x| match x { - Ele::I64(x) => x, - _ => 0, - }) - .collect::>(); - columns.push(Arc::new(Int64Array::from(timeseries))); - - for col in ts.cols.iter() { - let col = match col.dtype() { - types::EleType::I64 => Arc::new(Int64Array::from( - col.iter() - .take(length) - .map(|x| match x { - Ele::I64(x) => x, - _ => 0, - }) - .collect::>(), - )) as ArrayRef, - types::EleType::F64 => Arc::new(Float64Array::from( - col.iter() - .take(length) - .map(|x| match x { - Ele::F64(x) => x, - _ => 0.0, - }) - .collect::>(), - )) as ArrayRef, - types::EleType::I32 => Arc::new(Int32Array::from( - col.iter() - .take(length) - .map(|x| match x { - Ele::I32(x) => x, - _ => 0, - }) - .collect::>(), - )) as ArrayRef, - types::EleType::F32 => Arc::new(Float32Array::from( - col.iter() - .take(length) - .map(|x| match x { - Ele::F32(x) => x, - _ => 0.0, - }) - .collect::>(), - )) as ArrayRef, - types::EleType::Text => Arc::new(StringArray::from( - col.iter() - .take(length) - .map(|x| match x { - Ele::Text(x) => x, - _ => x.to_string(), - }) - .collect::>(), - )) as ArrayRef, - _ => Arc::new(StringArray::from( - col.iter() - .take(length) - .map(|x| x.to_string()) - .collect::>(), - )) as ArrayRef, - }; - - columns.push(col); - } - - Ok(vec![RecordBatch::try_new( - SchemaRef::new(Schema::new(fields)), - columns, - )?]) - } - pub fn object_to_recordbatch(obj: Bound<'_, PyAny>) -> Result> { let mut fields: Vec = vec![]; let mut columns: Vec = vec![]; diff --git a/probing/memtable/Cargo.toml b/probing/memtable/Cargo.toml index c6a99f39..2c5652e3 100644 --- a/probing/memtable/Cargo.toml +++ b/probing/memtable/Cargo.toml @@ -10,6 +10,7 @@ description = "Self-describing columnar memory table with ring buffer" xxhash-rust = { version = "0.8", features = ["xxh3"] } memmap2 = "0.9" libc = "0.2" +pco = "0.4" [dev-dependencies] diff --git a/probing/memtable/src/discover.rs b/probing/memtable/src/discover.rs index 43e06680..eec04e56 100644 --- a/probing/memtable/src/discover.rs +++ b/probing/memtable/src/discover.rs @@ -45,7 +45,7 @@ use crate::memh::layout::required_total_size as memh_required_size; use crate::memh::table::init_buf as memh_init_buf; use crate::memh::{MemhView, MemhWriter}; use crate::memtable::{MemTable, MemTableView, MemTableWriter}; -use crate::raw::{init_buf, process_start_time, validate_buf}; +use crate::raw::{process_start_time, validate_buf}; use crate::schema::{Schema, Value}; use memmap2::{Mmap, MmapMut}; @@ -102,12 +102,12 @@ pub fn is_creator_alive(pid: u32, expected_start_time: u64) -> bool { /// A memtable backed by an mmap'd file, exposed for cross-process discovery. /// -/// On [`Drop`], the file is removed. If the parent `/` directory is -/// empty afterward, it is removed too. +/// Thin wrapper around a **shared-memory** [`MemTable`] (see +/// [`MemTable::shared`]); kept for API stability. On [`Drop`], the file is +/// removed. If the parent `/` directory is empty afterward, it is +/// removed too. pub struct ExposedTable { - mmap: MmapMut, - path: PathBuf, - dir: PathBuf, + inner: MemTable, } impl ExposedTable { @@ -131,37 +131,22 @@ impl ExposedTable { chunk_size: u32, num_chunks: u32, ) -> io::Result { - let dir = base_dir.join(std::process::id().to_string()); - fs::create_dir_all(&dir)?; - - let path = dir.join(name); - let size = MemTable::required_size(schema, chunk_size as usize, num_chunks as usize); - - let file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(true) - .open(&path)?; - file.set_len(size as u64)?; - - let mut mmap = unsafe { MmapMut::map_mut(&file)? }; - init_buf(&mut mmap, schema, chunk_size, num_chunks); - - Ok(Self { mmap, path, dir }) + Ok(Self { + inner: MemTable::shared_in(base_dir, name, schema, chunk_size, num_chunks)?, + }) } pub fn as_bytes(&self) -> &[u8] { - &self.mmap + self.inner.as_bytes() } pub fn as_bytes_mut(&mut self) -> &mut [u8] { - &mut self.mmap + self.inner.as_bytes_mut() } /// File path of this table. pub fn path(&self) -> &Path { - &self.path + self.inner.path().expect("ExposedTable is always shared") } /// Create a [`MemTableWriter`] backed by the mmap'd region. @@ -169,51 +154,22 @@ impl ExposedTable { /// **Note**: this re-validates the entire buffer on every call. /// Prefer [`push_row`](Self::push_row) for hot-path writes. pub fn writer(&mut self) -> MemTableWriter<'_> { - MemTableWriter::new(&mut self.mmap).expect("mmap buffer validated at creation") + MemTableWriter::new(self.inner.as_bytes_mut()).expect("mmap buffer validated at creation") } /// Append a row without re-validating the buffer. /// /// This is the fast path for high-frequency writes — it skips the /// O(rows × chunks) `validate_buf` that `writer()` performs on every call. - /// Safe because the buffer was validated at `create()` time and only - /// mutated through well-formed write operations. - /// - /// # Panic safety - /// - /// The spinlock is released even if the write panics (e.g. row exceeds - /// chunk capacity), preventing a deadlocked mmap file. + /// The spinlock is released even if the write panics, preventing a + /// deadlocked mmap file (see [`MemTable::push_row`]). pub fn push_row(&mut self, values: &[Value]) { - use crate::layout::{acquire_write_lock, release_write_lock}; - use crate::memtable::push_plain_row; - use crate::raw::validate_row_schema; - - debug_assert!( - validate_row_schema(&self.mmap, values), - "value types do not match schema" - ); - - acquire_write_lock(&mut self.mmap); - let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { - push_plain_row(&mut self.mmap, values); - })); - release_write_lock(&mut self.mmap); - - if let Err(payload) = result { - std::panic::resume_unwind(payload); - } + self.inner.push_row(values) } /// Create a read-only [`MemTableView`]. pub fn view(&self) -> MemTableView<'_> { - MemTableView::new(&self.mmap).expect("mmap buffer validated at creation") - } -} - -impl Drop for ExposedTable { - fn drop(&mut self) { - let _ = fs::remove_file(&self.path); - let _ = fs::remove_dir(&self.dir); // succeeds only if empty + self.inner.view() } } @@ -296,6 +252,42 @@ impl Drop for ExposedHashTable { } } +// ── MappedFile ──────────────────────────────────────────────────────── + +/// Read-only mmap of a memtable file (MEMT ring or MEMH hash), without +/// format validation. +/// +/// This is the zero-copy read path for SQL/catalog integration: pages are +/// faulted in on demand instead of copying the whole file to the heap +/// (rings are sized for capacity, so most chunks may be untouched). +/// Callers inspect the bytes with [`crate::detect_table`] and construct +/// the appropriate view, which performs its own validation. +/// +/// The mapping stays valid even if the creating process unlinks the file +/// (e.g. [`ExposedTable`] drop) while this handle is alive. +#[derive(Debug)] +pub struct MappedFile { + mmap: Mmap, + path: PathBuf, +} + +impl MappedFile { + pub fn open(path: impl AsRef) -> io::Result { + let path = path.as_ref().to_path_buf(); + let file = File::open(&path)?; + let mmap = unsafe { Mmap::map(&file)? }; + Ok(Self { mmap, path }) + } + + pub fn path(&self) -> &Path { + &self.path + } + + pub fn as_bytes(&self) -> &[u8] { + &self.mmap + } +} + // ── DiscoveredTable ─────────────────────────────────────────────────── /// A memtable discovered on the filesystem (read-only mmap). @@ -469,6 +461,7 @@ fn read_any_start_time(dir: &Path) -> u64 { #[cfg(test)] mod tests { use super::*; + use crate::raw::init_buf; use crate::schema::{DType, Value}; use std::sync::atomic::{AtomicU32, Ordering as AtOrd}; diff --git a/probing/memtable/src/layout.rs b/probing/memtable/src/layout.rs index 536f29ee..f737468d 100644 --- a/probing/memtable/src/layout.rs +++ b/probing/memtable/src/layout.rs @@ -1,15 +1,15 @@ //! Low-level layout: header, column descriptors, chunk headers, byte helpers. //! -//! ## Header v2 binary layout (64 bytes, 1 cache line) +//! ## Header v3 binary layout (64 bytes, 1 cache line) //! //! ```text //! offset size field notes //! ────────────────────────────────────────────────────────── //! 0 4 magic 0x4D454D54 ("MEMT" in LE) -//! 4 2 version 2 +//! 4 2 version 3 //! 6 2 header_size 64 (validation only) //! 8 2 byte_order BOM: written as [0x01, 0x02] -//! 10 2 _pad0 0 +//! 10 2 ts_col timestamp column index + 1 (0 = none) //! 12 4 flags feature bits (see FLAG_*) //! 16 4 num_cols //! 20 4 num_chunks @@ -17,11 +17,11 @@ //! 28 4 data_offset (64-aligned) //! ─── 32 byte boundary (cold/hot split) ───────────────── //! 32 4 write_chunk AtomicU32 -//! 36 4 write_lock AtomicU32 +//! 36 4 write_lock AtomicU32: 0 = unlocked, else holder PID //! 40 4 refcount AtomicU32 //! 44 4 creator_pid PID of creating process //! 48 8 creator_start_time process start time (platform-specific) -//! 56 8 _reserved 0 +//! 56 8 lock_owner_start AtomicU64: lock holder's start time //! ────────────────────────────────────────────────────────── //! ``` //! @@ -29,7 +29,8 @@ //! allows readers to detect endianness mismatch without guessing. use std::mem; -use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicI64, AtomicU32, AtomicU64, Ordering}; +use std::time::{Duration, Instant}; // ── C-style layout structs ────────────────────────────────────────── @@ -38,7 +39,11 @@ pub const MAGIC_MEMT: u32 = 0x4D45_4D54; pub(crate) const MAGIC: u32 = MAGIC_MEMT; /// Header format version for MEMT. -pub(crate) const VERSION: u16 = 2; +/// +/// v3: `_pad0` became `ts_col`, `_reserved` became `lock_owner_start`, +/// `write_lock` stores the holder PID (was 0/1), and `ChunkHeader` grew +/// `min_ts`/`max_ts` (24 → 40 bytes). +pub(crate) const VERSION: u16 = 3; /// Byte-order mark: written as raw bytes `[0x01, 0x02]`. /// On a LE host, `u16::from_ne_bytes([0x01, 0x02])` == `0x0201`. @@ -80,7 +85,12 @@ pub(crate) struct Header { pub header_size: u16, /// Byte-order mark, written as `BYTE_ORDER_MARK`. pub byte_order: u16, - pub _pad0: u16, + /// Designated timestamp column **index + 1** (0 = no timestamp column). + /// + /// Set at init when the schema contains an `I64` column named + /// `"timestamp"`. The writer maintains per-chunk `min_ts`/`max_ts` + /// from this column so readers can prune chunks by time range. + pub ts_col: u16, /// Feature flags (see `FLAG_*` constants). pub flags: u32, pub num_cols: u32, @@ -92,7 +102,10 @@ pub(crate) struct Header { // ── hot zone (atomically mutated) ──────────────────── /// Ring buffer: index of the chunk currently being written. pub write_chunk: AtomicU32, - /// Spinlock for writer serialization: 0 = unlocked, 1 = locked. + /// Robust writer spinlock: 0 = unlocked, otherwise the **PID** of the + /// holding process. A waiter that has spun past + /// [`LOCK_STEAL_TIMEOUT`] checks the holder's liveness and steals the + /// lock from a dead process (see [`acquire_write_lock`]). pub write_lock: AtomicU32, /// Reference count for shared lifetime management. pub refcount: AtomicU32, @@ -103,7 +116,10 @@ pub(crate) struct Header { /// macOS: microseconds since epoch (via `sysctl`). /// Other: 0 (falls back to PID-only liveness check). pub creator_start_time: u64, - pub _reserved: [u32; 2], + /// Start time of the current lock holder (0 = unknown / not written + /// yet). Written by the holder right after acquiring; lets waiters + /// detect PID recycling before stealing. Advisory only. + pub lock_owner_start: AtomicU64, } /// Per-column descriptor, immediately following the Header. @@ -135,7 +151,12 @@ impl ColumnDesc { } } -/// Per-chunk metadata, at the start of every chunk's byte region. +/// Sentinel for `ChunkHeader.min_ts` when the chunk holds no rows. +pub(crate) const TS_MIN_INIT: i64 = i64::MAX; +/// Sentinel for `ChunkHeader.max_ts` when the chunk holds no rows. +pub(crate) const TS_MAX_INIT: i64 = i64::MIN; + +/// Per-chunk metadata, at the start of every chunk's byte region (40 bytes). #[repr(C)] pub(crate) struct ChunkHeader { /// Incremented each time the chunk is recycled (ring wrap). @@ -148,6 +169,12 @@ pub(crate) struct ChunkHeader { /// Chunk lifecycle state (see `ChunkState`). pub state: AtomicU32, pub _reserved: u32, + /// Smallest value of the designated timestamp column in this chunk + /// ([`TS_MIN_INIT`] when empty or no `Header::ts_col`). Maintained by + /// the writer; readers must validate against `generation` snapshots. + pub min_ts: AtomicI64, + /// Largest timestamp in this chunk ([`TS_MAX_INIT`] when empty). + pub max_ts: AtomicI64, } /// Chunk lifecycle state. @@ -164,7 +191,7 @@ pub(crate) const CHUNK_HEADER_SIZE: usize = mem::size_of::(); const _: () = { assert!(mem::size_of::
() == 64); assert!(mem::size_of::() == 64); - assert!(mem::size_of::() == 24); + assert!(mem::size_of::() == 40); }; // ── struct accessors ──────────────────────────────────────────────── @@ -197,10 +224,101 @@ pub(crate) fn chunk_header(buf: &[u8], cs: usize) -> &ChunkHeader { unsafe { &*(buf[cs..].as_ptr() as *const ChunkHeader) } } -/// Acquire the writer spinlock with exponential back-off. +/// How long a waiter spins before checking whether the lock holder is +/// still alive (and stealing the lock from a dead process). +/// +/// Writers hold the lock for nanoseconds–microseconds; even a descheduled +/// holder resumes within milliseconds. Reaching this timeout in practice +/// means the holder crashed while holding the lock. +pub(crate) const LOCK_STEAL_TIMEOUT: Duration = Duration::from_millis(500); + +/// `true` when a process with `pid` exists (it may belong to another user). +fn process_alive(pid: u32) -> bool { + if pid == std::process::id() { + return true; + } + if unsafe { libc::kill(pid as libc::pid_t, 0) } == 0 { + return true; + } + // EPERM: the process exists but we may not signal it. + std::io::Error::last_os_error().raw_os_error() == Some(libc::EPERM) +} + +/// This process's kernel start time, cached per PID (reads `/proc` on Linux). +/// +/// **Fork safety:** the cache is keyed on the live PID, not a one-shot +/// `OnceLock`. A child inheriting a parent's cached value would otherwise +/// record the *parent's* start time in `lock_owner_start`, and a waiter +/// comparing against the child's real start time would mistake the live child +/// for a recycled PID and steal its lock — exactly the hazard fork-heavy +/// workloads (e.g. PyTorch DataLoader) trigger. Re-reading whenever the PID +/// changes makes every post-fork caller observe its own start time. +fn my_start_time() -> u64 { + static MY_PID: AtomicU32 = AtomicU32::new(0); + static MY_START: AtomicU64 = AtomicU64::new(0); + + let pid = std::process::id(); + if MY_PID.load(Ordering::Acquire) == pid { + let cached = MY_START.load(Ordering::Acquire); + if cached != 0 { + return cached; + } + } + let start = crate::raw::process_start_time(pid); + // Publish start before PID: a reader that observes the matching PID is then + // guaranteed to also observe the start written for it. + MY_START.store(start, Ordering::Release); + MY_PID.store(pid, Ordering::Release); + start +} + +/// Decide whether the lock can be stolen from `holder`, and try to. +/// +/// Steal conditions (either): +/// - `holder` no longer exists (crashed / killed while holding the lock); +/// - `holder` exists but its kernel start time does not match the one the +/// real holder recorded in `lock_owner_start` — the PID was recycled by +/// an unrelated process. Re-checked after a grace period to rule out +/// the transient window where a fresh holder has not yet recorded its +/// start time. +/// +/// Stealing is safe with respect to data: rows only become visible via the +/// `used`/`row_count` Release stores at the end of a write, so a row half +/// written by the dead holder stays uncommitted and is simply overwritten. +#[cold] +#[inline(never)] +fn try_steal_lock(h: &Header, holder: u32, me: u32) -> bool { + if process_alive(holder) { + let owner_start = h.lock_owner_start.load(Ordering::Relaxed); + let actual_start = crate::raw::process_start_time(holder); + if owner_start == 0 || actual_start == 0 || actual_start == owner_start { + return false; // genuinely alive (or cannot tell) — keep waiting + } + std::thread::sleep(Duration::from_millis(10)); + if h.write_lock.load(Ordering::Relaxed) != holder + || h.lock_owner_start.load(Ordering::Relaxed) != owner_start + { + return false; // lock changed hands meanwhile — not stale + } + } + if h.write_lock + .compare_exchange(holder, me, Ordering::Acquire, Ordering::Relaxed) + .is_ok() + { + h.lock_owner_start.store(my_start_time(), Ordering::Relaxed); + return true; + } + false +} + +/// Acquire the **robust** writer spinlock with exponential back-off. /// -/// First few failures use `spin_loop()` (pause instruction), then -/// escalate to `yield_now()` to avoid burning CPU under contention. +/// The lock word holds the owner's PID (0 = unlocked). First few failures +/// use `spin_loop()` (pause instruction), then escalate to `yield_now()`. +/// A waiter stuck past [`LOCK_STEAL_TIMEOUT`] verifies the holder's +/// liveness and steals the lock from a dead process (see +/// [`try_steal_lock`]), so a writer crashing inside the critical section +/// cannot deadlock other writer processes forever. /// /// SAFETY NOTE: the buffer parameter is `&mut [u8]` (not `&[u8]`) so that /// LLVM does **not** mark the pointer `readonly`. With `&[u8]` LLVM may @@ -208,12 +326,30 @@ pub(crate) fn chunk_header(buf: &[u8], cs: usize) -> &ChunkHeader { /// the spin loop into an infinite loop in optimised (release) builds. pub(crate) fn acquire_write_lock(buf: &mut [u8]) { let ptr = buf.as_mut_ptr() as *const Header; - let lock = unsafe { &(*ptr).write_lock }; + let h = unsafe { &*ptr }; + let me = std::process::id(); let mut spins = 0u32; - while lock - .compare_exchange_weak(0, 1, Ordering::Acquire, Ordering::Relaxed) - .is_err() - { + let mut waiting_since: Option = None; + loop { + match h + .write_lock + .compare_exchange_weak(0, me, Ordering::Acquire, Ordering::Relaxed) + { + Ok(_) => { + h.lock_owner_start.store(my_start_time(), Ordering::Relaxed); + return; + } + Err(holder) if holder != 0 => { + let since = *waiting_since.get_or_insert_with(Instant::now); + if spins >= 16 && since.elapsed() >= LOCK_STEAL_TIMEOUT { + if try_steal_lock(h, holder, me) { + return; + } + waiting_since = Some(Instant::now()); + } + } + Err(_) => {} // spurious failure with lock free — retry CAS + } if spins < 16 { for _ in 0..1 << spins.min(4) { std::hint::spin_loop(); @@ -226,9 +362,15 @@ pub(crate) fn acquire_write_lock(buf: &mut [u8]) { } /// Release the writer spinlock. See [`acquire_write_lock`] for why `&mut`. +/// +/// Clears `lock_owner_start` *before* the lock word so that waiters never +/// pair the next holder's PID with this holder's start time. pub(crate) fn release_write_lock(buf: &mut [u8]) { let ptr = buf.as_mut_ptr() as *const Header; - unsafe { (*ptr).write_lock.store(0, Ordering::Release) }; + unsafe { + (*ptr).lock_owner_start.store(0, Ordering::Relaxed); + (*ptr).write_lock.store(0, Ordering::Release); + } } pub(crate) fn r32(buf: &[u8], off: usize) -> u32 { u32::from_le_bytes(buf[off..off + 4].try_into().unwrap()) @@ -262,7 +404,7 @@ mod tests { fn struct_sizes() { assert_eq!(mem::size_of::
(), 64); assert_eq!(mem::size_of::(), 64); - assert_eq!(mem::size_of::(), 24); + assert_eq!(mem::size_of::(), 40); } #[test] @@ -271,4 +413,40 @@ mod tests { let expected_le = u16::from_le_bytes(BYTE_ORDER_MARK); assert_eq!(bom, expected_le); } + + /// Fork safety: after `fork()`, `my_start_time()` must return the *child's* + /// own kernel start time, not a value cached for the parent before the + /// fork. With the old `OnceLock` cache the child returned the parent's + /// start time; a waiter then compared it against the child's real start + /// time and stole the lock from a live holder. The test process has run + /// long enough that its start tick differs from a freshly-forked child's, + /// so the stale value would be observably wrong. + /// + /// Linux-only: kernel start times come from `/proc`. On platforms without + /// it `process_start_time` returns 0, the PID-recycle steal path is inert, + /// and there is no fork hazard to guard against. + #[cfg(target_os = "linux")] + #[test] + fn my_start_time_refreshes_after_fork() { + // Warm the per-PID cache for the parent (mimics the leaked OnceLock). + let parent = my_start_time(); + assert_ne!(parent, 0, "parent start time should be readable"); + + unsafe { + let pid = libc::fork(); + assert!(pid >= 0, "fork failed"); + if pid == 0 { + // Child: the cached value must equal a fresh read for THIS pid. + let cached = my_start_time(); + let fresh = crate::raw::process_start_time(std::process::id()); + libc::_exit(if cached == fresh && cached != 0 { 0 } else { 1 }); + } + let mut status = 0; + libc::waitpid(pid, &mut status, 0); + assert!( + libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0, + "child my_start_time() must reflect its own process, not the parent's cache", + ); + } + } } diff --git a/probing/memtable/src/lib.rs b/probing/memtable/src/lib.rs index 3d16ff9e..2cbd5c04 100644 --- a/probing/memtable/src/lib.rs +++ b/probing/memtable/src/lib.rs @@ -93,6 +93,7 @@ mod cache; mod dedup; pub mod discover; mod layout; +pub mod memc; pub mod memh; mod memtable; mod raw; @@ -108,7 +109,7 @@ pub use memh::{ MemhValidateError, MemhView, MemhWriter, SharedMemhWriter, TypedValue, MAGIC_MEMH, VERSION_MEMH, }; -pub use memtable::{MemTable, MemTableView, MemTableWriter}; +pub use memtable::{BackingKind, MemTable, MemTableView, MemTableWriter}; pub use raw::validate_buf; pub use refcount::{acquire_ref, refcount, release_ref}; pub use row::{Row, RowCursor, RowIter}; diff --git a/probing/memtable/src/memc/codec.rs b/probing/memtable/src/memc/codec.rs new file mode 100644 index 00000000..bdeb1e6c --- /dev/null +++ b/probing/memtable/src/memc/codec.rs @@ -0,0 +1,281 @@ +//! Columnar encode/decode for MEMC page payloads. +//! +//! A page payload is the concatenation of per-column sub-blocks, each: +//! +//! ```text +//! [u8 encoding][u8 dtype][u16 _pad][u32 byte_len][payload bytes] +//! ``` +//! +//! Numeric columns use Pco (`simpler_compress`); `U8` and variable-length +//! `Str`/`Bytes` columns are stored raw (Pco has no `u8`/string support). + +use pco::standalone::{simple_decompress, simpler_compress}; + +use super::layout::{get_u32, ColEncoding, PCO_LEVEL}; +use crate::schema::{DType, Value}; + +/// One column's worth of values, type-tagged. +#[derive(Debug, Clone, PartialEq)] +pub enum ColumnData { + U8(Vec), + U32(Vec), + I32(Vec), + I64(Vec), + F32(Vec), + F64(Vec), + U64(Vec), + Str(Vec), + Bytes(Vec>), +} + +impl ColumnData { + pub fn dtype(&self) -> DType { + match self { + ColumnData::U8(_) => DType::U8, + ColumnData::U32(_) => DType::U32, + ColumnData::I32(_) => DType::I32, + ColumnData::I64(_) => DType::I64, + ColumnData::F32(_) => DType::F32, + ColumnData::F64(_) => DType::F64, + ColumnData::U64(_) => DType::U64, + ColumnData::Str(_) => DType::Str, + ColumnData::Bytes(_) => DType::Bytes, + } + } + + pub fn len(&self) -> usize { + match self { + ColumnData::U8(v) => v.len(), + ColumnData::U32(v) => v.len(), + ColumnData::I32(v) => v.len(), + ColumnData::I64(v) => v.len(), + ColumnData::F32(v) => v.len(), + ColumnData::F64(v) => v.len(), + ColumnData::U64(v) => v.len(), + ColumnData::Str(v) => v.len(), + ColumnData::Bytes(v) => v.len(), + } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// Builds one [`ColumnData`] of a fixed [`DType`] by pushing [`Value`]s. +pub struct ColumnBuilder { + data: ColumnData, +} + +impl ColumnBuilder { + pub fn new(dtype: DType) -> Self { + let data = match dtype { + DType::U8 => ColumnData::U8(Vec::new()), + DType::U32 => ColumnData::U32(Vec::new()), + DType::I32 => ColumnData::I32(Vec::new()), + DType::I64 => ColumnData::I64(Vec::new()), + DType::F32 => ColumnData::F32(Vec::new()), + DType::F64 => ColumnData::F64(Vec::new()), + DType::U64 => ColumnData::U64(Vec::new()), + DType::Str => ColumnData::Str(Vec::new()), + DType::Bytes => ColumnData::Bytes(Vec::new()), + }; + Self { data } + } + + /// Append a value. Mismatched types are coerced where lossless and + /// otherwise dropped as a zero/empty default — callers validate the + /// row schema up front, so this only guards against logic errors. + pub fn push(&mut self, v: &Value) { + match (&mut self.data, v) { + (ColumnData::U8(d), Value::U8(x)) => d.push(*x), + (ColumnData::U32(d), Value::U32(x)) => d.push(*x), + (ColumnData::I32(d), Value::I32(x)) => d.push(*x), + (ColumnData::I64(d), Value::I64(x)) => d.push(*x), + (ColumnData::F32(d), Value::F32(x)) => d.push(*x), + (ColumnData::F64(d), Value::F64(x)) => d.push(*x), + (ColumnData::U64(d), Value::U64(x)) => d.push(*x), + (ColumnData::Str(d), Value::Str(x)) => d.push((*x).to_string()), + (ColumnData::Bytes(d), Value::Bytes(x)) => d.push(x.to_vec()), + _ => debug_assert!(false, "ColumnBuilder type mismatch"), + } + } + + pub fn finish(self) -> ColumnData { + self.data + } +} + +fn pco_compress(nums: &[T]) -> Result, String> { + simpler_compress(nums, PCO_LEVEL).map_err(|e| e.to_string()) +} + +fn pco_decompress(data: &[u8]) -> Result, String> { + simple_decompress::(data).map_err(|e| e.to_string()) +} + +fn encode_varlen(entries: impl Iterator)>, total: usize) -> Vec { + let mut out = Vec::with_capacity(total); + for (len, bytes) in entries { + out.extend_from_slice(&(len as u32).to_le_bytes()); + out.extend_from_slice(&bytes); + } + out +} + +/// Encode one column into its sub-block (header + payload). +pub fn encode_column(col: &ColumnData) -> Result, String> { + let (encoding, payload): (ColEncoding, Vec) = match col { + ColumnData::U8(v) => (ColEncoding::RawFixed, v.clone()), + ColumnData::I32(v) => (ColEncoding::Pco, pco_compress(v)?), + ColumnData::I64(v) => (ColEncoding::Pco, pco_compress(v)?), + ColumnData::F32(v) => (ColEncoding::Pco, pco_compress(v)?), + ColumnData::F64(v) => (ColEncoding::Pco, pco_compress(v)?), + ColumnData::U32(v) => (ColEncoding::Pco, pco_compress(v)?), + ColumnData::U64(v) => (ColEncoding::Pco, pco_compress(v)?), + ColumnData::Str(v) => { + let total: usize = v.iter().map(|s| 4 + s.len()).sum(); + let payload = encode_varlen(v.iter().map(|s| (s.len(), s.as_bytes().to_vec())), total); + (ColEncoding::RawVarLen, payload) + } + ColumnData::Bytes(v) => { + let total: usize = v.iter().map(|b| 4 + b.len()).sum(); + let payload = encode_varlen(v.iter().map(|b| (b.len(), b.clone())), total); + (ColEncoding::RawVarLen, payload) + } + }; + + let mut out = Vec::with_capacity(8 + payload.len()); + out.push(encoding as u8); + out.push(col.dtype() as u32 as u8); + out.extend_from_slice(&[0u8, 0u8]); + out.extend_from_slice(&(payload.len() as u32).to_le_bytes()); + out.extend_from_slice(&payload); + Ok(out) +} + +/// Decode one column sub-block, returning the column and bytes consumed. +pub fn decode_column(buf: &[u8], row_count: usize) -> Result<(ColumnData, usize), String> { + if buf.len() < 8 { + return Err("column sub-block too small".into()); + } + let encoding = ColEncoding::from_u8(buf[0]).ok_or("invalid column encoding")?; + let dtype = DType::from_u32(buf[1] as u32).ok_or("invalid column dtype")?; + let payload_len = get_u32(buf, 4) as usize; + let start = 8; + let end = start + payload_len; + if buf.len() < end { + return Err("column payload out of bounds".into()); + } + let payload = &buf[start..end]; + + let col = match (encoding, dtype) { + (ColEncoding::RawFixed, DType::U8) => ColumnData::U8(payload.to_vec()), + (ColEncoding::Pco, DType::I32) => ColumnData::I32(pco_decompress(payload)?), + (ColEncoding::Pco, DType::I64) => ColumnData::I64(pco_decompress(payload)?), + (ColEncoding::Pco, DType::F32) => ColumnData::F32(pco_decompress(payload)?), + (ColEncoding::Pco, DType::F64) => ColumnData::F64(pco_decompress(payload)?), + (ColEncoding::Pco, DType::U32) => ColumnData::U32(pco_decompress(payload)?), + (ColEncoding::Pco, DType::U64) => ColumnData::U64(pco_decompress(payload)?), + (ColEncoding::RawVarLen, DType::Str) => { + ColumnData::Str(decode_varlen_str(payload, row_count)?) + } + (ColEncoding::RawVarLen, DType::Bytes) => { + ColumnData::Bytes(decode_varlen_bytes(payload, row_count)?) + } + _ => return Err("encoding/dtype mismatch".into()), + }; + Ok((col, end)) +} + +fn decode_varlen_entries(payload: &[u8], row_count: usize) -> Result>, String> { + let mut out = Vec::with_capacity(row_count); + let mut off = 0usize; + while off + 4 <= payload.len() { + let len = get_u32(payload, off) as usize; + off += 4; + if off + len > payload.len() { + return Err("varlen entry out of bounds".into()); + } + out.push(payload[off..off + len].to_vec()); + off += len; + } + Ok(out) +} + +fn decode_varlen_str(payload: &[u8], row_count: usize) -> Result, String> { + decode_varlen_entries(payload, row_count)? + .into_iter() + .map(|b| String::from_utf8(b).map_err(|_| "varlen str not utf-8".to_string())) + .collect() +} + +fn decode_varlen_bytes(payload: &[u8], row_count: usize) -> Result>, String> { + decode_varlen_entries(payload, row_count) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn roundtrip(col: ColumnData) { + let rc = col.len(); + let encoded = encode_column(&col).unwrap(); + let (decoded, consumed) = decode_column(&encoded, rc).unwrap(); + assert_eq!(consumed, encoded.len()); + assert_eq!(decoded, col); + } + + #[test] + fn numeric_columns_roundtrip() { + roundtrip(ColumnData::I64((0..1000).map(|i| i * 7 - 3).collect())); + roundtrip(ColumnData::I32(vec![-5, 0, 5, 100, -100])); + roundtrip(ColumnData::F64(vec![1.5, 2.5, 3.14, -9.0])); + roundtrip(ColumnData::F32(vec![0.1, 0.2, 0.3])); + roundtrip(ColumnData::U32(vec![1, 2, 3, u32::MAX])); + roundtrip(ColumnData::U64(vec![1, 2, 3, u64::MAX])); + roundtrip(ColumnData::U8(vec![0, 1, 2, 255])); + } + + #[test] + fn varlen_columns_roundtrip() { + roundtrip(ColumnData::Str(vec![ + "alpha".into(), + "".into(), + "δοκιμή".into(), + ])); + roundtrip(ColumnData::Bytes(vec![vec![1, 2, 3], vec![], vec![0xFF; 10]])); + } + + #[test] + fn pco_actually_compresses_monotonic_i64() { + // A monotonic timestamp column should shrink dramatically under Pco. + let col = ColumnData::I64((0..10_000).map(|i| 1_700_000_000_000 + i * 1000).collect()); + let encoded = encode_column(&col).unwrap(); + let raw = 10_000 * 8; + assert!( + encoded.len() < raw / 4, + "expected >4x compression, got {} vs {raw}", + encoded.len() + ); + } + + #[test] + fn column_builder_from_values() { + let mut b = ColumnBuilder::new(DType::I64); + for v in [Value::I64(10), Value::I64(20), Value::I64(30)] { + b.push(&v); + } + assert_eq!(b.finish(), ColumnData::I64(vec![10, 20, 30])); + } + + #[test] + fn corrupt_payload_len_is_rejected() { + let col = ColumnData::I64(vec![1, 2, 3]); + let mut encoded = encode_column(&col).unwrap(); + // Overstate payload_len → decode must refuse rather than panic. + let bad = (encoded.len() as u32 + 100).to_le_bytes(); + encoded[4..8].copy_from_slice(&bad); + assert!(decode_column(&encoded, 3).is_err()); + } +} diff --git a/probing/memtable/src/memc/compactor.rs b/probing/memtable/src/memc/compactor.rs new file mode 100644 index 00000000..d93e4ad7 --- /dev/null +++ b/probing/memtable/src/memc/compactor.rs @@ -0,0 +1,421 @@ +//! [`Compactor`]: the **roller** that drains sealed hot-ring chunks into +//! cold MEMC segments, bounding segment size to prevent fragmentation. +//! +//! ## Why a roller +//! +//! The MEMC format and [`ColdStore`] give us immutable segments and +//! oldest-first eviction, but *nothing decides when to seal a segment and +//! start a fresh one*. Without that policy you either seal every page +//! (a blizzard of tiny files) or never seal (one unbounded file). The +//! compactor closes that gap with a size-or-time roll policy: +//! +//! ```text +//! after each appended page: +//! size_bytes() >= target_segment_bytes → seal + roll +//! on every poll tick: +//! open segment older than max_segment_age → seal + roll (low-rate tables) +//! on shutdown / flush: +//! seal the open segment unconditionally → bounded tail file +//! ``` +//! +//! A busy process emits a steady stream of ~`target`-sized files; an idle +//! one keeps appending to a single open segment until the age window or +//! shutdown, so neither extreme fragments the directory. +//! +//! ## Multi-table +//! +//! One [`Compactor`] feeds **one** [`ColdStore`] from **many** hot tables. +//! Pages from every table share the same segment files (each carries its +//! `table_id`), so adding tables grows pages, not files or directories. +//! +//! ## Concurrency +//! +//! The hot table is written by the application; the compactor only ever +//! *reads* it. For shared/file-backed tables the compactor opens its own +//! read handle to the same mapping and relies on the ring's lock-free +//! `Acquire`/`Release` chunk protocol: it drains only `Sealed` chunks and +//! re-checks the chunk generation after transposing, discarding a page if +//! the ring recycled the chunk mid-read. The still-open `Writing` chunk is +//! left to the hot tier until it seals. + +use std::collections::HashMap; +use std::io; +use std::path::PathBuf; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; +use std::thread::JoinHandle; +use std::time::{Duration, Instant}; + +use super::codec::{ColumnBuilder, ColumnData}; +use super::layout::SOURCE_CHUNK_NONE; +use super::reader::SegmentReader; +use super::store::ColdStore; +use super::writer::SegmentWriter; +use crate::layout::ChunkState; +use crate::memtable::{MemTable, MemTableView}; +use crate::schema::{DType, Value}; + +/// Roll/retention policy for a [`Compactor`]. +#[derive(Debug, Clone)] +pub struct CompactorConfig { + /// Seal the open segment and start a new one once it reaches this many + /// bytes. Bounds individual file size; the main fragmentation knob. + pub target_segment_bytes: u64, + /// Also seal an open segment this old, so low-rate tables don't sit + /// unsealed (and unqueryable through the cold footer) indefinitely. + pub max_segment_age: Duration, + /// How long the background thread sleeps between drain passes. + pub poll_interval: Duration, + /// Cold-store byte budget; oldest segments are evicted past it. + pub max_total_bytes: Option, + /// Cold-store TTL; segments older than this are evicted. + pub ttl: Option, +} + +impl Default for CompactorConfig { + fn default() -> Self { + Self { + target_segment_bytes: 64 * 1024 * 1024, + max_segment_age: Duration::from_secs(300), + poll_interval: Duration::from_millis(500), + max_total_bytes: None, + ttl: None, + } + } +} + +/// Per-table draining bookkeeping. +struct TableProgress { + /// Last drained generation per chunk index (parallel to the hot ring). + /// A chunk is re-drained only when its generation advances past this. + drained_gen: Vec, + /// This table's id inside the *current* open segment, if registered. + /// Reset to `None` on every roll (table ids are segment-local). + seg_table_id: Option, +} + +/// Drains sealed hot chunks into size-bounded cold segments. +/// +/// Usable synchronously (call [`drain_view`](Self::drain_view) yourself) or +/// as a background thread via [`spawn`](Self::spawn). +pub struct Compactor { + store: ColdStore, + config: CompactorConfig, + current: Option, + opened_at: Instant, + tables: HashMap, + /// Per-(table, chunk) drain watermark recovered from existing cold + /// segments by [`prime_from_cold`](Self::prime_from_cold); merged into a + /// table's `drained_gen` the first time it is seen, so a restart over a + /// persistent cold dir does not re-compact already-persisted chunks. + seed: HashMap>, +} + +impl Compactor { + pub fn new(store: ColdStore, config: CompactorConfig) -> Self { + Self { + store, + config, + current: None, + opened_at: Instant::now(), + tables: HashMap::new(), + seed: HashMap::new(), + } + } + + /// Rebuild per-(table, chunk) drain watermarks from the cold segments + /// already on disk, so draining is **exactly-once across restarts** when + /// the cold dir persists. Call once after [`new`](Self::new), before any + /// `drain_view`. Each cold page records the hot-ring `(source_gen, + /// source_chunk)` it came from; we keep the max generation per chunk. + pub fn prime_from_cold(&mut self) -> io::Result<()> { + for path in self.store.segment_paths() { + let Ok(reader) = SegmentReader::open(&path) else { + continue; // unreadable/foreign file: skip, never fail priming + }; + for page in reader.pages() { + if page.source_chunk == SOURCE_CHUNK_NONE { + continue; + } + let Some(def) = reader.table_def(page.table_id) else { + continue; + }; + let slot = self + .seed + .entry(def.name.clone()) + .or_default() + .entry(page.source_chunk as usize) + .or_insert(0); + *slot = (*slot).max(page.source_gen); + } + } + Ok(()) + } + + pub fn config(&self) -> &CompactorConfig { + &self.config + } + + /// Bytes written to the currently open segment (0 if none). + pub fn current_segment_bytes(&self) -> u64 { + self.current.as_ref().map(|w| w.size_bytes()).unwrap_or(0) + } + + /// Cold-store capacity snapshot. + pub fn stats(&self) -> super::store::ColdStats { + self.store.stats() + } + + /// Drain every newly-sealed chunk of `view` (a read handle to a hot + /// table named `name`) into cold pages, rolling segments by size as it + /// goes. Returns the number of rows compacted this call. + pub fn drain_view(&mut self, name: &str, view: &MemTableView) -> io::Result { + let cols: Vec<(String, DType)> = view + .schema() + .cols + .iter() + .map(|c| (c.name.clone(), c.dtype)) + .collect(); + let num_chunks = view.num_chunks(); + + if !self.tables.contains_key(name) { + let mut drained_gen = vec![0u64; num_chunks]; + if let Some(seeds) = self.seed.get(name) { + for (&chunk, &gen) in seeds { + if chunk < drained_gen.len() { + drained_gen[chunk] = drained_gen[chunk].max(gen); + } + } + } + self.tables.insert( + name.to_string(), + TableProgress { + drained_gen, + seg_table_id: None, + }, + ); + } + let prog = self.tables.get_mut(name).unwrap(); + if prog.drained_gen.len() != num_chunks { + prog.drained_gen.resize(num_chunks, 0); + } + + let sealed = ChunkState::Sealed as u32; + let mut total_rows = 0usize; + + for chunk in view.chunks_logical() { + if view.chunk_state(chunk) != sealed { + continue; + } + let gen = view.chunk_generation(chunk); + let already = self.tables[name].drained_gen[chunk]; + if gen == 0 || gen <= already { + continue; + } + + let (gen_read, columns) = match transpose_chunk(view, chunk, &cols) { + Some(x) => x, + None => continue, // recycled mid-read; try again next pass + }; + let rows = columns.first().map(|c| c.len()).unwrap_or(0); + if rows == 0 { + self.tables.get_mut(name).unwrap().drained_gen[chunk] = gen_read; + continue; + } + + self.ensure_segment()?; + let table_id = self.register_if_needed(name, &cols)?; + self.current + .as_mut() + .expect("segment open") + .append_page(table_id, &columns, gen_read, chunk as u32)?; + self.tables.get_mut(name).unwrap().drained_gen[chunk] = gen_read; + total_rows += rows; + + self.maybe_roll_on_size()?; + } + Ok(total_rows) + } + + /// Seal the open segment if it has grown past `target_segment_bytes`. + fn maybe_roll_on_size(&mut self) -> io::Result> { + let over = self + .current + .as_ref() + .is_some_and(|w| w.size_bytes() >= self.config.target_segment_bytes); + if over { + self.roll() + } else { + Ok(None) + } + } + + /// Seal the open segment if it is older than `max_segment_age` and holds + /// at least one page. Call this periodically (the background loop does). + pub fn maybe_roll_on_age(&mut self) -> io::Result> { + let aged = self.current.as_ref().is_some_and(|w| w.page_count() > 0) + && self.opened_at.elapsed() >= self.config.max_segment_age; + if aged { + self.roll() + } else { + Ok(None) + } + } + + /// Seal the current segment and clear the open slot. An open segment + /// with no pages is removed instead of sealed, so an age-triggered roll + /// on an empty writer never leaves a stub file. Returns the sealed path. + pub fn roll(&mut self) -> io::Result> { + let Some(w) = self.current.take() else { + return Ok(None); + }; + for p in self.tables.values_mut() { + p.seg_table_id = None; + } + if w.page_count() == 0 { + let path = w.path().to_path_buf(); + drop(w); + let _ = std::fs::remove_file(&path); + return Ok(None); + } + Ok(Some(w.seal()?)) + } + + /// Seal whatever is open (shutdown / explicit checkpoint). + pub fn flush(&mut self) -> io::Result> { + self.roll() + } + + /// Apply the cold-store byte/TTL budget, deleting oldest segments. + pub fn enforce(&self) -> Vec { + self.store + .enforce_limits(self.config.max_total_bytes, self.config.ttl) + } + + fn ensure_segment(&mut self) -> io::Result<()> { + if self.current.is_none() { + self.current = Some(self.store.create_segment()?); + self.opened_at = Instant::now(); + for p in self.tables.values_mut() { + p.seg_table_id = None; + } + } + Ok(()) + } + + fn register_if_needed(&mut self, name: &str, cols: &[(String, DType)]) -> io::Result { + if let Some(id) = self.tables[name].seg_table_id { + return Ok(id); + } + let id = self + .current + .as_mut() + .expect("segment open") + .register_table(name, cols)?; + self.tables.get_mut(name).unwrap().seg_table_id = Some(id); + Ok(id) + } + + /// Move this compactor onto a background thread that drains `sources` + /// every `poll_interval`, rolls by size/age, and enforces the budget. + /// Each source is `(table_name, read_handle)`; the handle must be a + /// shared/file-backed [`MemTable`] the application is writing elsewhere. + /// Dropping (or [`stop`](CompactorHandle::stop)ping) the returned handle + /// does a final drain + flush so no sealed chunk is left behind. + pub fn spawn(mut self, sources: Vec<(String, MemTable)>) -> CompactorHandle { + let stop = Arc::new(AtomicBool::new(false)); + let stop_thread = stop.clone(); + let poll = self.config.poll_interval; + let thread = std::thread::Builder::new() + .name("memc-compactor".into()) + .spawn(move || { + while !stop_thread.load(Ordering::Relaxed) { + for (name, table) in &sources { + let view = table.view(); + let _ = self.drain_view(name, &view); + } + let _ = self.maybe_roll_on_age(); + let _ = self.enforce(); + std::thread::park_timeout(poll); + } + for (name, table) in &sources { + let view = table.view(); + let _ = self.drain_view(name, &view); + } + let _ = self.flush(); + let _ = self.enforce(); + }) + .expect("spawn memc-compactor thread"); + CompactorHandle { + stop, + thread: Some(thread), + } + } +} + +/// Handle to a background [`Compactor`] thread. Stops and joins on drop. +pub struct CompactorHandle { + stop: Arc, + thread: Option>, +} + +impl CompactorHandle { + /// Signal the thread to do a final drain + flush, then join it. + pub fn stop(mut self) { + self.shutdown(); + } + + fn shutdown(&mut self) { + self.stop.store(true, Ordering::Relaxed); + if let Some(t) = self.thread.take() { + t.thread().unpark(); + let _ = t.join(); + } + } +} + +impl Drop for CompactorHandle { + fn drop(&mut self) { + self.shutdown(); + } +} + +/// Transpose one chunk's rows into per-column [`ColumnData`]. +/// +/// Returns `None` if the chunk was empty, never written, or recycled by the +/// ring while we read it (detected by a generation change), so the caller +/// can skip and retry on the next pass without persisting torn data. +fn transpose_chunk( + view: &MemTableView, + chunk: usize, + cols: &[(String, DType)], +) -> Option<(u64, Vec)> { + let gen_before = view.chunk_generation(chunk); + if gen_before == 0 { + return None; + } + let mut builders: Vec = + cols.iter().map(|(_, dt)| ColumnBuilder::new(*dt)).collect(); + + for row in view.rows(chunk) { + let mut cur = row.cursor(); + for (ci, (_, dt)) in cols.iter().enumerate() { + match dt { + DType::U8 => builders[ci].push(&Value::U8(cur.next_u8())), + DType::U32 => builders[ci].push(&Value::U32(cur.next_u32())), + DType::I32 => builders[ci].push(&Value::I32(cur.next_i32())), + DType::I64 => builders[ci].push(&Value::I64(cur.next_i64())), + DType::F32 => builders[ci].push(&Value::F32(cur.next_f32())), + DType::F64 => builders[ci].push(&Value::F64(cur.next_f64())), + DType::U64 => builders[ci].push(&Value::U64(cur.next_u64())), + DType::Str => builders[ci].push(&Value::Str(cur.next_str())), + DType::Bytes => builders[ci].push(&Value::Bytes(cur.next_bytes())), + } + } + } + + if view.chunk_generation(chunk) != gen_before { + return None; // ring overwrote the chunk mid-transpose + } + Some((gen_before, builders.into_iter().map(|b| b.finish()).collect())) +} diff --git a/probing/memtable/src/memc/layout.rs b/probing/memtable/src/memc/layout.rs new file mode 100644 index 00000000..ddb3eaf1 --- /dev/null +++ b/probing/memtable/src/memc/layout.rs @@ -0,0 +1,434 @@ +//! MEMC v1 binary layout: segment header, block headers, footer. +//! +//! All multi-byte fields are little-endian. See [`super`] (module docs) +//! for the full format walkthrough. + +use crate::schema::DType; +use xxhash_rust::xxh3::xxh3_64; + +/// Segment file magic: ASCII bytes `M E M C` in little-endian order. +pub const MAGIC_MEMC: u32 = u32::from_le_bytes(*b"MEMC"); +/// Table-definition block magic. +pub const MAGIC_TABLE_BLOCK: u32 = u32::from_le_bytes(*b"MCTB"); +/// Page (data) block magic. +pub const MAGIC_PAGE_BLOCK: u32 = u32::from_le_bytes(*b"MCPG"); +/// Footer magic. +pub const MAGIC_FOOTER: u32 = u32::from_le_bytes(*b"MCFT"); + +/// MEMC format version. +pub const VERSION_MEMC: u16 = 1; + +/// Segment header size (one cache line, mirrors MEMT/MEMH). +pub const SEGMENT_HEADER_SIZE: usize = 64; +/// Block header size; blocks start 64-aligned. +pub const BLOCK_HEADER_SIZE: usize = 64; +/// Fixed size of one page-directory entry in the footer. +pub const PAGE_DIR_ENTRY_SIZE: usize = 56; + +/// `flags` bit: segment is sealed (footer present, file immutable). +pub const FLAG_SEALED: u16 = 1 << 0; + +/// Sentinels for "no timestamp column / no rows yet" (match the hot ring). +pub const TS_MIN_INIT: i64 = i64::MAX; +pub const TS_MAX_INIT: i64 = i64::MIN; + +/// Pco compression level for numeric columns (pco default). +pub const PCO_LEVEL: usize = 8; + +/// Column encoding inside a page payload. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(u8)] +pub enum ColEncoding { + /// Plain little-endian array of the fixed-size type. + RawFixed = 0, + /// Pco-compressed numeric column. + Pco = 1, + /// Concatenated `[u32 len][bytes]` entries (Str / Bytes). + RawVarLen = 2, +} + +impl ColEncoding { + pub fn from_u8(v: u8) -> Option { + match v { + 0 => Some(Self::RawFixed), + 1 => Some(Self::Pco), + 2 => Some(Self::RawVarLen), + _ => None, + } + } +} + +/// Low 32 bits of xxh3-64 — the integrity check used throughout MEMC. +#[inline] +pub fn xxh32(bytes: &[u8]) -> u32 { + xxh3_64(bytes) as u32 +} + +#[inline] +pub fn align64(n: usize) -> usize { + (n + 63) & !63 +} + +// ── byte helpers (encode into Vec / decode from slice) ─────────────── + +#[inline] +pub fn get_u16(buf: &[u8], off: usize) -> u16 { + u16::from_le_bytes(buf[off..off + 2].try_into().unwrap()) +} +#[inline] +pub fn get_u32(buf: &[u8], off: usize) -> u32 { + u32::from_le_bytes(buf[off..off + 4].try_into().unwrap()) +} +#[inline] +pub fn get_u64(buf: &[u8], off: usize) -> u64 { + u64::from_le_bytes(buf[off..off + 8].try_into().unwrap()) +} +#[inline] +pub fn get_i64(buf: &[u8], off: usize) -> i64 { + i64::from_le_bytes(buf[off..off + 8].try_into().unwrap()) +} +#[inline] +pub fn put_u16(buf: &mut [u8], off: usize, v: u16) { + buf[off..off + 2].copy_from_slice(&v.to_le_bytes()); +} +#[inline] +pub fn put_u32(buf: &mut [u8], off: usize, v: u32) { + buf[off..off + 4].copy_from_slice(&v.to_le_bytes()); +} +#[inline] +pub fn put_u64(buf: &mut [u8], off: usize, v: u64) { + buf[off..off + 8].copy_from_slice(&v.to_le_bytes()); +} +#[inline] +pub fn put_i64(buf: &mut [u8], off: usize, v: i64) { + buf[off..off + 8].copy_from_slice(&v.to_le_bytes()); +} + +// ── segment header ──────────────────────────────────────────────────── + +/// Parsed segment header. +/// +/// ```text +/// offset size field +/// 0 4 magic "MEMC" +/// 4 2 version 1 +/// 6 2 header_size 64 +/// 8 2 byte_order BOM [0x01, 0x02] +/// 10 2 flags bit0 = SEALED +/// 12 4 writer_pid +/// 16 8 writer_start creator process start time +/// 24 8 created_unix_ms +/// 32 8 footer_off 0 until sealed +/// 40 8 ts_min segment-wide (valid when sealed) +/// 48 8 ts_max +/// 56 4 page_count valid when sealed +/// 60 4 header_xxh xxh32 of bytes 0..60 +/// ``` +#[derive(Debug, Clone)] +pub struct SegmentHeader { + pub flags: u16, + pub writer_pid: u32, + pub writer_start: u64, + pub created_unix_ms: u64, + pub footer_off: u64, + pub ts_min: i64, + pub ts_max: i64, + pub page_count: u32, +} + +impl SegmentHeader { + pub fn is_sealed(&self) -> bool { + self.flags & FLAG_SEALED != 0 + } + + pub fn encode(&self) -> [u8; SEGMENT_HEADER_SIZE] { + let mut b = [0u8; SEGMENT_HEADER_SIZE]; + put_u32(&mut b, 0, MAGIC_MEMC); + put_u16(&mut b, 4, VERSION_MEMC); + put_u16(&mut b, 6, SEGMENT_HEADER_SIZE as u16); + b[8..10].copy_from_slice(&[0x01, 0x02]); + put_u16(&mut b, 10, self.flags); + put_u32(&mut b, 12, self.writer_pid); + put_u64(&mut b, 16, self.writer_start); + put_u64(&mut b, 24, self.created_unix_ms); + put_u64(&mut b, 32, self.footer_off); + put_i64(&mut b, 40, self.ts_min); + put_i64(&mut b, 48, self.ts_max); + put_u32(&mut b, 56, self.page_count); + let h = xxh32(&b[..60]); + put_u32(&mut b, 60, h); + b + } + + pub fn decode(buf: &[u8]) -> Result { + if buf.len() < SEGMENT_HEADER_SIZE { + return Err("buffer too small for MEMC header"); + } + if get_u32(buf, 0) != MAGIC_MEMC { + return Err("invalid MEMC magic"); + } + if get_u16(buf, 4) != VERSION_MEMC { + return Err("unsupported MEMC version"); + } + if get_u16(buf, 6) as usize != SEGMENT_HEADER_SIZE { + return Err("invalid MEMC header size"); + } + if buf[8..10] != [0x01, 0x02] { + return Err("byte order mismatch"); + } + if get_u32(buf, 60) != xxh32(&buf[..60]) { + return Err("MEMC header checksum mismatch"); + } + Ok(Self { + flags: get_u16(buf, 10), + writer_pid: get_u32(buf, 12), + writer_start: get_u64(buf, 16), + created_unix_ms: get_u64(buf, 24), + footer_off: get_u64(buf, 32), + ts_min: get_i64(buf, 40), + ts_max: get_i64(buf, 48), + page_count: get_u32(buf, 56), + }) + } +} + +// ── block header (table-definition and page blocks) ────────────────── + +/// Header shared by `MCTB` (table definition) and `MCPG` (page) blocks. +/// +/// ```text +/// offset size field +/// 0 4 block magic "MCTB" / "MCPG" +/// 4 4 table_id +/// 8 4 row_count (MCTB: 0) +/// 12 4 col_count +/// 16 8 ts_min (MCTB: TS_MIN_INIT) +/// 24 8 ts_max (MCTB: TS_MAX_INIT) +/// 32 8 source_gen hot-ring chunk generation this page drained (0 = n/a) +/// 40 4 payload_len +/// 44 4 payload_xxh xxh32 of payload bytes +/// 48 4 source_chunk hot-ring chunk index this page drained (u32::MAX = n/a) +/// 52 4 header_xxh xxh32 of bytes 0..52 +/// 56 8 reserved (zero) +/// ``` +/// +/// `source_gen` + `source_chunk` together identify the hot-ring chunk a page +/// was compacted from, letting a restarting compactor rebuild its per-chunk +/// drain watermark from existing cold pages (exactly-once across restarts). +/// +/// The payload follows the header and is padded to the next 64-byte +/// boundary; the padding is excluded from `payload_xxh`. +#[derive(Debug, Clone)] +pub struct BlockHeader { + pub magic: u32, + pub table_id: u32, + pub row_count: u32, + pub col_count: u32, + pub ts_min: i64, + pub ts_max: i64, + pub source_gen: u64, + pub payload_len: u32, + pub payload_xxh: u32, + pub source_chunk: u32, +} + +/// Sentinel for "this page did not originate from a specific hot-ring chunk". +pub const SOURCE_CHUNK_NONE: u32 = u32::MAX; + +impl BlockHeader { + pub fn encode(&self) -> [u8; BLOCK_HEADER_SIZE] { + let mut b = [0u8; BLOCK_HEADER_SIZE]; + put_u32(&mut b, 0, self.magic); + put_u32(&mut b, 4, self.table_id); + put_u32(&mut b, 8, self.row_count); + put_u32(&mut b, 12, self.col_count); + put_i64(&mut b, 16, self.ts_min); + put_i64(&mut b, 24, self.ts_max); + put_u64(&mut b, 32, self.source_gen); + put_u32(&mut b, 40, self.payload_len); + put_u32(&mut b, 44, self.payload_xxh); + put_u32(&mut b, 48, self.source_chunk); + let h = xxh32(&b[..52]); + put_u32(&mut b, 52, h); + b + } + + /// Decode and verify the header checksum. The payload checksum is + /// verified separately, against the actual payload bytes. + pub fn decode(buf: &[u8]) -> Option { + if buf.len() < BLOCK_HEADER_SIZE { + return None; + } + let magic = get_u32(buf, 0); + if magic != MAGIC_TABLE_BLOCK && magic != MAGIC_PAGE_BLOCK { + return None; + } + if get_u32(buf, 52) != xxh32(&buf[..52]) { + return None; + } + Some(Self { + magic, + table_id: get_u32(buf, 4), + row_count: get_u32(buf, 8), + col_count: get_u32(buf, 12), + ts_min: get_i64(buf, 16), + ts_max: get_i64(buf, 24), + source_gen: get_u64(buf, 32), + payload_len: get_u32(buf, 40), + payload_xxh: get_u32(buf, 44), + source_chunk: get_u32(buf, 48), + }) + } +} + +// ── table-definition payload ────────────────────────────────────────── + +/// In-memory table definition (parsed from an `MCTB` payload). +#[derive(Debug, Clone)] +pub struct TableDef { + pub id: u32, + pub name: String, + pub cols: Vec<(String, DType)>, + /// Index of the designated timestamp column, per the hot-ring + /// convention (`I64` column named `timestamp` / `ts`). + pub ts_col: Option, +} + +/// Encode a table definition payload: +/// `[u16 name_len][u16 col_count][name]` then per column +/// `[u8 dtype][u8 0][u16 name_len][name]`. +pub fn encode_table_payload(name: &str, cols: &[(String, DType)]) -> Vec { + let mut out = Vec::with_capacity(8 + name.len() + cols.len() * 16); + out.extend_from_slice(&(name.len() as u16).to_le_bytes()); + out.extend_from_slice(&(cols.len() as u16).to_le_bytes()); + out.extend_from_slice(name.as_bytes()); + for (cname, dtype) in cols { + out.push(*dtype as u32 as u8); + out.push(0); + out.extend_from_slice(&(cname.len() as u16).to_le_bytes()); + out.extend_from_slice(cname.as_bytes()); + } + out +} + +pub fn decode_table_payload(id: u32, payload: &[u8]) -> Result { + if payload.len() < 4 { + return Err("table payload too small"); + } + let name_len = get_u16(payload, 0) as usize; + let col_count = get_u16(payload, 2) as usize; + let mut off = 4; + if payload.len() < off + name_len { + return Err("table name out of bounds"); + } + let name = std::str::from_utf8(&payload[off..off + name_len]) + .map_err(|_| "table name not utf-8")? + .to_string(); + off += name_len; + + let mut cols = Vec::with_capacity(col_count); + for _ in 0..col_count { + if payload.len() < off + 4 { + return Err("column entry out of bounds"); + } + let dtype = DType::from_u32(payload[off] as u32).ok_or("invalid column dtype")?; + let cname_len = get_u16(payload, off + 2) as usize; + off += 4; + if payload.len() < off + cname_len { + return Err("column name out of bounds"); + } + let cname = std::str::from_utf8(&payload[off..off + cname_len]) + .map_err(|_| "column name not utf-8")? + .to_string(); + off += cname_len; + cols.push((cname, dtype)); + } + + let ts_col = cols + .iter() + .position(|(n, dt)| *dt == DType::I64 && crate::raw::TS_COL_NAMES.contains(&n.as_str())); + Ok(TableDef { + id, + name, + cols, + ts_col, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn magics_are_distinct_from_hot_formats() { + assert_ne!(MAGIC_MEMC, crate::MAGIC_MEMT); + assert_ne!(MAGIC_MEMC, crate::MAGIC_MEMH); + assert_ne!(MAGIC_TABLE_BLOCK, MAGIC_PAGE_BLOCK); + } + + #[test] + fn segment_header_roundtrip() { + let h = SegmentHeader { + flags: FLAG_SEALED, + writer_pid: 1234, + writer_start: 99, + created_unix_ms: 1_700_000_000_000, + footer_off: 4096, + ts_min: -5, + ts_max: 500, + page_count: 7, + }; + let bytes = h.encode(); + let d = SegmentHeader::decode(&bytes).unwrap(); + assert!(d.is_sealed()); + assert_eq!(d.writer_pid, 1234); + assert_eq!(d.footer_off, 4096); + assert_eq!((d.ts_min, d.ts_max), (-5, 500)); + assert_eq!(d.page_count, 7); + + // Corruption is detected + let mut bad = bytes; + bad[12] ^= 0xFF; + assert!(SegmentHeader::decode(&bad).is_err()); + } + + #[test] + fn block_header_roundtrip_and_corruption() { + let h = BlockHeader { + magic: MAGIC_PAGE_BLOCK, + table_id: 3, + row_count: 100, + col_count: 2, + ts_min: 10, + ts_max: 20, + source_gen: 42, + payload_len: 512, + payload_xxh: 0xDEAD, + source_chunk: 6, + }; + let bytes = h.encode(); + let d = BlockHeader::decode(&bytes).unwrap(); + assert_eq!(d.table_id, 3); + assert_eq!(d.source_gen, 42); + assert_eq!(d.source_chunk, 6); + + let mut bad = bytes; + bad[8] ^= 1; + assert!(BlockHeader::decode(&bad).is_none()); + } + + #[test] + fn table_payload_roundtrip() { + let cols = vec![ + ("timestamp".to_string(), DType::I64), + ("value".to_string(), DType::F64), + ("tag".to_string(), DType::Str), + ]; + let payload = encode_table_payload("metrics", &cols); + let def = decode_table_payload(5, &payload).unwrap(); + assert_eq!(def.name, "metrics"); + assert_eq!(def.cols.len(), 3); + assert_eq!(def.cols[2].1, DType::Str); + assert_eq!(def.ts_col, Some(0), "timestamp I64 column detected"); + } +} diff --git a/probing/memtable/src/memc/mod.rs b/probing/memtable/src/memc/mod.rs new file mode 100644 index 00000000..93e33f8d --- /dev/null +++ b/probing/memtable/src/memc/mod.rs @@ -0,0 +1,64 @@ +//! MEMC: **cold** columnar segment files — the on-disk second tier below +//! the hot MEMT ring. +//! +//! A background compactor drains sealed chunks from a hot [`MemTable`] and +//! appends them, transposed to columns and Pco-compressed, as immutable +//! **pages** inside append-only **segment** files. Segments live in a +//! [`ColdStore`] directory and are evicted oldest-first by byte budget or +//! TTL — a second-level ring that gives the system a time-retention axis +//! the fixed-capacity hot ring cannot provide on its own. +//! +//! [`MemTable`]: crate::MemTable +//! +//! ## File format (one `.memc` segment) +//! +//! ```text +//! ┌────────────────────────────────────────────┐ 0 +//! │ SegmentHeader (64 B) │ +//! │ magic "MEMC", version, BOM, flags │ +//! │ writer pid/start, created_unix_ms │ +//! │ footer_off, ts_min/ts_max, page_count │ +//! │ header_xxh │ +//! ├────────────────────────────────────────────┤ 64 +//! │ MCTB table-def block(s) — one per table │ +//! │ [BlockHeader 64B][name+columns payload] │ +//! ├────────────────────────────────────────────┤ +//! │ MCPG page block(s) — columnar, multi-table │ +//! │ [BlockHeader 64B] │ +//! │ per column: [enc][dtype][len][bytes] │ +//! │ numeric → Pco · u8/str/bytes → raw │ +//! ├────────────────────────────────────────────┤ footer_off +//! │ Footer: [MAGIC][count][len][xxh] │ +//! │ page directory: N × 48B │ +//! │ (table_id, ts_min/max, block_off/len, …) │ +//! └────────────────────────────────────────────┘ +//! ``` +//! +//! Every block header and payload carries an xxh3 checksum. Sealed +//! segments are read through the footer directory; if the writer crashed +//! before sealing, [`SegmentReader`] forward-scans the checksummed blocks +//! and drops the torn tail. +//! +//! ## Query path (two-level time pruning) +//! +//! Segment header `ts_min/ts_max` prunes whole files (no mmap), then the +//! page directory's per-page `(table_id, ts_min, ts_max)` prunes pages +//! before decode — mirroring the hot ring's chunk-level pruning so a query +//! planner can span hot chunks and cold pages with one time predicate. + +mod codec; +mod compactor; +mod layout; +mod reader; +mod store; +mod writer; + +pub use codec::{ColumnBuilder, ColumnData}; +pub use compactor::{Compactor, CompactorConfig, CompactorHandle}; +pub use layout::{ColEncoding, TableDef, MAGIC_MEMC, SOURCE_CHUNK_NONE, VERSION_MEMC}; +pub use reader::{PageMeta, SegmentReader}; +pub use store::{default_cold_dir, writer_id, ColdStats, ColdStore}; +pub use writer::SegmentWriter; + +#[cfg(test)] +mod tests; diff --git a/probing/memtable/src/memc/reader.rs b/probing/memtable/src/memc/reader.rs new file mode 100644 index 00000000..f684336e --- /dev/null +++ b/probing/memtable/src/memc/reader.rs @@ -0,0 +1,255 @@ +//! [`SegmentReader`]: mmap a `.memc` file and read its tables and pages. +//! +//! A sealed segment is read via its footer page directory. An unsealed or +//! torn segment (writer crashed before `seal`) falls back to a forward +//! scan of checksummed blocks, stopping at the first damaged/partial block +//! — so a half-written tail is silently dropped rather than surfaced. + +use std::collections::HashMap; +use std::io; +use std::path::{Path, PathBuf}; + +use memmap2::Mmap; + +use super::codec::{decode_column, ColumnData}; +use super::layout::{ + align64, get_u32, xxh32, BlockHeader, SegmentHeader, TableDef, BLOCK_HEADER_SIZE, MAGIC_FOOTER, + MAGIC_PAGE_BLOCK, MAGIC_TABLE_BLOCK, PAGE_DIR_ENTRY_SIZE, SEGMENT_HEADER_SIZE, +}; + +/// Metadata for one page, enough to prune before decoding. +#[derive(Debug, Clone)] +pub struct PageMeta { + pub table_id: u32, + pub row_count: u32, + pub col_count: u32, + pub ts_min: i64, + pub ts_max: i64, + pub block_off: u64, + pub block_len: u32, + pub source_gen: u64, + pub source_chunk: u32, +} + +/// Read-only view over a memory-mapped MEMC segment. +pub struct SegmentReader { + mmap: Mmap, + path: PathBuf, + header: SegmentHeader, + tables: HashMap, + pages: Vec, +} + +impl SegmentReader { + pub fn open(path: impl AsRef) -> io::Result { + let path = path.as_ref().to_path_buf(); + let file = std::fs::File::open(&path)?; + let mmap = unsafe { Mmap::map(&file)? }; + Self::from_mmap(mmap, path) + } + + fn from_mmap(mmap: Mmap, path: PathBuf) -> io::Result { + let header = SegmentHeader::decode(&mmap) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + let mut tables = HashMap::new(); + let mut pages = Vec::new(); + + let footer_ok = header.is_sealed() + && header.footer_off != 0 + && Self::load_footer(&mmap, &header, &mut pages); + + // Always scan blocks for table definitions (cheap; MCTB blocks live + // before pages). On footer failure this also recovers page metadata. + Self::scan_blocks(&mmap, &header, &mut tables, footer_ok, &mut pages); + + Ok(Self { + mmap, + path, + header, + tables, + pages, + }) + } + + /// Parse the footer page directory. Returns `false` (and leaves `pages` + /// untouched) if the footer is malformed or fails its checksum. + fn load_footer(mmap: &[u8], header: &SegmentHeader, pages: &mut Vec) -> bool { + let foff = header.footer_off as usize; + if foff + 16 > mmap.len() || get_u32(mmap, foff) != MAGIC_FOOTER { + return false; + } + let count = get_u32(mmap, foff + 4) as usize; + let entries_len = get_u32(mmap, foff + 8) as usize; + let checksum = get_u32(mmap, foff + 12); + if count != header.page_count as usize || entries_len != count * PAGE_DIR_ENTRY_SIZE { + return false; + } + let entries_start = foff + 16; + let entries_end = entries_start + entries_len; + if entries_end > mmap.len() || xxh32(&mmap[entries_start..entries_end]) != checksum { + return false; + } + + let mut out = Vec::with_capacity(count); + for i in 0..count { + let o = entries_start + i * PAGE_DIR_ENTRY_SIZE; + out.push(PageMeta { + table_id: get_u32(mmap, o), + row_count: get_u32(mmap, o + 4), + ts_min: super::layout::get_i64(mmap, o + 8), + ts_max: super::layout::get_i64(mmap, o + 16), + block_off: super::layout::get_u64(mmap, o + 24), + block_len: get_u32(mmap, o + 32), + col_count: get_u32(mmap, o + 36), + source_gen: super::layout::get_u64(mmap, o + 40), + source_chunk: get_u32(mmap, o + 48), + }); + } + *pages = out; + true + } + + /// Forward-scan blocks from the first block to `footer_off`/EOF. + /// Collects table definitions always; collects page metadata only when + /// `footer_ok` is false (recovery path). Stops at the first block that + /// fails to decode or whose payload checksum mismatches. + fn scan_blocks( + mmap: &[u8], + header: &SegmentHeader, + tables: &mut HashMap, + footer_ok: bool, + pages: &mut Vec, + ) { + let limit = if header.footer_off != 0 { + (header.footer_off as usize).min(mmap.len()) + } else { + mmap.len() + }; + let mut off = SEGMENT_HEADER_SIZE; + while off + BLOCK_HEADER_SIZE <= limit { + let Some(bh) = BlockHeader::decode(&mmap[off..]) else { + break; + }; + let payload_start = off + BLOCK_HEADER_SIZE; + let payload_end = payload_start + bh.payload_len as usize; + if payload_end > limit { + break; // torn tail + } + if xxh32(&mmap[payload_start..payload_end]) != bh.payload_xxh { + break; // corrupt payload — stop here + } + let block_len = align64(BLOCK_HEADER_SIZE + bh.payload_len as usize); + + match bh.magic { + MAGIC_TABLE_BLOCK => { + if let Ok(def) = + super::layout::decode_table_payload(bh.table_id, &mmap[payload_start..payload_end]) + { + tables.insert(bh.table_id, def); + } + } + MAGIC_PAGE_BLOCK if !footer_ok => { + pages.push(PageMeta { + table_id: bh.table_id, + row_count: bh.row_count, + col_count: bh.col_count, + ts_min: bh.ts_min, + ts_max: bh.ts_max, + block_off: off as u64, + block_len: block_len as u32, + source_gen: bh.source_gen, + source_chunk: bh.source_chunk, + }); + } + _ => {} + } + off += block_len; + } + } + + pub fn path(&self) -> &Path { + &self.path + } + + pub fn is_sealed(&self) -> bool { + self.header.is_sealed() + } + + /// Segment-wide timestamp range (sealed segments only; `None` otherwise + /// or when the segment has no timestamped rows). + pub fn ts_range(&self) -> Option<(i64, i64)> { + if self.header.is_sealed() && self.header.ts_min <= self.header.ts_max { + Some((self.header.ts_min, self.header.ts_max)) + } else { + None + } + } + + pub fn table_defs(&self) -> Vec<&TableDef> { + self.tables.values().collect() + } + + pub fn table_def(&self, id: u32) -> Option<&TableDef> { + self.tables.get(&id) + } + + pub fn table_id_by_name(&self, name: &str) -> Option { + self.tables + .values() + .find(|d| d.name == name) + .map(|d| d.id) + } + + pub fn pages(&self) -> &[PageMeta] { + &self.pages + } + + /// Pages for `table_id` whose `[ts_min, ts_max]` overlaps `[lo, hi]` + /// (either bound `None` = unbounded). Pages without a ts range + /// (`ts_min > ts_max`) are always included. + pub fn pages_in_range( + &self, + table_id: u32, + lo: Option, + hi: Option, + ) -> Vec { + self.pages + .iter() + .enumerate() + .filter(|(_, p)| p.table_id == table_id) + .filter(|(_, p)| { + if p.ts_min > p.ts_max { + return true; // no ts metadata: cannot prune + } + !(lo.is_some_and(|l| p.ts_max < l) || hi.is_some_and(|h| p.ts_min > h)) + }) + .map(|(i, _)| i) + .collect() + } + + /// Decode page `index` into its columns (in schema order). + pub fn read_page(&self, index: usize) -> Result, String> { + let p = self.pages.get(index).ok_or("page index out of range")?; + let hstart = p.block_off as usize; + let bh = BlockHeader::decode(&self.mmap[hstart..]).ok_or("page block header invalid")?; + let payload_start = hstart + BLOCK_HEADER_SIZE; + let payload_end = payload_start + bh.payload_len as usize; + if payload_end > self.mmap.len() { + return Err("page payload out of bounds".into()); + } + if xxh32(&self.mmap[payload_start..payload_end]) != bh.payload_xxh { + return Err("page payload checksum mismatch".into()); + } + + let rc = bh.row_count as usize; + let mut cols = Vec::with_capacity(bh.col_count as usize); + let mut off = payload_start; + for _ in 0..bh.col_count { + let (col, used) = decode_column(&self.mmap[off..payload_end], rc)?; + cols.push(col); + off += used; + } + Ok(cols) + } +} diff --git a/probing/memtable/src/memc/store.rs b/probing/memtable/src/memc/store.rs new file mode 100644 index 00000000..5f0fbd28 --- /dev/null +++ b/probing/memtable/src/memc/store.rs @@ -0,0 +1,247 @@ +//! [`ColdStore`]: directory of MEMC segment files with capacity management. +//! +//! Layout (one directory per host, segments shared across all of a +//! writer's tables): +//! +//! ```text +//! / +//! a3f2c1-000001.memc ← writer "a3f2c1", sequence 1 (sealed) +//! a3f2c1-000002.memc ← sequence 2 (current, may be unsealed) +//! 9c81b0-000001.memc ← another writer/process on the same host +//! ``` +//! +//! The store is a **second-level ring**: the hot MEMT buffer wraps by +//! bytes, the cold store wraps by whole segment files. Eviction deletes +//! the oldest segments once a byte budget or TTL is exceeded; because +//! segments are immutable whole files, eviction is atomic and O(1) per +//! file, and `unlink`ing a segment that a query still has mmap'd is safe +//! under POSIX (the inode survives until the last mapping drops). + +use std::io; +use std::path::{Path, PathBuf}; +use std::time::{Duration, SystemTime}; + +use super::layout::xxh32; +use super::writer::SegmentWriter; +use crate::raw::process_start_time; + +const SEGMENT_EXT: &str = "memc"; + +/// Stable per-writer id: hash of (pid, process start time). Restarting the +/// process yields a fresh id, so sequence numbers never collide across the +/// lifetime of a host directory. +pub fn writer_id(pid: u32, start_time: u64) -> String { + let mut buf = [0u8; 12]; + buf[0..4].copy_from_slice(&pid.to_le_bytes()); + buf[4..12].copy_from_slice(&start_time.to_le_bytes()); + format!("{:06x}", xxh32(&buf) & 0x00FF_FFFF) +} + +/// Capacity snapshot of a cold store. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct ColdStats { + pub segment_count: usize, + pub total_bytes: u64, + /// Modification time of the oldest segment, ms since epoch (0 if none). + pub oldest_unix_ms: u64, +} + +/// A directory of MEMC segments owned by one writer process. +pub struct ColdStore { + dir: PathBuf, + writer_id: String, + next_seq: u32, +} + +/// Default cold-store base directory: `$PROBING_COLD_DIR`, else +/// `/probing-cold`. +pub fn default_cold_dir() -> PathBuf { + std::env::var_os("PROBING_COLD_DIR") + .map(PathBuf::from) + .unwrap_or_else(|| std::env::temp_dir().join("probing-cold")) +} + +impl ColdStore { + /// Open (creating if needed) a cold store rooted at `dir`. + pub fn open(dir: impl AsRef) -> io::Result { + let dir = dir.as_ref().to_path_buf(); + std::fs::create_dir_all(&dir)?; + let pid = std::process::id(); + let wid = writer_id(pid, process_start_time(pid)); + let next_seq = Self::max_seq_for(&dir, &wid) + 1; + Ok(Self { + dir, + writer_id: wid, + next_seq, + }) + } + + pub fn dir(&self) -> &Path { + &self.dir + } + + pub fn writer_id(&self) -> &str { + &self.writer_id + } + + /// Highest existing sequence number for `wid` in `dir` (0 if none). + fn max_seq_for(dir: &Path, wid: &str) -> u32 { + let mut max = 0u32; + if let Ok(entries) = std::fs::read_dir(dir) { + for e in entries.flatten() { + let name = e.file_name().to_string_lossy().to_string(); + if let Some((w, seq)) = parse_segment_name(&name) { + if w == wid { + max = max.max(seq); + } + } + } + } + max + } + + /// Path for the next segment (does not create the file). + pub fn next_segment_path(&mut self) -> PathBuf { + let seq = self.next_seq; + self.next_seq += 1; + self.dir + .join(format!("{}-{:06}.{}", self.writer_id, seq, SEGMENT_EXT)) + } + + /// Create a new [`SegmentWriter`] for the next sequence number. + pub fn create_segment(&mut self) -> io::Result { + let path = self.next_segment_path(); + SegmentWriter::create(path) + } + + /// All segment files in the directory (any writer), sorted oldest → + /// newest by modification time. + pub fn segment_paths(&self) -> Vec { + let mut segs: Vec<(SystemTime, PathBuf)> = Vec::new(); + if let Ok(entries) = std::fs::read_dir(&self.dir) { + for e in entries.flatten() { + let path = e.path(); + if path.extension().and_then(|s| s.to_str()) != Some(SEGMENT_EXT) { + continue; + } + let mtime = e + .metadata() + .and_then(|m| m.modified()) + .unwrap_or(SystemTime::UNIX_EPOCH); + segs.push((mtime, path)); + } + } + segs.sort_by(|a, b| a.0.cmp(&b.0)); + segs.into_iter().map(|(_, p)| p).collect() + } + + pub fn stats(&self) -> ColdStats { + let paths = self.segment_paths(); + let mut total = 0u64; + let mut oldest = u64::MAX; + for p in &paths { + if let Ok(meta) = std::fs::metadata(p) { + total += meta.len(); + if let Ok(mtime) = meta.modified() { + let ms = mtime + .duration_since(SystemTime::UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0); + oldest = oldest.min(ms); + } + } + } + ColdStats { + segment_count: paths.len(), + total_bytes: total, + oldest_unix_ms: if paths.is_empty() { 0 } else { oldest }, + } + } + + /// Evict oldest segments until under `max_bytes` and within `ttl`. + /// + /// Either limit may be `None` to disable it. The newest segment is + /// never evicted (it may be the one currently being appended). Returns + /// the paths removed. + pub fn enforce_limits( + &self, + max_bytes: Option, + ttl: Option, + ) -> Vec { + let mut paths = self.segment_paths(); + if paths.len() <= 1 { + return Vec::new(); + } + // Protect the newest segment (oldest-first order ⇒ it is last); + // it may be the one currently being appended. + paths.pop(); + + let file_len = |p: &Path| std::fs::metadata(p).map(|m| m.len()).unwrap_or(0); + let now = SystemTime::now(); + let mut total: u64 = self.stats().total_bytes; + + let mut removed = Vec::new(); + for path in paths { + let too_old = ttl + .and_then(|ttl| { + let mtime = std::fs::metadata(&path).ok()?.modified().ok()?; + now.duration_since(mtime).ok().map(|age| age > ttl) + }) + .unwrap_or(false); + let over_budget = max_bytes.is_some_and(|max| total > max); + if !(too_old || over_budget) { + break; // sorted oldest-first: nothing newer qualifies either + } + let sz = file_len(&path); + if std::fs::remove_file(&path).is_ok() { + total = total.saturating_sub(sz); + removed.push(path); + } + } + removed + } +} + +/// Parse `"-.memc"` → `(writer_id, seq)`. +fn parse_segment_name(name: &str) -> Option<(String, u32)> { + let stem = name.strip_suffix(".memc")?; + let (wid, seq) = stem.rsplit_once('-')?; + let seq: u32 = seq.parse().ok()?; + Some((wid.to_string(), seq)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn writer_id_is_stable_and_pid_sensitive() { + assert_eq!(writer_id(100, 5), writer_id(100, 5)); + assert_ne!(writer_id(100, 5), writer_id(101, 5)); + assert_ne!(writer_id(100, 5), writer_id(100, 6)); + assert_eq!(writer_id(100, 5).len(), 6); + } + + #[test] + fn parse_segment_name_roundtrip() { + assert_eq!( + parse_segment_name("a3f2c1-000007.memc"), + Some(("a3f2c1".to_string(), 7)) + ); + assert_eq!(parse_segment_name("notasegment.txt"), None); + assert_eq!(parse_segment_name("missingseq.memc"), None); + } + + #[test] + fn sequence_numbers_increment_and_persist() { + let tmp = std::env::temp_dir().join(format!("memc-store-test-{}", std::process::id())); + let _ = std::fs::remove_dir_all(&tmp); + let mut store = ColdStore::open(&tmp).unwrap(); + let p1 = store.next_segment_path(); + let p2 = store.next_segment_path(); + assert_ne!(p1, p2); + assert!(p1.to_string_lossy().contains("-000001.")); + assert!(p2.to_string_lossy().contains("-000002.")); + let _ = std::fs::remove_dir_all(&tmp); + } +} diff --git a/probing/memtable/src/memc/tests.rs b/probing/memtable/src/memc/tests.rs new file mode 100644 index 00000000..556927fb --- /dev/null +++ b/probing/memtable/src/memc/tests.rs @@ -0,0 +1,643 @@ +//! End-to-end tests for the MEMC cold segment format and store. + +use super::*; +use crate::schema::{DType, Schema, Value}; +use crate::MemTable; +use std::time::Duration; + +fn tmp_dir(tag: &str) -> std::path::PathBuf { + let dir = std::env::temp_dir().join(format!( + "memc-test-{tag}-{}-{:?}", + std::process::id(), + std::thread::current().id() + )); + let _ = std::fs::remove_dir_all(&dir); + std::fs::create_dir_all(&dir).unwrap(); + dir +} + +fn metrics_cols() -> Vec<(String, DType)> { + vec![ + ("timestamp".to_string(), DType::I64), + ("value".to_string(), DType::F64), + ("tag".to_string(), DType::Str), + ] +} + +#[test] +fn segment_roundtrip_sealed() { + let dir = tmp_dir("roundtrip"); + let path = dir.join("seg.memc"); + + let mut w = SegmentWriter::create(&path).unwrap(); + let tid = w.register_table("metrics", &metrics_cols()).unwrap(); + w.append_page( + tid, + &[ + ColumnData::I64(vec![100, 200, 300]), + ColumnData::F64(vec![1.0, 2.0, 3.0]), + ColumnData::Str(vec!["a".into(), "b".into(), "c".into()]), + ], + 7, + 0, + ) + .unwrap(); + w.append_page( + tid, + &[ + ColumnData::I64(vec![400, 500]), + ColumnData::F64(vec![4.0, 5.0]), + ColumnData::Str(vec!["d".into(), "e".into()]), + ], + 8, + 1, + ) + .unwrap(); + w.seal().unwrap(); + + let r = SegmentReader::open(&path).unwrap(); + assert!(r.is_sealed()); + assert_eq!(r.ts_range(), Some((100, 500))); + assert_eq!(r.pages().len(), 2); + + let id = r.table_id_by_name("metrics").unwrap(); + let def = r.table_def(id).unwrap(); + assert_eq!(def.cols.len(), 3); + assert_eq!(def.ts_col, Some(0)); + + let cols = r.read_page(0).unwrap(); + assert_eq!(cols[0], ColumnData::I64(vec![100, 200, 300])); + assert_eq!(cols[2], ColumnData::Str(vec!["a".into(), "b".into(), "c".into()])); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn size_bytes_tracks_growth_for_roll_decisions() { + let dir = tmp_dir("sizehint"); + let path = dir.join("seg.memc"); + + let mut w = SegmentWriter::create(&path).unwrap(); + let base = w.size_bytes(); + assert_eq!(base, 64, "starts at the 64-byte header"); + assert_eq!(w.ts_span(), None); + + let tid = w + .register_table("m", &[("timestamp".to_string(), DType::I64)]) + .unwrap(); + let after_reg = w.size_bytes(); + assert!(after_reg > base, "table block advances the offset"); + + w.append_page(tid, &[ColumnData::I64(vec![10, 20, 30])], 0, 0) + .unwrap(); + assert!(w.size_bytes() > after_reg, "page advances the offset"); + assert_eq!(w.ts_span(), Some((10, 30))); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn multi_table_segment() { + let dir = tmp_dir("multitable"); + let path = dir.join("seg.memc"); + + let mut w = SegmentWriter::create(&path).unwrap(); + let metrics = w.register_table("metrics", &metrics_cols()).unwrap(); + let events = w + .register_table( + "events", + &[("ts".to_string(), DType::I64), ("code".to_string(), DType::I32)], + ) + .unwrap(); + + w.append_page( + metrics, + &[ + ColumnData::I64(vec![10, 20]), + ColumnData::F64(vec![0.1, 0.2]), + ColumnData::Str(vec!["x".into(), "y".into()]), + ], + 1, + 0, + ) + .unwrap(); + w.append_page( + events, + &[ColumnData::I64(vec![15]), ColumnData::I32(vec![42])], + 1, + 0, + ) + .unwrap(); + w.seal().unwrap(); + + let r = SegmentReader::open(&path).unwrap(); + let mpages = r.pages_in_range(metrics, None, None); + let epages = r.pages_in_range(events, None, None); + assert_eq!(mpages.len(), 1); + assert_eq!(epages.len(), 1); + assert_eq!(r.read_page(epages[0]).unwrap()[1], ColumnData::I32(vec![42])); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn page_pruning_by_time_range() { + let dir = tmp_dir("prune"); + let path = dir.join("seg.memc"); + + let mut w = SegmentWriter::create(&path).unwrap(); + let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap(); + w.append_page(tid, &[ColumnData::I64(vec![0, 10, 20])], 0, 0) + .unwrap(); + w.append_page(tid, &[ColumnData::I64(vec![100, 110, 120])], 0, 1) + .unwrap(); + w.append_page(tid, &[ColumnData::I64(vec![200, 210])], 0, 2) + .unwrap(); + w.seal().unwrap(); + + let r = SegmentReader::open(&path).unwrap(); + // Window [105, 130] overlaps only the middle page. + let hit = r.pages_in_range(tid, Some(105), Some(130)); + assert_eq!(hit.len(), 1); + assert_eq!(r.read_page(hit[0]).unwrap()[0], ColumnData::I64(vec![100, 110, 120])); + + // Lower bound past everything → no pages. + assert!(r.pages_in_range(tid, Some(1000), None).is_empty()); + // Unbounded → all three. + assert_eq!(r.pages_in_range(tid, None, None).len(), 3); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn unsealed_segment_recovers_via_forward_scan() { + let dir = tmp_dir("unsealed"); + let path = dir.join("seg.memc"); + + { + let mut w = SegmentWriter::create(&path).unwrap(); + let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap(); + w.append_page(tid, &[ColumnData::I64(vec![1, 2, 3])], 0, 0) + .unwrap(); + w.append_page(tid, &[ColumnData::I64(vec![4, 5, 6])], 0, 1) + .unwrap(); + // Drop WITHOUT seal — simulates a crash before footer is written. + } + + let r = SegmentReader::open(&path).unwrap(); + assert!(!r.is_sealed()); + assert_eq!(r.pages().len(), 2, "forward scan must recover both pages"); + let id = r.table_id_by_name("m").unwrap(); + assert_eq!(r.read_page(0).unwrap()[0], ColumnData::I64(vec![1, 2, 3])); + assert_eq!(r.pages_in_range(id, None, None).len(), 2); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn torn_tail_block_is_dropped() { + let dir = tmp_dir("torn"); + let path = dir.join("seg.memc"); + + { + let mut w = SegmentWriter::create(&path).unwrap(); + let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap(); + w.append_page(tid, &[ColumnData::I64(vec![1, 2, 3])], 0, 0) + .unwrap(); + w.append_page(tid, &[ColumnData::I64(vec![4, 5, 6])], 0, 1) + .unwrap(); + } + // Find the second page's block, then truncate into the middle of its + // payload (header intact) to mimic a partial write. + let cut = { + let r = SegmentReader::open(&path).unwrap(); + let p1 = &r.pages()[1]; + p1.block_off + (super::layout::BLOCK_HEADER_SIZE as u64) + 8 + }; + let f = std::fs::OpenOptions::new().write(true).open(&path).unwrap(); + f.set_len(cut).unwrap(); + drop(f); + + let r = SegmentReader::open(&path).unwrap(); + assert_eq!( + r.pages().len(), + 1, + "torn tail page must be dropped, first page survives" + ); + assert_eq!(r.read_page(0).unwrap()[0], ColumnData::I64(vec![1, 2, 3])); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn cold_store_segment_creation_and_listing() { + let dir = tmp_dir("store-create"); + let mut store = ColdStore::open(&dir).unwrap(); + + for batch in 0..3 { + let mut w = store.create_segment().unwrap(); + let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap(); + w.append_page(tid, &[ColumnData::I64(vec![batch, batch + 1])], 0, 0) + .unwrap(); + w.seal().unwrap(); + } + + let segs = store.segment_paths(); + assert_eq!(segs.len(), 3); + let stats = store.stats(); + assert_eq!(stats.segment_count, 3); + assert!(stats.total_bytes > 0); + + // A fresh store over the same dir continues the sequence. + let mut store2 = ColdStore::open(&dir).unwrap(); + let next = store2.next_segment_path(); + assert!(next.to_string_lossy().contains("-000004.")); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn eviction_respects_byte_budget_and_keeps_newest() { + let dir = tmp_dir("evict-bytes"); + let mut store = ColdStore::open(&dir).unwrap(); + + let mut sizes = Vec::new(); + for i in 0..5i64 { + let mut w = store.create_segment().unwrap(); + let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap(); + w.append_page( + tid, + &[ColumnData::I64((0..100).map(|x| x + i * 1000).collect())], + 0, + 0, + ) + .unwrap(); + let path = w.seal().unwrap(); + sizes.push(std::fs::metadata(&path).unwrap().len()); + // Ensure distinct mtimes for deterministic oldest-first ordering. + std::thread::sleep(Duration::from_millis(10)); + } + + let total: u64 = sizes.iter().sum(); + // Budget that should force dropping the oldest couple of segments. + let budget = total - sizes[0] - sizes[1] + 1; + let removed = store.enforce_limits(Some(budget), None); + assert!(!removed.is_empty(), "expected some eviction"); + + let remaining = store.segment_paths(); + assert!(remaining.len() < 5); + assert!(store.stats().total_bytes <= budget); + // Newest survives. + assert!(remaining + .last() + .unwrap() + .to_string_lossy() + .contains("-000005.")); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn eviction_by_ttl() { + let dir = tmp_dir("evict-ttl"); + let mut store = ColdStore::open(&dir).unwrap(); + for _ in 0..3 { + let mut w = store.create_segment().unwrap(); + let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap(); + w.append_page(tid, &[ColumnData::I64(vec![1, 2])], 0, 0) + .unwrap(); + w.seal().unwrap(); + std::thread::sleep(Duration::from_millis(10)); + } + // TTL of 0 → every segment except the protected newest is expired. + let removed = store.enforce_limits(None, Some(Duration::from_millis(0))); + assert_eq!(removed.len(), 2); + assert_eq!(store.segment_paths().len(), 1); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn pco_compresses_large_numeric_segment() { + let dir = tmp_dir("compress"); + let path = dir.join("seg.memc"); + + let n = 50_000i64; + let mut w = SegmentWriter::create(&path).unwrap(); + let tid = w + .register_table( + "metrics", + &[("timestamp".to_string(), DType::I64), ("value".to_string(), DType::F64)], + ) + .unwrap(); + w.append_page( + tid, + &[ + ColumnData::I64((0..n).map(|i| 1_700_000_000_000 + i * 1000).collect()), + ColumnData::F64((0..n).map(|i| (i as f64) * 0.5).collect()), + ], + 0, + 0, + ) + .unwrap(); + let sealed = w.seal().unwrap(); + + let on_disk = std::fs::metadata(&sealed).unwrap().len(); + let raw = (n as u64) * (8 + 8); + assert!( + on_disk < raw / 3, + "expected >3x compression: {on_disk} vs {raw}" + ); + + // And it still reads back exactly. + let r = SegmentReader::open(&sealed).unwrap(); + let cols = r.read_page(0).unwrap(); + assert_eq!(cols[0].len(), n as usize); + + let _ = std::fs::remove_dir_all(&dir); +} + +// ── Compactor (the roller) ─────────────────────────────────────────── + +fn hot_metrics(chunk_size: u32, num_chunks: u32) -> MemTable { + let schema = Schema::new() + .col("timestamp", DType::I64) + .col("value", DType::F64) + .col("tag", DType::Str); + MemTable::new(&schema, chunk_size, num_chunks) +} + +/// Total rows across all pages of every sealed segment in `dir`. +fn cold_row_count(dir: &std::path::Path) -> usize { + let store = ColdStore::open(dir).unwrap(); + store + .segment_paths() + .iter() + .map(|p| { + let r = SegmentReader::open(p).unwrap(); + r.pages().iter().map(|pg| pg.row_count as usize).sum::() + }) + .sum() +} + +#[test] +fn compactor_drains_only_sealed_chunks() { + let dir = tmp_dir("compact-basic"); + let mut t = hot_metrics(512, 4); + + for i in 0..3 { + t.push_row(&[Value::I64(100 + i), Value::F64(i as f64), Value::Str("a")]); + } + t.advance_chunk(); // seal chunk 0 (3 rows) + for i in 0..2 { + t.push_row(&[Value::I64(200 + i), Value::F64(i as f64), Value::Str("b")]); + } + t.advance_chunk(); // seal chunk 1 (2 rows) + // chunk 2 stays Writing — must NOT be drained + t.push_row(&[Value::I64(999), Value::F64(9.0), Value::Str("c")]); + + let store = ColdStore::open(&dir).unwrap(); + let cfg = CompactorConfig { + target_segment_bytes: 1 << 30, // never roll on size + ..Default::default() + }; + let mut c = Compactor::new(store, cfg); + let rows = c.drain_view("metrics", &t.view()).unwrap(); + assert_eq!(rows, 5, "only the two sealed chunks drain"); + + // Draining again is idempotent — nothing new sealed. + assert_eq!(c.drain_view("metrics", &t.view()).unwrap(), 0); + + let sealed = c.flush().unwrap().expect("one segment sealed"); + let r = SegmentReader::open(&sealed).unwrap(); + assert!(r.is_sealed()); + assert_eq!(r.pages().len(), 2); + assert_eq!(r.ts_range(), Some((100, 201))); + + let id = r.table_id_by_name("metrics").unwrap(); + assert_eq!(r.table_def(id).unwrap().ts_col, Some(0)); + assert_eq!(r.read_page(0).unwrap()[0], ColumnData::I64(vec![100, 101, 102])); + + assert_eq!(cold_row_count(&dir), 5); + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn compactor_rolls_by_size_and_reregisters_table() { + let dir = tmp_dir("compact-roll"); + let mut t = hot_metrics(512, 4); + + for c in 0..3 { + for i in 0..2 { + t.push_row(&[ + Value::I64(1000 * c + i), + Value::F64(i as f64), + Value::Str("x"), + ]); + } + t.advance_chunk(); // seal each chunk + } + + let store = ColdStore::open(&dir).unwrap(); + let cfg = CompactorConfig { + target_segment_bytes: 1, // force a roll after every page + ..Default::default() + }; + let mut c = Compactor::new(store, cfg); + let rows = c.drain_view("metrics", &t.view()).unwrap(); + assert_eq!(rows, 6); + assert!(c.flush().unwrap().is_none(), "no open segment after size rolls"); + + // Three sealed chunks → three one-page segments, each independently + // carrying the table definition (re-registered on every roll). + let store = ColdStore::open(&dir).unwrap(); + let paths = store.segment_paths(); + assert_eq!(paths.len(), 3); + for p in &paths { + let r = SegmentReader::open(p).unwrap(); + assert!(r.is_sealed()); + assert_eq!(r.pages().len(), 1); + assert!(r.table_id_by_name("metrics").is_some()); + } + assert_eq!(cold_row_count(&dir), 6); + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn compactor_multi_table_shares_segments() { + let dir = tmp_dir("compact-multi"); + let mut a = hot_metrics(512, 4); + let mut b = hot_metrics(512, 4); + for i in 0..2 { + a.push_row(&[Value::I64(i), Value::F64(0.0), Value::Str("a")]); + b.push_row(&[Value::I64(100 + i), Value::F64(1.0), Value::Str("b")]); + } + a.advance_chunk(); + b.advance_chunk(); + + let store = ColdStore::open(&dir).unwrap(); + let mut c = Compactor::new( + store, + CompactorConfig { + target_segment_bytes: 1 << 30, + ..Default::default() + }, + ); + c.drain_view("table_a", &a.view()).unwrap(); + c.drain_view("table_b", &b.view()).unwrap(); + c.flush().unwrap(); + + // Both tables land in a single shared segment file. + let store = ColdStore::open(&dir).unwrap(); + let paths = store.segment_paths(); + assert_eq!(paths.len(), 1); + let r = SegmentReader::open(&paths[0]).unwrap(); + assert_eq!(r.table_defs().len(), 2); + assert!(r.table_id_by_name("table_a").is_some()); + assert!(r.table_id_by_name("table_b").is_some()); + assert_eq!(r.pages().len(), 2); + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn compactor_background_thread_drains_on_stop() { + let dir = tmp_dir("compact-spawn"); + let file = dir.join("hot.memt"); + let schema = Schema::new() + .col("timestamp", DType::I64) + .col("value", DType::F64); + + // Writer handle (application side) and an independent read handle the + // compactor thread owns — same mmap'd file, lock-free reads. + let mut writer = MemTable::file_at(&file, &schema, 512, 4).unwrap(); + let reader = MemTable::open_file(&file).unwrap(); + + let store = ColdStore::open(&dir).unwrap(); + let handle = Compactor::new( + store, + CompactorConfig { + target_segment_bytes: 1 << 30, + poll_interval: Duration::from_millis(10), + ..Default::default() + }, + ) + .spawn(vec![("metrics".to_string(), reader)]); + + for i in 0..4 { + writer.push_row(&[Value::I64(i), Value::F64(i as f64)]); + } + writer.advance_chunk(); + std::thread::sleep(Duration::from_millis(40)); + for i in 0..3 { + writer.push_row(&[Value::I64(100 + i), Value::F64(i as f64)]); + } + writer.advance_chunk(); + + // stop() performs a final drain + flush before joining. + handle.stop(); + + assert_eq!(cold_row_count(&dir), 7); + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn compactor_enforce_evicts_oldest_segments() { + let dir = tmp_dir("compact-evict"); + let mut t = hot_metrics(512, 8); + for c in 0..5 { + for i in 0..2 { + t.push_row(&[Value::I64(c * 10 + i), Value::F64(0.0), Value::Str("x")]); + } + t.advance_chunk(); + } + + let store = ColdStore::open(&dir).unwrap(); + let mut c = Compactor::new( + store, + CompactorConfig { + target_segment_bytes: 1, // one segment per page + max_total_bytes: Some(1), // keep only the protected newest + ..Default::default() + }, + ); + c.drain_view("metrics", &t.view()).unwrap(); + c.flush().unwrap(); + assert_eq!(c.stats().segment_count, 5); + + let removed = c.enforce(); + assert!(!removed.is_empty(), "over-budget segments evicted"); + // enforce_limits never deletes the newest segment. + assert!(c.stats().segment_count >= 1); + assert!(c.stats().segment_count < 5); + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn compactor_restart_dedup_via_prime() { + let dir = tmp_dir("compact-restart"); + let mut t = hot_metrics(512, 4); + for c in 0..2 { + for i in 0..2 { + t.push_row(&[Value::I64(c * 10 + i), Value::F64(0.0), Value::Str("x")]); + } + t.advance_chunk(); // seal chunks 0 and 1 + } + + let cfg = || CompactorConfig { + target_segment_bytes: 1 << 30, + ..Default::default() + }; + + // First run: drain the two sealed chunks into cold. + { + let mut c = Compactor::new(ColdStore::open(&dir).unwrap(), cfg()); + assert_eq!(c.drain_view("metrics", &t.view()).unwrap(), 4); + c.flush().unwrap(); + } + assert_eq!(cold_row_count(&dir), 4); + + // Simulated restart over the SAME cold dir. prime_from_cold rebuilds the + // per-chunk watermark from persisted source_gen/source_chunk, so the same + // still-resident sealed chunks are recognised as already compacted. + { + let mut c = Compactor::new(ColdStore::open(&dir).unwrap(), cfg()); + c.prime_from_cold().unwrap(); + assert_eq!(c.drain_view("metrics", &t.view()).unwrap(), 0); + assert!(c.flush().unwrap().is_none(), "nothing new to seal"); + } + assert_eq!(cold_row_count(&dir), 4, "exactly-once: no duplication"); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn compactor_without_prime_redrains_on_restart() { + // Negative control: this is precisely the duplication prime_from_cold + // prevents. Without priming, a fresh compactor re-drains resident chunks. + let dir = tmp_dir("compact-noprime"); + let mut t = hot_metrics(512, 4); + for i in 0..2 { + t.push_row(&[Value::I64(i), Value::F64(0.0), Value::Str("x")]); + } + t.advance_chunk(); + + let cfg = || CompactorConfig { + target_segment_bytes: 1 << 30, + ..Default::default() + }; + + { + let mut c = Compactor::new(ColdStore::open(&dir).unwrap(), cfg()); + assert_eq!(c.drain_view("metrics", &t.view()).unwrap(), 2); + c.flush().unwrap(); + } + { + let mut c = Compactor::new(ColdStore::open(&dir).unwrap(), cfg()); + // No prime_from_cold → the resident sealed chunk is drained again. + assert_eq!(c.drain_view("metrics", &t.view()).unwrap(), 2); + c.flush().unwrap(); + } + assert_eq!(cold_row_count(&dir), 4, "duplicated without priming"); + + let _ = std::fs::remove_dir_all(&dir); +} diff --git a/probing/memtable/src/memc/writer.rs b/probing/memtable/src/memc/writer.rs new file mode 100644 index 00000000..224d7304 --- /dev/null +++ b/probing/memtable/src/memc/writer.rs @@ -0,0 +1,337 @@ +//! [`SegmentWriter`]: build one `.memc` segment file incrementally. +//! +//! Lifecycle: create → `register_table`* → `append_page`* → `seal`. +//! Blocks are flushed to the file as they are produced; the footer (page +//! directory) and the sealed segment header are written last, so a crash +//! before `seal` leaves a forward-scannable, checksummed prefix. + +use std::collections::HashMap; +use std::fs::{File, OpenOptions}; +use std::io::{self, Seek, SeekFrom, Write}; +use std::path::{Path, PathBuf}; + +use super::codec::{encode_column, ColumnData}; +use super::layout::{ + align64, xxh32, BlockHeader, ColEncoding, SegmentHeader, BLOCK_HEADER_SIZE, FLAG_SEALED, + MAGIC_FOOTER, MAGIC_PAGE_BLOCK, MAGIC_TABLE_BLOCK, PAGE_DIR_ENTRY_SIZE, SEGMENT_HEADER_SIZE, + SOURCE_CHUNK_NONE, TS_MAX_INIT, TS_MIN_INIT, +}; +use crate::raw::process_start_time; +use crate::schema::DType; + +/// One page-directory entry, mirrored into the footer on seal. +#[derive(Debug, Clone)] +pub(crate) struct PageDirEntry { + pub table_id: u32, + pub row_count: u32, + pub col_count: u32, + pub ts_min: i64, + pub ts_max: i64, + pub block_off: u64, + pub block_len: u32, + pub source_gen: u64, + pub source_chunk: u32, +} + +struct TableInfo { + cols: Vec<(String, DType)>, + ts_col: Option, +} + +/// Incremental writer for a single MEMC segment file. +pub struct SegmentWriter { + file: File, + path: PathBuf, + offset: u64, + tables: HashMap, + next_table_id: u32, + pages: Vec, + seg_ts_min: i64, + seg_ts_max: i64, + sealed: bool, +} + +fn now_unix_ms() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0) +} + +impl SegmentWriter { + /// Create a new segment file at `path`, writing the (unsealed) header. + pub fn create(path: impl AsRef) -> io::Result { + let path = path.as_ref().to_path_buf(); + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent)?; + } + let mut file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&path)?; + + let pid = std::process::id(); + let header = SegmentHeader { + flags: 0, + writer_pid: pid, + writer_start: process_start_time(pid), + created_unix_ms: now_unix_ms(), + footer_off: 0, + ts_min: TS_MIN_INIT, + ts_max: TS_MAX_INIT, + page_count: 0, + }; + file.write_all(&header.encode())?; + + Ok(Self { + file, + path, + offset: SEGMENT_HEADER_SIZE as u64, + tables: HashMap::new(), + next_table_id: 1, + pages: Vec::new(), + seg_ts_min: TS_MIN_INIT, + seg_ts_max: TS_MAX_INIT, + sealed: false, + }) + } + + pub fn path(&self) -> &Path { + &self.path + } + + pub fn page_count(&self) -> usize { + self.pages.len() + } + + /// Bytes written to the segment so far (header + all blocks, before the + /// footer). A compactor polls this to decide when to seal and roll to a + /// fresh segment, bounding file size and preventing fragmentation. + pub fn size_bytes(&self) -> u64 { + self.offset + } + + /// Timestamp span covered so far, `None` until a timestamped page lands. + /// Lets a compactor also roll on a wall-clock window (e.g. seal every + /// 5 min) so low-rate tables don't sit unsealed indefinitely. + pub fn ts_span(&self) -> Option<(i64, i64)> { + if self.seg_ts_min <= self.seg_ts_max { + Some((self.seg_ts_min, self.seg_ts_max)) + } else { + None + } + } + + /// Register a table, write its `MCTB` definition block, return its id. + pub fn register_table( + &mut self, + name: &str, + cols: &[(String, DType)], + ) -> io::Result { + let id = self.next_table_id; + self.next_table_id += 1; + + let payload = super::layout::encode_table_payload(name, cols); + let header = BlockHeader { + magic: MAGIC_TABLE_BLOCK, + table_id: id, + row_count: 0, + col_count: cols.len() as u32, + ts_min: TS_MIN_INIT, + ts_max: TS_MAX_INIT, + source_gen: 0, + payload_len: payload.len() as u32, + payload_xxh: xxh32(&payload), + source_chunk: SOURCE_CHUNK_NONE, + }; + self.write_block(&header, &payload)?; + + let ts_col = cols + .iter() + .position(|(n, dt)| *dt == DType::I64 && crate::raw::TS_COL_NAMES.contains(&n.as_str())); + self.tables.insert( + id, + TableInfo { + cols: cols.to_vec(), + ts_col, + }, + ); + Ok(id) + } + + /// Append a columnar page for `table_id`. `source_gen` / `source_chunk` + /// record the hot-ring chunk this page was compacted from (generation and + /// chunk index); pass `(0, SOURCE_CHUNK_NONE)` when not applicable. They + /// let a restarting compactor rebuild its per-chunk drain watermark. + /// + /// All columns must share the same length and match the registered + /// schema's dtypes in order. + pub fn append_page( + &mut self, + table_id: u32, + columns: &[ColumnData], + source_gen: u64, + source_chunk: u32, + ) -> io::Result<()> { + let info = self + .tables + .get(&table_id) + .ok_or_else(|| io::Error::new(io::ErrorKind::NotFound, "unknown table_id"))?; + if columns.len() != info.cols.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "page column count mismatch", + )); + } + let row_count = columns.first().map(|c| c.len()).unwrap_or(0); + for (i, col) in columns.iter().enumerate() { + if col.dtype() != info.cols[i].1 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "page column dtype mismatch", + )); + } + if col.len() != row_count { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "page columns have unequal lengths", + )); + } + } + if row_count == 0 { + return Ok(()); // nothing to persist + } + + let (ts_min, ts_max) = match info.ts_col { + Some(ci) => match &columns[ci] { + ColumnData::I64(v) => v + .iter() + .fold((TS_MIN_INIT, TS_MAX_INIT), |(lo, hi), &t| { + (lo.min(t), hi.max(t)) + }), + _ => (TS_MIN_INIT, TS_MAX_INIT), + }, + None => (TS_MIN_INIT, TS_MAX_INIT), + }; + + let mut payload = Vec::new(); + for col in columns { + let sub = encode_column(col).map_err(|e| { + io::Error::new(io::ErrorKind::InvalidData, format!("column encode: {e}")) + })?; + payload.extend_from_slice(&sub); + } + + let header = BlockHeader { + magic: MAGIC_PAGE_BLOCK, + table_id, + row_count: row_count as u32, + col_count: columns.len() as u32, + ts_min, + ts_max, + source_gen, + payload_len: payload.len() as u32, + payload_xxh: xxh32(&payload), + source_chunk, + }; + let block_off = self.offset; + let block_len = self.write_block(&header, &payload)?; + + if ts_min <= ts_max { + self.seg_ts_min = self.seg_ts_min.min(ts_min); + self.seg_ts_max = self.seg_ts_max.max(ts_max); + } + self.pages.push(PageDirEntry { + table_id, + row_count: row_count as u32, + col_count: columns.len() as u32, + ts_min, + ts_max, + block_off, + block_len: block_len as u32, + source_gen, + source_chunk, + }); + Ok(()) + } + + /// Write the footer (page directory) and the sealed header, then flush. + /// + /// After this the file is immutable; the writer is consumed. + pub fn seal(mut self) -> io::Result { + let footer_off = self.offset; + let mut footer = Vec::with_capacity(16 + self.pages.len() * PAGE_DIR_ENTRY_SIZE); + footer.extend_from_slice(&MAGIC_FOOTER.to_le_bytes()); + footer.extend_from_slice(&(self.pages.len() as u32).to_le_bytes()); + let entries_len = (self.pages.len() * PAGE_DIR_ENTRY_SIZE) as u32; + footer.extend_from_slice(&entries_len.to_le_bytes()); + footer.extend_from_slice(&[0u8; 4]); // checksum placeholder + + let entries_start = footer.len(); + for p in &self.pages { + footer.extend_from_slice(&p.table_id.to_le_bytes()); + footer.extend_from_slice(&p.row_count.to_le_bytes()); + footer.extend_from_slice(&p.ts_min.to_le_bytes()); + footer.extend_from_slice(&p.ts_max.to_le_bytes()); + footer.extend_from_slice(&p.block_off.to_le_bytes()); + footer.extend_from_slice(&p.block_len.to_le_bytes()); + footer.extend_from_slice(&p.col_count.to_le_bytes()); + footer.extend_from_slice(&p.source_gen.to_le_bytes()); + footer.extend_from_slice(&p.source_chunk.to_le_bytes()); + footer.extend_from_slice(&[0u8; 4]); // pad to 56 + } + let checksum = xxh32(&footer[entries_start..]); + footer[12..16].copy_from_slice(&checksum.to_le_bytes()); + + self.file.write_all(&footer)?; + + // Rewrite the header with seal metadata. + let pid = std::process::id(); + let header = SegmentHeader { + flags: FLAG_SEALED, + writer_pid: pid, + writer_start: process_start_time(pid), + created_unix_ms: now_unix_ms(), + footer_off, + ts_min: self.seg_ts_min, + ts_max: self.seg_ts_max, + page_count: self.pages.len() as u32, + }; + self.file.seek(SeekFrom::Start(0))?; + self.file.write_all(&header.encode())?; + self.file.flush()?; + self.file.sync_data()?; + self.sealed = true; + Ok(self.path.clone()) + } + + /// Write a block header + payload, zero-padded to a 64-byte boundary. + /// Returns the total bytes written (the block length). + fn write_block(&mut self, header: &BlockHeader, payload: &[u8]) -> io::Result { + debug_assert!(matches!( + ColEncoding::from_u8(0), + Some(ColEncoding::RawFixed) + )); + let raw = BLOCK_HEADER_SIZE + payload.len(); + let padded = align64(raw); + self.file.write_all(&header.encode())?; + self.file.write_all(payload)?; + if padded > raw { + self.file.write_all(&vec![0u8; padded - raw])?; + } + self.offset += padded as u64; + Ok(padded as u64) + } +} + +impl Drop for SegmentWriter { + fn drop(&mut self) { + // An unsealed segment on drop keeps its checksummed block prefix on + // disk; the reader's forward-scan recovery path will pick it up. + if !self.sealed { + let _ = self.file.flush(); + } + } +} diff --git a/probing/memtable/src/memtable.rs b/probing/memtable/src/memtable.rs index 3c681412..6545df5d 100644 --- a/probing/memtable/src/memtable.rs +++ b/probing/memtable/src/memtable.rs @@ -4,13 +4,18 @@ use crate::layout::{ header_mut, release_write_lock, w32, CHUNK_HEADER_SIZE, FLAG_DEDUP, }; use crate::raw::{ - advance_chunk_unlocked, init_buf, validate_buf, validate_row_schema, write_row_bytes, + advance_chunk_unlocked, init_buf, note_row_ts, row_ts, validate_buf, validate_row_schema, + write_row_bytes, }; use crate::refcount::refcount; use crate::row::RowIter; use crate::schema::{Col, DType, Schema, Value}; use crate::writer::RowWriter; +use memmap2::MmapMut; use std::fmt; +use std::fs::OpenOptions; +use std::io; +use std::path::{Path, PathBuf}; use std::sync::atomic::Ordering; // ── Shared read-only accessor methods (expands inside each impl) ───── @@ -59,6 +64,37 @@ macro_rules! impl_table_reader { let cs = chunk_start_off(buf, chunk); chunk_header(buf, cs).state.load(Ordering::Acquire) } + /// Index of the designated timestamp column ([`None`] when the + /// schema has no `I64` column named `timestamp` / `ts`). + pub fn ts_col(&self) -> Option { + match header(self.as_bytes()).ts_col as usize { + 0 => None, + idx => Some(idx - 1), + } + } + /// `(min, max)` of the designated timestamp column over the rows + /// committed in `chunk`; [`None`] when the chunk is empty or the + /// table has no timestamp column. + /// + /// The `used` Acquire load pairs with the writer's Release store + /// that publishes each row, so the returned range covers every row + /// visible to this reader. Like all chunk metadata the snapshot is + /// racy: callers pruning by time must bracket it between two + /// [`chunk_generation`](Self::chunk_generation) reads. + pub fn chunk_ts_range(&self, chunk: usize) -> Option<(i64, i64)> { + self.ts_col()?; + let buf = self.as_bytes(); + let cs = chunk_start_off(buf, chunk); + let ch = chunk_header(buf, cs); + let _used = ch.used.load(Ordering::Acquire); + let min = ch.min_ts.load(Ordering::Relaxed); + let max = ch.max_ts.load(Ordering::Relaxed); + if min > max { + None // sentinel values: no committed rows + } else { + Some((min, max)) + } + } pub fn rows(&self, chunk: usize) -> RowIter<'_> { let buf = self.as_bytes(); let cs = chunk_start_off(buf, chunk); @@ -76,6 +112,33 @@ macro_rules! impl_table_reader { let cs = chunk_start_off(buf, chunk); chunk_header(buf, cs).row_count.load(Ordering::Acquire) as usize } + /// Chunk indices in **logical (oldest → newest) write order**. + /// + /// The ring writes chunks in `(generation, index)` order: chunk 0 at + /// generation 1, then chunks 1..N-1 at generation 1, then wraps back + /// to chunk 0 at generation 2, and so on. Sorting non-empty chunks + /// by `(generation, index)` therefore recovers temporal order + /// regardless of the current wrap position. + /// + /// Chunks that were never written (generation 0) or hold no + /// committed rows are skipped. The snapshot is racy by design: + /// callers that read concurrently with a writer must re-check + /// [`chunk_generation`](Self::chunk_generation) after consuming a + /// chunk and discard it on mismatch. + pub fn chunks_logical(&self) -> Vec { + let mut order: Vec<(u64, usize)> = (0..self.num_chunks()) + .filter_map(|i| { + let generation = self.chunk_generation(i); + if generation == 0 || self.num_rows(i) == 0 { + None + } else { + Some((generation, i)) + } + }) + .collect(); + order.sort_unstable(); + order.into_iter().map(|(_, i)| i).collect() + } pub fn creator_pid(&self) -> u32 { header(self.as_bytes()).creator_pid } @@ -112,6 +175,7 @@ fn make_row_writer<'a>( let wc = h.write_chunk.load(Ordering::Relaxed) as usize; let csz = h.chunk_size as usize; let doff = h.data_offset as usize; + let ts_col = h.ts_col; let cs = doff + wc * csz; let used = chunk_header(buf, cs).used.load(Ordering::Relaxed) as usize; RowWriter { @@ -125,6 +189,8 @@ fn make_row_writer<'a>( done: false, col_idx: 0, locked, + ts_col, + pending_ts: None, } } @@ -155,12 +221,6 @@ fn locked_append(buf: &mut [u8], values: &[Value]) -> bool { ok } -fn locked_push(buf: &mut [u8], values: &[Value]) { - acquire_write_lock(buf); - push_plain_row(buf, values); - release_write_lock(buf); -} - fn locked_advance(buf: &mut [u8]) { acquire_write_lock(buf); advance_chunk_unlocked(buf); @@ -224,6 +284,9 @@ fn append_row_dedup_bytes(buf: &mut [u8], state: &mut DedupState, values: &[Valu off += v.encode(&mut buf[off..]); } } + if let Some(ts) = row_ts(header(buf), values) { + note_row_ts(chunk_header(buf, cs), ts); + } chunk_header(buf, cs) .used .store((used + total) as u32, Ordering::Release); @@ -233,10 +296,110 @@ fn append_row_dedup_bytes(buf: &mut [u8], state: &mut DedupState, values: &[Valu true } -// ── MemTable (owned buffer) ────────────────────────────────────────── +// ── MemTable (owned buffer: heap or mmap'd shared memory) ─────────── + +/// Which kind of storage backs a [`MemTable`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BackingKind { + /// Process-private heap allocation. + Heap, + /// POSIX shared memory object (`shm_open`) — memory-only. + Shm, + /// mmap'd regular file — disk-backed. + File, +} + +/// Storage behind a [`MemTable`]. +enum Backing { + /// Process-private heap allocation. Invisible to other processes; + /// freed on drop. + Heap(Vec), + /// POSIX shared memory object (`shm_open` + `mmap`). Memory-only: + /// never touches disk, gone after reboot. Other processes attach by + /// name. When `unlink_on_drop`, the creator removes the name on drop + /// (existing mappings stay valid until unmapped). + Shm { + mmap: MmapMut, + name: String, + unlink_on_drop: bool, + }, + /// mmap'd regular file. Disk-backed: contents persist after drop / + /// reboot unless `unlink_on_drop` is set (used by the discoverable + /// `//` convention, where `dir` is the parent + /// `/` directory to remove when it becomes empty). + File { + mmap: MmapMut, + path: PathBuf, + dir: Option, + unlink_on_drop: bool, + }, +} + +impl Backing { + #[inline] + fn bytes(&self) -> &[u8] { + match self { + Backing::Heap(v) => v, + Backing::Shm { mmap, .. } => mmap, + Backing::File { mmap, .. } => mmap, + } + } + #[inline] + fn bytes_mut(&mut self) -> &mut [u8] { + match self { + Backing::Heap(v) => v, + Backing::Shm { mmap, .. } => mmap, + Backing::File { mmap, .. } => mmap, + } + } +} + +/// Normalise a POSIX shm name: must start with `/`, no other slashes. +/// Keep names short — macOS limits them to 31 bytes (`PSHMNAMLEN`). +fn shm_name_cstring(name: &str) -> io::Result { + let normalised = if name.starts_with('/') { + name.to_string() + } else { + format!("/{name}") + }; + if normalised[1..].contains('/') { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "shm name must not contain '/' (apart from the leading one)", + )); + } + std::ffi::CString::new(normalised) + .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "shm name contains NUL")) +} + +/// `shm_open` wrapper returning an owned [`std::fs::File`]. +fn shm_open_file(name: &std::ffi::CString, oflag: libc::c_int) -> io::Result { + use std::os::fd::FromRawFd; + let fd = unsafe { libc::shm_open(name.as_ptr(), oflag, 0o600 as libc::c_uint) }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + Ok(unsafe { std::fs::File::from_raw_fd(fd) }) +} + +/// Ring-buffer table that owns its storage. Three backings, one API: +/// +/// | Constructor | Backing | Cross-process | Survives crash | Survives reboot | +/// |-------------|---------|--------------|----------------|-----------------| +/// | [`new`](Self::new) / [`from_buf`](Self::from_buf) | heap | no | no | no | +/// | [`shm`](Self::shm) / [`open_shm`](Self::open_shm) | POSIX shared memory | by name | yes¹ | no | +/// | [`file_at`](Self::file_at) / [`open_file`](Self::open_file) | mmap'd file | by path | yes | yes | +/// | [`shared`](Self::shared) / [`shared_in`](Self::shared_in) | mmap'd file under `//` | discovery + SQL catalog | yes¹ | — | +/// +/// ¹ until the name/file is unlinked (creator drop or stale-pid cleanup). +/// +/// On Linux the discoverable `shared` flavour lives in `/dev/shm` (tmpfs), +/// so it is effectively shared memory *with* a browsable path; `shm` is the +/// portable memory-only variant (on macOS, shm objects have no filesystem +/// path at all). pub struct MemTable { - buf: Vec, + backing: Backing, } impl MemTable { @@ -244,57 +407,324 @@ impl MemTable { compute_data_offset(schema.cols.len()) + chunk_size * num_chunks } + /// Create a **heap-backed** (process-private) table. pub fn new(schema: &Schema, chunk_size: u32, num_chunks: u32) -> Self { let size = Self::required_size(schema, chunk_size as usize, num_chunks as usize); let mut buf = vec![0u8; size]; init_buf(&mut buf, schema, chunk_size, num_chunks); - Self { buf } + Self { + backing: Backing::Heap(buf), + } } + /// Adopt an existing heap buffer (validates the MEMT layout). pub fn from_buf(buf: Vec) -> Result { validate_buf(&buf)?; - Ok(Self { buf }) + Ok(Self { + backing: Backing::Heap(buf), + }) + } + + // ── POSIX shared memory (memory-only) ──────────────────────────── + + /// Create a **POSIX shared-memory** table (`shm_open`). + /// + /// Memory-only: never hits disk, vanishes on reboot. Other processes + /// attach with [`open_shm`](Self::open_shm) using the same `name` + /// (normalised to a leading `/`; keep it short — macOS caps shm names + /// at 31 bytes). The creator unlinks the name on drop; attached + /// processes keep a valid mapping until they unmap. + /// + /// Fails with `AlreadyExists` if the name is taken. + pub fn shm(name: &str, schema: &Schema, chunk_size: u32, num_chunks: u32) -> io::Result { + let cname = shm_name_cstring(name)?; + let size = Self::required_size(schema, chunk_size as usize, num_chunks as usize); + + let file = shm_open_file(&cname, libc::O_CREAT | libc::O_EXCL | libc::O_RDWR)?; + file.set_len(size as u64)?; + + let mut mmap = unsafe { MmapMut::map_mut(&file)? }; + init_buf(&mut mmap, schema, chunk_size, num_chunks); + + Ok(Self { + backing: Backing::Shm { + mmap, + name: cname.into_string().expect("validated utf-8"), + unlink_on_drop: true, + }, + }) + } + + /// Attach to an existing POSIX shared-memory table created by + /// [`shm`](Self::shm) (validates the MEMT layout). + /// + /// The returned handle does **not** unlink the name on drop. + pub fn open_shm(name: &str) -> io::Result { + let cname = shm_name_cstring(name)?; + let file = shm_open_file(&cname, libc::O_RDWR)?; + + let mmap = unsafe { MmapMut::map_mut(&file)? }; + validate_buf(&mmap).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + Ok(Self { + backing: Backing::Shm { + mmap, + name: cname.into_string().expect("validated utf-8"), + unlink_on_drop: false, + }, + }) + } + + // ── mmap'd file (disk-backed, persistent) ──────────────────────── + + /// Create a table backed by an **mmap'd regular file** at `path`. + /// + /// Disk-backed and persistent: the file is **kept** on drop and can be + /// reopened later with [`open_file`](Self::open_file) — including + /// after a process crash or reboot. Truncates any existing file. + pub fn file_at( + path: impl AsRef, + schema: &Schema, + chunk_size: u32, + num_chunks: u32, + ) -> io::Result { + let path = path.as_ref().to_path_buf(); + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent)?; + } + let size = Self::required_size(schema, chunk_size as usize, num_chunks as usize); + + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&path)?; + file.set_len(size as u64)?; + + let mut mmap = unsafe { MmapMut::map_mut(&file)? }; + init_buf(&mut mmap, schema, chunk_size, num_chunks); + + Ok(Self { + backing: Backing::File { + mmap, + path, + dir: None, + unlink_on_drop: false, + }, + }) + } + + /// Reopen an existing mmap'd-file table read-write (validates the + /// MEMT layout). The file is kept on drop. + pub fn open_file(path: impl AsRef) -> io::Result { + let path = path.as_ref().to_path_buf(); + let file = OpenOptions::new().read(true).write(true).open(&path)?; + + let mmap = unsafe { MmapMut::map_mut(&file)? }; + validate_buf(&mmap).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + Ok(Self { + backing: Backing::File { + mmap, + path, + dir: None, + unlink_on_drop: false, + }, + }) + } + + // ── discoverable file (data-dir convention) ────────────────────── + + /// Create a **discoverable** mmap'd-file table in the + /// [`default_dir`](crate::discover::default_dir), at + /// `//`. + /// + /// This is the flavour the SQL catalog and cross-process discovery + /// scan for. On Linux the default dir is `/dev/shm` (tmpfs), making + /// this shared memory with a browsable path. The file is unlinked on + /// drop; after a crash it stays readable until stale-pid cleanup. + pub fn shared( + name: &str, + schema: &Schema, + chunk_size: u32, + num_chunks: u32, + ) -> io::Result { + Self::shared_in( + &crate::discover::default_dir(), + name, + schema, + chunk_size, + num_chunks, + ) + } + + /// Like [`shared`](Self::shared), under a custom base directory + /// (file at `//`). + pub fn shared_in( + base_dir: &Path, + name: &str, + schema: &Schema, + chunk_size: u32, + num_chunks: u32, + ) -> io::Result { + let dir = base_dir.join(std::process::id().to_string()); + std::fs::create_dir_all(&dir)?; + + let path = dir.join(name); + let size = Self::required_size(schema, chunk_size as usize, num_chunks as usize); + + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&path)?; + file.set_len(size as u64)?; + + let mut mmap = unsafe { MmapMut::map_mut(&file)? }; + init_buf(&mut mmap, schema, chunk_size, num_chunks); + + Ok(Self { + backing: Backing::File { + mmap, + path, + dir: Some(dir), + unlink_on_drop: true, + }, + }) + } + + // ── backing introspection ───────────────────────────────────────── + + /// Which backend stores this table. + pub fn backing_kind(&self) -> BackingKind { + match &self.backing { + Backing::Heap(_) => BackingKind::Heap, + Backing::Shm { .. } => BackingKind::Shm, + Backing::File { .. } => BackingKind::File, + } + } + + /// `true` when other processes can attach (shm or mmap'd file). + pub fn is_shared(&self) -> bool { + !matches!(self.backing, Backing::Heap(_)) + } + + /// File path of the mapping; [`None`] for heap and shm backings + /// (POSIX shm objects have no portable filesystem path). + pub fn path(&self) -> Option<&Path> { + match &self.backing { + Backing::File { path, .. } => Some(path), + _ => None, + } + } + + /// POSIX shm name (with leading `/`); [`None`] for other backings. + pub fn shm_name(&self) -> Option<&str> { + match &self.backing { + Backing::Shm { name, .. } => Some(name), + _ => None, + } } pub fn as_bytes(&self) -> &[u8] { - &self.buf + self.backing.bytes() } + + pub fn as_bytes_mut(&mut self) -> &mut [u8] { + self.backing.bytes_mut() + } + pub fn view(&self) -> MemTableView<'_> { - MemTableView { buf: &self.buf } + MemTableView { + buf: self.backing.bytes(), + } } impl_table_reader!(); pub fn row_writer(&mut self) -> RowWriter<'_> { - begin_row_writer(&mut self.buf, None) + begin_row_writer(self.backing.bytes_mut(), None) } pub fn append_row(&mut self, values: &[Value]) -> bool { assert!( - validate_row_schema(&self.buf, values), + validate_row_schema(self.backing.bytes(), values), "value types do not match schema" ); - locked_append(&mut self.buf, values) + locked_append(self.backing.bytes_mut(), values) } pub fn advance_chunk(&mut self) { - locked_advance(&mut self.buf) + locked_advance(self.backing.bytes_mut()) } + + /// Append a row, auto-advancing to the next chunk when full. + /// + /// # Panic safety + /// + /// The spinlock is released even if the write panics (e.g. row exceeds + /// chunk capacity) — for shared tables this prevents a deadlocked mmap + /// file that other processes may still be reading. pub fn push_row(&mut self, values: &[Value]) { assert!( - validate_row_schema(&self.buf, values), + validate_row_schema(self.backing.bytes(), values), "value types do not match schema" ); - locked_push(&mut self.buf, values); + self.push_row_unchecked(values); } pub fn push_row_unchecked(&mut self, values: &[Value]) { - locked_push(&mut self.buf, values); + let buf = self.backing.bytes_mut(); + acquire_write_lock(buf); + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + push_plain_row(buf, values); + })); + release_write_lock(buf); + if let Err(payload) = result { + std::panic::resume_unwind(payload); + } + } +} + +impl Drop for MemTable { + fn drop(&mut self) { + match &self.backing { + Backing::Heap(_) => {} + Backing::Shm { + name, + unlink_on_drop: true, + .. + } => { + if let Ok(cname) = std::ffi::CString::new(name.as_str()) { + unsafe { libc::shm_unlink(cname.as_ptr()) }; + } + } + Backing::Shm { .. } => {} + Backing::File { + path, + dir, + unlink_on_drop: true, + .. + } => { + let _ = std::fs::remove_file(path); + if let Some(dir) = dir { + let _ = std::fs::remove_dir(dir); // succeeds only if empty + } + } + Backing::File { .. } => {} + } } } impl fmt::Display for MemTable { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let kind = match self.backing_kind() { + BackingKind::Heap => "heap", + BackingKind::Shm => "shm", + BackingKind::File => "file", + }; write!( f, - "MemTable({} cols, {} chunks × {} bytes)", + "MemTable({kind}, {} cols, {} chunks × {} bytes)", self.num_cols(), self.num_chunks(), self.chunk_size() @@ -501,7 +931,7 @@ impl fmt::Display for MemTableWriter<'_> { #[cfg(test)] mod tests { - use super::{MemTable, MemTableView, MemTableWriter}; + use super::{BackingKind, MemTable, MemTableView, MemTableWriter}; use crate::layout::{col_desc, header, header_mut, MAGIC, VERSION}; use crate::raw::init_buf; use crate::refcount::{acquire_ref, refcount, release_ref}; @@ -625,8 +1055,8 @@ mod tests { #[test] fn append_row_returns_false_when_full() { let schema = Schema::new().col("x", DType::I64); - // ChunkHeader=24, each I64 row=12 → 48-24=24 data bytes → 2 rows fit - let mut t = MemTable::new(&schema, 48, 1); + // ChunkHeader=40, each I64 row=12 → 64-40=24 data bytes → 2 rows fit + let mut t = MemTable::new(&schema, 64, 1); assert!(t.append_row(&[Value::I64(1)])); assert!(t.append_row(&[Value::I64(2)])); assert!(!t.append_row(&[Value::I64(3)])); @@ -636,8 +1066,8 @@ mod tests { #[test] fn ring_buffer_wrap() { let schema = Schema::new().col("v", DType::I32); - // ChunkHeader=24, each I32 row=8 → 80-24=56 data bytes → 7 rows fit - let mut t = MemTable::new(&schema, 80, 3); + // ChunkHeader=40, each I32 row=8 → 96-40=56 data bytes → 7 rows fit + let mut t = MemTable::new(&schema, 96, 3); for i in 0..7 { t.push_row(&[Value::I32(i)]); } @@ -652,6 +1082,197 @@ mod tests { assert_eq!(t.rows(0).next().unwrap().col_i32(0), 213); } + #[test] + fn heap_backing_is_private() { + let schema = Schema::new().col("x", DType::I32); + let mut t = MemTable::new(&schema, 1024, 2); + assert!(!t.is_shared()); + assert_eq!(t.backing_kind(), BackingKind::Heap); + assert!(t.path().is_none()); + assert!(t.shm_name().is_none()); + t.push_row(&[Value::I32(7)]); + assert_eq!(t.rows(0).next().unwrap().col_i32(0), 7); + } + + #[test] + fn shm_backing_roundtrip_and_unlink() { + // Short name: macOS caps shm names at 31 bytes. + let name = format!("/pbg_t{}", std::process::id() % 1_000_000); + // In case a previous failed run leaked the name. + if let Ok(c) = std::ffi::CString::new(name.as_str()) { + unsafe { libc::shm_unlink(c.as_ptr()) }; + } + + let schema = Schema::new().col("ts", DType::I64).col("msg", DType::Str); + let mut creator = MemTable::shm(&name, &schema, 4096, 2).unwrap(); + assert_eq!(creator.backing_kind(), BackingKind::Shm); + assert!(creator.is_shared()); + assert!(creator.path().is_none()); + assert_eq!(creator.shm_name(), Some(name.as_str())); + + creator.push_row(&[Value::I64(1), Value::Str("alpha")]); + + // Second attachment (what another process would do) sees the data… + let mut attached = MemTable::open_shm(&name).unwrap(); + assert_eq!(attached.num_rows(0), 1); + assert_eq!(attached.rows(0).next().unwrap().col_str(1), "alpha"); + + // …and writes through it are visible to the creator (same memory). + attached.push_row(&[Value::I64(2), Value::Str("beta")]); + assert_eq!(creator.num_rows(0), 2); + + // Name collision is rejected. + assert!(MemTable::shm(&name, &schema, 4096, 2).is_err()); + + // Creator drop unlinks the name; the attached mapping stays valid. + drop(creator); + assert!(MemTable::open_shm(&name).is_err()); + assert_eq!(attached.num_rows(0), 2); + } + + #[test] + fn file_backing_persists_across_reopen() { + let dir = std::env::temp_dir().join(format!( + "probing_mt_file_test_{}_{}", + std::process::id(), + line!() + )); + let _ = std::fs::remove_dir_all(&dir); + let path = dir.join("persistent.mt"); + + let schema = Schema::new().col("v", DType::I64); + { + let mut t = MemTable::file_at(&path, &schema, 4096, 2).unwrap(); + assert_eq!(t.backing_kind(), BackingKind::File); + assert_eq!(t.path(), Some(path.as_path())); + t.push_row(&[Value::I64(42)]); + } + // Unlike `shared`, the file survives drop… + assert!(path.is_file()); + + // …and can be reopened read-write with data intact. + let mut t = MemTable::open_file(&path).unwrap(); + assert_eq!(t.num_rows(0), 1); + assert_eq!(t.rows(0).next().unwrap().col_i64(0), 42); + t.push_row(&[Value::I64(43)]); + assert_eq!(t.num_rows(0), 2); + + // Reopening garbage fails validation. + let bad = dir.join("garbage.mt"); + std::fs::write(&bad, vec![0u8; 256]).unwrap(); + assert!(MemTable::open_file(&bad).is_err()); + + let _ = std::fs::remove_dir_all(&dir); + } + + #[test] + fn shared_backing_roundtrip_and_cleanup() { + let base = std::env::temp_dir().join(format!( + "probing_mt_shared_test_{}_{}", + std::process::id(), + line!() + )); + let _ = std::fs::remove_dir_all(&base); + + let schema = Schema::new().col("ts", DType::I64).col("msg", DType::Str); + let path = { + let mut t = MemTable::shared_in(&base, "shm_tbl", &schema, 4096, 2).unwrap(); + assert!(t.is_shared()); + let path = t.path().unwrap().to_path_buf(); + assert!(path.is_file()); + + t.push_row(&[Value::I64(1), Value::Str("alpha")]); + t.push_row(&[Value::I64(2), Value::Str("beta")]); + + // Same write/read API as the heap backing + assert_eq!(t.num_rows(0), 2); + assert_eq!(t.chunks_logical(), vec![0]); + + // Another handle (separate mmap of the same file) sees the data — + // this is what a cross-process reader does. + let bytes = std::fs::read(&path).unwrap(); + let view = MemTableView::new(&bytes).unwrap(); + assert_eq!(view.num_rows(0), 2); + let row = view.rows(0).next().unwrap(); + assert_eq!(row.col_i64(0), 1); + assert_eq!(row.col_str(1), "alpha"); + + path + }; + // Drop unlinks the file and the (now empty) / directory. + assert!(!path.exists()); + assert!(!path.parent().unwrap().exists()); + + let _ = std::fs::remove_dir_all(&base); + } + + #[test] + fn shared_and_heap_share_write_semantics_across_wrap() { + let base = std::env::temp_dir().join(format!( + "probing_mt_shared_test_{}_{}", + std::process::id(), + line!() + )); + let _ = std::fs::remove_dir_all(&base); + + let schema = Schema::new().col("v", DType::I32); + let mut heap = MemTable::new(&schema, 80, 3); + let mut shm = MemTable::shared_in(&base, "wrap_tbl", &schema, 80, 3).unwrap(); + + for i in 0..20 { + heap.push_row(&[Value::I32(i)]); + shm.push_row(&[Value::I32(i)]); + } + + let collect = |t: &MemTable| -> Vec { + t.chunks_logical() + .into_iter() + .flat_map(|c| t.rows(c).map(|r| r.col_i32(0)).collect::>()) + .collect() + }; + assert_eq!(collect(&heap), collect(&shm)); + + drop(shm); + let _ = std::fs::remove_dir_all(&base); + } + + #[test] + fn chunks_logical_pre_wrap() { + let schema = Schema::new().col("v", DType::I32); + let mut t = MemTable::new(&schema, 80, 3); + // No data yet: only chunk 0 is Writing (gen 1) but has no rows + assert!(t.chunks_logical().is_empty()); + + t.push_row(&[Value::I32(1)]); + assert_eq!(t.chunks_logical(), vec![0]); + + t.advance_chunk(); + t.push_row(&[Value::I32(2)]); + assert_eq!(t.chunks_logical(), vec![0, 1]); + } + + #[test] + fn chunks_logical_post_wrap() { + let schema = Schema::new().col("v", DType::I32); + let mut t = MemTable::new(&schema, 80, 2); + t.push_row(&[Value::I32(10)]); // chunk 0, gen 1 + t.advance_chunk(); + t.push_row(&[Value::I32(20)]); // chunk 1, gen 1 + t.advance_chunk(); // wraps: chunk 0 recycled → gen 2, zeroed + t.push_row(&[Value::I32(30)]); // chunk 0, gen 2 + + // Logical order: oldest surviving data (chunk 1, gen 1) first, + // then the recycled chunk 0 (gen 2). + let order = t.chunks_logical(); + assert_eq!(order, vec![1, 0]); + + let values: Vec = order + .iter() + .flat_map(|&c| t.rows(c).map(|r| r.col_i32(0)).collect::>()) + .collect(); + assert_eq!(values, vec![20, 30]); + } + #[test] fn ring_buffer_with_str() { let schema = Schema::new().col("msg", DType::Str); @@ -707,7 +1328,7 @@ mod tests { fn display_format() { let schema = Schema::new().col("a", DType::I32); let t = MemTable::new(&schema, 1024, 2); - assert_eq!(format!("{t}"), "MemTable(1 cols, 2 chunks × 1024 bytes)"); + assert_eq!(format!("{t}"), "MemTable(heap, 1 cols, 2 chunks × 1024 bytes)"); } #[test] @@ -1111,8 +1732,8 @@ mod tests { #[test] fn chunk_generation_increments_on_wrap() { let schema = Schema::new().col("v", DType::I32); - // 80 bytes per chunk → 7 I32 rows per chunk - let mut t = MemTable::new(&schema, 80, 2); + // 96 bytes per chunk → 7 I32 rows per chunk (ChunkHeader=40) + let mut t = MemTable::new(&schema, 96, 2); assert_eq!(t.chunk_generation(0), 1); assert_eq!(t.chunk_generation(1), 0); @@ -1350,4 +1971,173 @@ mod tests { } assert!(total > 0, "should have rows across chunks"); } + + // ── designated timestamp column / chunk ts range ────────────────── + + #[test] + fn ts_col_detection() { + let t = MemTable::new( + &Schema::new().col("v", DType::F64).col("timestamp", DType::I64), + 1024, + 1, + ); + assert_eq!(t.ts_col(), Some(1)); + + let t = MemTable::new(&Schema::new().col("ts", DType::I64), 1024, 1); + assert_eq!(t.ts_col(), Some(0)); + + // Wrong dtype or name → no designated column + let t = MemTable::new(&Schema::new().col("timestamp", DType::F64), 1024, 1); + assert_eq!(t.ts_col(), None); + let t = MemTable::new(&Schema::new().col("when", DType::I64), 1024, 1); + assert_eq!(t.ts_col(), None); + assert_eq!(t.chunk_ts_range(0), None); + } + + #[test] + fn chunk_ts_range_tracks_min_max() { + let schema = Schema::new().col("ts", DType::I64).col("v", DType::I32); + let mut t = MemTable::new(&schema, 1024, 2); + assert_eq!(t.chunk_ts_range(0), None, "empty chunk has no range"); + + t.push_row(&[Value::I64(500), Value::I32(1)]); + t.push_row(&[Value::I64(100), Value::I32(2)]); // out-of-order ts + t.push_row(&[Value::I64(900), Value::I32(3)]); + assert_eq!(t.chunk_ts_range(0), Some((100, 900))); + + // Advance: new chunk starts with a fresh range + t.advance_chunk(); + assert_eq!(t.chunk_ts_range(1), None); + t.push_row(&[Value::I64(1000), Value::I32(4)]); + assert_eq!(t.chunk_ts_range(1), Some((1000, 1000))); + assert_eq!(t.chunk_ts_range(0), Some((100, 900)), "old chunk keeps range"); + } + + #[test] + fn chunk_ts_range_resets_on_wrap() { + let schema = Schema::new().col("ts", DType::I64); + // ChunkHeader=40, I64 row=12 → 64-40=24 → 2 rows per chunk + let mut t = MemTable::new(&schema, 64, 2); + t.push_row(&[Value::I64(10)]); + t.push_row(&[Value::I64(20)]); + t.push_row(&[Value::I64(30)]); // → chunk 1 + t.push_row(&[Value::I64(40)]); + t.push_row(&[Value::I64(50)]); // wrap → chunk 0 recycled + assert_eq!(t.write_chunk(), 0); + assert_eq!(t.chunk_ts_range(0), Some((50, 50)), "recycled range resets"); + assert_eq!(t.chunk_ts_range(1), Some((30, 40))); + } + + #[test] + fn row_writer_maintains_ts_range() { + let schema = Schema::new().col("timestamp", DType::I64).col("m", DType::Str); + let mut t = MemTable::new(&schema, 4096, 1); + t.row_writer().put_i64(300).put_str("a").finish(); + t.row_writer().put_i64(100).put_str("b").finish(); + assert_eq!(t.chunk_ts_range(0), Some((100, 300))); + } + + #[test] + fn dedup_writer_maintains_ts_range() { + let schema = Schema::new().col("ts", DType::I64).col("tag", DType::Str); + let size = MemTable::required_size(&schema, 4096, 1); + let mut buf = vec![0u8; size]; + let mut w = MemTableWriter::init(&mut buf, &schema, 4096, 1).dedup(); + w.push_row(&[Value::I64(7), Value::Str("x")]); + w.push_row(&[Value::I64(3), Value::Str("x")]); + assert_eq!(w.chunk_ts_range(0), Some((3, 7))); + } + + #[test] + fn validate_rejects_bad_ts_col() { + let schema = Schema::new().col("ts", DType::I64).col("v", DType::F64); + let mut t = MemTable::new(&schema, 1024, 1); + header_mut(t.as_bytes_mut()).ts_col = 3; // out of range (2 cols) + assert!(MemTableView::new(t.as_bytes()).is_err()); + header_mut(t.as_bytes_mut()).ts_col = 2; // col 1 is F64, not I64 + assert!(MemTableView::new(t.as_bytes()).is_err()); + header_mut(t.as_bytes_mut()).ts_col = 1; // col 0 is I64 → ok + assert!(MemTableView::new(t.as_bytes()).is_ok()); + } + + // ── robust write lock ────────────────────────────────────────────── + + /// PID of a process that no longer exists: spawn a short-lived child + /// and wait for it to exit. + fn dead_pid() -> u32 { + let mut child = std::process::Command::new("true") + .spawn() + .expect("spawn true"); + let pid = child.id(); + child.wait().expect("wait true"); + pid + } + + #[test] + fn lock_word_holds_pid_while_held() { + let schema = Schema::new().col("x", DType::I32); + let mut t = MemTable::new(&schema, 1024, 1); + let lock_ptr = header(t.as_bytes()).write_lock.as_ptr() as usize; + let lock = unsafe { &*(lock_ptr as *const std::sync::atomic::AtomicU32) }; + { + let _w = t.row_writer(); // holds the lock + assert_eq!( + lock.load(Ordering::Relaxed), + std::process::id(), + "lock word must hold the owner PID" + ); + } + assert_eq!(lock.load(Ordering::Relaxed), 0); + } + + #[test] + fn stale_lock_from_dead_process_is_stolen() { + let schema = Schema::new().col("x", DType::I32); + let mut t = MemTable::new(&schema, 1024, 1); + + // Simulate a writer that crashed inside the critical section. + header(t.as_bytes()) + .write_lock + .store(dead_pid(), Ordering::SeqCst); + + let start = std::time::Instant::now(); + t.push_row(&[Value::I32(42)]); // must not deadlock + let took = start.elapsed(); + + assert_eq!(t.rows(0).next().unwrap().col_i32(0), 42); + assert_eq!(header(t.as_bytes()).write_lock.load(Ordering::Relaxed), 0); + assert!( + took >= crate::layout::LOCK_STEAL_TIMEOUT, + "steal must wait out the timeout first (took {took:?})" + ); + } + + #[test] + fn live_holder_is_not_preempted() { + let schema = Schema::new().col("x", DType::I32); + let mut t = MemTable::new(&schema, 1024, 1); + + // Another thread of this (alive) process holds the lock and + // releases it well past the steal timeout. + let me = std::process::id(); + header(t.as_bytes()).write_lock.store(me, Ordering::SeqCst); + let lock_ptr = header(t.as_bytes()).write_lock.as_ptr() as usize; + let hold = crate::layout::LOCK_STEAL_TIMEOUT + std::time::Duration::from_millis(200); + let releaser = std::thread::spawn(move || { + std::thread::sleep(hold); + let lock = unsafe { &*(lock_ptr as *const std::sync::atomic::AtomicU32) }; + lock.store(0, Ordering::Release); + }); + + let start = std::time::Instant::now(); + t.push_row(&[Value::I32(7)]); + let took = start.elapsed(); + releaser.join().unwrap(); + + assert!( + took >= hold - std::time::Duration::from_millis(50), + "live holder must be waited on, not preempted (took {took:?})" + ); + assert_eq!(t.rows(0).next().unwrap().col_i32(0), 7); + } } diff --git a/probing/memtable/src/raw.rs b/probing/memtable/src/raw.rs index 0c27050b..9eb61272 100644 --- a/probing/memtable/src/raw.rs +++ b/probing/memtable/src/raw.rs @@ -1,12 +1,42 @@ use crate::layout::{ chunk_header, col_desc, col_desc_mut, compute_data_offset, header, header_mut, r32, w32, ChunkHeader, ChunkState, Header, BYTE_ORDER_MARK, CHUNK_HEADER_SIZE, FLAGS_KNOWN, FLAG_DEDUP, - MAGIC, VERSION, + MAGIC, TS_MAX_INIT, TS_MIN_INIT, VERSION, }; use crate::schema::{DType, Schema, Value}; use std::mem; use std::sync::atomic::Ordering; +/// Column names recognised as the designated timestamp column (must be +/// `I64`). Matched at [`init_buf`] time and recorded in `Header::ts_col`. +pub(crate) const TS_COL_NAMES: [&str; 2] = ["timestamp", "ts"]; + +/// Fold a committed row's timestamp into the chunk's `min_ts`/`max_ts`. +/// +/// Called by the (single, lock-holding) writer **before** the `used` +/// Release store that publishes the row, so any reader that observes the +/// row also observes a covering ts range. +pub(crate) fn note_row_ts(ch: &ChunkHeader, ts: i64) { + if ts < ch.min_ts.load(Ordering::Relaxed) { + ch.min_ts.store(ts, Ordering::Relaxed); + } + if ts > ch.max_ts.load(Ordering::Relaxed) { + ch.max_ts.store(ts, Ordering::Relaxed); + } +} + +/// Extract the designated timestamp from a row, per `Header::ts_col`. +#[inline] +pub(crate) fn row_ts(h: &Header, values: &[Value]) -> Option { + match h.ts_col as usize { + 0 => None, + idx => match values.get(idx - 1) { + Some(Value::I64(ts)) => Some(*ts), + _ => None, + }, + } +} + /// Returns the kernel-reported start time of a process. /// /// Used to populate [`Header::creator_start_time`] and to verify liveness @@ -74,6 +104,9 @@ pub(crate) fn write_row_bytes(buf: &mut [u8], values: &[Value], row_data: usize) } unsafe { let ch = &*(ptr.add(cs) as *const ChunkHeader); + if let Some(ts) = row_ts(&*(ptr as *const Header), values) { + note_row_ts(ch, ts); + } ch.used.store((used + total) as u32, Ordering::Release); ch.row_count.fetch_add(1, Ordering::Release); } @@ -104,6 +137,8 @@ pub(crate) fn advance_chunk_unlocked(buf: &mut [u8]) { let new_ch = &*(ptr.add(cs) as *const ChunkHeader); new_ch.used.store(0, Ordering::Relaxed); new_ch.row_count.store(0, Ordering::Relaxed); + new_ch.min_ts.store(TS_MIN_INIT, Ordering::Relaxed); + new_ch.max_ts.store(TS_MAX_INIT, Ordering::Relaxed); new_ch .state .store(ChunkState::Writing as u32, Ordering::Relaxed); @@ -222,6 +257,15 @@ pub fn validate_buf(buf: &[u8]) -> Result<(), &'static str> { return Err("invalid column dtype"); } } + let ts_col = h.ts_col as usize; + if ts_col != 0 { + if ts_col > nc { + return Err("ts_col out of range"); + } + if DType::from_u32(col_desc(buf, ts_col - 1).dtype) != Some(DType::I64) { + return Err("ts_col must reference an I64 column"); + } + } let payload_cap = csz - CHUNK_HEADER_SIZE; for i in 0..h.num_chunks as usize { let cs = expected_off + i * csz; @@ -294,12 +338,21 @@ pub(crate) fn init_buf(buf: &mut [u8], schema: &Schema, chunk_size: u32, num_chu CHUNK_HEADER_SIZE + 8 ); + // First I64 column with a recognised timestamp name becomes the + // designated time column (index + 1; 0 = none). + let ts_col = schema + .cols + .iter() + .position(|c| c.dtype == DType::I64 && TS_COL_NAMES.contains(&c.name.as_str())) + .map(|i| (i + 1) as u16) + .unwrap_or(0); + let h = header_mut(buf); h.magic = MAGIC; h.version = VERSION; h.header_size = mem::size_of::
() as u16; h.byte_order = u16::from_ne_bytes(BYTE_ORDER_MARK); - h._pad0 = 0; + h.ts_col = ts_col; h.flags = 0; h.num_cols = nc as u32; h.num_chunks = num_chunks; @@ -310,7 +363,7 @@ pub(crate) fn init_buf(buf: &mut [u8], schema: &Schema, chunk_size: u32, num_chu h.refcount.store(1, Ordering::Relaxed); h.creator_pid = std::process::id(); h.creator_start_time = process_start_time(std::process::id()); - h._reserved = [0; 2]; + h.lock_owner_start.store(0, Ordering::Relaxed); for (i, col) in schema.cols.iter().enumerate() { let cd = col_desc_mut(buf, i); @@ -326,6 +379,8 @@ pub(crate) fn init_buf(buf: &mut [u8], schema: &Schema, chunk_size: u32, num_chu ch.generation.store(0, Ordering::Relaxed); ch.used.store(0, Ordering::Relaxed); ch.row_count.store(0, Ordering::Relaxed); + ch.min_ts.store(TS_MIN_INIT, Ordering::Relaxed); + ch.max_ts.store(TS_MAX_INIT, Ordering::Relaxed); ch.state.store(ChunkState::Empty as u32, Ordering::Relaxed); } // Chunk 0 is the initial write target diff --git a/probing/memtable/src/writer.rs b/probing/memtable/src/writer.rs index f514f6ae..1a4cb428 100644 --- a/probing/memtable/src/writer.rs +++ b/probing/memtable/src/writer.rs @@ -1,5 +1,6 @@ use crate::dedup::DedupState; use crate::layout::{chunk_header, release_write_lock, w32, CHUNK_HEADER_SIZE}; +use crate::raw::note_row_ts; use std::sync::atomic::Ordering; /// Streaming row writer — **low-overhead, weak-contract** hot-path API. @@ -25,6 +26,11 @@ pub struct RowWriter<'a> { pub(crate) done: bool, pub(crate) col_idx: usize, pub(crate) locked: bool, + /// `Header::ts_col` (timestamp column index + 1; 0 = none). + pub(crate) ts_col: u16, + /// Timestamp captured by `put_i64` on the designated column, + /// folded into the chunk's min/max on a successful `finish()`. + pub(crate) pending_ts: Option, } impl<'a> RowWriter<'a> { @@ -84,6 +90,9 @@ impl<'a> RowWriter<'a> { self } pub fn put_i64(&mut self, v: i64) -> &mut Self { + if self.ts_col as usize == self.col_idx + 1 { + self.pending_ts = Some(v); + } self.write_raw(&v.to_le_bytes()); self.col_idx += 1; self @@ -135,6 +144,9 @@ impl<'a> RowWriter<'a> { let row_data = self.pos - self.row_start - 4; w32(self.buf, self.row_start, row_data as u32); let new_used = (self.pos - self.chunk_start - CHUNK_HEADER_SIZE) as u32; + if let Some(ts) = self.pending_ts { + note_row_ts(chunk_header(self.buf, self.chunk_start), ts); + } chunk_header(self.buf, self.chunk_start) .used .store(new_used, Ordering::Release); @@ -197,8 +209,8 @@ mod tests { #[test] fn row_writer_overflow() { let schema = Schema::new().col("x", DType::I64); - // ChunkHeader=24, each I64 row=12 → 40-24=16 → 1 row fits, 2nd overflows - let mut t = MemTable::new(&schema, 40, 1); + // ChunkHeader=40, each I64 row=12 → 56-40=16 → 1 row fits, 2nd overflows + let mut t = MemTable::new(&schema, 56, 1); assert!(t.row_writer().put_i64(1).finish()); assert!(!t.row_writer().put_i64(2).finish()); assert_eq!(t.num_rows(0), 1); diff --git a/probing/server/Cargo.toml b/probing/server/Cargo.toml index e2f5fe85..314e3835 100644 --- a/probing/server/Cargo.toml +++ b/probing/server/Cargo.toml @@ -17,7 +17,7 @@ probing-python = { path = "../extensions/python", default-features = false } probing-proto = { path = "../proto" } probing-core = { path = "../core" } -datafusion = { version = "47.0.0", default-features = false } +datafusion = { workspace = true } anyhow = { workspace = true } log = { workspace = true } diff --git a/probing/server/src/engine.rs b/probing/server/src/engine.rs index 6f15752e..4c3749eb 100644 --- a/probing/server/src/engine.rs +++ b/probing/server/src/engine.rs @@ -30,7 +30,10 @@ pub async fn initialize_engine() -> Result<()> { #[cfg(target_os = "linux")] let builder = builder.with_extension(cc::TaskStatsExtension::default(), "rdma", Some("flow")); - probing_core::initialize_engine(builder).await + let result = probing_core::initialize_engine(builder).await; + // Opt-in background hot→cold compaction (PROBING_COLD=on / SET memtable.cold_compaction). + crate::memtable_ext::start_cold_compaction_from_env(); + result } pub async fn handle_query(request: Query) -> Result { diff --git a/probing/server/src/memtable_ext.rs b/probing/server/src/memtable_ext.rs index 8c49efdf..fa27daa6 100644 --- a/probing/server/src/memtable_ext.rs +++ b/probing/server/src/memtable_ext.rs @@ -1,698 +1,8 @@ -//! Mmap memtable integration for DataFusion. +//! Mmap memtable ↔ SQL integration. //! -//! ## File → SQL mapping (no hard-coded product prefix) -//! -//! Each regular file under `//` can be queried when its name is valid: -//! -//! - **First `.` splits schema vs table** — `acme.actors` → schema `acme`, table `actors`; -//! `foo.bar.baz` → schema `foo`, table `bar.baz` (on-disk name is the full filename). -//! - **No `.`** — exposed as `memtable.` (e.g. `metrics` → `memtable.metrics`). -//! -//! Schema head and table tail must be non-empty; only ASCII letters, digits, `_`, and -//! `.` inside the table tail are allowed (no `/`, `\\`). Leading-dot names are ignored. -use std::any::Any; -use std::collections::BTreeSet; -use std::sync::Arc; - -use async_trait::async_trait; -use datafusion::arrow::array::{ - ArrayRef, BinaryBuilder, Float32Builder, Float64Builder, GenericStringBuilder, Int32Builder, - Int64Builder, RecordBatch, UInt32Builder, UInt64Builder, UInt8Builder, -}; -use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use datafusion::catalog::CatalogProvider; -use datafusion::catalog::SchemaProvider; -use datafusion::datasource::TableProvider; -use datafusion::error::DataFusionError; -use datafusion::error::Result as DfResult; - -use probing_core::core::{ - EngineCall, EngineDatasource, EngineError, EngineExtension, EngineExtensionOption, - LazyTableSource, Plugin, PluginType, -}; -use probing_memtable::discover::default_dir; -use probing_memtable::{detect_table, DType, MemTableView, MemhView, TableKind, TypedValue}; - -/// SQL schema used for mmap files whose basename contains no `.`. -pub const DEFAULT_UNDOTTED_SCHEMA: &str = "memtable"; - -fn self_dir() -> std::path::PathBuf { - default_dir().join(std::process::id().to_string()) -} - -#[inline] -fn valid_schema_head(s: &str) -> bool { - !s.is_empty() && s.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'_') -} - -#[inline] -fn valid_table_tail(s: &str) -> bool { - !s.is_empty() - && !s.contains('/') - && !s.contains('\\') - && s.bytes() - .all(|b| b.is_ascii_alphanumeric() || b == b'_' || b == b'.') -} - -/// Map basename `filename` → `(schema, table)` for routing; [`None`] if skipped. -pub fn classify_mmap_basename(filename: &str) -> Option<(String, String)> { - if filename.starts_with('.') { - return None; - } - if let Some((head, tail)) = filename.split_once('.') { - if valid_schema_head(head) && valid_table_tail(tail) { - return Some((head.to_string(), tail.to_string())); - } - return None; - } - if valid_schema_head(filename) { - Some((DEFAULT_UNDOTTED_SCHEMA.to_string(), filename.to_string())) - } else { - None - } -} - -/// On-disk filename for a `(schema, table)` pair. -pub fn mmap_filename_for(schema: &str, table: &str) -> String { - if schema == DEFAULT_UNDOTTED_SCHEMA { - table.to_string() - } else { - format!("{schema}.{table}") - } -} - -fn tables_in_schema(target_schema: &str) -> Vec { - let dir = self_dir(); - let Ok(entries) = std::fs::read_dir(&dir) else { - return vec![]; - }; - let mut out = Vec::new(); - for e in entries.flatten() { - if !e.path().is_file() { - continue; - } - let n = e.file_name().to_string_lossy().to_string(); - if let Some((sch, tbl)) = classify_mmap_basename(&n) { - if sch == target_schema { - out.push(tbl); - } - } - } - out.sort(); - out.dedup(); - out -} - -fn discover_all_schemas() -> BTreeSet { - let mut out = BTreeSet::new(); - let dir = self_dir(); - if let Ok(entries) = std::fs::read_dir(&dir) { - for e in entries.flatten() { - if !e.path().is_file() { - continue; - } - let n = e.file_name().to_string_lossy().to_string(); - if let Some((sch, _)) = classify_mmap_basename(&n) { - out.insert(sch); - } - } - } - out.insert(DEFAULT_UNDOTTED_SCHEMA.to_string()); - out -} - -fn bytes_to_lazy_table(data: &[u8], logical_name: &str) -> Arc { - match detect_table(data) { - Some(TableKind::Ring) => { - let view = match MemTableView::new(data) { - Ok(v) => v, - Err(_) => return Arc::new(LazyTableSource::default()), - }; - Arc::new(LazyTableSource { - name: logical_name.to_string(), - schema: Some(view_to_arrow_schema(&view)), - data: view_to_recordbatch(&view), - }) - } - Some(TableKind::Hash) => { - let view = match MemhView::new(data) { - Ok(v) => v, - Err(_) => return Arc::new(LazyTableSource::default()), - }; - Arc::new(LazyTableSource { - name: logical_name.to_string(), - schema: Some(memh_kv_schema()), - data: memh_view_to_recordbatch(&view), - }) - } - None => Arc::new(LazyTableSource::default()), - } -} - -fn dtype_to_arrow(dt: DType) -> DataType { - match dt { - DType::U8 => DataType::UInt8, - DType::U32 => DataType::UInt32, - DType::I32 => DataType::Int32, - DType::I64 => DataType::Int64, - DType::F32 => DataType::Float32, - DType::F64 => DataType::Float64, - DType::U64 => DataType::UInt64, - DType::Str => DataType::Utf8, - DType::Bytes => DataType::Binary, - } -} - -fn view_to_arrow_schema(view: &MemTableView) -> SchemaRef { - let s = view.schema(); - let fields: Vec = s - .cols - .iter() - .map(|c| Field::new(&c.name, dtype_to_arrow(c.dtype), true)) - .collect(); - SchemaRef::new(Schema::new(fields)) -} - -enum ColBuilder { - U8(UInt8Builder), - U32(UInt32Builder), - I32(Int32Builder), - I64(Int64Builder), - F32(Float32Builder), - F64(Float64Builder), - U64(UInt64Builder), - Str(GenericStringBuilder), - Bytes(BinaryBuilder), -} - -fn view_to_recordbatch(view: &MemTableView) -> Vec { - let schema = view.schema(); - let arrow_schema = view_to_arrow_schema(view); - - let mut builders: Vec = schema - .cols - .iter() - .map(|c| match c.dtype { - DType::U8 => ColBuilder::U8(UInt8Builder::new()), - DType::U32 => ColBuilder::U32(UInt32Builder::new()), - DType::I32 => ColBuilder::I32(Int32Builder::new()), - DType::I64 => ColBuilder::I64(Int64Builder::new()), - DType::F32 => ColBuilder::F32(Float32Builder::new()), - DType::F64 => ColBuilder::F64(Float64Builder::new()), - DType::U64 => ColBuilder::U64(UInt64Builder::new()), - DType::Str => ColBuilder::Str(GenericStringBuilder::new()), - DType::Bytes => ColBuilder::Bytes(BinaryBuilder::new()), - }) - .collect(); - - for chunk in 0..view.num_chunks() { - for row in view.rows(chunk) { - let mut cursor = row.cursor(); - for builder in builders.iter_mut() { - match builder { - ColBuilder::U8(b) => b.append_value(cursor.next_u8()), - ColBuilder::U32(b) => b.append_value(cursor.next_u32()), - ColBuilder::I32(b) => b.append_value(cursor.next_i32()), - ColBuilder::I64(b) => b.append_value(cursor.next_i64()), - ColBuilder::F32(b) => b.append_value(cursor.next_f32()), - ColBuilder::F64(b) => b.append_value(cursor.next_f64()), - ColBuilder::U64(b) => b.append_value(cursor.next_u64()), - ColBuilder::Str(b) => b.append_value(cursor.next_str()), - ColBuilder::Bytes(b) => b.append_value(cursor.next_bytes()), - } - } - } - } - - let arrays: Vec = builders - .into_iter() - .map(|b| -> ArrayRef { - match b { - ColBuilder::U8(mut b) => Arc::new(b.finish()), - ColBuilder::U32(mut b) => Arc::new(b.finish()), - ColBuilder::I32(mut b) => Arc::new(b.finish()), - ColBuilder::I64(mut b) => Arc::new(b.finish()), - ColBuilder::F32(mut b) => Arc::new(b.finish()), - ColBuilder::F64(mut b) => Arc::new(b.finish()), - ColBuilder::U64(mut b) => Arc::new(b.finish()), - ColBuilder::Str(mut b) => Arc::new(b.finish()), - ColBuilder::Bytes(mut b) => Arc::new(b.finish()), - } - }) - .collect(); - - match RecordBatch::try_new(arrow_schema, arrays) { - Ok(batch) => vec![batch], - Err(e) => { - log::error!("memtable → RecordBatch failed: {e}"); - vec![] - } - } -} - -// ── MEMH: key-value table → two-column RecordBatch ──────────────────── - -/// Fixed Arrow schema for MEMH tables: `key` (Utf8) + `value` (Utf8). -/// -/// All MEMH values are serialised to strings so that heterogeneous value types -/// (scalars, strings, bytes) can be represented in a single column and queried -/// with SQL string predicates. -fn memh_kv_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("key", DataType::Utf8, false), - Field::new("value", DataType::Utf8, true), - ])) -} - -fn typed_value_to_str(v: &TypedValue<'_>) -> String { - match v { - TypedValue::U8(n) => n.to_string(), - TypedValue::I32(n) => n.to_string(), - TypedValue::I64(n) => n.to_string(), - TypedValue::F32(n) => n.to_string(), - TypedValue::F64(n) => n.to_string(), - TypedValue::U64(n) => n.to_string(), - TypedValue::U32(n) => n.to_string(), - TypedValue::Str(s) => s.to_string(), - TypedValue::Bytes(b) => { - // Hex-encode without adding a dep; e.g. "0xdeadbeef" - let mut out = String::with_capacity(2 + b.len() * 2); - out.push_str("0x"); - for byte in *b { - use std::fmt::Write; - let _ = write!(out, "{byte:02x}"); - } - out - } - } -} - -fn memh_view_to_recordbatch(view: &MemhView<'_>) -> Vec { - let schema = memh_kv_schema(); - let mut keys: GenericStringBuilder = GenericStringBuilder::new(); - let mut values: GenericStringBuilder = GenericStringBuilder::new(); - - for (k, v) in view.iter() { - keys.append_value(k); - values.append_value(typed_value_to_str(&v)); - } - - match RecordBatch::try_new( - schema, - vec![Arc::new(keys.finish()), Arc::new(values.finish())], - ) { - Ok(batch) => vec![batch], - Err(e) => { - log::error!("memh → RecordBatch failed: {e}"); - vec![] - } - } -} - -// ── Dynamic schemas from mmap filenames ─────────────────────────────── - -/// One DataFusion schema: tables are mmap files whose basename maps here via -/// [`classify_mmap_basename`]. -#[derive(Debug)] -pub struct MmapFileSchemaProvider { - schema: String, -} - -impl MmapFileSchemaProvider { - pub fn new(schema: impl Into) -> Self { - Self { - schema: schema.into(), - } - } -} - -#[async_trait] -impl SchemaProvider for MmapFileSchemaProvider { - fn as_any(&self) -> &dyn Any { - self - } - - fn table_names(&self) -> Vec { - tables_in_schema(&self.schema) - } - - async fn table(&self, name: &str) -> DfResult>> { - let names = self.table_names(); - if !names.iter().any(|n| n == name) { - return Ok(None); - } - let path = self_dir().join(mmap_filename_for(&self.schema, name)); - let data = match std::fs::read(&path) { - Ok(d) => d, - Err(_) => return Ok(None), - }; - Ok(Some(bytes_to_lazy_table(&data, name))) - } - - fn register_table( - &self, - _name: String, - _table: Arc, - ) -> DfResult>> { - Err(DataFusionError::NotImplemented( - "unable to create tables".to_string(), - )) - } - - fn deregister_table(&self, _name: &str) -> DfResult>> { - Err(DataFusionError::NotImplemented( - "unable to drop tables".to_string(), - )) - } - - fn table_exist(&self, name: &str) -> bool { - self.table_names().iter().any(|n| n == name) - } -} - -/// Wraps `probe` catalog; delegates static schemas (python, cluster, …) -/// to inner, discovers mmap-backed schemas (e.g. `pulsing.*`) at query time. -#[derive(Debug)] -struct DynamicMmapCatalog { - inner: Arc, -} - -impl CatalogProvider for DynamicMmapCatalog { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema_names(&self) -> Vec { - let mut names: BTreeSet = self.inner.schema_names().into_iter().collect(); - for sch in discover_all_schemas() { - names.insert(sch); - } - names.into_iter().collect() - } - - fn schema(&self, name: &str) -> Option> { - if !tables_in_schema(name).is_empty() || name == DEFAULT_UNDOTTED_SCHEMA { - return Some(Arc::new(MmapFileSchemaProvider::new(name))); - } - self.inner.schema(name) - } - - fn register_schema( - &self, - name: &str, - schema: Arc, - ) -> DfResult>> { - self.inner.register_schema(name, schema) - } -} - -/// Namespace plugin that wraps the `probe` catalog with [`DynamicMmapCatalog`] -/// for dynamic schema discovery from mmap files at query time. -#[derive(Debug, Default)] -pub struct UnifiedMemtablePlugin; - -impl Plugin for UnifiedMemtablePlugin { - fn name(&self) -> String { - "mmap_memtables".into() - } - fn kind(&self) -> PluginType { - PluginType::Namespace - } - fn namespace(&self) -> String { - "memtable".into() - } - - fn provide_catalog(&self, inner: Arc) -> Option> { - Some(Arc::new(DynamicMmapCatalog { inner })) - } -} - -// ── EngineExtension ──────────────────────────────────────────────────── - -#[derive(Debug, Default, EngineExtension)] -pub struct MemTableExtension {} - -impl EngineCall for MemTableExtension {} - -impl EngineDatasource for MemTableExtension { - fn datasrc( - &self, - _namespace: &str, - _name: Option<&str>, - ) -> Option> { - Some(Arc::new(UnifiedMemtablePlugin::default())) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use datafusion::arrow::array::{AsArray, Float64Array, Int32Array, Int64Array, UInt8Array}; - use probing_memtable::{MemTable, Schema as MtSchema, Value}; - use std::sync::Mutex; - - /// `PROBING_DATA_DIR` is process-global; serialize tests that mutate it. - static PROBING_DATA_DIR_LOCK: Mutex<()> = Mutex::new(()); - - #[test] - fn dtype_mapping_covers_all_variants() { - assert_eq!(dtype_to_arrow(DType::U8), DataType::UInt8); - assert_eq!(dtype_to_arrow(DType::U32), DataType::UInt32); - assert_eq!(dtype_to_arrow(DType::I32), DataType::Int32); - assert_eq!(dtype_to_arrow(DType::I64), DataType::Int64); - assert_eq!(dtype_to_arrow(DType::F32), DataType::Float32); - assert_eq!(dtype_to_arrow(DType::F64), DataType::Float64); - assert_eq!(dtype_to_arrow(DType::U64), DataType::UInt64); - assert_eq!(dtype_to_arrow(DType::Str), DataType::Utf8); - assert_eq!(dtype_to_arrow(DType::Bytes), DataType::Binary); - } - - #[test] - fn recordbatch_from_mixed_types() { - let schema = MtSchema::new() - .col("id", DType::I32) - .col("value", DType::F64) - .col("tag", DType::Str); - let mut t = MemTable::new(&schema, 4096, 2); - t.push_row(&[Value::I32(1), Value::F64(3.14), Value::Str("hello")]); - t.push_row(&[Value::I32(2), Value::F64(2.72), Value::Str("world")]); - - let view = t.view(); - let batches = view_to_recordbatch(&view); - assert_eq!(batches.len(), 1); - let batch = &batches[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 3); - - let ids = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(ids.value(0), 1); - assert_eq!(ids.value(1), 2); - - let vals = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!((vals.value(0) - 3.14).abs() < 1e-10); - assert!((vals.value(1) - 2.72).abs() < 1e-10); - - let tags: &datafusion::arrow::array::StringArray = batch.column(2).as_string(); - assert_eq!(tags.value(0), "hello"); - assert_eq!(tags.value(1), "world"); - } - - #[test] - fn recordbatch_multiple_chunks() { - let schema = MtSchema::new().col("v", DType::I64); - // Small chunk so rows spill across chunks - let mut t = MemTable::new(&schema, 128, 4); - for i in 0..20 { - t.push_row(&[Value::I64(i)]); - } - - let view = t.view(); - let batches = view_to_recordbatch(&view); - assert_eq!(batches.len(), 1); - let batch = &batches[0]; - // Ring buffer may have overwritten old chunks, but total rows should be > 0 - assert!(batch.num_rows() > 0); - - let col = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - // Verify values are sequential (from whatever chunks survived) - for i in 1..col.len() { - assert!(col.value(i) > col.value(i - 1)); - } - } - - #[test] - fn recordbatch_empty_table() { - let schema = MtSchema::new().col("x", DType::U8); - let t = MemTable::new(&schema, 1024, 1); - let view = t.view(); - let batches = view_to_recordbatch(&view); - assert_eq!(batches.len(), 1); - assert_eq!(batches[0].num_rows(), 0); - } - - #[test] - fn arrow_schema_matches_memtable_schema() { - let schema = MtSchema::new() - .col("ts", DType::I64) - .col("cpu", DType::F64) - .col("name", DType::Str); - let t = MemTable::new(&schema, 1024, 1); - let view = t.view(); - let arrow = view_to_arrow_schema(&view); - - assert_eq!(arrow.fields().len(), 3); - assert_eq!(arrow.field(0).name(), "ts"); - assert_eq!(*arrow.field(0).data_type(), DataType::Int64); - assert_eq!(arrow.field(1).name(), "cpu"); - assert_eq!(*arrow.field(1).data_type(), DataType::Float64); - assert_eq!(arrow.field(2).name(), "name"); - assert_eq!(*arrow.field(2).data_type(), DataType::Utf8); - } - - #[test] - fn recordbatch_u8_column() { - let schema = MtSchema::new().col("flag", DType::U8); - let mut t = MemTable::new(&schema, 1024, 1); - t.push_row(&[Value::U8(0)]); - t.push_row(&[Value::U8(255)]); - - let view = t.view(); - let batches = view_to_recordbatch(&view); - let col = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(col.value(0), 0); - assert_eq!(col.value(1), 255); - } - - fn read_lazy_from_mmap(schema: &str, table: &str) -> Arc { - let path = self_dir().join(mmap_filename_for(schema, table)); - let data = std::fs::read(path).unwrap(); - bytes_to_lazy_table(&data, table) - } - - #[test] - fn classify_and_mmap_roundtrip() { - assert_eq!( - classify_mmap_basename("pulsing.actors"), - Some(("pulsing".into(), "actors".into())) - ); - assert_eq!( - classify_mmap_basename("foo.bar.baz"), - Some(("foo".into(), "bar.baz".into())) - ); - assert_eq!( - classify_mmap_basename("metrics"), - Some((DEFAULT_UNDOTTED_SCHEMA.into(), "metrics".into())) - ); - assert_eq!( - mmap_filename_for(DEFAULT_UNDOTTED_SCHEMA, "metrics"), - "metrics" - ); - assert_eq!(mmap_filename_for("pulsing", "actors"), "pulsing.actors"); - assert_eq!(mmap_filename_for("foo", "bar.baz"), "foo.bar.baz"); - } - - #[test] - fn namespace_list_and_make_lazy_via_exposed_table() { - let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap(); - use probing_memtable::discover::ExposedTable; - - let tmp = tempfile::tempdir().unwrap(); - // Override discovery dir via env var - let orig = std::env::var("PROBING_DATA_DIR").ok(); - std::env::set_var("PROBING_DATA_DIR", tmp.path()); - - let schema = MtSchema::new().col("ts", DType::I64).col("msg", DType::Str); - let mut table = ExposedTable::create("test_metrics", &schema, 4096, 2).unwrap(); - { - let mut w = table.writer(); - w.push_row(&[Value::I64(100), Value::Str("alpha")]); - w.push_row(&[Value::I64(200), Value::Str("beta")]); - } - - let names = tables_in_schema(DEFAULT_UNDOTTED_SCHEMA); - assert!( - names.contains(&"test_metrics".to_string()), - "got: {names:?}" - ); - - let lazy = read_lazy_from_mmap(DEFAULT_UNDOTTED_SCHEMA, "test_metrics"); - assert_eq!(lazy.data.len(), 1); - let batch = &lazy.data[0]; - assert_eq!(batch.num_rows(), 2); - - let ts = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(ts.value(0), 100); - assert_eq!(ts.value(1), 200); - - let msgs: &datafusion::arrow::array::StringArray = batch.column(1).as_string(); - assert_eq!(msgs.value(0), "alpha"); - assert_eq!(msgs.value(1), "beta"); - - // Cleanup - drop(table); - match orig { - Some(v) => std::env::set_var("PROBING_DATA_DIR", v), - None => std::env::remove_var("PROBING_DATA_DIR"), - } - } - - #[test] - fn dotted_schema_isolated_from_memtable_list() { - let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap(); - use probing_memtable::discover::ExposedTable; - - let tmp = tempfile::tempdir().unwrap(); - let orig = std::env::var("PROBING_DATA_DIR").ok(); - std::env::set_var("PROBING_DATA_DIR", tmp.path()); - - let schema = MtSchema::new().col("ts", DType::I64).col("msg", DType::Str); - let dotted = mmap_filename_for("acme", "metrics_demo"); - let mut ring = ExposedTable::create(&dotted, &schema, 4096, 2).unwrap(); - { - let mut w = ring.writer(); - w.push_row(&[Value::I64(1), Value::Str("x")]); - } - - let mem_names = tables_in_schema(DEFAULT_UNDOTTED_SCHEMA); - assert!( - !mem_names.contains(&"metrics_demo".to_string()), - "dotted file must not appear as memtable table: {mem_names:?}" - ); - - let acme_names = tables_in_schema("acme"); - assert!( - acme_names.contains(&"metrics_demo".to_string()), - "got: {acme_names:?}" - ); - - let lazy = read_lazy_from_mmap("acme", "metrics_demo"); - assert_eq!(lazy.data.len(), 1); - assert_eq!(lazy.data[0].num_rows(), 1); +//! The implementation moved to `probing_core::core::memtable_sql` so that both +//! the server and language extensions can expose mmap memtables to SQL through +//! the same code path (logical chunk ordering, generation re-validation, and +//! zero-copy mmap reads). This module re-exports it for backward compatibility. - drop(ring); - match orig { - Some(v) => std::env::set_var("PROBING_DATA_DIR", v), - None => std::env::remove_var("PROBING_DATA_DIR"), - } - } -} +pub use probing_core::core::memtable_sql::*; From 1dd31ac8e4f40ce083d7fc25fc811f332c014fb8 Mon Sep 17 00:00:00 2001 From: Reiase Date: Sat, 13 Jun 2026 23:51:21 +0800 Subject: [PATCH 2/3] Update .gitignore, upgrade dioxus dependencies, and enhance documentation - Added `.claude/` to `.gitignore` to exclude specific files from version control. - Updated `dioxus` and related dependencies to version `0.7.9` in `Cargo.toml` for improved features and stability. - Revised documentation in `data-layer.md` and its Chinese counterpart to reflect changes in the header version and the single-writer model, enhancing clarity and accuracy. --- .github/actions/setup-build-env/action.yml | 2 +- .gitignore | 1 + docs/src/design/data-layer.md | 66 ++-- docs/src/design/data-layer.zh.md | 58 ++- probing/cli/src/cli/bench/args.rs | 19 +- probing/cli/src/cli/bench/runners/mixed.rs | 13 +- probing/cli/src/cli/bench/runners/mp.rs | 14 +- probing/cli/src/cli/bench/runners/write.rs | 19 +- probing/memtable/benches/memtable_report.rs | 48 --- probing/memtable/src/discover.rs | 3 +- probing/memtable/src/layout.rs | 232 ++--------- probing/memtable/src/lib.rs | 20 +- probing/memtable/src/memtable.rs | 402 +++----------------- probing/memtable/src/raw.rs | 13 +- probing/memtable/src/writer.rs | 45 +-- web/Cargo.toml | 9 +- 16 files changed, 238 insertions(+), 726 deletions(-) diff --git a/.github/actions/setup-build-env/action.yml b/.github/actions/setup-build-env/action.yml index 6f26a114..4cadba77 100644 --- a/.github/actions/setup-build-env/action.yml +++ b/.github/actions/setup-build-env/action.yml @@ -66,7 +66,7 @@ runs: test -e ~/.cargo/bin/rnr || cargo install rnr test -e ~/.cargo/bin/cargo-nextest || cargo install --locked cargo-nextest test -e ~/.cargo/bin/cargo-binstall || cargo install cargo-binstall - test -e ~/.cargo/bin/dx || cargo binstall dioxus-cli@0.7.6 -y + test -e ~/.cargo/bin/dx || cargo binstall dioxus-cli@0.7.9 -y test -e ~/.cargo/bin/trunk || cargo install trunk --locked - name: Install Python Build Dependencies diff --git a/.gitignore b/.gitignore index b7860735..a1dd4b73 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ pkg/ venv/ python/probing/probing docs/site/ +.claude/ diff --git a/docs/src/design/data-layer.md b/docs/src/design/data-layer.md index ddcf4156..2dd565dc 100644 --- a/docs/src/design/data-layer.md +++ b/docs/src/design/data-layer.md @@ -53,12 +53,12 @@ The hot tier is mapped read-only at query time; the cold tier is read via `Segme Every MEMT buffer (heap, shared memory, or mmap'd file) begins with a 64-byte header (one cache line), followed by per-column descriptors, then chunk data. -**Header v3 (64 bytes):** +**Header v4 (64 bytes):** | offset | size | field | notes | |---|---|---|---| | 0 | 4 | `magic` | `0x4D454D54` (`"MEMT"`) | -| 4 | 2 | `version` | 3 | +| 4 | 2 | `version` | 4 | | 6 | 2 | `header_size` | 64 (validation) | | 8 | 2 | `byte_order` | BOM `[0x01,0x02]` | | 10 | 2 | `ts_col` | timestamp column index + 1 (0 = none) | @@ -68,16 +68,19 @@ line), followed by per-column descriptors, then chunk data. | 24 | 4 | `chunk_size` | bytes per chunk | | 28 | 4 | `data_offset` | 64-aligned | | 32 | 4 | `write_chunk` | `AtomicU32` — current ring slot | -| 36 | 4 | `write_lock` | `AtomicU32` — 0 = free, else holder PID | -| 40 | 4 | `refcount` | `AtomicU32` | -| 44 | 4 | `creator_pid` | | -| 48 | 8 | `creator_start_time` | for PID-recycling detection | -| 56 | 8 | `lock_owner_start` | `AtomicU64` — lock holder's start time | +| 36 | 4 | `refcount` | `AtomicU32` | +| 40 | 4 | `creator_pid` | | +| 44 | 4 | `_pad0` | alignment (was `write_lock` in v3) | +| 48 | 8 | `creator_start_time` | for PID-recycling detection during discovery | +| 56 | 8 | `_reserved` | reserved (was `lock_owner_start` in v3) | Bytes 0–31 are the **cold zone** (immutable after init); bytes 32–63 are the **hot zone** (atomically mutated), split to avoid false sharing. Each chunk starts with a 40-byte `ChunkHeader` carrying a `generation` counter and per-chunk `min_ts`/`max_ts` (`AtomicI64`). +> **v4** dropped the `write_lock` and `lock_owner_start` fields: MEMT is single-writer, so there is +> no in-buffer write lock. Their byte slots are now reserved. + ### Backends The same API backs three storage kinds: @@ -95,25 +98,42 @@ slot (wrapping), sealing the previous chunk. Each slot carries a monotonically i **logical (oldest → newest) order** and re-check the generation after reading — a chunk recycled mid-read is discarded rather than surfacing torn rows. -### Robust Write Lock +### Single-Writer Model (no lock) + +MEMT is **single-writer**: exactly one writer owns each buffer (the creator process; any in-process +write is serialized by the caller). There is **no in-buffer write lock** — the writer appends rows +without any CAS or fence on a lock word. Readers are lock-free and never coordinated with the writer +except through the per-chunk `used` / `row_count` `Release` stores and `generation` re-validation. + +Why this is safe and sufficient: + +- Production uses one writer per table — the Python `ExternalTable` path writes one file per process + (named `//…`); a process restart means a new PID and a fresh file. +- Readers never wrote to the lock anyway; their correctness rides the `Release`/`Acquire` ordering on + `used`/`row_count` plus the `generation` check on each chunk. +- Removing the lock also removes the fork-safety hazard the PID-stealing spinlock had to guard + against (a forked child inheriting a cached start time and being mistaken for a recycled PID). + +> The **cold tier (MEMC)** has a separate concurrency story — multiple compactor writers are +> distinguished by `writer_id` and segment isolation — and is unaffected by the MEMT single-writer +> model. -`write_lock` holds **0 (free) or the holder's PID**. A waiter spins; if it spins past -`LOCK_STEAL_TIMEOUT` (500 ms) it enters a steal decision: +### Single-Writer Fast Path -- if the holder process no longer exists (`kill(pid, 0)`), the lock is stolen; -- if the holder exists but its kernel start time differs from `lock_owner_start`, the PID was - recycled by an unrelated process — stolen after a short re-check grace. +Since data is generated **one row at a time**, the single-row commit path is tuned to be as cheap as +possible: -Stealing is data-safe: rows only become visible via the `Release` store of `row_count` at the end -of a write, so a half-written row from a dead holder stays uncommitted and is simply overwritten. +- **Zero per-row allocation.** The `RowWriter` streaming API encodes fields directly into the ring + chunk; no `Vec` is built per row. (The `push_row(&[Value])` convenience API still works but + asks the caller to materialize a value slice.) +- **No lock, no per-row `catch_unwind`.** With a single writer there is nothing to lock and nothing + to release on panic, so neither a per-row CAS + `Release` fence nor a `catch_unwind`/`Drop` guard + is needed. -!!! note "Fork safety" - The holder's start time is read via a per-PID cache, **not** a one-shot cache. A child that - inherited a parent's cached value would record the parent's start time and be mistaken for a - recycled PID by a waiter — exactly the hazard fork-heavy workloads (PyTorch DataLoader) - trigger. Re-reading whenever the live PID changes makes every post-fork caller observe its own - start time. (Start times come from `/proc` on Linux; on platforms without it the steal-on-recycle - path is inert.) +Reader correctness is independent of the write path: row visibility always rides the `used` / +`row_count` `Release` stores in `finish()`. Measured single-thread `metrics` throughput (M4, +release): plain `push_row` + spinlock ≈ 18.8M rows/s → streaming, lock-free ≈ 29.9M rows/s +(**+59%** end to end). ### Timestamp Metadata @@ -273,7 +293,7 @@ re-validates). - No torn rows on reads (generation re-validation); cold torn-tail recovery. - Exactly-once across tiers (query dedup) and across restarts (`prime_from_cold`). - Bounded hot memory; bounded cold bytes/TTL. -- Fork-safe locking. +- Single-writer, lock-free hot path (MEMT); readers lock-free via generation re-validation. **Known trade-offs (P2 backlog):** diff --git a/docs/src/design/data-layer.zh.md b/docs/src/design/data-layer.zh.md index 718317fc..9303b605 100644 --- a/docs/src/design/data-layer.zh.md +++ b/docs/src/design/data-layer.zh.md @@ -47,12 +47,12 @@ graph LR 每个 MEMT 缓冲区(堆、共享内存或 mmap 文件)都以 64 字节头部(一个 cache line)开始,随后是 逐列描述符,再是 chunk 数据。 -**Header v3(64 字节):** +**Header v4(64 字节):** | 偏移 | 大小 | 字段 | 说明 | |---|---|---|---| | 0 | 4 | `magic` | `0x4D454D54`(`"MEMT"`) | -| 4 | 2 | `version` | 3 | +| 4 | 2 | `version` | 4 | | 6 | 2 | `header_size` | 64(仅校验) | | 8 | 2 | `byte_order` | BOM `[0x01,0x02]` | | 10 | 2 | `ts_col` | 时间戳列索引 + 1(0 = 无) | @@ -62,16 +62,19 @@ graph LR | 24 | 4 | `chunk_size` | 每个 chunk 字节数 | | 28 | 4 | `data_offset` | 64 对齐 | | 32 | 4 | `write_chunk` | `AtomicU32`——当前环形槽位 | -| 36 | 4 | `write_lock` | `AtomicU32`——0 = 空闲,否则为持有者 PID | -| 40 | 4 | `refcount` | `AtomicU32` | -| 44 | 4 | `creator_pid` | | -| 48 | 8 | `creator_start_time` | 用于 PID 回收检测 | -| 56 | 8 | `lock_owner_start` | `AtomicU64`——锁持有者的进程启动时间 | +| 36 | 4 | `refcount` | `AtomicU32` | +| 40 | 4 | `creator_pid` | | +| 44 | 4 | `_pad0` | 对齐填充(v3 中为 `write_lock`) | +| 48 | 8 | `creator_start_time` | 用于发现期的 PID 回收检测 | +| 56 | 8 | `_reserved` | 预留(v3 中为 `lock_owner_start`) | 字节 0–31 是**冷区**(初始化后不可变),字节 32–63 是**热区**(运行时原子修改),二者分离以避免 伪共享。每个 chunk 以 40 字节的 `ChunkHeader` 开头,携带 `generation` 计数器及逐 chunk 的 `min_ts`/`max_ts`(`AtomicI64`)。 +> **v4** 移除了 `write_lock` 与 `lock_owner_start` 字段:MEMT 是单写者,缓冲区内不再有写锁。其字节 +> 槽位现已预留。 + ### 三种后端 同一套 API 支撑三种存储形态: @@ -86,23 +89,36 @@ graph LR 每个槽位携带单调递增的 `generation`(每次环形绕回到该槽位即自增)。读取者按**逻辑顺序(旧 → 新)** 物化 chunk,并在读取后复核 generation——若某 chunk 在读取过程中被回收,则丢弃而非暴露半行数据。 -### Robust 写锁 +### 单写者模型(无锁) + +MEMT 是**单写者**:每个缓冲区恰好一个写者拥有(创建者进程;进程内的写由调用方自行串行化)。缓冲区 +内**没有写锁**——写者直接追加行,不在任何锁字上做 CAS 或屏障。读者免锁,与写者之间仅通过逐 chunk +的 `used` / `row_count` 的 `Release` 存储以及 `generation` 复核来协调。 + +为何安全且足够: + +- 生产中每表单写者——Python `ExternalTable` 路径为每个进程写一个文件(命名为 `//…`); + 进程重启即换新 PID、换新文件; +- 读者本就不写锁字,其正确性依赖 `used`/`row_count` 的 `Release`/`Acquire` 次序以及逐 chunk 的 + `generation` 复核; +- 去掉锁还顺带消除了 PID 抢占自旋锁必须防范的 fork 隐患(fork 出的子进程继承了缓存的启动时间,被误 + 判为 PID 回收)。 + +> **冷层(MEMC)** 是另一套并发模型——多个压实写者由 `writer_id` 与段隔离区分——不受 MEMT 单写者 +> 模型影响。 -`write_lock` 存放 **0(空闲)或持有者的 PID**。等待者自旋;若自旋超过 `LOCK_STEAL_TIMEOUT` -(500 ms),进入抢占判定: +### 单写者快路径 -- 若持有者进程已不存在(`kill(pid, 0)`),抢占该锁; -- 若持有者存在但其内核启动时间与 `lock_owner_start` 不符,说明 PID 已被无关进程回收——经短暂复核 - 宽限后抢占。 +由于数据是**单条生成**的,单行提交路径被尽量做轻: -抢占是数据安全的:行只有在写入结束时通过 `row_count` 的 `Release` 存储才可见,因此已死持有者写到 -一半的行不会提交,会被直接覆盖。 +- **每行零分配。** `RowWriter` 流式 API 直接把各字段编码进 ring chunk,不再为每行构造 + `Vec`。(`push_row(&[Value])` 便捷接口仍可用,但要求调用方先物化一个 value 切片。) +- **无锁,也无每行 `catch_unwind`。** 单写者下既无需加锁,也无需在 panic 时释放锁,因此既不需要逐行 + 的 CAS + `Release` 屏障,也不需要 `catch_unwind`/`Drop` 守卫。 -!!! note "fork 安全" - 持有者启动时间通过**按 PID 缓存**读取,而非一次性缓存。若子进程继承了父进程的缓存值,就会记录 - 父进程的启动时间,从而被等待者误判为 PID 回收——这正是大量 fork 的负载(PyTorch DataLoader) - 会触发的隐患。每当存活 PID 变化即重新读取,可让每个 fork 后的调用者观察到自己的启动时间。 - (启动时间在 Linux 上来自 `/proc`;在不具备该接口的平台上,回收抢占路径自动失效。) +读者正确性与写路径无关:行可见性始终依赖 `finish()` 中 `used` / `row_count` 的 `Release` 存储。单线程 +`metrics` 实测吞吐(M4,release):朴素 `push_row` + 自旋锁 ≈ 18.8M 行/s → 流式、免锁 ≈ 29.9M 行/s +(端到端 **+59%**)。 ### 时间戳元数据 @@ -244,7 +260,7 @@ chunk。每行恰好计数一次,且去重对环形回收免疫(generation - 读取无半行数据(generation 复核);冷层尾部撕裂可恢复; - 跨层精确一次(查询去重)与跨重启精确一次(`prime_from_cold`); - 热层内存有界;冷层字节/TTL 有界; -- fork 安全的锁。 +- 单写者、无锁热路径(MEMT);读者通过 generation 复核免锁读取。 **已知取舍(P2 待办):** diff --git a/probing/cli/src/cli/bench/args.rs b/probing/cli/src/cli/bench/args.rs index ad9b7cff..a46b4ce3 100644 --- a/probing/cli/src/cli/bench/args.rs +++ b/probing/cli/src/cli/bench/args.rs @@ -22,9 +22,9 @@ pub enum Backend { /// Streaming row writer vs. value-vector `push_row`. #[derive(ValueEnum, Debug, Clone, Copy, PartialEq, Eq)] pub enum WriterMode { - /// `push_row` — concurrency-safe auto-advance; allocates a value row. + /// `push_row` — auto-advance on chunk full; allocates a value row. Push, - /// `RowWriter` streaming fast path (single-threaded only). + /// `RowWriter` streaming fast path (zero per-row allocation). Streaming, } @@ -81,13 +81,14 @@ pub struct WriteArgs { #[arg(long, default_value_t = 1_000_000)] pub rows: u64, - /// Concurrent writer threads. >1 requires a shared backend - /// (shm/file/shared) to exercise the cross-handle write lock. + /// Concurrent writer threads. Only valid with `--backend heap`, where each + /// thread gets its own independent table. Shared backends are single-writer. #[arg(long, default_value_t = 1)] pub threads: usize, - /// Writer API to exercise. - #[arg(long, value_enum, default_value = "push")] + /// Writer API to exercise. `streaming` is the zero-allocation single-row + /// fast path (no per-row value vector); `push` allocates a value row. + #[arg(long, value_enum, default_value = "streaming")] pub writer: WriterMode, /// File path for `--backend file` (defaults to a temp file). @@ -178,8 +179,8 @@ pub struct MixedArgs { #[arg(long, value_enum, default_value = "shared")] pub backend: Backend, - /// Concurrent writer threads. - #[arg(long, default_value_t = 2)] + /// Writer threads. MEMT is single-writer; must be 1. + #[arg(long, default_value_t = 1)] pub writers: usize, /// Concurrent reader (scan) threads. @@ -218,7 +219,7 @@ pub struct MpArgs { #[arg(long, value_enum, default_value = "shared")] pub backend: Backend, - /// Number of writer processes. + /// Writer processes. MEMT is single-writer; must be 1. #[arg(long, default_value_t = 1)] pub writers: usize, diff --git a/probing/cli/src/cli/bench/runners/mixed.rs b/probing/cli/src/cli/bench/runners/mixed.rs index 4b92d666..ea0972d9 100644 --- a/probing/cli/src/cli/bench/runners/mixed.rs +++ b/probing/cli/src/cli/bench/runners/mixed.rs @@ -1,7 +1,7 @@ -//! `mixed` — end-to-end pipeline / soak: concurrent writers, optional -//! background compactor, and concurrent readers over one shared table for a -//! fixed duration. Reports per-role throughput plus the resulting cold-tier -//! footprint. +//! `mixed` — end-to-end pipeline / soak: a single writer, optional background +//! compactor, and concurrent readers over one shared table for a fixed +//! duration. Reports per-role throughput plus the resulting cold-tier +//! footprint. MEMT is single-writer, so the writer count is fixed at 1. use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::Arc; @@ -19,7 +19,10 @@ use crate::cli::bench::workload::RowGen; pub fn run(args: &MixedArgs, json: bool, seed: u64) -> Result<()> { let spec = args.schema.spec(); let row_bytes = spec.approx_row_bytes() as u64; - let writers = args.writers.max(1); + if args.writers > 1 { + bail!("mixed is single-writer (MEMT); --writers must be 1"); + } + let writers = 1usize; let readers = args.readers; // Create the shared backing; keep the creator alive for the whole run. diff --git a/probing/cli/src/cli/bench/runners/mp.rs b/probing/cli/src/cli/bench/runners/mp.rs index b5ba246e..a25e9e3e 100644 --- a/probing/cli/src/cli/bench/runners/mp.rs +++ b/probing/cli/src/cli/bench/runners/mp.rs @@ -6,9 +6,9 @@ //! wall-clock window (synchronised by a shared start instant) and prints a //! one-line JSON result; the orchestrator aggregates them. //! -//! This is the scenario the data layer is built for: independent OS processes -//! contending on the in-buffer robust write lock (writers) while others read -//! lock-free (readers) — the cross-process path threads cannot exercise. +//! This exercises the cross-process read path: a single writer process feeds +//! the shared mapping while several reader processes read lock-free. MEMT is +//! single-writer, so there is exactly one writer process. //! //! Worker vs. orchestrator is selected by the `PROBING_BENCH_MP_ROLE` //! environment variable, so the public surface stays a single `mp` command. @@ -41,11 +41,11 @@ pub fn run(args: &MpArgs, json: bool, seed: u64) -> Result<()> { fn orchestrate(args: &MpArgs, json: bool, seed: u64) -> Result<()> { let spec = args.schema.spec(); let row_bytes = spec.approx_row_bytes() as u64; - let writers = args.writers.max(1); - let readers = args.readers; - if writers + readers == 0 { - bail!("need at least one worker (--writers/--readers)"); + if args.writers > 1 { + bail!("mp is single-writer (MEMT); --writers must be 1"); } + let writers = 1usize; + let readers = args.readers; // Create the shared backing and keep it alive for the whole run. let (attach, _creator) = match args.backend { diff --git a/probing/cli/src/cli/bench/runners/write.rs b/probing/cli/src/cli/bench/runners/write.rs index 16fd4aa4..356b6fb8 100644 --- a/probing/cli/src/cli/bench/runners/write.rs +++ b/probing/cli/src/cli/bench/runners/write.rs @@ -1,10 +1,8 @@ //! `write` — write throughput across backends, writer counts and APIs. //! -//! With `--threads > 1` on a shared backend (`shm`/`file`/`shared`) every -//! thread opens its own handle to the same mapping, so the run genuinely -//! contends on the in-buffer robust write lock. The `heap` backend cannot be -//! shared, so multi-threaded heap runs use independent per-thread tables -//! (parallel throughput, no lock contention). +//! MEMT is single-writer, so shared backends (`shm`/`file`/`shared`) run with +//! one writer. `--threads > 1` is only valid on the `heap` backend, where each +//! thread gets its own independent table (parallel throughput, one writer each). use std::sync::Barrier; use std::time::Instant; @@ -37,10 +35,19 @@ pub fn run(args: &WriteArgs, json: bool, seed: u64) -> Result<()> { if args.writer == WriterMode::Streaming && threads > 1 { bail!("--writer streaming requires --threads 1 (advance-on-overflow is not concurrency-safe)"); } + // MEMT is single-writer. Multiple threads writing the SAME mapping is + // unsupported, so shared backends are capped to one writer. The heap + // backend instead gives each thread its own independent table. + if threads > 1 && args.backend != Backend::Heap { + bail!( + "--threads > 1 requires --backend heap (independent per-thread tables); \ + shared backends (shm/file/shared) are single-writer" + ); + } if threads > 1 && args.backend == Backend::Heap { eprintln!( "note: heap backend cannot be shared; --threads {threads} uses independent \ - per-thread tables (no lock contention)" + per-thread tables (parallel, single-writer each)" ); } diff --git a/probing/memtable/benches/memtable_report.rs b/probing/memtable/benches/memtable_report.rs index 6de6ac0d..d8625c9b 100644 --- a/probing/memtable/benches/memtable_report.rs +++ b/probing/memtable/benches/memtable_report.rs @@ -280,40 +280,6 @@ fn bench_push_row_unchecked_fixed(rows: &[FixedInput]) -> u64 { rows.len() as u64 } -fn bench_solo_push_row_fixed(rows: &[FixedInput]) -> u64 { - let schema = fixed_schema(); - let bytes_per_row = 4 + 8 + 8; - let chunk_size = 64 * 1024; - let total_bytes = rows.len() * bytes_per_row; - let num_chunks = ((total_bytes / chunk_size) + 2).max(2); - let size = MemTable::required_size(&schema, chunk_size, num_chunks); - let mut buf = vec![0u8; size]; - let mut sw = - MemTableWriter::init(&mut buf, &schema, chunk_size as u32, num_chunks as u32).solo(); - for &(ts, value) in rows { - sw.push_row_unchecked(&[Value::I64(ts), Value::I64(value)]); - } - black_box(sw.num_chunks()); - rows.len() as u64 -} - -fn bench_solo_row_writer_fixed(rows: &[FixedInput]) -> u64 { - let schema = fixed_schema(); - let bytes_per_row = 4 + 8 + 8; - let chunk_size = 64 * 1024; - let total_bytes = rows.len() * bytes_per_row; - let num_chunks = ((total_bytes / chunk_size) + 2).max(2); - let size = MemTable::required_size(&schema, chunk_size, num_chunks); - let mut buf = vec![0u8; size]; - let mut sw = - MemTableWriter::init(&mut buf, &schema, chunk_size as u32, num_chunks as u32).solo(); - for &(ts, value) in rows { - sw.row_writer().put_i64(ts).put_i64(value).finish(); - } - black_box(sw.num_chunks()); - rows.len() as u64 -} - fn bench_dedup_push_row_strings(rows: &[StringInput]) -> u64 { let schema = string_schema(); let approx_bytes_per_row = 4 + 8 + (4 + 5) + (4 + 20); @@ -509,9 +475,7 @@ fn print_report(results: &[BenchResult]) { ("baseline_memcpy_fixed", None), ("baseline_raw_append", Some("baseline_memcpy_fixed")), ("baseline_flat_encode", Some("baseline_raw_append")), - ("solo_row_writer_fixed", Some("baseline_flat_encode")), ("row_writer_fixed", Some("baseline_flat_encode")), - ("solo_push_row_fixed", Some("solo_row_writer_fixed")), ("push_row_unchecked_fixed", Some("row_writer_fixed")), ("push_row_fixed", Some("row_writer_fixed")), ], @@ -594,18 +558,6 @@ fn main() { FIXED_ROWS as u64 * fixed_bytes, || bench_push_row_unchecked_fixed(&fixed_write_inputs), ), - run_case( - "solo_push_row_fixed", - FIXED_ROWS as u64, - FIXED_ROWS as u64 * fixed_bytes, - || bench_solo_push_row_fixed(&fixed_write_inputs), - ), - run_case( - "solo_row_writer_fixed", - FIXED_ROWS as u64, - FIXED_ROWS as u64 * fixed_bytes, - || bench_solo_row_writer_fixed(&fixed_write_inputs), - ), run_case( "row_writer_fixed", FIXED_ROWS as u64, diff --git a/probing/memtable/src/discover.rs b/probing/memtable/src/discover.rs index eec04e56..4197eca0 100644 --- a/probing/memtable/src/discover.rs +++ b/probing/memtable/src/discover.rs @@ -161,8 +161,7 @@ impl ExposedTable { /// /// This is the fast path for high-frequency writes — it skips the /// O(rows × chunks) `validate_buf` that `writer()` performs on every call. - /// The spinlock is released even if the write panics, preventing a - /// deadlocked mmap file (see [`MemTable::push_row`]). + /// MEMT is single-writer, so no lock is taken (see [`MemTable::push_row`]). pub fn push_row(&mut self, values: &[Value]) { self.inner.push_row(values) } diff --git a/probing/memtable/src/layout.rs b/probing/memtable/src/layout.rs index f737468d..6729d7cb 100644 --- a/probing/memtable/src/layout.rs +++ b/probing/memtable/src/layout.rs @@ -1,12 +1,12 @@ //! Low-level layout: header, column descriptors, chunk headers, byte helpers. //! -//! ## Header v3 binary layout (64 bytes, 1 cache line) +//! ## Header v4 binary layout (64 bytes, 1 cache line) //! //! ```text //! offset size field notes //! ────────────────────────────────────────────────────────── //! 0 4 magic 0x4D454D54 ("MEMT" in LE) -//! 4 2 version 3 +//! 4 2 version 4 //! 6 2 header_size 64 (validation only) //! 8 2 byte_order BOM: written as [0x01, 0x02] //! 10 2 ts_col timestamp column index + 1 (0 = none) @@ -17,20 +17,24 @@ //! 28 4 data_offset (64-aligned) //! ─── 32 byte boundary (cold/hot split) ───────────────── //! 32 4 write_chunk AtomicU32 -//! 36 4 write_lock AtomicU32: 0 = unlocked, else holder PID -//! 40 4 refcount AtomicU32 -//! 44 4 creator_pid PID of creating process +//! 36 4 refcount AtomicU32 +//! 40 4 creator_pid PID of creating process +//! 44 4 _pad0 (alignment) //! 48 8 creator_start_time process start time (platform-specific) -//! 56 8 lock_owner_start AtomicU64: lock holder's start time +//! 56 8 _reserved reserved for future use //! ────────────────────────────────────────────────────────── //! ``` //! //! All multi-byte fields are little-endian. The `byte_order` BOM //! allows readers to detect endianness mismatch without guessing. +//! +//! MEMT is **single-writer**: exactly one writer owns each buffer (the +//! creator process; in-process writes are serialised by the caller). There +//! is no in-buffer write lock. Readers stay lock-free via per-chunk +//! `generation` re-validation. use std::mem; -use std::sync::atomic::{AtomicI64, AtomicU32, AtomicU64, Ordering}; -use std::time::{Duration, Instant}; +use std::sync::atomic::{AtomicI64, AtomicU32, AtomicU64}; // ── C-style layout structs ────────────────────────────────────────── @@ -40,10 +44,11 @@ pub(crate) const MAGIC: u32 = MAGIC_MEMT; /// Header format version for MEMT. /// -/// v3: `_pad0` became `ts_col`, `_reserved` became `lock_owner_start`, -/// `write_lock` stores the holder PID (was 0/1), and `ChunkHeader` grew -/// `min_ts`/`max_ts` (24 → 40 bytes). -pub(crate) const VERSION: u16 = 3; +/// v4: dropped the `write_lock` and `lock_owner_start` fields — MEMT is +/// single-writer, so there is no in-buffer write lock. Their bytes are now +/// `_pad0`/`_reserved`. v3 added per-chunk `min_ts`/`max_ts` and the PID +/// write lock (both since superseded). +pub(crate) const VERSION: u16 = 4; /// Byte-order mark: written as raw bytes `[0x01, 0x02]`. /// On a LE host, `u16::from_ne_bytes([0x01, 0x02])` == `0x0201`. @@ -68,8 +73,8 @@ pub(crate) const FLAGS_KNOWN: u32 = FLAG_DEDUP; /// schema dimensions, layout offsets. /// /// **Hot zone** (bytes 32–63): atomically mutated at runtime — -/// `write_chunk`, `write_lock`, `refcount`. Separated from the cold -/// zone to avoid false-sharing on different cache lines. +/// `write_chunk`, `refcount`. Separated from the cold zone to avoid +/// false-sharing on different cache lines. #[repr(C)] pub(crate) struct Header { // ── cold zone (read-only after init) ───────────────── @@ -102,24 +107,19 @@ pub(crate) struct Header { // ── hot zone (atomically mutated) ──────────────────── /// Ring buffer: index of the chunk currently being written. pub write_chunk: AtomicU32, - /// Robust writer spinlock: 0 = unlocked, otherwise the **PID** of the - /// holding process. A waiter that has spun past - /// [`LOCK_STEAL_TIMEOUT`] checks the holder's liveness and steals the - /// lock from a dead process (see [`acquire_write_lock`]). - pub write_lock: AtomicU32, /// Reference count for shared lifetime management. pub refcount: AtomicU32, /// PID of the process that created this table (for cross-process discovery). pub creator_pid: u32, - /// Process start time — for PID-recycling detection. + /// Padding to 8-align `creator_start_time` (was `write_lock` in v3). + pub _pad0: u32, + /// Process start time — for PID-recycling detection during discovery. /// Linux: clock ticks since boot (`/proc//stat` field 22). /// macOS: microseconds since epoch (via `sysctl`). /// Other: 0 (falls back to PID-only liveness check). pub creator_start_time: u64, - /// Start time of the current lock holder (0 = unknown / not written - /// yet). Written by the holder right after acquiring; lets waiters - /// detect PID recycling before stealing. Advisory only. - pub lock_owner_start: AtomicU64, + /// Reserved for future use (was `lock_owner_start` in v3). + pub _reserved: u64, } /// Per-column descriptor, immediately following the Header. @@ -224,154 +224,6 @@ pub(crate) fn chunk_header(buf: &[u8], cs: usize) -> &ChunkHeader { unsafe { &*(buf[cs..].as_ptr() as *const ChunkHeader) } } -/// How long a waiter spins before checking whether the lock holder is -/// still alive (and stealing the lock from a dead process). -/// -/// Writers hold the lock for nanoseconds–microseconds; even a descheduled -/// holder resumes within milliseconds. Reaching this timeout in practice -/// means the holder crashed while holding the lock. -pub(crate) const LOCK_STEAL_TIMEOUT: Duration = Duration::from_millis(500); - -/// `true` when a process with `pid` exists (it may belong to another user). -fn process_alive(pid: u32) -> bool { - if pid == std::process::id() { - return true; - } - if unsafe { libc::kill(pid as libc::pid_t, 0) } == 0 { - return true; - } - // EPERM: the process exists but we may not signal it. - std::io::Error::last_os_error().raw_os_error() == Some(libc::EPERM) -} - -/// This process's kernel start time, cached per PID (reads `/proc` on Linux). -/// -/// **Fork safety:** the cache is keyed on the live PID, not a one-shot -/// `OnceLock`. A child inheriting a parent's cached value would otherwise -/// record the *parent's* start time in `lock_owner_start`, and a waiter -/// comparing against the child's real start time would mistake the live child -/// for a recycled PID and steal its lock — exactly the hazard fork-heavy -/// workloads (e.g. PyTorch DataLoader) trigger. Re-reading whenever the PID -/// changes makes every post-fork caller observe its own start time. -fn my_start_time() -> u64 { - static MY_PID: AtomicU32 = AtomicU32::new(0); - static MY_START: AtomicU64 = AtomicU64::new(0); - - let pid = std::process::id(); - if MY_PID.load(Ordering::Acquire) == pid { - let cached = MY_START.load(Ordering::Acquire); - if cached != 0 { - return cached; - } - } - let start = crate::raw::process_start_time(pid); - // Publish start before PID: a reader that observes the matching PID is then - // guaranteed to also observe the start written for it. - MY_START.store(start, Ordering::Release); - MY_PID.store(pid, Ordering::Release); - start -} - -/// Decide whether the lock can be stolen from `holder`, and try to. -/// -/// Steal conditions (either): -/// - `holder` no longer exists (crashed / killed while holding the lock); -/// - `holder` exists but its kernel start time does not match the one the -/// real holder recorded in `lock_owner_start` — the PID was recycled by -/// an unrelated process. Re-checked after a grace period to rule out -/// the transient window where a fresh holder has not yet recorded its -/// start time. -/// -/// Stealing is safe with respect to data: rows only become visible via the -/// `used`/`row_count` Release stores at the end of a write, so a row half -/// written by the dead holder stays uncommitted and is simply overwritten. -#[cold] -#[inline(never)] -fn try_steal_lock(h: &Header, holder: u32, me: u32) -> bool { - if process_alive(holder) { - let owner_start = h.lock_owner_start.load(Ordering::Relaxed); - let actual_start = crate::raw::process_start_time(holder); - if owner_start == 0 || actual_start == 0 || actual_start == owner_start { - return false; // genuinely alive (or cannot tell) — keep waiting - } - std::thread::sleep(Duration::from_millis(10)); - if h.write_lock.load(Ordering::Relaxed) != holder - || h.lock_owner_start.load(Ordering::Relaxed) != owner_start - { - return false; // lock changed hands meanwhile — not stale - } - } - if h.write_lock - .compare_exchange(holder, me, Ordering::Acquire, Ordering::Relaxed) - .is_ok() - { - h.lock_owner_start.store(my_start_time(), Ordering::Relaxed); - return true; - } - false -} - -/// Acquire the **robust** writer spinlock with exponential back-off. -/// -/// The lock word holds the owner's PID (0 = unlocked). First few failures -/// use `spin_loop()` (pause instruction), then escalate to `yield_now()`. -/// A waiter stuck past [`LOCK_STEAL_TIMEOUT`] verifies the holder's -/// liveness and steals the lock from a dead process (see -/// [`try_steal_lock`]), so a writer crashing inside the critical section -/// cannot deadlock other writer processes forever. -/// -/// SAFETY NOTE: the buffer parameter is `&mut [u8]` (not `&[u8]`) so that -/// LLVM does **not** mark the pointer `readonly`. With `&[u8]` LLVM may -/// legally eliminate the atomic store inside `release_write_lock`, turning -/// the spin loop into an infinite loop in optimised (release) builds. -pub(crate) fn acquire_write_lock(buf: &mut [u8]) { - let ptr = buf.as_mut_ptr() as *const Header; - let h = unsafe { &*ptr }; - let me = std::process::id(); - let mut spins = 0u32; - let mut waiting_since: Option = None; - loop { - match h - .write_lock - .compare_exchange_weak(0, me, Ordering::Acquire, Ordering::Relaxed) - { - Ok(_) => { - h.lock_owner_start.store(my_start_time(), Ordering::Relaxed); - return; - } - Err(holder) if holder != 0 => { - let since = *waiting_since.get_or_insert_with(Instant::now); - if spins >= 16 && since.elapsed() >= LOCK_STEAL_TIMEOUT { - if try_steal_lock(h, holder, me) { - return; - } - waiting_since = Some(Instant::now()); - } - } - Err(_) => {} // spurious failure with lock free — retry CAS - } - if spins < 16 { - for _ in 0..1 << spins.min(4) { - std::hint::spin_loop(); - } - } else { - std::thread::yield_now(); - } - spins += 1; - } -} - -/// Release the writer spinlock. See [`acquire_write_lock`] for why `&mut`. -/// -/// Clears `lock_owner_start` *before* the lock word so that waiters never -/// pair the next holder's PID with this holder's start time. -pub(crate) fn release_write_lock(buf: &mut [u8]) { - let ptr = buf.as_mut_ptr() as *const Header; - unsafe { - (*ptr).lock_owner_start.store(0, Ordering::Relaxed); - (*ptr).write_lock.store(0, Ordering::Release); - } -} pub(crate) fn r32(buf: &[u8], off: usize) -> u32 { u32::from_le_bytes(buf[off..off + 4].try_into().unwrap()) } @@ -413,40 +265,4 @@ mod tests { let expected_le = u16::from_le_bytes(BYTE_ORDER_MARK); assert_eq!(bom, expected_le); } - - /// Fork safety: after `fork()`, `my_start_time()` must return the *child's* - /// own kernel start time, not a value cached for the parent before the - /// fork. With the old `OnceLock` cache the child returned the parent's - /// start time; a waiter then compared it against the child's real start - /// time and stole the lock from a live holder. The test process has run - /// long enough that its start tick differs from a freshly-forked child's, - /// so the stale value would be observably wrong. - /// - /// Linux-only: kernel start times come from `/proc`. On platforms without - /// it `process_start_time` returns 0, the PID-recycle steal path is inert, - /// and there is no fork hazard to guard against. - #[cfg(target_os = "linux")] - #[test] - fn my_start_time_refreshes_after_fork() { - // Warm the per-PID cache for the parent (mimics the leaked OnceLock). - let parent = my_start_time(); - assert_ne!(parent, 0, "parent start time should be readable"); - - unsafe { - let pid = libc::fork(); - assert!(pid >= 0, "fork failed"); - if pid == 0 { - // Child: the cached value must equal a fresh read for THIS pid. - let cached = my_start_time(); - let fresh = crate::raw::process_start_time(std::process::id()); - libc::_exit(if cached == fresh && cached != 0 { 0 } else { 1 }); - } - let mut status = 0; - libc::waitpid(pid, &mut status, 0); - assert!( - libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0, - "child my_start_time() must reflect its own process, not the parent's cache", - ); - } - } } diff --git a/probing/memtable/src/lib.rs b/probing/memtable/src/lib.rs index 2cbd5c04..87f02631 100644 --- a/probing/memtable/src/lib.rs +++ b/probing/memtable/src/lib.rs @@ -5,10 +5,14 @@ //! //! ## Concurrency //! -//! - **Writers** are serialized by a spinlock (`write_lock` in Header). +//! MEMT is **single-writer**: exactly one writer owns each buffer, so there +//! is no in-buffer write lock. +//! +//! - **Writer**: a single owner appends rows; the `&mut` borrow (or the +//! caller's own serialization) guarantees exclusivity. No lock is taken. //! - **Readers** are lock-free: per-chunk `used` is updated with `Release` ordering -//! by writers and loaded with `Acquire` by readers, ensuring row data visibility. -//! - `RowWriter` holds the lock for its lifetime; released by `finish()` or `Drop`. +//! by the writer and loaded with `Acquire` by readers, ensuring row data visibility. +//! Readers re-validate the chunk `generation` to discard rows from a recycled chunk. //! //! # Memory Layout //! @@ -16,13 +20,13 @@ //! //! ```text //! ┌──────────────────────────────────┐ 0 -//! │ Header v2 (64 bytes, repr(C)) │ +//! │ Header v4 (64 bytes, repr(C)) │ //! │ ── cold zone (read-only) ── │ //! │ magic: u32 (0x4D454D54) │ -//! │ version: u16 (2) │ +//! │ version: u16 (4) │ //! │ header_size: u16 (64) │ //! │ byte_order: u16 (BOM 0x0102) │ -//! │ _pad0: u16 │ +//! │ ts_col: u16 │ //! │ flags: u32 (feature bits) │ //! │ num_cols: u32 │ //! │ num_chunks: u32 │ @@ -30,11 +34,11 @@ //! │ data_offset: u32 │ //! │ ── hot zone (atomic) ──── │ //! │ write_chunk: AtomicU32 │ -//! │ write_lock: AtomicU32 │ //! │ refcount: AtomicU32 │ //! │ creator_pid: u32 │ +//! │ _pad0: u32 │ //! │ creator_start_time: u64 │ -//! │ _reserved: [u32; 2] │ +//! │ _reserved: u64 │ //! ├──────────────────────────────────┤ 64 //! │ ColumnDesc × N (64 bytes each) │ //! │ name: [u8; 56] (LP u16) │ diff --git a/probing/memtable/src/memtable.rs b/probing/memtable/src/memtable.rs index 6545df5d..66797556 100644 --- a/probing/memtable/src/memtable.rs +++ b/probing/memtable/src/memtable.rs @@ -1,10 +1,10 @@ use crate::dedup::DedupState; use crate::layout::{ - acquire_write_lock, chunk_header, chunk_start_off, col_desc, compute_data_offset, header, - header_mut, release_write_lock, w32, CHUNK_HEADER_SIZE, FLAG_DEDUP, + chunk_header, chunk_start_off, col_desc, compute_data_offset, header, header_mut, w32, + CHUNK_HEADER_SIZE, FLAG_DEDUP, }; use crate::raw::{ - advance_chunk_unlocked, init_buf, note_row_ts, row_ts, validate_buf, validate_row_schema, + advance_chunk_raw, init_buf, note_row_ts, row_ts, validate_buf, validate_row_schema, write_row_bytes, }; use crate::refcount::refcount; @@ -166,11 +166,7 @@ macro_rules! impl_table_reader { // ── Write helpers ──────────────────────────────────────────────────── -fn make_row_writer<'a>( - buf: &'a mut [u8], - dedup: Option<&'a mut DedupState>, - locked: bool, -) -> RowWriter<'a> { +fn make_row_writer<'a>(buf: &'a mut [u8], dedup: Option<&'a mut DedupState>) -> RowWriter<'a> { let h = header(buf); let wc = h.write_chunk.load(Ordering::Relaxed) as usize; let csz = h.chunk_size as usize; @@ -188,17 +184,11 @@ fn make_row_writer<'a>( overflow: false, done: false, col_idx: 0, - locked, ts_col, pending_ts: None, } } -fn begin_row_writer<'a>(buf: &'a mut [u8], dedup: Option<&'a mut DedupState>) -> RowWriter<'a> { - acquire_write_lock(buf); - make_row_writer(buf, dedup, true) -} - fn row_data_size(values: &[Value]) -> usize { values.iter().map(|v| v.encoded_size()).sum() } @@ -206,7 +196,7 @@ fn row_data_size(values: &[Value]) -> usize { pub(crate) fn push_plain_row(buf: &mut [u8], values: &[Value]) { let row_data = row_data_size(values); if !write_row_bytes(buf, values, row_data) { - advance_chunk_unlocked(buf); + advance_chunk_raw(buf); assert!( write_row_bytes(buf, values, row_data), "row exceeds chunk capacity" @@ -214,18 +204,6 @@ pub(crate) fn push_plain_row(buf: &mut [u8], values: &[Value]) { } } -fn locked_append(buf: &mut [u8], values: &[Value]) -> bool { - acquire_write_lock(buf); - let ok = write_row_bytes(buf, values, row_data_size(values)); - release_write_lock(buf); - ok -} - -fn locked_advance(buf: &mut [u8]) { - acquire_write_lock(buf); - advance_chunk_unlocked(buf); - release_write_lock(buf); -} const MAX_DEDUP_COLS: usize = 64; @@ -413,16 +391,14 @@ impl MemTable { let mut buf = vec![0u8; size]; init_buf(&mut buf, schema, chunk_size, num_chunks); Self { - backing: Backing::Heap(buf), - } + backing: Backing::Heap(buf), } } /// Adopt an existing heap buffer (validates the MEMT layout). pub fn from_buf(buf: Vec) -> Result { validate_buf(&buf)?; Ok(Self { - backing: Backing::Heap(buf), - }) + backing: Backing::Heap(buf), }) } // ── POSIX shared memory (memory-only) ──────────────────────────── @@ -451,8 +427,7 @@ impl MemTable { mmap, name: cname.into_string().expect("validated utf-8"), unlink_on_drop: true, - }, - }) + }, }) } /// Attach to an existing POSIX shared-memory table created by @@ -471,8 +446,7 @@ impl MemTable { mmap, name: cname.into_string().expect("validated utf-8"), unlink_on_drop: false, - }, - }) + }, }) } // ── mmap'd file (disk-backed, persistent) ──────────────────────── @@ -511,8 +485,7 @@ impl MemTable { path, dir: None, unlink_on_drop: false, - }, - }) + }, }) } /// Reopen an existing mmap'd-file table read-write (validates the @@ -530,8 +503,7 @@ impl MemTable { path, dir: None, unlink_on_drop: false, - }, - }) + }, }) } // ── discoverable file (data-dir convention) ────────────────────── @@ -591,8 +563,7 @@ impl MemTable { path, dir: Some(dir), unlink_on_drop: true, - }, - }) + }, }) } // ── backing introspection ───────────────────────────────────────── @@ -645,26 +616,23 @@ impl MemTable { impl_table_reader!(); pub fn row_writer(&mut self) -> RowWriter<'_> { - begin_row_writer(self.backing.bytes_mut(), None) + make_row_writer(self.backing.bytes_mut(), None) } pub fn append_row(&mut self, values: &[Value]) -> bool { assert!( validate_row_schema(self.backing.bytes(), values), "value types do not match schema" ); - locked_append(self.backing.bytes_mut(), values) + write_row_bytes(self.backing.bytes_mut(), values, row_data_size(values)) } pub fn advance_chunk(&mut self) { - locked_advance(self.backing.bytes_mut()) + advance_chunk_raw(self.backing.bytes_mut()) } /// Append a row, auto-advancing to the next chunk when full. /// - /// # Panic safety - /// - /// The spinlock is released even if the write panics (e.g. row exceeds - /// chunk capacity) — for shared tables this prevents a deadlocked mmap - /// file that other processes may still be reading. + /// MEMT is single-writer: the `&mut self` borrow guarantees exclusive + /// access, so no lock is taken. pub fn push_row(&mut self, values: &[Value]) { assert!( validate_row_schema(self.backing.bytes(), values), @@ -673,15 +641,7 @@ impl MemTable { self.push_row_unchecked(values); } pub fn push_row_unchecked(&mut self, values: &[Value]) { - let buf = self.backing.bytes_mut(); - acquire_write_lock(buf); - let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { - push_plain_row(buf, values); - })); - release_write_lock(buf); - if let Err(payload) = result { - std::panic::resume_unwind(payload); - } + push_plain_row(self.backing.bytes_mut(), values); } } @@ -767,46 +727,30 @@ impl fmt::Display for MemTableView<'_> { /// Unified writer for external buffers (`&mut [u8]`). /// -/// Supports four modes via builder methods: +/// MEMT is single-writer: the `&mut [u8]` borrow guarantees exclusive +/// access, so no lock is taken. Two modes via builder methods: /// /// | Mode | Construction | /// |------|-------------| -/// | Locked, plain | `MemTableWriter::new(buf)?` | -/// | Locked, dedup | `MemTableWriter::new(buf)?.dedup()` | -/// | Solo, plain | `MemTableWriter::new(buf)?.solo()` | -/// | Solo, dedup | `MemTableWriter::new(buf)?.solo().dedup()` | -/// -/// **Locked** (default): writers are serialized via a spinlock — safe for -/// multiple writer threads sharing the same buffer through raw pointers. -/// -/// **Solo**: no spinlock — the `&mut [u8]` borrow guarantees exclusive -/// access at compile time. Saves ~5 ns/row of CAS overhead. +/// | Plain | `MemTableWriter::new(buf)?` | +/// | Dedup | `MemTableWriter::new(buf)?.dedup()` | /// /// **Dedup**: per-chunk, hash-based string/bytes dedup. Repeated values /// are stored as 4-byte back-references within the same chunk. pub struct MemTableWriter<'a> { buf: &'a mut [u8], dedup: Option, - locked: bool, } impl<'a> MemTableWriter<'a> { pub fn new(buf: &'a mut [u8]) -> Result { validate_buf(buf)?; - Ok(Self { - buf, - dedup: None, - locked: true, - }) + Ok(Self { buf, dedup: None }) } pub fn init(buf: &'a mut [u8], schema: &Schema, chunk_size: u32, num_chunks: u32) -> Self { init_buf(buf, schema, chunk_size, num_chunks); - Self { - buf, - dedup: None, - locked: true, - } + Self { buf, dedup: None } } /// Enable per-chunk string/bytes dedup. Sets `FLAG_DEDUP` in header. @@ -816,12 +760,6 @@ impl<'a> MemTableWriter<'a> { self } - /// Disable the spinlock (single-producer mode). - pub fn solo(mut self) -> Self { - self.locked = false; - self - } - pub fn set_min_dedup_len(&mut self, len: usize) { if let Some(ref mut s) = self.dedup { s.set_min_dedup_len(len); @@ -838,11 +776,7 @@ impl<'a> MemTableWriter<'a> { impl_table_reader!(); pub fn row_writer(&mut self) -> RowWriter<'_> { - if self.locked { - begin_row_writer(self.buf, self.dedup.as_mut()) - } else { - make_row_writer(self.buf, self.dedup.as_mut(), false) - } + make_row_writer(self.buf, self.dedup.as_mut()) } pub fn push_row(&mut self, values: &[Value]) { @@ -858,16 +792,10 @@ impl<'a> MemTableWriter<'a> { } pub fn advance_chunk(&mut self) { - if self.locked { - acquire_write_lock(self.buf); - } - advance_chunk_unlocked(self.buf); + advance_chunk_raw(self.buf); if let Some(ref mut s) = self.dedup { s.clear(); } - if self.locked { - release_write_lock(self.buf); - } } pub fn append_row(&mut self, values: &[Value]) -> bool { @@ -875,27 +803,17 @@ impl<'a> MemTableWriter<'a> { validate_row_schema(self.buf, values), "value types do not match schema" ); - if self.locked { - acquire_write_lock(self.buf); - } - let ok = if let Some(ref mut state) = self.dedup { + if let Some(ref mut state) = self.dedup { append_row_dedup_bytes(self.buf, state, values) } else { write_row_bytes(self.buf, values, row_data_size(values)) - }; - if self.locked { - release_write_lock(self.buf); } - ok } fn push_inner(&mut self, values: &[Value]) { - if self.locked { - acquire_write_lock(self.buf); - } if let Some(ref mut state) = self.dedup { if !append_row_dedup_bytes(self.buf, state, values) { - advance_chunk_unlocked(self.buf); + advance_chunk_raw(self.buf); state.clear(); assert!( append_row_dedup_bytes(self.buf, state, values), @@ -905,19 +823,15 @@ impl<'a> MemTableWriter<'a> { } else { push_plain_row(self.buf, values); } - if self.locked { - release_write_lock(self.buf); - } } } impl fmt::Display for MemTableWriter<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let mode = match (self.locked, self.dedup.is_some()) { - (true, false) => "locked", - (true, true) => "locked+dedup", - (false, false) => "solo", - (false, true) => "solo+dedup", + let mode = if self.dedup.is_some() { + "dedup" + } else { + "plain" }; write!( f, @@ -1349,7 +1263,6 @@ mod tests { assert_eq!(h.num_chunks, 4); assert_eq!(h.chunk_size, 1024); assert_eq!(h.write_chunk.load(Ordering::Relaxed), 0); - assert_eq!(h.write_lock.load(Ordering::Relaxed), 0); assert_eq!(h.refcount.load(Ordering::Relaxed), 1); } @@ -1488,67 +1401,15 @@ mod tests { } #[test] - fn concurrent_multiple_writers() { - use std::alloc; - use std::thread; - - let schema = Schema::new().col("tid", DType::I64).col("seq", DType::I64); - let chunk_size = 8192u32; - let num_chunks = 8u32; - let size = MemTable::required_size(&schema, chunk_size as usize, num_chunks as usize); - let layout = alloc::Layout::from_size_align(size, 64).unwrap(); - let ptr = unsafe { alloc::alloc_zeroed(layout) }; - assert!(!ptr.is_null()); - - unsafe { - let buf = std::slice::from_raw_parts_mut(ptr, size); - init_buf(buf, &schema, chunk_size, num_chunks); - } - - let num_writers = 8; - let rows_per_writer = 50; - let addr = ptr as usize; - - // 单写线程:多线程各自 `&mut` 同一块缓冲在语言层面是 UB,release 下易死锁/损坏元数据。 - let writer = thread::spawn(move || { - let buf = unsafe { std::slice::from_raw_parts_mut(addr as *mut u8, size) }; - let mut mt = MemTableWriter::new(buf).unwrap(); - for tid in 0..num_writers { - for seq in 0..rows_per_writer as i64 { - mt.push_row(&[Value::I64(tid as i64), Value::I64(seq)]); - } - } - }); - writer.join().unwrap(); - - unsafe { - let buf = std::slice::from_raw_parts(ptr, size); - let view = MemTableView::new(buf).unwrap(); - let total: usize = (0..view.num_chunks()).map(|c| view.num_rows(c)).sum(); - assert_eq!(total, num_writers * rows_per_writer); - - // every row should be a valid (tid, seq) pair - for chunk in 0..view.num_chunks() { - for row in view.rows(chunk) { - let mut c = row.cursor(); - let tid = c.next_i64(); - let seq = c.next_i64(); - assert!((0..num_writers as i64).contains(&tid)); - assert!((0..rows_per_writer as i64).contains(&seq)); - } - } - - alloc::dealloc(ptr, layout); - } - } - - #[test] - fn concurrent_writers_and_readers() { + fn single_writer_concurrent_readers() { use std::alloc; use std::sync::atomic::{AtomicBool, AtomicUsize}; use std::sync::{Arc, Barrier}; use std::thread; + // The production model: one writer feeds the ring while N lock-free + // readers continuously scan it. Readers must never observe a torn or + // corrupt row. let schema = Schema::new().col("val", DType::I64); let chunk_size = 4096u32; let num_chunks = 4u32; @@ -1562,16 +1423,14 @@ mod tests { init_buf(buf, &schema, chunk_size, num_chunks); } - let num_writers = 4; - let rows_per_writer = 100; + let total_rows = 400i64; let num_readers = 4; let addr = ptr as usize; let done = Arc::new(AtomicBool::new(false)); let total_reads = Arc::new(AtomicUsize::new(0)); - // 1 个写线程 + num_readers 个读线程(不能多写线程同缓冲 &mut,见 concurrent_multiple_writers) let barrier = Arc::new(Barrier::new(1 + num_readers)); - // spawn readers — continuously scan all chunks while writers are active + // Readers continuously scan all chunks while the writer is active. let reader_handles: Vec<_> = (0..num_readers) .map(|_| { let done = done.clone(); @@ -1604,10 +1463,8 @@ mod tests { barrier.wait(); let buf = unsafe { std::slice::from_raw_parts_mut(addr as *mut u8, size) }; let mut mt = MemTableWriter::new(buf).unwrap(); - for tid in 0..num_writers { - for seq in 0..rows_per_writer as i64 { - mt.push_row(&[Value::I64(tid as i64 * 1000 + seq)]); - } + for seq in 0..total_rows { + mt.push_row(&[Value::I64(seq)]); } }) }; @@ -1619,72 +1476,17 @@ mod tests { h.join().unwrap(); } - // readers actually read some rows assert!( total_reads.load(Ordering::Relaxed) > 0, "readers should have observed rows" ); - // final consistency: total rows == writers × rows_per_writer + // Final consistency: every written row is present. unsafe { let buf = std::slice::from_raw_parts(ptr, size); let view = MemTableView::new(buf).unwrap(); let total: usize = (0..view.num_chunks()).map(|c| view.num_rows(c)).sum(); - assert_eq!(total, num_writers * rows_per_writer); - alloc::dealloc(ptr, layout); - } - } - - #[test] - fn concurrent_row_writer_contention() { - use std::alloc; - use std::thread; - - let schema = Schema::new().col("tid", DType::I32).col("msg", DType::Str); - let chunk_size = 16384u32; - let num_chunks = 4u32; - let size = MemTable::required_size(&schema, chunk_size as usize, num_chunks as usize); - let layout = alloc::Layout::from_size_align(size, 64).unwrap(); - let ptr = unsafe { alloc::alloc_zeroed(layout) }; - assert!(!ptr.is_null()); - - unsafe { - let buf = std::slice::from_raw_parts_mut(ptr, size); - init_buf(buf, &schema, chunk_size, num_chunks); - } - - let num_writers = 8; - let rows_per_writer = 60; - let addr = ptr as usize; - - let writer = thread::spawn(move || { - let buf = unsafe { std::slice::from_raw_parts_mut(addr as *mut u8, size) }; - let mut mt = MemTableWriter::new(buf).unwrap(); - for tid in 0..num_writers { - let tag = format!("t{tid}"); - for _ in 0..rows_per_writer { - mt.row_writer().put_i32(tid as i32).put_str(&tag).finish(); - } - } - }); - writer.join().unwrap(); - - unsafe { - let buf = std::slice::from_raw_parts(ptr, size); - let view = MemTableView::new(buf).unwrap(); - let total: usize = (0..view.num_chunks()).map(|c| view.num_rows(c)).sum(); - assert_eq!(total, num_writers * rows_per_writer); - - for chunk in 0..view.num_chunks() { - for row in view.rows(chunk) { - let mut c = row.cursor(); - let tid = c.next_i32(); - let msg = c.next_str(); - assert!((0..num_writers as i32).contains(&tid)); - assert_eq!(msg, format!("t{tid}")); - } - } - + assert_eq!(total, total_rows as usize); alloc::dealloc(ptr, layout); } } @@ -1868,14 +1670,14 @@ mod tests { t.push_row(&[Value::Str("oops")]); // Str instead of U32 } - // ── MemTableWriter solo mode tests ────────────────────────── + // ── MemTableWriter tests ────────────────────────── #[test] - fn solo_writer_basic() { + fn mem_table_writer_basic() { let schema = Schema::new().col("ts", DType::I64).col("val", DType::F64); let size = MemTable::required_size(&schema, 4096, 2); let mut buf = vec![0u8; size]; - let mut sw = MemTableWriter::init(&mut buf, &schema, 4096, 2).solo(); + let mut sw = MemTableWriter::init(&mut buf, &schema, 4096, 2); sw.push_row(&[Value::I64(100), Value::F64(3.14)]); sw.push_row(&[Value::I64(200), Value::F64(2.72)]); @@ -1888,11 +1690,11 @@ mod tests { } #[test] - fn solo_writer_row_writer() { + fn mem_table_writer_row_writer() { let schema = Schema::new().col("id", DType::I32).col("msg", DType::Str); let size = MemTable::required_size(&schema, 4096, 1); let mut buf = vec![0u8; size]; - let mut sw = MemTableWriter::init(&mut buf, &schema, 4096, 1).solo(); + let mut sw = MemTableWriter::init(&mut buf, &schema, 4096, 1); sw.row_writer().put_i32(1).put_str("hello").finish(); sw.row_writer().put_i32(2).put_str("world").finish(); @@ -1905,28 +1707,11 @@ mod tests { } #[test] - fn solo_writer_no_lock_touched() { - let schema = Schema::new().col("x", DType::I32); - let size = MemTable::required_size(&schema, 1024, 1); - let mut buf = vec![0u8; size]; - let mut sw = MemTableWriter::init(&mut buf, &schema, 1024, 1).solo(); - sw.push_row(&[Value::I32(42)]); - sw.row_writer().put_i32(99).finish(); - assert_eq!( - header(sw.as_bytes()).write_lock.load(Ordering::Relaxed), - 0, - "solo mode must never touch the write_lock" - ); - } - - #[test] - fn solo_writer_dedup() { + fn mem_table_writer_dedup() { let schema = Schema::new().col("tag", DType::Str).col("seq", DType::I32); let size = MemTable::required_size(&schema, 8192, 1); let mut buf = vec![0u8; size]; - let mut sw = MemTableWriter::init(&mut buf, &schema, 8192, 1) - .solo() - .dedup(); + let mut sw = MemTableWriter::init(&mut buf, &schema, 8192, 1).dedup(); for i in 0..20 { sw.push_row(&[Value::Str("repeat"), Value::I32(i)]); @@ -1934,9 +1719,9 @@ mod tests { let used_dedup = sw.chunk_used(0); - // Compare with plain solo writer + // Compare with a plain writer let mut buf2 = vec![0u8; size]; - let mut sw2 = MemTableWriter::init(&mut buf2, &schema, 8192, 1).solo(); + let mut sw2 = MemTableWriter::init(&mut buf2, &schema, 8192, 1); for i in 0..20 { sw2.push_row(&[Value::Str("repeat"), Value::I32(i)]); } @@ -1955,11 +1740,11 @@ mod tests { } #[test] - fn solo_writer_auto_advance() { + fn mem_table_writer_auto_advance() { let schema = Schema::new().col("v", DType::I64); let size = MemTable::required_size(&schema, 64, 4); let mut buf = vec![0u8; size]; - let mut sw = MemTableWriter::init(&mut buf, &schema, 64, 4).solo(); + let mut sw = MemTableWriter::init(&mut buf, &schema, 64, 4); for i in 0..50i64 { sw.push_row_unchecked(&[Value::I64(i)]); @@ -2059,85 +1844,4 @@ mod tests { header_mut(t.as_bytes_mut()).ts_col = 1; // col 0 is I64 → ok assert!(MemTableView::new(t.as_bytes()).is_ok()); } - - // ── robust write lock ────────────────────────────────────────────── - - /// PID of a process that no longer exists: spawn a short-lived child - /// and wait for it to exit. - fn dead_pid() -> u32 { - let mut child = std::process::Command::new("true") - .spawn() - .expect("spawn true"); - let pid = child.id(); - child.wait().expect("wait true"); - pid - } - - #[test] - fn lock_word_holds_pid_while_held() { - let schema = Schema::new().col("x", DType::I32); - let mut t = MemTable::new(&schema, 1024, 1); - let lock_ptr = header(t.as_bytes()).write_lock.as_ptr() as usize; - let lock = unsafe { &*(lock_ptr as *const std::sync::atomic::AtomicU32) }; - { - let _w = t.row_writer(); // holds the lock - assert_eq!( - lock.load(Ordering::Relaxed), - std::process::id(), - "lock word must hold the owner PID" - ); - } - assert_eq!(lock.load(Ordering::Relaxed), 0); - } - - #[test] - fn stale_lock_from_dead_process_is_stolen() { - let schema = Schema::new().col("x", DType::I32); - let mut t = MemTable::new(&schema, 1024, 1); - - // Simulate a writer that crashed inside the critical section. - header(t.as_bytes()) - .write_lock - .store(dead_pid(), Ordering::SeqCst); - - let start = std::time::Instant::now(); - t.push_row(&[Value::I32(42)]); // must not deadlock - let took = start.elapsed(); - - assert_eq!(t.rows(0).next().unwrap().col_i32(0), 42); - assert_eq!(header(t.as_bytes()).write_lock.load(Ordering::Relaxed), 0); - assert!( - took >= crate::layout::LOCK_STEAL_TIMEOUT, - "steal must wait out the timeout first (took {took:?})" - ); - } - - #[test] - fn live_holder_is_not_preempted() { - let schema = Schema::new().col("x", DType::I32); - let mut t = MemTable::new(&schema, 1024, 1); - - // Another thread of this (alive) process holds the lock and - // releases it well past the steal timeout. - let me = std::process::id(); - header(t.as_bytes()).write_lock.store(me, Ordering::SeqCst); - let lock_ptr = header(t.as_bytes()).write_lock.as_ptr() as usize; - let hold = crate::layout::LOCK_STEAL_TIMEOUT + std::time::Duration::from_millis(200); - let releaser = std::thread::spawn(move || { - std::thread::sleep(hold); - let lock = unsafe { &*(lock_ptr as *const std::sync::atomic::AtomicU32) }; - lock.store(0, Ordering::Release); - }); - - let start = std::time::Instant::now(); - t.push_row(&[Value::I32(7)]); - let took = start.elapsed(); - releaser.join().unwrap(); - - assert!( - took >= hold - std::time::Duration::from_millis(50), - "live holder must be waited on, not preempted (took {took:?})" - ); - assert_eq!(t.rows(0).next().unwrap().col_i32(0), 7); - } } diff --git a/probing/memtable/src/raw.rs b/probing/memtable/src/raw.rs index 9eb61272..94948f71 100644 --- a/probing/memtable/src/raw.rs +++ b/probing/memtable/src/raw.rs @@ -113,11 +113,12 @@ pub(crate) fn write_row_bytes(buf: &mut [u8], values: &[Value], row_data: usize) true } -/// Advance the ring buffer to the next chunk (caller must hold the write lock). +/// Advance the ring buffer to the next chunk. /// -/// Takes `&mut [u8]` so that LLVM does not mark the pointer `readonly`; -/// see [`acquire_write_lock`](crate::layout::acquire_write_lock) for details. -pub(crate) fn advance_chunk_unlocked(buf: &mut [u8]) { +/// MEMT is single-writer, so no lock is taken. Takes `&mut [u8]` so that +/// LLVM does not mark the pointer `readonly` (which would let it elide the +/// atomic stores below in optimised builds). +pub(crate) fn advance_chunk_raw(buf: &mut [u8]) { let ptr = buf.as_mut_ptr(); unsafe { let h = &*(ptr as *const Header); @@ -359,11 +360,11 @@ pub(crate) fn init_buf(buf: &mut [u8], schema: &Schema, chunk_size: u32, num_chu h.chunk_size = chunk_size; h.data_offset = data_off as u32; h.write_chunk.store(0, Ordering::Relaxed); - h.write_lock.store(0, Ordering::Relaxed); h.refcount.store(1, Ordering::Relaxed); h.creator_pid = std::process::id(); + h._pad0 = 0; h.creator_start_time = process_start_time(std::process::id()); - h.lock_owner_start.store(0, Ordering::Relaxed); + h._reserved = 0; for (i, col) in schema.cols.iter().enumerate() { let cd = col_desc_mut(buf, i); diff --git a/probing/memtable/src/writer.rs b/probing/memtable/src/writer.rs index 1a4cb428..468979d0 100644 --- a/probing/memtable/src/writer.rs +++ b/probing/memtable/src/writer.rs @@ -1,13 +1,12 @@ use crate::dedup::DedupState; -use crate::layout::{chunk_header, release_write_lock, w32, CHUNK_HEADER_SIZE}; +use crate::layout::{chunk_header, w32, CHUNK_HEADER_SIZE}; use crate::raw::note_row_ts; use std::sync::atomic::Ordering; /// Streaming row writer — **low-overhead, weak-contract** hot-path API. /// -/// When `locked` is true (default), holds the write lock from creation -/// until [`finish()`](Self::finish) (or `Drop`). -/// When `locked` is false (solo mode), no lock is touched. +/// MEMT is single-writer, so no lock is taken; the `&mut` borrow guarantees +/// exclusive access for the writer's lifetime. /// /// Callers must supply columns in schema order via the typed `put_*` /// methods; **no per-call schema validation is performed**. @@ -25,7 +24,6 @@ pub struct RowWriter<'a> { pub(crate) overflow: bool, pub(crate) done: bool, pub(crate) col_idx: usize, - pub(crate) locked: bool, /// `Header::ts_col` (timestamp column index + 1; 0 = none). pub(crate) ts_col: u16, /// Timestamp captured by `put_i64` on the designated column, @@ -132,7 +130,8 @@ impl<'a> RowWriter<'a> { self } - /// Commit the row and release the write lock (if held). + /// Commit the row. Returns `false` if the row overflowed the chunk (and + /// nothing was committed) or `finish` was already called. pub fn finish(&mut self) -> bool { if self.done { return false; @@ -155,27 +154,14 @@ impl<'a> RowWriter<'a> { .fetch_add(1, Ordering::Release); true }; - if self.locked { - release_write_lock(self.buf); - } ok } } -impl Drop for RowWriter<'_> { - fn drop(&mut self) { - if !self.done && self.locked { - release_write_lock(self.buf); - } - } -} - #[cfg(test)] mod tests { - use crate::layout::header; use crate::memtable::MemTable; use crate::schema::{DType, Schema, Value}; - use std::sync::atomic::Ordering; #[test] fn row_writer_basic() { @@ -217,15 +203,17 @@ mod tests { } #[test] - fn row_writer_drop_releases_lock() { + fn row_writer_drop_without_finish_commits_nothing() { let schema = Schema::new().col("x", DType::I32); let mut t = MemTable::new(&schema, 1024, 1); { - let _w = t.row_writer(); // acquires lock - // dropped without finish() → lock released by Drop + let mut w = t.row_writer(); + w.put_i32(99); // dropped without finish() → row not committed } - // lock should be free; this must not deadlock + assert_eq!(t.num_rows(0), 0, "uncommitted row must not be visible"); + // A subsequent write still works and is the first visible row. t.push_row(&[Value::I32(42)]); + assert_eq!(t.num_rows(0), 1); assert_eq!(t.rows(0).next().unwrap().col_i32(0), 42); } @@ -245,15 +233,14 @@ mod tests { } #[test] - fn write_lock_field_is_zero_after_operations() { + fn mixed_push_and_row_writer() { let schema = Schema::new().col("x", DType::I32); let mut t = MemTable::new(&schema, 1024, 1); t.push_row(&[Value::I32(1)]); t.row_writer().put_i32(2).finish(); - assert_eq!( - header(t.as_bytes()).write_lock.load(Ordering::Relaxed), - 0, - "write_lock must be 0 after all operations complete" - ); + assert_eq!(t.num_rows(0), 2); + let rows: Vec<_> = t.rows(0).collect(); + assert_eq!(rows[0].col_i32(0), 1); + assert_eq!(rows[1].col_i32(0), 2); } } diff --git a/web/Cargo.toml b/web/Cargo.toml index a17c9b45..99447114 100644 --- a/web/Cargo.toml +++ b/web/Cargo.toml @@ -7,9 +7,9 @@ edition = "2021" [dependencies] # Dioxus dependencies -dioxus = { version = "0.7", features = ["web"] } -dioxus-router = "0.7" -dioxus-web = "0.7" +dioxus = { version = "0.7.9", features = ["web"] } +dioxus-router = "0.7.9" +dioxus-web = "0.7.9" # Serialization serde = { version = "1.0", features = ["derive"] } @@ -41,7 +41,8 @@ probing-proto = { path = "../probing/proto", default-features = false, features [profile.release] opt-level = "z" debug = false -strip = true +# Do not set strip here — dx strips after asset extraction via rust-objcopy, +# which breaks on some nightly macOS toolchains (missing libLLVM.dylib). [profile.wasm-dev] inherits = "dev" From dcced68b117a06b6b5b1f025ea791bc6d65f6f4d Mon Sep 17 00:00:00 2001 From: Reiase Date: Sun, 14 Jun 2026 00:11:08 +0800 Subject: [PATCH 3/3] Update version to 0.2.5 across multiple files - Bumped the version of the `probing` package in `Cargo.toml`, `pyproject.toml`, and various `Cargo.lock` entries from `0.2.4` to `0.2.5`. - Updated documentation in `installation.md` and its Chinese counterpart to reflect the new version. - Adjusted the version reporting in the CLI to dynamically fetch the version from the environment variable. --- Cargo.lock | 20 ++++++++++---------- Cargo.toml | 2 +- docs/src/design/data-layer.md | 12 ++++++------ docs/src/design/data-layer.zh.md | 12 ++++++------ docs/src/installation.md | 2 +- docs/src/installation.zh.md | 2 +- probing/cli/src/cli/mod.rs | 2 +- probing/memtable/src/layout.rs | 18 ++++++++---------- probing/memtable/src/lib.rs | 4 ++-- pyproject.toml | 2 +- python/probing/__init__.py | 2 +- 11 files changed, 38 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cf4703ab..cca12cf7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3329,7 +3329,7 @@ dependencies = [ [[package]] name = "probing" -version = "0.2.4" +version = "0.2.5" dependencies = [ "anyhow", "arrow", @@ -3349,7 +3349,7 @@ dependencies = [ [[package]] name = "probing-cc" -version = "0.2.4" +version = "0.2.5" dependencies = [ "anyhow", "async-trait", @@ -3365,7 +3365,7 @@ dependencies = [ [[package]] name = "probing-cli" -version = "0.2.4" +version = "0.2.5" dependencies = [ "anyhow", "clap 4.5.38", @@ -3394,7 +3394,7 @@ dependencies = [ [[package]] name = "probing-core" -version = "0.2.4" +version = "0.2.5" dependencies = [ "anyhow", "arrow", @@ -3421,7 +3421,7 @@ dependencies = [ [[package]] name = "probing-macros" -version = "0.2.4" +version = "0.2.5" dependencies = [ "probing-core", "quote", @@ -3430,7 +3430,7 @@ dependencies = [ [[package]] name = "probing-memtable" -version = "0.2.4" +version = "0.2.5" dependencies = [ "libc", "memmap2", @@ -3440,7 +3440,7 @@ dependencies = [ [[package]] name = "probing-proto" -version = "0.2.4" +version = "0.2.5" dependencies = [ "anyhow", "arrow", @@ -3456,7 +3456,7 @@ dependencies = [ [[package]] name = "probing-python" -version = "0.2.4" +version = "0.2.5" dependencies = [ "anyhow", "async-trait", @@ -3488,7 +3488,7 @@ dependencies = [ [[package]] name = "probing-server" -version = "0.2.4" +version = "0.2.5" dependencies = [ "anyhow", "async-trait", @@ -3519,7 +3519,7 @@ dependencies = [ [[package]] name = "probing-store" -version = "0.2.4" +version = "0.2.5" dependencies = [ "thiserror 2.0.12", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 56c8210b..faec3266 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ members = [ ] [workspace.package] -version = "0.2.4" +version = "0.2.5" authors = ["reiase "] edition = "2021" license = "Apache-2.0" diff --git a/docs/src/design/data-layer.md b/docs/src/design/data-layer.md index 2dd565dc..c51e67da 100644 --- a/docs/src/design/data-layer.md +++ b/docs/src/design/data-layer.md @@ -53,12 +53,12 @@ The hot tier is mapped read-only at query time; the cold tier is read via `Segme Every MEMT buffer (heap, shared memory, or mmap'd file) begins with a 64-byte header (one cache line), followed by per-column descriptors, then chunk data. -**Header v4 (64 bytes):** +**Header v3 (64 bytes):** | offset | size | field | notes | |---|---|---|---| | 0 | 4 | `magic` | `0x4D454D54` (`"MEMT"`) | -| 4 | 2 | `version` | 4 | +| 4 | 2 | `version` | 3 | | 6 | 2 | `header_size` | 64 (validation) | | 8 | 2 | `byte_order` | BOM `[0x01,0x02]` | | 10 | 2 | `ts_col` | timestamp column index + 1 (0 = none) | @@ -70,16 +70,16 @@ line), followed by per-column descriptors, then chunk data. | 32 | 4 | `write_chunk` | `AtomicU32` — current ring slot | | 36 | 4 | `refcount` | `AtomicU32` | | 40 | 4 | `creator_pid` | | -| 44 | 4 | `_pad0` | alignment (was `write_lock` in v3) | +| 44 | 4 | `_pad0` | alignment (was `write_lock` in v2) | | 48 | 8 | `creator_start_time` | for PID-recycling detection during discovery | -| 56 | 8 | `_reserved` | reserved (was `lock_owner_start` in v3) | +| 56 | 8 | `_reserved` | reserved | Bytes 0–31 are the **cold zone** (immutable after init); bytes 32–63 are the **hot zone** (atomically mutated), split to avoid false sharing. Each chunk starts with a 40-byte `ChunkHeader` carrying a `generation` counter and per-chunk `min_ts`/`max_ts` (`AtomicI64`). -> **v4** dropped the `write_lock` and `lock_owner_start` fields: MEMT is single-writer, so there is -> no in-buffer write lock. Their byte slots are now reserved. +> **v3** vs v2: `_pad0` became `ts_col`; dropped `write_lock` (single-writer model); +> `ChunkHeader` gained `min_ts`/`max_ts` (24 → 40 bytes). ### Backends diff --git a/docs/src/design/data-layer.zh.md b/docs/src/design/data-layer.zh.md index 9303b605..d1e7a3a0 100644 --- a/docs/src/design/data-layer.zh.md +++ b/docs/src/design/data-layer.zh.md @@ -47,12 +47,12 @@ graph LR 每个 MEMT 缓冲区(堆、共享内存或 mmap 文件)都以 64 字节头部(一个 cache line)开始,随后是 逐列描述符,再是 chunk 数据。 -**Header v4(64 字节):** +**Header v3(64 字节):** | 偏移 | 大小 | 字段 | 说明 | |---|---|---|---| | 0 | 4 | `magic` | `0x4D454D54`(`"MEMT"`) | -| 4 | 2 | `version` | 4 | +| 4 | 2 | `version` | 3 | | 6 | 2 | `header_size` | 64(仅校验) | | 8 | 2 | `byte_order` | BOM `[0x01,0x02]` | | 10 | 2 | `ts_col` | 时间戳列索引 + 1(0 = 无) | @@ -64,16 +64,16 @@ graph LR | 32 | 4 | `write_chunk` | `AtomicU32`——当前环形槽位 | | 36 | 4 | `refcount` | `AtomicU32` | | 40 | 4 | `creator_pid` | | -| 44 | 4 | `_pad0` | 对齐填充(v3 中为 `write_lock`) | +| 44 | 4 | `_pad0` | 对齐填充(v2 中为 `write_lock`) | | 48 | 8 | `creator_start_time` | 用于发现期的 PID 回收检测 | -| 56 | 8 | `_reserved` | 预留(v3 中为 `lock_owner_start`) | +| 56 | 8 | `_reserved` | 预留 | 字节 0–31 是**冷区**(初始化后不可变),字节 32–63 是**热区**(运行时原子修改),二者分离以避免 伪共享。每个 chunk 以 40 字节的 `ChunkHeader` 开头,携带 `generation` 计数器及逐 chunk 的 `min_ts`/`max_ts`(`AtomicI64`)。 -> **v4** 移除了 `write_lock` 与 `lock_owner_start` 字段:MEMT 是单写者,缓冲区内不再有写锁。其字节 -> 槽位现已预留。 +> **v3** 相对 v2:`_pad0` 改为 `ts_col`;移除 `write_lock`(单写者模型);`ChunkHeader` 新增 +> `min_ts`/`max_ts`(24 → 40 字节)。 ### 三种后端 diff --git a/docs/src/installation.md b/docs/src/installation.md index 80035d42..05dff6c4 100644 --- a/docs/src/installation.md +++ b/docs/src/installation.md @@ -53,7 +53,7 @@ probing --version This should print the installed version of Probing, for example: ``` -probing 0.2.4 +probing 0.2.5 ``` You can also check if the `probing` command is available: diff --git a/docs/src/installation.zh.md b/docs/src/installation.zh.md index 9ac8c4a3..52672ad2 100644 --- a/docs/src/installation.zh.md +++ b/docs/src/installation.zh.md @@ -51,7 +51,7 @@ probing --version 应该会输出已安装的 Probing 版本,例如: ``` -probing 0.2.4 +probing 0.2.5 ``` 您也可以检查 `probing` 命令是否可用: diff --git a/probing/cli/src/cli/mod.rs b/probing/cli/src/cli/mod.rs index b900e672..519cd7fa 100644 --- a/probing/cli/src/cli/mod.rs +++ b/probing/cli/src/cli/mod.rs @@ -25,7 +25,7 @@ use commands::Commands; use once_cell::sync::Lazy; fn get_build_info() -> String { - let mut info = "0.2.1".to_string(); + let mut info = env!("CARGO_PKG_VERSION").to_string(); if let Some(timestamp) = option_env!("VERGEN_BUILD_TIMESTAMP") { info.push_str(&format!("\nBuild Timestamp: {timestamp}")); diff --git a/probing/memtable/src/layout.rs b/probing/memtable/src/layout.rs index 6729d7cb..b7bc52ae 100644 --- a/probing/memtable/src/layout.rs +++ b/probing/memtable/src/layout.rs @@ -1,12 +1,12 @@ //! Low-level layout: header, column descriptors, chunk headers, byte helpers. //! -//! ## Header v4 binary layout (64 bytes, 1 cache line) +//! ## Header v3 binary layout (64 bytes, 1 cache line) //! //! ```text //! offset size field notes //! ────────────────────────────────────────────────────────── //! 0 4 magic 0x4D454D54 ("MEMT" in LE) -//! 4 2 version 4 +//! 4 2 version 3 //! 6 2 header_size 64 (validation only) //! 8 2 byte_order BOM: written as [0x01, 0x02] //! 10 2 ts_col timestamp column index + 1 (0 = none) @@ -44,11 +44,9 @@ pub(crate) const MAGIC: u32 = MAGIC_MEMT; /// Header format version for MEMT. /// -/// v4: dropped the `write_lock` and `lock_owner_start` fields — MEMT is -/// single-writer, so there is no in-buffer write lock. Their bytes are now -/// `_pad0`/`_reserved`. v3 added per-chunk `min_ts`/`max_ts` and the PID -/// write lock (both since superseded). -pub(crate) const VERSION: u16 = 4; +/// v3: `_pad0` became `ts_col`; dropped `write_lock` (single-writer model); +/// `ChunkHeader` grew `min_ts`/`max_ts` (24 → 40 bytes). +pub(crate) const VERSION: u16 = 3; /// Byte-order mark: written as raw bytes `[0x01, 0x02]`. /// On a LE host, `u16::from_ne_bytes([0x01, 0x02])` == `0x0201`. @@ -80,7 +78,7 @@ pub(crate) struct Header { // ── cold zone (read-only after init) ───────────────── pub magic: u32, pub version: u16, - /// Size of this header in bytes (always 64 in v2). + /// Size of this header in bytes (always 64). /// /// Used for validation only — column descriptors always start at /// offset `size_of::
()` (compile-time constant). If a @@ -111,14 +109,14 @@ pub(crate) struct Header { pub refcount: AtomicU32, /// PID of the process that created this table (for cross-process discovery). pub creator_pid: u32, - /// Padding to 8-align `creator_start_time` (was `write_lock` in v3). + /// Padding to 8-align `creator_start_time` (was `write_lock` in v2). pub _pad0: u32, /// Process start time — for PID-recycling detection during discovery. /// Linux: clock ticks since boot (`/proc//stat` field 22). /// macOS: microseconds since epoch (via `sysctl`). /// Other: 0 (falls back to PID-only liveness check). pub creator_start_time: u64, - /// Reserved for future use (was `lock_owner_start` in v3). + /// Reserved for future use (was part of `_reserved` in v2). pub _reserved: u64, } diff --git a/probing/memtable/src/lib.rs b/probing/memtable/src/lib.rs index 87f02631..0fa43526 100644 --- a/probing/memtable/src/lib.rs +++ b/probing/memtable/src/lib.rs @@ -20,10 +20,10 @@ //! //! ```text //! ┌──────────────────────────────────┐ 0 -//! │ Header v4 (64 bytes, repr(C)) │ +//! │ Header v3 (64 bytes, repr(C)) │ //! │ ── cold zone (read-only) ── │ //! │ magic: u32 (0x4D454D54) │ -//! │ version: u16 (4) │ +//! │ version: u16 (3) │ //! │ header_size: u16 (64) │ //! │ byte_order: u16 (BOM 0x0102) │ //! │ ts_col: u16 │ diff --git a/pyproject.toml b/pyproject.toml index 4edb2329..b352432c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "probing" -version = "0.2.4" +version = "0.2.5" description = "Dynamic Performance Profiler for Distributed AI" readme = "README.md" authors = [ diff --git a/python/probing/__init__.py b/python/probing/__init__.py index 85499c26..ceca30e3 100644 --- a/python/probing/__init__.py +++ b/python/probing/__init__.py @@ -22,7 +22,7 @@ import probing.config as config from probing import _core -VERSION = "0.2.4" +VERSION = "0.2.5" # Core Primitives ExternalTable = _core.ExternalTable