From 7cac8e9ba4f64a30e8b0bd8c541fd8e6e616bced Mon Sep 17 00:00:00 2001
From: Reiase <reiase@gmail.com>
Date: Sat, 13 Jun 2026 23:50:05 +0800
Subject: [PATCH 1/3] Update dependencies and enhance documentation

- Updated `Cargo.lock` to reflect new versions for several dependencies, including `arrow` and `async-trait`.
- Removed deprecated `android-tzdata` package from `Cargo.lock`.
- Enhanced `Cargo.toml` to include new features for `datafusion` and updated `arrow` version to `58.1.0`.
- Added new documentation for the data layer in both English and Chinese, detailing its architecture and design goals.
- Introduced a hidden `bench` command in the CLI for stress testing the in-process data layer, with various subcommands for benchmarking different aspects.
- Improved the overall structure and clarity of the documentation, including navigation updates in `mkdocs.yml`.
---
 .github/actions/setup-build-env/action.yml    |    2 +-
 Cargo.lock                                    | 1032 ++++++---
 Cargo.toml                                    |   14 +-
 docs/mkdocs.yml                               |    2 +
 docs/src/design/data-layer.md                 |  293 +++
 docs/src/design/data-layer.zh.md              |  262 +++
 docs/src/design/index.md                      |    1 +
 docs/src/design/index.zh.md                   |    1 +
 probing/cli/Cargo.toml                        |    1 +
 probing/cli/src/cli/bench/args.rs             |  232 ++
 probing/cli/src/cli/bench/metrics.rs          |  311 +++
 probing/cli/src/cli/bench/mod.rs              |   61 +
 probing/cli/src/cli/bench/runners/coldscan.rs |   92 +
 probing/cli/src/cli/bench/runners/common.rs   |  167 ++
 probing/cli/src/cli/bench/runners/compact.rs  |  118 +
 probing/cli/src/cli/bench/runners/mixed.rs    |  184 ++
 probing/cli/src/cli/bench/runners/mod.rs      |   10 +
 probing/cli/src/cli/bench/runners/mp.rs       |  307 +++
 probing/cli/src/cli/bench/runners/scan.rs     |   51 +
 probing/cli/src/cli/bench/runners/write.rs    |  198 ++
 probing/cli/src/cli/bench/workload.rs         |  189 ++
 probing/cli/src/cli/commands.rs               |    4 +
 probing/cli/src/cli/mod.rs                    |    5 +
 probing/core/Cargo.toml                       |    6 +-
 probing/core/src/core/memtable_sql.rs         | 2048 +++++++++++++++++
 probing/core/src/core/mod.rs                  |    6 +
 probing/core/src/core/plugin.rs               |   93 +-
 probing/core/src/core/plugin_advanced.rs      |  590 +++++
 probing/extensions/cc/Cargo.toml              |    2 +-
 probing/extensions/python/Cargo.toml          |    2 +
 .../python/src/extensions/python/exttbls.rs   |  595 +++--
 .../python/src/extensions/python/tbls.rs      |  214 +-
 probing/memtable/Cargo.toml                   |    1 +
 probing/memtable/src/discover.rs              |  115 +-
 probing/memtable/src/layout.rs                |  222 +-
 probing/memtable/src/lib.rs                   |    3 +-
 probing/memtable/src/memc/codec.rs            |  281 +++
 probing/memtable/src/memc/compactor.rs        |  421 ++++
 probing/memtable/src/memc/layout.rs           |  434 ++++
 probing/memtable/src/memc/mod.rs              |   64 +
 probing/memtable/src/memc/reader.rs           |  255 ++
 probing/memtable/src/memc/store.rs            |  247 ++
 probing/memtable/src/memc/tests.rs            |  643 ++++++
 probing/memtable/src/memc/writer.rs           |  337 +++
 probing/memtable/src/memtable.rs              |  848 ++++++-
 probing/memtable/src/raw.rs                   |   61 +-
 probing/memtable/src/writer.rs                |   16 +-
 probing/server/Cargo.toml                     |    2 +-
 probing/server/src/engine.rs                  |    5 +-
 probing/server/src/memtable_ext.rs            |  702 +-----
 50 files changed, 10162 insertions(+), 1588 deletions(-)
 create mode 100644 docs/src/design/data-layer.md
 create mode 100644 docs/src/design/data-layer.zh.md
 create mode 100644 probing/cli/src/cli/bench/args.rs
 create mode 100644 probing/cli/src/cli/bench/metrics.rs
 create mode 100644 probing/cli/src/cli/bench/mod.rs
 create mode 100644 probing/cli/src/cli/bench/runners/coldscan.rs
 create mode 100644 probing/cli/src/cli/bench/runners/common.rs
 create mode 100644 probing/cli/src/cli/bench/runners/compact.rs
 create mode 100644 probing/cli/src/cli/bench/runners/mixed.rs
 create mode 100644 probing/cli/src/cli/bench/runners/mod.rs
 create mode 100644 probing/cli/src/cli/bench/runners/mp.rs
 create mode 100644 probing/cli/src/cli/bench/runners/scan.rs
 create mode 100644 probing/cli/src/cli/bench/runners/write.rs
 create mode 100644 probing/cli/src/cli/bench/workload.rs
 create mode 100644 probing/core/src/core/memtable_sql.rs
 create mode 100644 probing/core/src/core/plugin_advanced.rs
 create mode 100644 probing/memtable/src/memc/codec.rs
 create mode 100644 probing/memtable/src/memc/compactor.rs
 create mode 100644 probing/memtable/src/memc/layout.rs
 create mode 100644 probing/memtable/src/memc/mod.rs
 create mode 100644 probing/memtable/src/memc/reader.rs
 create mode 100644 probing/memtable/src/memc/store.rs
 create mode 100644 probing/memtable/src/memc/tests.rs
 create mode 100644 probing/memtable/src/memc/writer.rs

diff --git a/.github/actions/setup-build-env/action.yml b/.github/actions/setup-build-env/action.yml
index 0b890dfc..6f26a114 100644
--- a/.github/actions/setup-build-env/action.yml
+++ b/.github/actions/setup-build-env/action.yml
@@ -66,7 +66,7 @@ runs:
         test -e ~/.cargo/bin/rnr || cargo install rnr
         test -e ~/.cargo/bin/cargo-nextest || cargo install --locked cargo-nextest
         test -e ~/.cargo/bin/cargo-binstall || cargo install cargo-binstall
-        test -e ~/.cargo/bin/dx || cargo binstall dioxus-cli@0.7.0 -y
+        test -e ~/.cargo/bin/dx || cargo binstall dioxus-cli@0.7.6 -y
         test -e ~/.cargo/bin/trunk || cargo install trunk --locked
 
     - name: Install Python Build Dependencies
diff --git a/Cargo.lock b/Cargo.lock
index 0566fe30..cf4703ab 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -55,12 +55,6 @@ version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
 
-[[package]]
-name = "android-tzdata"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
-
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -140,9 +134,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
 [[package]]
 name = "arrow"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1bb018b6960c87fd9d025009820406f74e83281185a8bdcb44880d2aa5c9a87"
+checksum = "d441fdda254b65f3e9025910eb2c2066b6295d9c8ed409522b8d2ace1ff8574c"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -161,23 +155,23 @@ dependencies = [
 
 [[package]]
 name = "arrow-arith"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44de76b51473aa888ecd6ad93ceb262fb8d40d1f1154a4df2f069b3590aa7575"
+checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "chrono",
- "num",
+ "num-traits",
 ]
 
 [[package]]
 name = "arrow-array"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29ed77e22744475a9a53d00026cf8e166fe73cf42d89c4c4ae63607ee1cfcc3f"
+checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d"
 dependencies = [
  "ahash",
  "arrow-buffer",
@@ -186,30 +180,34 @@ dependencies = [
  "chrono",
  "chrono-tz",
  "half",
- "hashbrown 0.15.2",
- "num",
+ "hashbrown 0.16.1",
+ "num-complex",
+ "num-integer",
+ "num-traits",
 ]
 
 [[package]]
 name = "arrow-buffer"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0391c96eb58bf7389171d1e103112d3fc3e5625ca6b372d606f2688f1ea4cce"
+checksum = "898f4cf1e9598fdb77f356fdf2134feedfd0ee8d5a4e0a5f573e7d0aec16baa4"
 dependencies = [
  "bytes",
  "half",
- "num",
+ "num-bigint",
+ "num-traits",
 ]
 
 [[package]]
 name = "arrow-cast"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f39e1d774ece9292697fcbe06b5584401b26bd34be1bec25c33edae65c2420ff"
+checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
+ "arrow-ord",
  "arrow-schema",
  "arrow-select",
  "atoi",
@@ -218,15 +216,15 @@ dependencies = [
  "comfy-table",
  "half",
  "lexical-core",
- "num",
+ "num-traits",
  "ryu",
 ]
 
 [[package]]
 name = "arrow-csv"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9055c972a07bf12c2a827debfd34f88d3b93da1941d36e1d9fee85eebe38a12a"
+checksum = "ca025bd0f38eeecb57c2153c0123b960494138e6a957bbda10da2b25415209fe"
 dependencies = [
  "arrow-array",
  "arrow-cast",
@@ -234,41 +232,42 @@ dependencies = [
  "chrono",
  "csv",
  "csv-core",
- "lazy_static",
  "regex",
 ]
 
 [[package]]
 name = "arrow-data"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf75ac27a08c7f48b88e5c923f267e980f27070147ab74615ad85b5c5f90473d"
+checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4"
 dependencies = [
  "arrow-buffer",
  "arrow-schema",
  "half",
- "num",
+ "num-integer",
+ "num-traits",
 ]
 
 [[package]]
 name = "arrow-ipc"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a222f0d93772bd058d1268f4c28ea421a603d66f7979479048c429292fac7b2e"
+checksum = "609a441080e338147a84e8e6904b6da482cefb957c5cdc0f3398872f69a315d0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
+ "arrow-select",
  "flatbuffers",
  "lz4_flex",
 ]
 
 [[package]]
 name = "arrow-json"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9085342bbca0f75e8cb70513c0807cc7351f1fbf5cb98192a67d5e3044acb033"
+checksum = "6ead0914e4861a531be48fe05858265cf854a4880b9ed12618b1d08cba9bebc8"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -277,20 +276,22 @@ dependencies = [
  "arrow-schema",
  "chrono",
  "half",
- "indexmap 2.9.0",
+ "indexmap 2.14.0",
+ "itoa",
  "lexical-core",
  "memchr",
- "num",
- "serde",
+ "num-traits",
+ "ryu",
+ "serde_core",
  "serde_json",
  "simdutf8",
 ]
 
 [[package]]
 name = "arrow-ord"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab2f1065a5cad7b9efa9e22ce5747ce826aa3855766755d4904535123ef431e7"
+checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -301,9 +302,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-row"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3703a0e3e92d23c3f756df73d2dc9476873f873a76ae63ef9d3de17fda83b2d8"
+checksum = "e14fe367802f16d7668163ff647830258e6e0aeea9a4d79aaedf273af3bdcd3e"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -314,29 +315,33 @@ dependencies = [
 
 [[package]]
 name = "arrow-schema"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73a47aa0c771b5381de2b7f16998d351a6f4eb839f1e13d48353e17e873d969b"
+checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743"
+dependencies = [
+ "serde_core",
+ "serde_json",
+]
 
 [[package]]
 name = "arrow-select"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24b7b85575702b23b85272b01bc1c25a01c9b9852305e5d0078c79ba25d995d4"
+checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe"
 dependencies = [
  "ahash",
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
- "num",
+ "num-traits",
 ]
 
 [[package]]
 name = "arrow-string"
-version = "55.1.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9260fddf1cdf2799ace2b4c2fc0356a9789fa7551e0953e35435536fecefebbd"
+checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -344,20 +349,20 @@ dependencies = [
  "arrow-schema",
  "arrow-select",
  "memchr",
- "num",
+ "num-traits",
  "regex",
  "regex-syntax",
 ]
 
 [[package]]
 name = "async-trait"
-version = "0.1.88"
+version = "0.1.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5"
+checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -450,7 +455,7 @@ checksum = "604fde5e028fea851ce1d8570bbdc034bec850d157f7569d10f347d06808c05c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -489,9 +494,9 @@ checksum = "92fde17f91e7ba10b2a07f8dff29530b77144894bc6ae850fbc66e1276af0d28"
 
 [[package]]
 name = "bigdecimal"
-version = "0.4.8"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013"
+checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695"
 dependencies = [
  "autocfg",
  "libm",
@@ -559,9 +564,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.10.1"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
 [[package]]
 name = "camino"
@@ -624,40 +629,28 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
 
 [[package]]
 name = "chrono"
-version = "0.4.40"
+version = "0.4.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c"
+checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
 dependencies = [
- "android-tzdata",
  "iana-time-zone",
  "js-sys",
  "num-traits",
  "serde",
  "wasm-bindgen",
- "windows-link",
+ "windows-link 0.2.1",
 ]
 
 [[package]]
 name = "chrono-tz"
-version = "0.10.3"
+version = "0.10.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "efdce149c370f133a071ca8ef6ea340b7b88748ab0810097a9e2976eaa34b4f3"
+checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3"
 dependencies = [
  "chrono",
- "chrono-tz-build",
  "phf",
 ]
 
-[[package]]
-name = "chrono-tz-build"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402"
-dependencies = [
- "parse-zoneinfo",
- "phf_codegen",
-]
-
 [[package]]
 name = "ciborium"
 version = "0.2.2"
@@ -735,7 +728,7 @@ dependencies = [
  "proc-macro2",
  "pulldown-cmark",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -809,7 +802,7 @@ dependencies = [
  "cookie",
  "document-features",
  "idna",
- "indexmap 2.9.0",
+ "indexmap 2.14.0",
  "log",
  "serde",
  "serde_derive",
@@ -1027,7 +1020,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim 0.11.1",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1038,7 +1031,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1063,12 +1056,11 @@ checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476"
 
 [[package]]
 name = "datafusion"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ffe060b978f74ab446be722adb8a274e052e005bf6dfd171caadc3abaad10080"
+checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b"
 dependencies = [
  "arrow",
- "arrow-ipc",
  "arrow-schema",
  "async-trait",
  "bytes",
@@ -1078,6 +1070,7 @@ dependencies = [
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-datasource",
+ "datafusion-datasource-arrow",
  "datafusion-datasource-csv",
  "datafusion-datasource-json",
  "datafusion-execution",
@@ -1085,11 +1078,12 @@ dependencies = [
  "datafusion-expr-common",
  "datafusion-functions",
  "datafusion-functions-aggregate",
+ "datafusion-functions-nested",
  "datafusion-functions-table",
  "datafusion-functions-window",
- "datafusion-macros",
  "datafusion-optimizer",
  "datafusion-physical-expr",
+ "datafusion-physical-expr-adapter",
  "datafusion-physical-expr-common",
  "datafusion-physical-optimizer",
  "datafusion-physical-plan",
@@ -1100,7 +1094,7 @@ dependencies = [
  "log",
  "object_store",
  "parking_lot 0.12.3",
- "rand 0.8.5",
+ "rand",
  "regex",
  "sqlparser",
  "tempfile",
@@ -1111,9 +1105,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-catalog"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61fe34f401bd03724a1f96d12108144f8cd495a3cdda2bf5e091822fb80b7e66"
+checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1126,7 +1120,6 @@ dependencies = [
  "datafusion-physical-expr",
  "datafusion-physical-plan",
  "datafusion-session",
- "datafusion-sql",
  "futures",
  "itertools 0.14.0",
  "log",
@@ -1137,9 +1130,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-catalog-listing"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4411b8e3bce5e0fc7521e44f201def2e2d5d1b5f176fb56e8cdc9942c890f00"
+checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1149,28 +1142,29 @@ dependencies = [
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-physical-expr",
+ "datafusion-physical-expr-adapter",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
- "datafusion-session",
  "futures",
+ "itertools 0.14.0",
  "log",
  "object_store",
- "tokio",
 ]
 
 [[package]]
 name = "datafusion-common"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0734015d81c8375eb5d4869b7f7ecccc2ee8d6cb81948ef737cd0e7b743bd69c"
+checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2"
 dependencies = [
  "ahash",
  "arrow",
  "arrow-ipc",
- "base64 0.22.1",
+ "chrono",
  "half",
- "hashbrown 0.14.5",
- "indexmap 2.9.0",
+ "hashbrown 0.16.1",
+ "indexmap 2.14.0",
+ "itertools 0.14.0",
  "libc",
  "log",
  "object_store",
@@ -1182,9 +1176,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-common-runtime"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5167bb1d2ccbb87c6bc36c295274d7a0519b14afcfdaf401d53cbcaa4ef4968b"
+checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def"
 dependencies = [
  "futures",
  "log",
@@ -1193,9 +1187,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-datasource"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04e602dcdf2f50c2abf297cc2203c73531e6f48b29516af7695d338cf2a778b1"
+checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1206,6 +1200,7 @@ dependencies = [
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-physical-expr",
+ "datafusion-physical-expr-adapter",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
  "datafusion-session",
@@ -1214,27 +1209,49 @@ dependencies = [
  "itertools 0.14.0",
  "log",
  "object_store",
- "rand 0.8.5",
+ "rand",
  "tokio",
  "url",
 ]
 
+[[package]]
+name = "datafusion-datasource-arrow"
+version = "53.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096"
+dependencies = [
+ "arrow",
+ "arrow-ipc",
+ "async-trait",
+ "bytes",
+ "datafusion-common",
+ "datafusion-common-runtime",
+ "datafusion-datasource",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-plan",
+ "datafusion-session",
+ "futures",
+ "itertools 0.14.0",
+ "object_store",
+ "tokio",
+]
+
 [[package]]
 name = "datafusion-datasource-csv"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3bb2253952dc32296ed5b84077cb2e0257fea4be6373e1c376426e17ead4ef6"
+checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca"
 dependencies = [
  "arrow",
  "async-trait",
  "bytes",
- "datafusion-catalog",
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-datasource",
  "datafusion-execution",
  "datafusion-expr",
- "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
  "datafusion-session",
@@ -1246,20 +1263,18 @@ dependencies = [
 
 [[package]]
 name = "datafusion-datasource-json"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b8c7f47a5d2fe03bfa521ec9bafdb8a5c82de8377f60967c3663f00c8790352"
+checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2"
 dependencies = [
  "arrow",
  "async-trait",
  "bytes",
- "datafusion-catalog",
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-datasource",
  "datafusion-execution",
  "datafusion-expr",
- "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
  "datafusion-session",
@@ -1267,40 +1282,46 @@ dependencies = [
  "object_store",
  "serde_json",
  "tokio",
+ "tokio-stream",
 ]
 
 [[package]]
 name = "datafusion-doc"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a91f8c2c5788ef32f48ff56c68e5b545527b744822a284373ac79bba1ba47292"
+checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee"
 
 [[package]]
 name = "datafusion-execution"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06f004d100f49a3658c9da6fb0c3a9b760062d96cd4ad82ccc3b7b69a9fb2f84"
+checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709"
 dependencies = [
  "arrow",
+ "arrow-buffer",
+ "async-trait",
+ "chrono",
  "dashmap",
  "datafusion-common",
  "datafusion-expr",
+ "datafusion-physical-expr-common",
  "futures",
  "log",
  "object_store",
  "parking_lot 0.12.3",
- "rand 0.8.5",
+ "rand",
  "tempfile",
  "url",
 ]
 
 [[package]]
 name = "datafusion-expr"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a4e4ce3802609be38eeb607ee72f6fe86c3091460de9dbfae9e18db423b3964"
+checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd"
 dependencies = [
  "arrow",
+ "async-trait",
  "chrono",
  "datafusion-common",
  "datafusion-doc",
@@ -1308,7 +1329,8 @@ dependencies = [
  "datafusion-functions-aggregate-common",
  "datafusion-functions-window-common",
  "datafusion-physical-expr-common",
- "indexmap 2.9.0",
+ "indexmap 2.14.0",
+ "itertools 0.14.0",
  "paste",
  "serde_json",
  "sqlparser",
@@ -1316,27 +1338,28 @@ dependencies = [
 
 [[package]]
 name = "datafusion-expr-common"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "422ac9cf3b22bbbae8cdf8ceb33039107fde1b5492693168f13bd566b1bcc839"
+checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e"
 dependencies = [
  "arrow",
  "datafusion-common",
- "indexmap 2.9.0",
+ "indexmap 2.14.0",
  "itertools 0.14.0",
  "paste",
 ]
 
 [[package]]
 name = "datafusion-functions"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ddf0a0a2db5d2918349c978d42d80926c6aa2459cd8a3c533a84ec4bb63479e"
+checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6"
 dependencies = [
  "arrow",
  "arrow-buffer",
  "base64 0.22.1",
  "chrono",
+ "chrono-tz",
  "datafusion-common",
  "datafusion-doc",
  "datafusion-execution",
@@ -1346,7 +1369,9 @@ dependencies = [
  "hex",
  "itertools 0.14.0",
  "log",
- "rand 0.8.5",
+ "memchr",
+ "num-traits",
+ "rand",
  "regex",
  "unicode-segmentation",
  "uuid",
@@ -1354,9 +1379,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions-aggregate"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "408a05dafdc70d05a38a29005b8b15e21b0238734dab1e98483fcb58038c5aba"
+checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad"
 dependencies = [
  "ahash",
  "arrow",
@@ -1370,14 +1395,15 @@ dependencies = [
  "datafusion-physical-expr-common",
  "half",
  "log",
+ "num-traits",
  "paste",
 ]
 
 [[package]]
 name = "datafusion-functions-aggregate-common"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "756d21da2dd6c9bef97af1504970ff56cbf35d03fbd4ffd62827f02f4d2279d4"
+checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47"
 dependencies = [
  "ahash",
  "arrow",
@@ -1386,11 +1412,36 @@ dependencies = [
  "datafusion-physical-expr-common",
 ]
 
+[[package]]
+name = "datafusion-functions-nested"
+version = "53.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a"
+dependencies = [
+ "arrow",
+ "arrow-ord",
+ "datafusion-common",
+ "datafusion-doc",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-expr-common",
+ "datafusion-functions",
+ "datafusion-functions-aggregate",
+ "datafusion-functions-aggregate-common",
+ "datafusion-macros",
+ "datafusion-physical-expr-common",
+ "hashbrown 0.16.1",
+ "itertools 0.14.0",
+ "itoa",
+ "log",
+ "paste",
+]
+
 [[package]]
 name = "datafusion-functions-table"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc9a97220736c8fff1446e936be90d57216c06f28969f9ffd3b72ac93c958c8a"
+checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1404,10 +1455,11 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions-window"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cefc2d77646e1aadd1d6a9c40088937aedec04e68c5f0465939912e1291f8193"
+checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6"
 dependencies = [
+ "arrow",
  "datafusion-common",
  "datafusion-doc",
  "datafusion-expr",
@@ -1421,9 +1473,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions-window-common"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd4aff082c42fa6da99ce0698c85addd5252928c908eb087ca3cfa64ff16b313"
+checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c"
 dependencies = [
  "datafusion-common",
  "datafusion-physical-expr-common",
@@ -1431,27 +1483,28 @@ dependencies = [
 
 [[package]]
 name = "datafusion-macros"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df6f88d7ee27daf8b108ba910f9015176b36fbc72902b1ca5c2a5f1d1717e1a1"
+checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd"
 dependencies = [
- "datafusion-expr",
+ "datafusion-doc",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "datafusion-optimizer"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "084d9f979c4b155346d3c34b18f4256e6904ded508e9554d90fed416415c3515"
+checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace"
 dependencies = [
  "arrow",
  "chrono",
  "datafusion-common",
  "datafusion-expr",
+ "datafusion-expr-common",
  "datafusion-physical-expr",
- "indexmap 2.9.0",
+ "indexmap 2.14.0",
  "itertools 0.14.0",
  "log",
  "regex",
@@ -1460,9 +1513,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-physical-expr"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64c536062b0076f4e30084065d805f389f9fe38af0ca75bcbac86bc5e9fbab65"
+checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59"
 dependencies = [
  "ahash",
  "arrow",
@@ -1472,33 +1525,52 @@ dependencies = [
  "datafusion-functions-aggregate-common",
  "datafusion-physical-expr-common",
  "half",
- "hashbrown 0.14.5",
- "indexmap 2.9.0",
+ "hashbrown 0.16.1",
+ "indexmap 2.14.0",
  "itertools 0.14.0",
- "log",
+ "parking_lot 0.12.3",
  "paste",
  "petgraph",
+ "tokio",
+]
+
+[[package]]
+name = "datafusion-physical-expr-adapter"
+version = "53.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4"
+dependencies = [
+ "arrow",
+ "datafusion-common",
+ "datafusion-expr",
+ "datafusion-functions",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "itertools 0.14.0",
 ]
 
 [[package]]
 name = "datafusion-physical-expr-common"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8a92b53b3193fac1916a1c5b8e3f4347c526f6822e56b71faa5fb372327a863"
+checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362"
 dependencies = [
  "ahash",
  "arrow",
+ "chrono",
  "datafusion-common",
  "datafusion-expr-common",
- "hashbrown 0.14.5",
+ "hashbrown 0.16.1",
+ "indexmap 2.14.0",
  "itertools 0.14.0",
+ "parking_lot 0.12.3",
 ]
 
 [[package]]
 name = "datafusion-physical-optimizer"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fa0a5ac94c7cf3da97bedabd69d6bbca12aef84b9b37e6e9e8c25286511b5e2"
+checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -1508,75 +1580,86 @@ dependencies = [
  "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
+ "datafusion-pruning",
  "itertools 0.14.0",
- "log",
 ]
 
 [[package]]
 name = "datafusion-physical-plan"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "690c615db468c2e5fe5085b232d8b1c088299a6c63d87fd960a354a71f7acb55"
+checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79"
 dependencies = [
  "ahash",
  "arrow",
  "arrow-ord",
  "arrow-schema",
  "async-trait",
- "chrono",
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-execution",
  "datafusion-expr",
+ "datafusion-functions",
+ "datafusion-functions-aggregate-common",
  "datafusion-functions-window-common",
  "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "futures",
  "half",
- "hashbrown 0.14.5",
- "indexmap 2.9.0",
+ "hashbrown 0.16.1",
+ "indexmap 2.14.0",
  "itertools 0.14.0",
  "log",
+ "num-traits",
  "parking_lot 0.12.3",
  "pin-project-lite",
  "tokio",
 ]
 
 [[package]]
-name = "datafusion-session"
-version = "47.0.0"
+name = "datafusion-pruning"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad229a134c7406c057ece00c8743c0c34b97f4e72f78b475fe17b66c5e14fa4f"
+checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a"
 dependencies = [
  "arrow",
- "async-trait",
- "dashmap",
  "datafusion-common",
- "datafusion-common-runtime",
- "datafusion-execution",
- "datafusion-expr",
+ "datafusion-datasource",
+ "datafusion-expr-common",
  "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
  "datafusion-physical-plan",
- "datafusion-sql",
- "futures",
  "itertools 0.14.0",
  "log",
- "object_store",
+]
+
+[[package]]
+name = "datafusion-session"
+version = "53.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e"
+dependencies = [
+ "async-trait",
+ "datafusion-common",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-physical-plan",
  "parking_lot 0.12.3",
- "tokio",
 ]
 
 [[package]]
 name = "datafusion-sql"
-version = "47.0.0"
+version = "53.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64f6ab28b72b664c21a27b22a2ff815fd390ed224c26e89a93b5a8154a4e8607"
+checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1"
 dependencies = [
  "arrow",
  "bigdecimal",
+ "chrono",
  "datafusion-common",
  "datafusion-expr",
- "indexmap 2.9.0",
+ "datafusion-functions-nested",
+ "indexmap 2.14.0",
  "log",
  "regex",
  "sqlparser",
@@ -1618,7 +1701,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1628,7 +1711,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
 dependencies = [
  "derive_builder_core",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1649,7 +1732,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1736,7 +1819,7 @@ checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1827,11 +1910,23 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
 [[package]]
 name = "form_urlencoded"
-version = "1.2.1"
+version = "1.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
+checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf"
 dependencies = [
  "percent-encoding",
 ]
@@ -1902,7 +1997,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1973,10 +2068,23 @@ checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0"
 dependencies = [
  "cfg-if",
  "libc",
- "r-efi",
+ "r-efi 5.2.0",
  "wasi 0.14.2+wasi-0.2.4",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi 6.0.0",
+ "wasip2",
+ "wasip3",
+]
+
 [[package]]
 name = "gimli"
 version = "0.31.1"
@@ -1991,13 +2099,14 @@ checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
 
 [[package]]
 name = "half"
-version = "2.6.0"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
 dependencies = [
  "cfg-if",
  "crunchy",
  "num-traits",
+ "zerocopy 0.8.48",
 ]
 
 [[package]]
@@ -2011,16 +2120,32 @@ name = "hashbrown"
 version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+
+[[package]]
+name = "hashbrown"
+version = "0.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
+dependencies = [
+ "foldhash 0.1.5",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
 dependencies = [
- "ahash",
  "allocator-api2",
+ "equivalent",
+ "foldhash 0.2.0",
 ]
 
 [[package]]
 name = "hashbrown"
-version = "0.15.2"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
+checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
 
 [[package]]
 name = "heck"
@@ -2293,9 +2418,15 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
+[[package]]
+name = "id-arena"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
+
 [[package]]
 name = "ident_case"
 version = "1.0.1"
@@ -2304,9 +2435,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
 
 [[package]]
 name = "idna"
-version = "1.0.3"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
+checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
 dependencies = [
  "idna_adapter",
  "smallvec",
@@ -2354,12 +2485,14 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.9.0"
+version = "2.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
+checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
 dependencies = [
  "equivalent",
- "hashbrown 0.15.2",
+ "hashbrown 0.17.0",
+ "serde",
+ "serde_core",
 ]
 
 [[package]]
@@ -2375,7 +2508,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
 dependencies = [
  "ahash",
- "indexmap 2.9.0",
+ "indexmap 2.14.0",
  "is-terminal",
  "itoa",
  "log",
@@ -2396,7 +2529,7 @@ dependencies = [
  "crossbeam-channel",
  "crossbeam-utils",
  "dashmap",
- "indexmap 2.9.0",
+ "indexmap 2.14.0",
  "itoa",
  "log",
  "num-format",
@@ -2486,7 +2619,7 @@ checksum = "199b7932d97e325aff3a7030e141eafe7f2c6268e1d1b24859b753a627f45254"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -2505,6 +2638,12 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 
+[[package]]
+name = "leb128fmt"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
+
 [[package]]
 name = "lexical-core"
 version = "1.0.5"
@@ -2571,9 +2710,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.176"
+version = "0.2.186"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
 
 [[package]]
 name = "libloading"
@@ -2643,9 +2782,9 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
 
 [[package]]
 name = "lz4_flex"
-version = "0.11.3"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
+checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a"
 dependencies = [
  "twox-hash",
 ]
@@ -2658,9 +2797,9 @@ checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
 
 [[package]]
 name = "memchr"
-version = "2.7.4"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
 
 [[package]]
 name = "memmap2"
@@ -2724,14 +2863,14 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "1.0.3"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
+checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1"
 dependencies = [
  "libc",
  "log",
  "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2899,14 +3038,16 @@ dependencies = [
 
 [[package]]
 name = "object_store"
-version = "0.12.0"
+version = "0.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e9ce831b09395f933addbc56d894d889e4b226eba304d4e7adbab591e26daf1e"
+checksum = "622acbc9100d3c10e2ee15804b0caa40e55c933d5aa53814cd520805b7958a49"
 dependencies = [
  "async-trait",
  "bytes",
  "chrono",
- "futures",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
  "http",
  "humantime",
  "itertools 0.14.0",
@@ -2917,6 +3058,8 @@ dependencies = [
  "tracing",
  "url",
  "walkdir",
+ "wasm-bindgen-futures",
+ "web-time",
 ]
 
 [[package]]
@@ -2996,15 +3139,6 @@ dependencies = [
  "windows-targets",
 ]
 
-[[package]]
-name = "parse-zoneinfo"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24"
-dependencies = [
- "regex",
-]
-
 [[package]]
 name = "paste"
 version = "1.0.15"
@@ -3025,9 +3159,9 @@ dependencies = [
 
 [[package]]
 name = "percent-encoding"
-version = "2.3.1"
+version = "2.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
 
 [[package]]
 name = "pete"
@@ -3043,48 +3177,30 @@ dependencies = [
 
 [[package]]
 name = "petgraph"
-version = "0.7.1"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
+checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455"
 dependencies = [
  "fixedbitset",
- "indexmap 2.9.0",
+ "hashbrown 0.15.2",
+ "indexmap 2.14.0",
+ "serde",
 ]
 
 [[package]]
 name = "phf"
-version = "0.11.3"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
+checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7"
 dependencies = [
  "phf_shared",
 ]
 
-[[package]]
-name = "phf_codegen"
-version = "0.11.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
-dependencies = [
- "phf_generator",
- "phf_shared",
-]
-
-[[package]]
-name = "phf_generator"
-version = "0.11.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
-dependencies = [
- "phf_shared",
- "rand 0.8.5",
-]
-
 [[package]]
 name = "phf_shared"
-version = "0.11.3"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
+checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981"
 dependencies = [
  "siphasher",
 ]
@@ -3106,7 +3222,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3198,7 +3314,17 @@ version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
 dependencies = [
- "zerocopy 0.8.25",
+ "zerocopy 0.8.48",
+]
+
+[[package]]
+name = "prettyplease"
+version = "0.2.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
+dependencies = [
+ "proc-macro2",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3254,6 +3380,7 @@ dependencies = [
  "nix 0.30.1",
  "once_cell",
  "pete",
+ "probing-memtable",
  "probing-proto",
  "probing-store",
  "procfs",
@@ -3280,10 +3407,12 @@ dependencies = [
  "log",
  "once_cell",
  "probing-macros",
+ "probing-memtable",
  "probing-proto",
  "serde",
  "serde_json",
  "sled",
+ "tempfile",
  "thiserror 2.0.12",
  "tokio",
  "url",
@@ -3296,7 +3425,7 @@ version = "0.2.4"
 dependencies = [
  "probing-core",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3305,6 +3434,7 @@ version = "0.2.4"
 dependencies = [
  "libc",
  "memmap2",
+ "pco",
  "xxhash-rust",
 ]
 
@@ -3344,6 +3474,7 @@ dependencies = [
  "probing-cc",
  "probing-cli",
  "probing-core",
+ "probing-memtable",
  "probing-proto",
  "probing-store",
  "pyo3",
@@ -3351,6 +3482,7 @@ dependencies = [
  "regex",
  "serde_json",
  "signal-hook-registry",
+ "tempfile",
  "tokio",
 ]
 
@@ -3425,15 +3557,6 @@ dependencies = [
  "hex",
 ]
 
-[[package]]
-name = "psm"
-version = "0.1.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "pulldown-cmark"
 version = "0.13.0"
@@ -3491,7 +3614,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3504,7 +3627,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-build-config",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3527,9 +3650,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.40"
+version = "1.0.45"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
 dependencies = [
  "proc-macro2",
 ]
@@ -3541,15 +3664,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
 
 [[package]]
-name = "rand"
-version = "0.8.5"
+name = "r-efi"
+version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
-dependencies = [
- "libc",
- "rand_chacha 0.3.1",
- "rand_core 0.6.4",
-]
+checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
 
 [[package]]
 name = "rand"
@@ -3557,20 +3675,10 @@ version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
 dependencies = [
- "rand_chacha 0.9.0",
+ "rand_chacha",
  "rand_core 0.9.3",
 ]
 
-[[package]]
-name = "rand_chacha"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
-dependencies = [
- "ppv-lite86",
- "rand_core 0.6.4",
-]
-
 [[package]]
 name = "rand_chacha"
 version = "0.9.0"
@@ -3586,9 +3694,6 @@ name = "rand_core"
 version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
-dependencies = [
- "getrandom 0.2.16",
-]
 
 [[package]]
 name = "rand_core"
@@ -3628,26 +3733,6 @@ dependencies = [
  "crossbeam-utils",
 ]
 
-[[package]]
-name = "recursive"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e"
-dependencies = [
- "recursive-proc-macro-impl",
- "stacker",
-]
-
-[[package]]
-name = "recursive-proc-macro-impl"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b"
-dependencies = [
- "quote",
- "syn 2.0.101",
-]
-
 [[package]]
 name = "redox_syscall"
 version = "0.2.16"
@@ -3688,9 +3773,9 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.11.1"
+version = "1.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -3700,9 +3785,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.9"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -3711,9 +3796,9 @@ dependencies = [
 
 [[package]]
 name = "regex-syntax"
-version = "0.8.5"
+version = "0.8.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
 
 [[package]]
 name = "rgb"
@@ -3875,22 +3960,32 @@ dependencies = [
 
 [[package]]
 name = "serde"
-version = "1.0.219"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.219"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4019,34 +4114,33 @@ checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9"
 
 [[package]]
 name = "socket2"
-version = "0.5.9"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef"
+checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
 dependencies = [
  "libc",
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "sqlparser"
-version = "0.55.0"
+version = "0.61.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11"
+checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7"
 dependencies = [
  "log",
- "recursive",
  "sqlparser_derive",
 ]
 
 [[package]]
 name = "sqlparser_derive"
-version = "0.3.0"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c"
+checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4055,19 +4149,6 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
 
-[[package]]
-name = "stacker"
-version = "0.1.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b"
-dependencies = [
- "cc",
- "cfg-if",
- "libc",
- "psm",
- "windows-sys 0.59.0",
-]
-
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -4136,7 +4217,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4181,9 +4262,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.101"
+version = "2.0.117"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -4204,7 +4285,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4286,7 +4367,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4297,7 +4378,7 @@ checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4364,29 +4445,40 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.45.0"
+version = "1.52.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2513ca694ef9ede0fb23fe71a4ee4107cb102b9dc1930f6d0fd77aae068ae165"
+checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6"
 dependencies = [
- "backtrace",
  "bytes",
  "libc",
  "mio",
  "pin-project-lite",
  "socket2",
  "tokio-macros",
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "tokio-macros"
-version = "2.5.0"
+version = "2.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
+checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+ "tokio-util",
 ]
 
 [[package]]
@@ -4414,6 +4506,19 @@ dependencies = [
  "tungstenite 0.28.0",
 ]
 
+[[package]]
+name = "tokio-util"
+version = "0.7.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+
 [[package]]
 name = "tower"
 version = "0.5.2"
@@ -4462,7 +4567,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4491,7 +4596,7 @@ dependencies = [
  "http",
  "httparse",
  "log",
- "rand 0.9.1",
+ "rand",
  "sha1",
  "thiserror 2.0.12",
  "utf-8",
@@ -4508,7 +4613,7 @@ dependencies = [
  "http",
  "httparse",
  "log",
- "rand 0.9.1",
+ "rand",
  "sha1",
  "thiserror 2.0.12",
  "utf-8",
@@ -4516,13 +4621,9 @@ dependencies = [
 
 [[package]]
 name = "twox-hash"
-version = "1.6.3"
+version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
-dependencies = [
- "cfg-if",
- "static_assertions",
-]
+checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c"
 
 [[package]]
 name = "typenum"
@@ -4554,6 +4655,12 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
 
+[[package]]
+name = "unicode-xid"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
+
 [[package]]
 name = "unindent"
 version = "0.2.4"
@@ -4596,13 +4703,14 @@ dependencies = [
 
 [[package]]
 name = "url"
-version = "2.5.4"
+version = "2.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
+checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed"
 dependencies = [
  "form_urlencoded",
  "idna",
  "percent-encoding",
+ "serde",
 ]
 
 [[package]]
@@ -4637,13 +4745,13 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
 [[package]]
 name = "uuid"
-version = "1.16.0"
+version = "1.23.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9"
+checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76"
 dependencies = [
- "getrandom 0.3.2",
+ "getrandom 0.4.2",
  "js-sys",
- "serde",
+ "serde_core",
  "wasm-bindgen",
 ]
 
@@ -4723,6 +4831,24 @@ dependencies = [
  "wit-bindgen-rt",
 ]
 
+[[package]]
+name = "wasip2"
+version = "1.0.3+wasi-0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
+dependencies = [
+ "wit-bindgen 0.57.1",
+]
+
+[[package]]
+name = "wasip3"
+version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
+dependencies = [
+ "wit-bindgen 0.51.0",
+]
+
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.100"
@@ -4745,10 +4871,23 @@ dependencies = [
  "log",
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
  "wasm-bindgen-shared",
 ]
 
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "once_cell",
+ "wasm-bindgen",
+ "web-sys",
+]
+
 [[package]]
 name = "wasm-bindgen-macro"
 version = "0.2.100"
@@ -4767,7 +4906,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -4781,6 +4920,40 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "wasm-encoder"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
+dependencies = [
+ "leb128fmt",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-metadata"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
+dependencies = [
+ "anyhow",
+ "indexmap 2.14.0",
+ "wasm-encoder",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasmparser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
+dependencies = [
+ "bitflags 2.9.0",
+ "hashbrown 0.15.2",
+ "indexmap 2.14.0",
+ "semver",
+]
+
 [[package]]
 name = "web-sys"
 version = "0.3.77"
@@ -4840,7 +5013,7 @@ checksum = "4763c1de310c86d75a878046489e2e5ba02c649d185f21c67d4cf8a56d098980"
 dependencies = [
  "windows-implement",
  "windows-interface",
- "windows-link",
+ "windows-link 0.1.1",
  "windows-result",
  "windows-strings",
 ]
@@ -4853,7 +5026,7 @@ checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4864,7 +5037,7 @@ checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4873,13 +5046,19 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38"
 
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
 [[package]]
 name = "windows-result"
 version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c64fd11a4fd95df68efcfee5f44a294fe71b8bc6a91993e2791938abcc712252"
 dependencies = [
- "windows-link",
+ "windows-link 0.1.1",
 ]
 
 [[package]]
@@ -4888,7 +5067,7 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a2ba9642430ee452d5a7aa78d72907ebe8cfda358e8cb7918a2050581322f97"
 dependencies = [
- "windows-link",
+ "windows-link 0.1.1",
 ]
 
 [[package]]
@@ -4909,6 +5088,15 @@ dependencies = [
  "windows-targets",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link 0.2.1",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.52.6"
@@ -4973,6 +5161,32 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
+[[package]]
+name = "wit-bindgen"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+dependencies = [
+ "wit-bindgen-rust-macro",
+]
+
+[[package]]
+name = "wit-bindgen"
+version = "0.57.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
+
+[[package]]
+name = "wit-bindgen-core"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
+dependencies = [
+ "anyhow",
+ "heck 0.5.0",
+ "wit-parser",
+]
+
 [[package]]
 name = "wit-bindgen-rt"
 version = "0.39.0"
@@ -4982,6 +5196,74 @@ dependencies = [
  "bitflags 2.9.0",
 ]
 
+[[package]]
+name = "wit-bindgen-rust"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
+dependencies = [
+ "anyhow",
+ "heck 0.5.0",
+ "indexmap 2.14.0",
+ "prettyplease",
+ "syn 2.0.117",
+ "wasm-metadata",
+ "wit-bindgen-core",
+ "wit-component",
+]
+
+[[package]]
+name = "wit-bindgen-rust-macro"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
+dependencies = [
+ "anyhow",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "wit-bindgen-core",
+ "wit-bindgen-rust",
+]
+
+[[package]]
+name = "wit-component"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
+dependencies = [
+ "anyhow",
+ "bitflags 2.9.0",
+ "indexmap 2.14.0",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "wasm-encoder",
+ "wasm-metadata",
+ "wasmparser",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-parser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
+dependencies = [
+ "anyhow",
+ "id-arena",
+ "indexmap 2.14.0",
+ "log",
+ "semver",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "unicode-xid",
+ "wasmparser",
+]
+
 [[package]]
 name = "write16"
 version = "1.0.0"
@@ -5020,7 +5302,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
  "synstructure",
 ]
 
@@ -5035,11 +5317,11 @@ dependencies = [
 
 [[package]]
 name = "zerocopy"
-version = "0.8.25"
+version = "0.8.48"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
+checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
 dependencies = [
- "zerocopy-derive 0.8.25",
+ "zerocopy-derive 0.8.48",
 ]
 
 [[package]]
@@ -5050,18 +5332,18 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.8.25"
+version = "0.8.48"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef"
+checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5081,7 +5363,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
  "synstructure",
 ]
 
@@ -5110,5 +5392,5 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 75edce8c..56c8210b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,13 @@ description = "Performance and Stability Diagnostic Tool for AI Applications"
 
 [workspace.dependencies]
 anyhow = "1.0"
-arrow = { version = "55.1.0", default-features = false, features = ["csv"] }
+arrow = { version = "58.1.0", default-features = false, features = ["csv"] }
+datafusion = { version = "53.1.0", default-features = false, features = [
+    "datetime_expressions",
+    "sql",
+    "regex_expressions",
+    "string_expressions",
+] }
 chrono = { version = "0.4", features = ["serde"] }
 ctor = { version = "0.4.1", features = [] }
 env_logger = { version = "0.11.6", default-features = false, features = [
@@ -77,7 +83,9 @@ default = ["extension-module", "use-mimalloc"]
 [dependencies]
 probing-core = { path = "probing/core" }
 probing-server = { path = "probing/server", default-features = false }
-probing-python = { path = "probing/extensions/python", default-features = false, features=["tracing"] }
+probing-python = { path = "probing/extensions/python", default-features = false, features = [
+    "tracing",
+] }
 probing-cli = { path = "probing/cli" }
 
 anyhow = { workspace = true }
@@ -110,6 +118,6 @@ tokio = { workspace = true }
 [profile.release]
 opt-level = "z"    # Optimize for size.
 panic = "unwind"
-strip = false       # Automatically strip symbols from the binary.
+strip = false      # Automatically strip symbols from the binary.
 lto = "thin"
 codegen-units = 16
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index ecf6d041..5851dd9a 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -89,6 +89,7 @@ plugins:
             Troubleshooting: 常见问题
             Design: 设计文档
             Architecture: 系统架构
+            Data Layer: 数据层
             Profiling: 性能分析
             Distributed: 分布式
             Cluster with Pulsing: 基于 Pulsing 的集群
@@ -123,6 +124,7 @@ nav:
   - Design:
     - design/index.md
     - Architecture: design/architecture.md
+    - Data Layer: design/data-layer.md
     - Profiling: design/profiling.md
     - Debugging: design/debugging.md
     - Distributed: design/distributed.md
diff --git a/docs/src/design/data-layer.md b/docs/src/design/data-layer.md
new file mode 100644
index 00000000..ddcf4156
--- /dev/null
+++ b/docs/src/design/data-layer.md
@@ -0,0 +1,293 @@
+# Data Layer
+
+Probing's data layer is a **per-process, crash-resilient, time-retained data plane** for
+observability data (metrics, samples, traces). Every producer writes through one in-house
+columnar store, [`probing-memtable`](https://github.com/DeepLink-org/probing), and every
+consumer queries it through SQL (DataFusion). It is built as **two tiers**:
+
+- a **hot tier** (`MEMT`): a fixed-capacity ring buffer for the live window — constant memory,
+  zero-allocation writes;
+- a **cold tier** (`MEMC`): immutable, compressed segments for time retention beyond the ring,
+  with whole-file eviction.
+
+A single SQL time predicate prunes and queries both tiers at once.
+
+## Design Goals
+
+- **Bounded resource use.** The hot ring never grows; the cold store is capped by a byte budget
+  and TTL.
+- **Crash resilience.** A process killed mid-write never surfaces torn rows; cold segments
+  recover from a torn tail via forward scan.
+- **Time retention.** Data that scrolls out of the hot ring survives in cold segments and stays
+  queryable.
+- **One write path, one read path.** Producers (server, Python/Torch extensions) all write
+  `probing-memtable`; consumers all go through `probing-core::memtable_sql`.
+- **Fork safety.** Correct under fork-heavy workloads (PyTorch DataLoader workers).
+
+## Architecture
+
+```mermaid
+graph LR
+    APP[push_row / RowWriter] --> MEMT
+    subgraph HOT[Hot tier · probing-memtable]
+        MEMT[MEMT ring buffer] --> SEALED[sealed chunk\nmin/max ts + generation]
+    end
+    SEALED -->|transpose + Pco| ROLLER[Compactor / Roller]
+    ROLLER --> MEMC
+    subgraph COLD[Cold tier · MEMC segments]
+        MEMC[ColdStore\nimmutable segments]
+    end
+
+    SQL[SQL query] --> HCT[HotColdTable]
+    HCT -->|chunk pruning| MEMT
+    HCT -->|segment + page pruning| MEMC
+```
+
+The hot tier is mapped read-only at query time; the cold tier is read via `SegmentReader`. The
+`HotColdTable` provider unions them into one scan, deduplicating chunks that exist in both tiers.
+
+## Hot Tier (MEMT)
+
+### File Layout
+
+Every MEMT buffer (heap, shared memory, or mmap'd file) begins with a 64-byte header (one cache
+line), followed by per-column descriptors, then chunk data.
+
+**Header v3 (64 bytes):**
+
+| offset | size | field | notes |
+|---|---|---|---|
+| 0 | 4 | `magic` | `0x4D454D54` (`"MEMT"`) |
+| 4 | 2 | `version` | 3 |
+| 6 | 2 | `header_size` | 64 (validation) |
+| 8 | 2 | `byte_order` | BOM `[0x01,0x02]` |
+| 10 | 2 | `ts_col` | timestamp column index + 1 (0 = none) |
+| 12 | 4 | `flags` | feature bits (`FLAG_DEDUP`, …) |
+| 16 | 4 | `num_cols` | |
+| 20 | 4 | `num_chunks` | ring slot count |
+| 24 | 4 | `chunk_size` | bytes per chunk |
+| 28 | 4 | `data_offset` | 64-aligned |
+| 32 | 4 | `write_chunk` | `AtomicU32` — current ring slot |
+| 36 | 4 | `write_lock` | `AtomicU32` — 0 = free, else holder PID |
+| 40 | 4 | `refcount` | `AtomicU32` |
+| 44 | 4 | `creator_pid` | |
+| 48 | 8 | `creator_start_time` | for PID-recycling detection |
+| 56 | 8 | `lock_owner_start` | `AtomicU64` — lock holder's start time |
+
+Bytes 0–31 are the **cold zone** (immutable after init); bytes 32–63 are the **hot zone**
+(atomically mutated), split to avoid false sharing. Each chunk starts with a 40-byte
+`ChunkHeader` carrying a `generation` counter and per-chunk `min_ts`/`max_ts` (`AtomicI64`).
+
+### Backends
+
+The same API backs three storage kinds:
+
+- **Heap** — a private `Vec<u8>`; for in-process use.
+- **POSIX shared memory** (`shm_open` + `mmap`) — cross-process, named, unlinked on cleanup.
+- **File-backed mmap** — persistent, discoverable files under `<data_dir>/<pid>/`. This is what
+  the SQL layer reads.
+
+### Ring Buffer & Generations
+
+Writes append to the current chunk; when a row does not fit, the writer advances to the next ring
+slot (wrapping), sealing the previous chunk. Each slot carries a monotonically increasing
+`generation` (incremented every time the ring wraps onto it). Readers materialize chunks in
+**logical (oldest → newest) order** and re-check the generation after reading — a chunk recycled
+mid-read is discarded rather than surfacing torn rows.
+
+### Robust Write Lock
+
+`write_lock` holds **0 (free) or the holder's PID**. A waiter spins; if it spins past
+`LOCK_STEAL_TIMEOUT` (500 ms) it enters a steal decision:
+
+- if the holder process no longer exists (`kill(pid, 0)`), the lock is stolen;
+- if the holder exists but its kernel start time differs from `lock_owner_start`, the PID was
+  recycled by an unrelated process — stolen after a short re-check grace.
+
+Stealing is data-safe: rows only become visible via the `Release` store of `row_count` at the end
+of a write, so a half-written row from a dead holder stays uncommitted and is simply overwritten.
+
+!!! note "Fork safety"
+    The holder's start time is read via a per-PID cache, **not** a one-shot cache. A child that
+    inherited a parent's cached value would record the parent's start time and be mistaken for a
+    recycled PID by a waiter — exactly the hazard fork-heavy workloads (PyTorch DataLoader)
+    trigger. Re-reading whenever the live PID changes makes every post-fork caller observe its own
+    start time. (Start times come from `/proc` on Linux; on platforms without it the steal-on-recycle
+    path is inert.)
+
+### Timestamp Metadata
+
+When the schema has an `I64` column named `timestamp` (or `ts`), `ts_col` records it and the write
+path maintains per-chunk `min_ts`/`max_ts`. This is the basis for chunk-level time pruning at query
+time, and it is **structurally identical** to the cold tier's page/segment time ranges.
+
+## Cold Tier (MEMC)
+
+### Directory & File Naming
+
+Cold segments live in `<data_dir>/<pid>/cold` — co-located with, and scoped like, the hot ring
+files, so cold data never mixes across processes. Each segment is named
+`<writer_id>-<seq>.memc`, where `writer_id` is a hash of `(pid, start_time)` and `seq` is a
+monotonically increasing sequence the `ColdStore` recovers on open.
+
+### Segment Format
+
+A segment is a sequence of 64-aligned blocks. All integrity checks use **xxh3-64 truncated to 32
+bits**.
+
+**Segment header (64 bytes):** `magic` (`"MEMC"`), `version` (1), BOM, `flags` (bit 0 = sealed),
+`writer_pid`, `writer_start`, `created_unix_ms`, `footer_off` (0 until sealed), segment-wide
+`ts_min`/`ts_max`, `page_count`, header checksum.
+
+**Blocks** share a 64-byte header:
+
+| magic | meaning |
+|---|---|
+| `MCTB` | table-definition block — declares a `table_id`, name, column dtypes, ts column |
+| `MCPG` | page block — one columnar page for a `table_id` |
+| `MCFT` | footer — page directory written on seal |
+
+The page/block header carries `table_id`, `row_count`, `col_count`, `ts_min`/`ts_max`,
+`payload_len`, `payload_xxh`, and — crucially for restart dedup — `source_gen` and `source_chunk`
+(the hot-ring chunk generation and index this page was drained from; `u32::MAX` = not applicable).
+The header is itself checksummed (covering `source_chunk`).
+
+A single segment holds pages from **multiple tables**, distinguished by `table_id`. This decouples
+file/directory count from table count: hundreds of tables share one set of segment files.
+
+### Column Encodings
+
+Each column is encoded independently (`ColEncoding`):
+
+- **`Pco`** — numeric columns (`i32/i64/f32/f64/u32/u64`), compressed with Pco (level 8). Monotonic
+  timestamp columns compress > 4×.
+- **`RawFixed`** — `u8` (Pco offers no benefit for byte columns).
+- **`RawVarLen`** — `Str`/`Bytes`, stored as concatenated `[u32 len][bytes]` entries (Pco has no
+  string support).
+
+### Crash Recovery
+
+- A **sealed** segment is read via its footer page directory — O(1) location of every page.
+- An **unsealed or torn** segment is recovered by **forward scan**: walk blocks from the start,
+  verifying each block's header and payload checksum, stopping at the first bad block and dropping
+  the torn tail. Table-definition blocks are always scanned (cheap, and they precede pages).
+
+There is no heuristic that tries to repair a half-written record.
+
+!!! warning "Durability"
+    Pages are not `fsync`'d individually (only `sync_data` on seal). A `SIGKILL` may lose
+    not-yet-flushed tail pages of the open segment. This is acceptable for observability data but
+    is an explicit trade-off.
+
+## Compactor (Roller)
+
+The `Compactor` drains newly-sealed hot chunks into cold segments.
+
+- **Drain semantics.** Only `Sealed` chunks are drained (never the currently-writing chunk). Rows
+  are transposed to columns; the chunk's `generation` is re-checked before and after — if the ring
+  recycled it, the page is dropped and retried next pass. Draining is **idempotent**: a per-chunk
+  `drained_gen` high-water mark skips already-compacted chunk generations.
+- **Rolling.** The open segment is sealed and a new one started when it reaches
+  `target_segment_bytes` (default 64 MiB — the main fragmentation knob), or when it exceeds
+  `max_segment_age` (default 300 s, so low-rate tables still become queryable), or on explicit
+  flush.
+- **Eviction.** `enforce` deletes oldest segments past a byte budget (`max_total_bytes`) or TTL,
+  always protecting the newest segment.
+
+### Exactly-Once Across Restarts
+
+`drained_gen` is in-memory, so a naive restart over a persistent cold dir would re-compact
+chunks still resident in the hot ring, producing duplicate rows. `prime_from_cold()` rebuilds the
+watermark on startup: it scans existing cold segments and, per `(table, source_chunk)`, takes the
+max `source_gen`, merging it into `drained_gen` the first time a table is seen. The result is
+**exactly-once** even across restarts.
+
+## Runtime Owner
+
+`ColdCompactor` is a process-global singleton (modeled on the task-stats worker) that gives the
+compactor a single lifecycle home:
+
+- a background thread **rediscovers** ring files under `<data_dir>/<pid>/` each pass (tables appear
+  over time), drains each into the shared `ColdStore`, rolls by age, and enforces the budget;
+- on startup it calls `prime_from_cold()`; on stop it flushes (seals the open segment).
+
+It is **opt-in** (off by default) to avoid spawning a compaction thread in every forked worker.
+Configuration is applied via the `MemTableExtension` option surface or environment variables; the
+server calls `start_cold_compaction_from_env()` at engine init.
+
+## SQL Integration
+
+### Catalog Discovery
+
+mmap files under `<data_dir>/<pid>/` are exposed as DataFusion tables, with the filename mapping
+to `(schema, table)`:
+
+- first `.` splits schema vs table — `acme.actors` → schema `acme`, table `actors`;
+- no `.` → schema `memtable` (e.g. `metrics` → `memtable.metrics`).
+
+`DynamicMmapCatalog` merges these dynamic schemas with the static `probe` catalog. A query like
+`SELECT … FROM probe.memtable.metrics` resolves through `MmapFileSchemaProvider::table()`.
+
+### Providers
+
+- **`RingMmapTable`** — lazy provider over a hot ring file. Materializes Arrow batches at `scan()`
+  time, pruning chunks whose `[min_ts, max_ts]` cannot match the query's time predicate.
+- **`HotColdTable`** — unions a hot ring with its cold segments under one logical table (keyed by
+  on-disk basename, so names never collide across schemas). This is what the catalog returns for
+  ring tables.
+
+### Three-Level Time Pruning
+
+One time predicate prunes both tiers, in increasing granularity:
+
+1. **Segment level** — skip a sealed cold segment whose header `ts_range` cannot match (no mmap).
+2. **Page level** — skip cold pages outside the range via the page directory.
+3. **Chunk level** — skip hot chunks outside the range via their `min_ts`/`max_ts`.
+
+Hot and cold batches are handed to the scan as two partitions, so projection, filter, and limit
+pushdown apply uniformly across both.
+
+### Hot∪Cold Exactly-Once
+
+A compacted chunk still lives in the hot ring until overwritten, so a naive union would
+double-count it. `cold_scan` returns the set of `(source_chunk, source_gen)` the cold pages came
+from; the hot side then **excludes** any chunk whose `(index, current generation)` is in that set.
+Each row is counted exactly once, and the dedup is immune to ring recycling (the generation check
+re-validates).
+
+## Configuration Reference
+
+| `SET memtable.*` | env | meaning | default |
+|---|---|---|---|
+| `cold_compaction` | `PROBING_COLD` | run the background compactor (`on`/`off`) | off |
+| `cold_max_total_mb` | `PROBING_COLD_MAX_TOTAL_MB` | cold-store byte budget (MiB) | unlimited |
+| `cold_ttl_secs` | `PROBING_COLD_TTL_SECS` | evict cold segments older than this | none |
+| — | `PROBING_COLD_TARGET_MB` | segment roll size (MiB) | 64 |
+| — | `PROBING_COLD_POLL_MS` | drain-pass interval | 2000 |
+| — | `PROBING_COLD_MAX_AGE_SECS` | seal idle open segment after | 300 |
+
+## Guarantees & Known Limits
+
+**Guaranteed:**
+
+- No torn rows on reads (generation re-validation); cold torn-tail recovery.
+- Exactly-once across tiers (query dedup) and across restarts (`prime_from_cold`).
+- Bounded hot memory; bounded cold bytes/TTL.
+- Fork-safe locking.
+
+**Known trade-offs (P2 backlog):**
+
+- **Cold dir is per-PID.** Cross-process isolation is clean, but cold data is not persistent across
+  restarts by default (a new PID is a new cold dir). `prime_from_cold` makes restart dedup correct
+  whenever a persistent cold dir is configured.
+- **No per-page `fsync`** — a `SIGKILL` may lose the open segment's not-yet-flushed tail.
+- **No segment-level manifest** — multi-segment queries open each segment header to prune.
+- **Pco level is fixed (8)** — not adapted per column.
+- **Runtime is single-process** — cross-process / cluster aggregation is not yet wired.
+
+## Testing
+
+The data layer ships with unit and end-to-end tests: hot-ring lock/recycle/fork tests
+(`probing-memtable`), MEMC format/recovery/compactor tests (including restart-dedup with a negative
+control), and SQL end-to-end tests that drain through the runtime owner and query the union through
+the real catalog path (`probing-core::memtable_sql`).
diff --git a/docs/src/design/data-layer.zh.md b/docs/src/design/data-layer.zh.md
new file mode 100644
index 00000000..718317fc
--- /dev/null
+++ b/docs/src/design/data-layer.zh.md
@@ -0,0 +1,262 @@
+# 数据层
+
+Probing 的数据层是一个面向观测数据（指标、采样、trace）的**单进程、抗崩溃、带时间保留的数据面**。
+所有生产者都通过同一套自研列式存储 [`probing-memtable`](https://github.com/DeepLink-org/probing)
+写入，所有消费者都通过 SQL（DataFusion）读取。它由**两层**构成：
+
+- **热层**（`MEMT`）：固定容量的环形缓冲区，承载实时窗口——常量内存、零分配写入；
+- **冷层**（`MEMC`）：不可变、压缩的段文件，用于超出环形窗口的时间保留，按整文件淘汰。
+
+一条 SQL 时间谓词即可同时对两层做剪枝与查询。
+
+## 设计目标
+
+- **资源有界。** 热层环形缓冲永不增长；冷层受字节预算与 TTL 双重约束。
+- **抗崩溃。** 写入中途被杀的进程不会暴露半行数据；冷段可从尾部撕裂中通过前向扫描恢复。
+- **时间保留。** 滚出热层环形窗口的数据落入冷段，依然可查。
+- **一条写路径、一条读路径。** 生产者（server、Python/Torch 扩展）统一写 `probing-memtable`；
+  消费者统一走 `probing-core::memtable_sql`。
+- **fork 安全。** 在大量 fork 的负载（如 PyTorch DataLoader worker）下依然正确。
+
+## 总体架构
+
+```mermaid
+graph LR
+    APP[push_row / RowWriter] --> MEMT
+    subgraph HOT[热层 · probing-memtable]
+        MEMT[MEMT 环形缓冲] --> SEALED[已封存 chunk\nmin/max ts + generation]
+    end
+    SEALED -->|转置 + Pco| ROLLER[Compactor / Roller]
+    ROLLER --> MEMC
+    subgraph COLD[冷层 · MEMC 段]
+        MEMC[ColdStore\n不可变段]
+    end
+
+    SQL[SQL 查询] --> HCT[HotColdTable]
+    HCT -->|chunk 剪枝| MEMT
+    HCT -->|段 + page 剪枝| MEMC
+```
+
+查询时热层以只读方式 mmap，冷层通过 `SegmentReader` 读取。`HotColdTable` provider 将两者合并为
+一次扫描，并对同时存在于两层的 chunk 做去重。
+
+## 热层（MEMT）
+
+### 文件布局
+
+每个 MEMT 缓冲区（堆、共享内存或 mmap 文件）都以 64 字节头部（一个 cache line）开始，随后是
+逐列描述符，再是 chunk 数据。
+
+**Header v3（64 字节）：**
+
+| 偏移 | 大小 | 字段 | 说明 |
+|---|---|---|---|
+| 0 | 4 | `magic` | `0x4D454D54`（`"MEMT"`） |
+| 4 | 2 | `version` | 3 |
+| 6 | 2 | `header_size` | 64（仅校验） |
+| 8 | 2 | `byte_order` | BOM `[0x01,0x02]` |
+| 10 | 2 | `ts_col` | 时间戳列索引 + 1（0 = 无） |
+| 12 | 4 | `flags` | 特性位（`FLAG_DEDUP` 等） |
+| 16 | 4 | `num_cols` | |
+| 20 | 4 | `num_chunks` | 环形槽位数 |
+| 24 | 4 | `chunk_size` | 每个 chunk 字节数 |
+| 28 | 4 | `data_offset` | 64 对齐 |
+| 32 | 4 | `write_chunk` | `AtomicU32`——当前环形槽位 |
+| 36 | 4 | `write_lock` | `AtomicU32`——0 = 空闲，否则为持有者 PID |
+| 40 | 4 | `refcount` | `AtomicU32` |
+| 44 | 4 | `creator_pid` | |
+| 48 | 8 | `creator_start_time` | 用于 PID 回收检测 |
+| 56 | 8 | `lock_owner_start` | `AtomicU64`——锁持有者的进程启动时间 |
+
+字节 0–31 是**冷区**（初始化后不可变），字节 32–63 是**热区**（运行时原子修改），二者分离以避免
+伪共享。每个 chunk 以 40 字节的 `ChunkHeader` 开头，携带 `generation` 计数器及逐 chunk 的
+`min_ts`/`max_ts`（`AtomicI64`）。
+
+### 三种后端
+
+同一套 API 支撑三种存储形态：
+
+- **堆内存**——私有 `Vec<u8>`，用于进程内使用；
+- **POSIX 共享内存**（`shm_open` + `mmap`）——跨进程、具名、清理时 unlink；
+- **文件 mmap**——持久、可发现的文件，位于 `<data_dir>/<pid>/`。SQL 层读取的正是这种。
+
+### 环形缓冲与 generation
+
+写入追加到当前 chunk；当一行放不下时，写入者推进到下一个环形槽位（绕回），同时封存上一个 chunk。
+每个槽位携带单调递增的 `generation`（每次环形绕回到该槽位即自增）。读取者按**逻辑顺序（旧 → 新）**
+物化 chunk，并在读取后复核 generation——若某 chunk 在读取过程中被回收，则丢弃而非暴露半行数据。
+
+### Robust 写锁
+
+`write_lock` 存放 **0（空闲）或持有者的 PID**。等待者自旋；若自旋超过 `LOCK_STEAL_TIMEOUT`
+（500 ms），进入抢占判定：
+
+- 若持有者进程已不存在（`kill(pid, 0)`），抢占该锁；
+- 若持有者存在但其内核启动时间与 `lock_owner_start` 不符，说明 PID 已被无关进程回收——经短暂复核
+  宽限后抢占。
+
+抢占是数据安全的：行只有在写入结束时通过 `row_count` 的 `Release` 存储才可见，因此已死持有者写到
+一半的行不会提交，会被直接覆盖。
+
+!!! note "fork 安全"
+    持有者启动时间通过**按 PID 缓存**读取，而非一次性缓存。若子进程继承了父进程的缓存值，就会记录
+    父进程的启动时间，从而被等待者误判为 PID 回收——这正是大量 fork 的负载（PyTorch DataLoader）
+    会触发的隐患。每当存活 PID 变化即重新读取，可让每个 fork 后的调用者观察到自己的启动时间。
+    （启动时间在 Linux 上来自 `/proc`；在不具备该接口的平台上，回收抢占路径自动失效。）
+
+### 时间戳元数据
+
+当 schema 含有名为 `timestamp`（或 `ts`）的 `I64` 列时，`ts_col` 记录其位置，写入路径维护逐
+chunk 的 `min_ts`/`max_ts`。这是查询时 chunk 级时间剪枝的基础，并且与冷层的 page/段时间范围在
+**结构上完全一致**。
+
+## 冷层（MEMC）
+
+### 目录与文件命名
+
+冷段位于 `<data_dir>/<pid>/cold`——与热层环形文件同处一地、同样按进程隔离，因此冷数据绝不会跨进程
+混淆。每个段命名为 `<writer_id>-<seq>.memc`，其中 `writer_id` 是 `(pid, start_time)` 的哈希，
+`seq` 是 `ColdStore` 打开时恢复的单调递增序号。
+
+### 段格式
+
+一个段是一系列 64 对齐的 block。所有完整性校验都使用 **xxh3-64 截断为 32 位**。
+
+**段头部（64 字节）：** `magic`（`"MEMC"`）、`version`（1）、BOM、`flags`（bit 0 = 已封存）、
+`writer_pid`、`writer_start`、`created_unix_ms`、`footer_off`（封存前为 0）、段级
+`ts_min`/`ts_max`、`page_count`、头部校验和。
+
+**Block** 共享 64 字节头部：
+
+| magic | 含义 |
+|---|---|
+| `MCTB` | 表定义 block——声明一个 `table_id`、表名、列 dtype、时间戳列 |
+| `MCPG` | page（数据）block——某个 `table_id` 的一列页 |
+| `MCFT` | footer——封存时写入的 page 目录 |
+
+page/block 头部携带 `table_id`、`row_count`、`col_count`、`ts_min`/`ts_max`、`payload_len`、
+`payload_xxh`，以及对重启去重至关重要的 `source_gen` 与 `source_chunk`（该 page 从哪个热层
+chunk 的 generation 和索引抽取而来；`u32::MAX` = 不适用）。头部本身也带校验和（覆盖
+`source_chunk`）。
+
+单个段可容纳**多张表**的 page，以 `table_id` 区分。这让文件/目录数量与表数量解耦：成百上千张表
+共享同一组段文件。
+
+### 列编码
+
+每列独立编码（`ColEncoding`）：
+
+- **`Pco`**——数值列（`i32/i64/f32/f64/u32/u64`），用 Pco（level 8）压缩。单调时间戳列压缩比 > 4×；
+- **`RawFixed`**——`u8`（Pco 对字节列无收益）；
+- **`RawVarLen`**——`Str`/`Bytes`，以连续的 `[u32 len][bytes]` 条目存储（Pco 不支持字符串）。
+
+### 崩溃恢复
+
+- **已封存**段通过 footer 的 page 目录读取——O(1) 定位每个 page；
+- **未封存或撕裂**的段通过**前向扫描**恢复：从头遍历 block，校验每个 block 的头部和 payload
+  校验和，在第一个坏 block 处停止并丢弃撕裂的尾部。表定义 block 总会被扫描（开销小，且位于 page 之前）。
+
+不存在任何试图修复半行记录的启发式逻辑。
+
+!!! warning "持久性"
+    page 不会逐个 `fsync`（仅在封存时 `sync_data`）。`SIGKILL` 可能丢失当前打开段尚未刷盘的尾部
+    page。对观测数据可接受，但这是一个明确的取舍。
+
+## Compactor（Roller）
+
+`Compactor` 将新封存的热层 chunk 徕出（drain）到冷段。
+
+- **徕出语义。** 只徕出 `Sealed` 状态的 chunk（绝不动正在写入的 chunk）。行被转置为列；徕出前后
+  复核该 chunk 的 `generation`——若环形已回收它，丢弃该 page 并在下一轮重试。徕出是**幂等**的：
+  逐 chunk 的 `drained_gen` 高水位跳过已压缩的 chunk generation。
+- **滚动。** 当打开的段达到 `target_segment_bytes`（默认 64 MiB——主要的碎片化调节旋钮）、超过
+  `max_segment_age`（默认 300 s，让低速率表也能及时可查），或显式 flush 时，封存当前段并新开一个。
+- **淘汰。** `enforce` 在超出字节预算（`max_total_bytes`）或 TTL 时删除最旧的段，并始终保护最新段。
+
+### 跨重启的精确一次
+
+`drained_gen` 在内存中，因此朴素重启面对持久冷目录时会重新压缩仍驻留在热层环形中的 chunk，产生重复
+行。`prime_from_cold()` 在启动时重建高水位：扫描已有冷段，按 `(表, source_chunk)` 取 `source_gen`
+的最大值，在首次见到某表时合并进 `drained_gen`。结果即使跨重启也保证**精确一次**。
+
+## 运行时 Owner
+
+`ColdCompactor` 是进程级全局单例（仿照 task-stats worker），为 compactor 提供唯一的生命周期归宿：
+
+- 后台线程每轮**重新发现** `<data_dir>/<pid>/` 下的环形文件（表会随时间出现），将每个徕出到共享的
+  `ColdStore`，按时长滚动，并执行预算约束；
+- 启动时调用 `prime_from_cold()`；停止时 flush（封存打开的段）。
+
+它**默认关闭**（opt-in），以避免在每个 fork 出来的 worker 中都启动一个压缩线程。配置通过
+`MemTableExtension` 选项面或环境变量下发；server 在引擎初始化时调用
+`start_cold_compaction_from_env()`。
+
+## SQL 集成
+
+### Catalog 发现
+
+`<data_dir>/<pid>/` 下的 mmap 文件被暴露为 DataFusion 表，文件名映射到 `(schema, table)`：
+
+- 首个 `.` 分隔 schema 与 table——`acme.actors` → schema `acme`、table `actors`；
+- 无 `.` → schema `memtable`（例如 `metrics` → `memtable.metrics`）。
+
+`DynamicMmapCatalog` 将这些动态 schema 与静态 `probe` catalog 合并。形如
+`SELECT … FROM probe.memtable.metrics` 的查询经 `MmapFileSchemaProvider::table()` 解析。
+
+### Provider
+
+- **`RingMmapTable`**——热层环形文件之上的惰性 provider。在 `scan()` 时才物化 Arrow batch，并剪掉
+  `[min_ts, max_ts]` 无法匹配查询时间谓词的 chunk。
+- **`HotColdTable`**——将一个热层环形与其冷段合并为同一张逻辑表（以磁盘 basename 为键，使表名跨
+  schema 永不冲突）。这是 catalog 为环形表返回的 provider。
+
+### 三级时间剪枝
+
+一条时间谓词以递增粒度剪枝两层：
+
+1. **段级**——跳过头部 `ts_range` 无法匹配的已封存冷段（无需 mmap）；
+2. **page 级**——通过 page 目录跳过范围外的冷 page；
+3. **chunk 级**——通过 `min_ts`/`max_ts` 跳过范围外的热 chunk。
+
+热、冷 batch 作为两个分区交给扫描，因此投影、过滤、limit 下推对两层一致生效。
+
+### 热∪冷的精确一次
+
+被压缩的 chunk 在被覆盖前仍存活于热层环形中，朴素的合并会重复计数。`cold_scan` 返回冷 page 来源的
+`(source_chunk, source_gen)` 集合；热侧据此**排除**任何 `(索引, 当前 generation)` 落在该集合中的
+chunk。每行恰好计数一次，且去重对环形回收免疫（generation 复核会重新验证）。
+
+## 配置参考
+
+| `SET memtable.*` | 环境变量 | 含义 | 默认 |
+|---|---|---|---|
+| `cold_compaction` | `PROBING_COLD` | 运行后台 compactor（`on`/`off`） | 关闭 |
+| `cold_max_total_mb` | `PROBING_COLD_MAX_TOTAL_MB` | 冷层字节预算（MiB） | 无限 |
+| `cold_ttl_secs` | `PROBING_COLD_TTL_SECS` | 淘汰早于此时长的冷段 | 无 |
+| — | `PROBING_COLD_TARGET_MB` | 段滚动大小（MiB） | 64 |
+| — | `PROBING_COLD_POLL_MS` | 徕出轮询间隔 | 2000 |
+| — | `PROBING_COLD_MAX_AGE_SECS` | 空闲打开段多久后封存 | 300 |
+
+## 保证与已知边界
+
+**已保证：**
+
+- 读取无半行数据（generation 复核）；冷层尾部撕裂可恢复；
+- 跨层精确一次（查询去重）与跨重启精确一次（`prime_from_cold`）；
+- 热层内存有界；冷层字节/TTL 有界；
+- fork 安全的锁。
+
+**已知取舍（P2 待办）：**
+
+- **冷目录按 PID 隔离。** 跨进程隔离干净，但默认不跨重启持久化（新 PID = 新冷目录）。在配置了持久
+  冷目录时，`prime_from_cold` 保证重启去重正确。
+- **无逐 page `fsync`**——`SIGKILL` 可能丢失打开段尚未刷盘的尾部。
+- **无段级 manifest**——多段查询需打开每个段头部做剪枝。
+- **Pco level 固定（8）**——未按列自适应。
+- **运行时为单进程**——跨进程 / 集群聚合尚未打通。
+
+## 测试
+
+数据层附带单元与端到端测试：热层环形的锁/回收/fork 测试（`probing-memtable`）、MEMC 的
+格式/恢复/compactor 测试（含带反例的重启去重），以及经运行时 owner 徕出、再通过真实 catalog 路径
+查询合并结果的 SQL 端到端测试（`probing-core::memtable_sql`）。
diff --git a/docs/src/design/index.md b/docs/src/design/index.md
index 409aba92..54fd89bf 100644
--- a/docs/src/design/index.md
+++ b/docs/src/design/index.md
@@ -42,6 +42,7 @@ Probing's core mission is simple: **make distributed systems feel Pythonic again
 | Document | Description |
 |----------|-------------|
 | [Architecture](architecture.md) | System structure and components |
+| [Data Layer](data-layer.md) | Hot/cold columnar store and SQL integration |
 | [Profiling](profiling.md) | Performance data collection |
 | [Debugging](debugging.md) | Debugging capabilities |
 | [Distributed](distributed.md) | Multi-node support |
diff --git a/docs/src/design/index.zh.md b/docs/src/design/index.zh.md
index 6ab7c14a..e80bac50 100644
--- a/docs/src/design/index.zh.md
+++ b/docs/src/design/index.zh.md
@@ -42,6 +42,7 @@ Probing 的核心使命很简单：**让分布式系统重新变得 Pythonic**
 | 文档 | 描述 |
 |------|------|
 | [系统架构](architecture.md) | 系统结构和组件 |
+| [数据层](data-layer.md) | 冷热分层列式存储与 SQL 集成 |
 | [性能分析](profiling.md) | 性能数据收集 |
 | [调试](debugging.md) | 调试能力 |
 | [分布式](distributed.md) | 多节点支持 |
diff --git a/probing/cli/Cargo.toml b/probing/cli/Cargo.toml
index 352e9500..5377752a 100644
--- a/probing/cli/Cargo.toml
+++ b/probing/cli/Cargo.toml
@@ -20,6 +20,7 @@ path = "src/lib.rs"
 probing-proto = { path = "../proto", default-features = false, features = [] }
 probing-store = { path = "../crates/store", default-features = false, features = [
 ] }
+probing-memtable = { path = "../memtable" }
 
 anyhow = { workspace = true }
 log = { workspace = true }
diff --git a/probing/cli/src/cli/bench/args.rs b/probing/cli/src/cli/bench/args.rs
new file mode 100644
index 00000000..ad9b7cff
--- /dev/null
+++ b/probing/cli/src/cli/bench/args.rs
@@ -0,0 +1,232 @@
+//! Argument structs for the `bench` subcommands (clap derive).
+
+use std::path::PathBuf;
+
+use clap::{Args, ValueEnum};
+
+use super::workload::{SchemaKind, WorkloadSpec};
+
+/// Storage backend under test.
+#[derive(ValueEnum, Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Backend {
+    /// Process-private heap buffer (no cross-handle sharing).
+    Heap,
+    /// POSIX shared memory (`shm_open`).
+    Shm,
+    /// mmap'd regular file at an explicit path.
+    File,
+    /// Discoverable mmap'd file under the data dir (SQL-visible).
+    Shared,
+}
+
+/// Streaming row writer vs. value-vector `push_row`.
+#[derive(ValueEnum, Debug, Clone, Copy, PartialEq, Eq)]
+pub enum WriterMode {
+    /// `push_row` — concurrency-safe auto-advance; allocates a value row.
+    Push,
+    /// `RowWriter` streaming fast path (single-threaded only).
+    Streaming,
+}
+
+/// Schema / row-shape options shared by every subcommand.
+#[derive(Args, Debug, Clone)]
+pub struct SchemaArgs {
+    /// Built-in column layout.
+    #[arg(long, value_enum, default_value = "metrics")]
+    pub schema: SchemaKind,
+
+    /// Number of f64 columns for `--schema wide`.
+    #[arg(long, default_value_t = 16)]
+    pub wide_cols: usize,
+
+    /// Byte length of the `msg` payload for `--schema logs`.
+    #[arg(long, default_value_t = 32)]
+    pub str_len: usize,
+}
+
+impl SchemaArgs {
+    pub fn spec(&self) -> WorkloadSpec {
+        WorkloadSpec {
+            kind: self.schema,
+            wide_cols: self.wide_cols.max(1),
+            str_len: self.str_len,
+        }
+    }
+}
+
+/// Ring geometry shared by subcommands that build a hot table.
+#[derive(Args, Debug, Clone)]
+pub struct RingArgs {
+    /// Bytes per ring chunk.
+    #[arg(long, default_value_t = 256 * 1024)]
+    pub chunk_size: u32,
+
+    /// Number of ring chunks (slots).
+    #[arg(long, default_value_t = 64)]
+    pub chunks: u32,
+}
+
+#[derive(Args, Debug, Clone)]
+pub struct WriteArgs {
+    #[command(flatten)]
+    pub schema: SchemaArgs,
+    #[command(flatten)]
+    pub ring: RingArgs,
+
+    /// Storage backend.
+    #[arg(long, value_enum, default_value = "heap")]
+    pub backend: Backend,
+
+    /// Total rows to write (across all threads).
+    #[arg(long, default_value_t = 1_000_000)]
+    pub rows: u64,
+
+    /// Concurrent writer threads. >1 requires a shared backend
+    /// (shm/file/shared) to exercise the cross-handle write lock.
+    #[arg(long, default_value_t = 1)]
+    pub threads: usize,
+
+    /// Writer API to exercise.
+    #[arg(long, value_enum, default_value = "push")]
+    pub writer: WriterMode,
+
+    /// File path for `--backend file` (defaults to a temp file).
+    #[arg(long)]
+    pub path: Option<PathBuf>,
+
+    /// Record a per-row latency histogram (adds measurable overhead).
+    #[arg(long)]
+    pub latency: bool,
+
+    /// Warm-up rows per thread, excluded from measurement.
+    #[arg(long, default_value_t = 0)]
+    pub warmup: u64,
+}
+
+#[derive(Args, Debug, Clone)]
+pub struct ScanArgs {
+    #[command(flatten)]
+    pub schema: SchemaArgs,
+    #[command(flatten)]
+    pub ring: RingArgs,
+
+    /// Rows to pre-populate before scanning.
+    #[arg(long, default_value_t = 1_000_000)]
+    pub rows: u64,
+
+    /// Number of full scan passes to time.
+    #[arg(long, default_value_t = 5)]
+    pub iters: usize,
+}
+
+#[derive(Args, Debug, Clone)]
+pub struct CompactArgs {
+    #[command(flatten)]
+    pub schema: SchemaArgs,
+    #[command(flatten)]
+    pub ring: RingArgs,
+
+    /// Rows to ingest and compact.
+    #[arg(long, default_value_t = 2_000_000)]
+    pub rows: u64,
+
+    /// Segment roll size in MiB (`target_segment_bytes`).
+    #[arg(long, default_value_t = 8)]
+    pub target_mb: u64,
+
+    /// Cold directory (defaults to a temp dir; removed on exit unless --keep).
+    #[arg(long)]
+    pub dir: Option<PathBuf>,
+
+    /// Keep the cold directory after the run.
+    #[arg(long)]
+    pub keep: bool,
+}
+
+#[derive(Args, Debug, Clone)]
+pub struct ColdscanArgs {
+    #[command(flatten)]
+    pub schema: SchemaArgs,
+    #[command(flatten)]
+    pub ring: RingArgs,
+
+    /// Read an existing cold directory instead of building one.
+    #[arg(long)]
+    pub dir: Option<PathBuf>,
+
+    /// Rows to ingest when building a cold store (ignored with --dir).
+    #[arg(long, default_value_t = 2_000_000)]
+    pub rows: u64,
+
+    /// Segment roll size in MiB when building (ignored with --dir).
+    #[arg(long, default_value_t = 8)]
+    pub target_mb: u64,
+
+    /// Number of full read passes to time.
+    #[arg(long, default_value_t = 3)]
+    pub iters: usize,
+}
+
+#[derive(Args, Debug, Clone)]
+pub struct MixedArgs {
+    #[command(flatten)]
+    pub schema: SchemaArgs,
+    #[command(flatten)]
+    pub ring: RingArgs,
+
+    /// Shared backend for the pipeline.
+    #[arg(long, value_enum, default_value = "shared")]
+    pub backend: Backend,
+
+    /// Concurrent writer threads.
+    #[arg(long, default_value_t = 2)]
+    pub writers: usize,
+
+    /// Concurrent reader (scan) threads.
+    #[arg(long, default_value_t = 1)]
+    pub readers: usize,
+
+    /// Run duration in seconds.
+    #[arg(long, default_value_t = 10)]
+    pub duration: u64,
+
+    /// Disable the background compactor (hot-only pipeline).
+    #[arg(long)]
+    pub no_compact: bool,
+
+    /// Segment roll size in MiB.
+    #[arg(long, default_value_t = 8)]
+    pub target_mb: u64,
+
+    /// Cold-store byte budget in MiB (eviction trigger).
+    #[arg(long)]
+    pub max_total_mb: Option<u64>,
+
+    /// Cold-store TTL in seconds.
+    #[arg(long)]
+    pub ttl_secs: Option<u64>,
+}
+
+#[derive(Args, Debug, Clone)]
+pub struct MpArgs {
+    #[command(flatten)]
+    pub schema: SchemaArgs,
+    #[command(flatten)]
+    pub ring: RingArgs,
+
+    /// Shared backend (must be cross-process: shm/file/shared).
+    #[arg(long, value_enum, default_value = "shared")]
+    pub backend: Backend,
+
+    /// Number of writer processes.
+    #[arg(long, default_value_t = 1)]
+    pub writers: usize,
+
+    /// Number of reader processes.
+    #[arg(long, default_value_t = 2)]
+    pub readers: usize,
+
+    /// Measurement window in seconds (the soak is time-driven, not row-driven).
+    #[arg(long, default_value_t = 10)]
+    pub duration: u64,
+}
diff --git a/probing/cli/src/cli/bench/metrics.rs b/probing/cli/src/cli/bench/metrics.rs
new file mode 100644
index 00000000..59d0d11b
--- /dev/null
+++ b/probing/cli/src/cli/bench/metrics.rs
@@ -0,0 +1,311 @@
+//! Measurement primitives: a bounded latency reservoir and a small
+//! report builder that renders either as an aligned table or as JSON.
+
+use std::time::Duration;
+
+/// Reservoir-sampled latency recorder (nanoseconds).
+///
+/// Per-operation timing on a hot write path is itself measurable overhead,
+/// so latency capture is opt-in. When enabled we keep an unbiased uniform
+/// sample of at most `cap` observations (reservoir sampling) plus exact
+/// `min`/`max`/`sum`/`count`, which is enough for stable tail-quantile
+/// estimates without unbounded memory.
+pub struct Latency {
+    samples: Vec<u64>,
+    cap: usize,
+    seen: u64,
+    min: u64,
+    max: u64,
+    sum: u128,
+    rng: u64,
+}
+
+impl Latency {
+    pub fn new(cap: usize) -> Self {
+        Self {
+            samples: Vec::with_capacity(cap.min(1 << 16)),
+            cap: cap.max(1),
+            seen: 0,
+            min: u64::MAX,
+            max: 0,
+            sum: 0,
+            rng: 0x9E37_79B9_7F4A_7C15,
+        }
+    }
+
+    #[inline]
+    fn next_rng(&mut self) -> u64 {
+        // xorshift64*
+        let mut x = self.rng;
+        x ^= x >> 12;
+        x ^= x << 25;
+        x ^= x >> 27;
+        self.rng = x;
+        x.wrapping_mul(0x2545_F491_4F6C_DD1D)
+    }
+
+    #[inline]
+    pub fn record(&mut self, ns: u64) {
+        self.seen += 1;
+        self.min = self.min.min(ns);
+        self.max = self.max.max(ns);
+        self.sum += ns as u128;
+        if self.samples.len() < self.cap {
+            self.samples.push(ns);
+        } else {
+            let j = (self.next_rng() % self.seen) as usize;
+            if j < self.cap {
+                self.samples[j] = ns;
+            }
+        }
+    }
+
+    pub fn merge(&mut self, other: &Latency) {
+        for &s in &other.samples {
+            self.record(s);
+        }
+        // record() above already folded the sampled values; fix the exact
+        // aggregates from the source's exact tallies instead of the sample.
+        if other.seen > 0 {
+            self.min = self.min.min(other.min);
+            self.max = self.max.max(other.max);
+        }
+    }
+
+    pub fn count(&self) -> u64 {
+        self.seen
+    }
+
+    pub fn mean_ns(&self) -> f64 {
+        if self.seen == 0 {
+            0.0
+        } else {
+            self.sum as f64 / self.seen as f64
+        }
+    }
+
+    pub fn min_ns(&self) -> u64 {
+        if self.seen == 0 {
+            0
+        } else {
+            self.min
+        }
+    }
+
+    pub fn max_ns(&self) -> u64 {
+        self.max
+    }
+
+    /// Estimated quantile (`q` in `[0,1]`) from the reservoir sample.
+    pub fn quantile_ns(&self, q: f64) -> u64 {
+        if self.samples.is_empty() {
+            return 0;
+        }
+        let mut s = self.samples.clone();
+        s.sort_unstable();
+        let q = q.clamp(0.0, 1.0);
+        let idx = ((s.len() as f64 - 1.0) * q).round() as usize;
+        s[idx]
+    }
+}
+
+// ── Report ───────────────────────────────────────────────────────────
+
+/// One labelled measurement: a human string plus a machine-readable value.
+struct Entry {
+    key: String,
+    display: String,
+    json: serde_json::Value,
+}
+
+/// Accumulates labelled results and renders them as an aligned table or
+/// a JSON object. Construction order is preserved.
+pub struct Report {
+    title: String,
+    entries: Vec<Entry>,
+}
+
+impl Report {
+    pub fn new(title: impl Into<String>) -> Self {
+        Self {
+            title: title.into(),
+            entries: Vec::new(),
+        }
+    }
+
+    fn push(&mut self, key: &str, display: String, json: serde_json::Value) -> &mut Self {
+        self.entries.push(Entry {
+            key: key.to_string(),
+            display,
+            json,
+        });
+        self
+    }
+
+    pub fn text(&mut self, key: &str, value: impl Into<String>) -> &mut Self {
+        let v = value.into();
+        let json = serde_json::Value::String(v.clone());
+        self.push(key, v, json)
+    }
+
+    pub fn count(&mut self, key: &str, n: u64) -> &mut Self {
+        self.push(
+            key,
+            group_thousands(n),
+            serde_json::Value::from(n),
+        )
+    }
+
+    pub fn float(&mut self, key: &str, v: f64, suffix: &str) -> &mut Self {
+        let disp = if suffix.is_empty() {
+            format!("{v:.3}")
+        } else {
+            format!("{v:.3} {suffix}")
+        };
+        self.push(key, disp, json_f64(v))
+    }
+
+    pub fn ratio(&mut self, key: &str, v: f64) -> &mut Self {
+        self.push(key, format!("{v:.2}x"), json_f64(v))
+    }
+
+    pub fn bytes(&mut self, key: &str, n: u64) -> &mut Self {
+        self.push(key, human_bytes(n), serde_json::Value::from(n))
+    }
+
+    pub fn duration(&mut self, key: &str, d: Duration) -> &mut Self {
+        self.push(
+            key,
+            format!("{:.3} s", d.as_secs_f64()),
+            json_f64(d.as_secs_f64()),
+        )
+    }
+
+    /// Throughput in ops/second, displayed with an SI suffix.
+    pub fn rate(&mut self, key: &str, ops: u64, elapsed: Duration, unit: &str) -> &mut Self {
+        let per_sec = rate_per_sec(ops, elapsed);
+        self.push(
+            key,
+            format!("{} {unit}/s", si(per_sec)),
+            json_f64(per_sec),
+        )
+    }
+
+    /// Throughput in bytes/second, displayed as MiB/s.
+    pub fn byte_rate(&mut self, key: &str, bytes: u64, elapsed: Duration) -> &mut Self {
+        let per_sec = rate_per_sec(bytes, elapsed);
+        self.push(
+            key,
+            format!("{:.2} MiB/s", per_sec / (1024.0 * 1024.0)),
+            json_f64(per_sec),
+        )
+    }
+
+    /// Append the standard quantile rows for a latency recorder.
+    pub fn latency(&mut self, prefix: &str, lat: &Latency) -> &mut Self {
+        if lat.count() == 0 {
+            return self;
+        }
+        self.float(&format!("{prefix} min"), lat.min_ns() as f64, "ns");
+        self.float(&format!("{prefix} mean"), lat.mean_ns(), "ns");
+        self.float(&format!("{prefix} p50"), lat.quantile_ns(0.50) as f64, "ns");
+        self.float(&format!("{prefix} p99"), lat.quantile_ns(0.99) as f64, "ns");
+        self.float(&format!("{prefix} p999"), lat.quantile_ns(0.999) as f64, "ns");
+        self.float(&format!("{prefix} max"), lat.max_ns() as f64, "ns");
+        self
+    }
+
+    pub fn print_table(&self) {
+        let width = self
+            .entries
+            .iter()
+            .map(|e| e.key.len())
+            .max()
+            .unwrap_or(0);
+        println!("\n  {}", self.title);
+        println!("  {}", "─".repeat(self.title.len().max(20)));
+        for e in &self.entries {
+            println!("  {:<width$}  {}", e.key, e.display, width = width);
+        }
+        println!();
+    }
+
+    pub fn to_json(&self) -> serde_json::Value {
+        let mut map = serde_json::Map::new();
+        for e in &self.entries {
+            map.insert(e.key.clone(), e.json.clone());
+        }
+        serde_json::json!({ "benchmark": self.title, "metrics": map })
+    }
+
+    pub fn emit(&self, json: bool) {
+        if json {
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&self.to_json())
+                    .unwrap_or_else(|_| "{}".to_string())
+            );
+        } else {
+            self.print_table();
+        }
+    }
+}
+
+// ── formatting helpers ─────────────────────────────────────────────────
+
+fn json_f64(v: f64) -> serde_json::Value {
+    serde_json::Number::from_f64(v)
+        .map(serde_json::Value::Number)
+        .unwrap_or(serde_json::Value::Null)
+}
+
+pub fn rate_per_sec(n: u64, elapsed: Duration) -> f64 {
+    let s = elapsed.as_secs_f64();
+    if s <= 0.0 {
+        0.0
+    } else {
+        n as f64 / s
+    }
+}
+
+/// SI-suffixed magnitude (K/M/G) for human-readable rates.
+pub fn si(v: f64) -> String {
+    let a = v.abs();
+    if a >= 1e9 {
+        format!("{:.2}G", v / 1e9)
+    } else if a >= 1e6 {
+        format!("{:.2}M", v / 1e6)
+    } else if a >= 1e3 {
+        format!("{:.2}K", v / 1e3)
+    } else {
+        format!("{v:.0}")
+    }
+}
+
+pub fn human_bytes(n: u64) -> String {
+    const UNITS: [&str; 5] = ["B", "KiB", "MiB", "GiB", "TiB"];
+    let mut v = n as f64;
+    let mut i = 0;
+    while v >= 1024.0 && i < UNITS.len() - 1 {
+        v /= 1024.0;
+        i += 1;
+    }
+    if i == 0 {
+        format!("{n} B")
+    } else {
+        format!("{v:.2} {}", UNITS[i])
+    }
+}
+
+fn group_thousands(n: u64) -> String {
+    let s = n.to_string();
+    let len = s.len();
+    let mut out = String::with_capacity(len + len / 3);
+    for (i, c) in s.chars().enumerate() {
+        if i > 0 && (len - i).is_multiple_of(3) {
+            out.push(',');
+        }
+        out.push(c);
+    }
+    out
+}
diff --git a/probing/cli/src/cli/bench/mod.rs b/probing/cli/src/cli/bench/mod.rs
new file mode 100644
index 00000000..263de0bf
--- /dev/null
+++ b/probing/cli/src/cli/bench/mod.rs
@@ -0,0 +1,61 @@
+//! Hidden `bench` command: a load generator and stress harness for the
+//! probing data layer (hot MEMT ring + cold MEMC segments).
+//!
+//! This is an internal/diagnostic command (hidden from `--help`). Run
+//! `probing bench <subcommand> --help` for per-workload options.
+
+pub mod args;
+pub mod metrics;
+pub mod runners;
+pub mod workload;
+
+use anyhow::Result;
+use clap::{Args, Subcommand};
+
+use args::{ColdscanArgs, CompactArgs, MixedArgs, MpArgs, ScanArgs, WriteArgs};
+
+/// Stress and benchmark the in-process data layer.
+#[derive(Args, Debug)]
+pub struct BenchCommand {
+    /// Emit machine-readable JSON instead of a formatted table.
+    #[arg(long, global = true)]
+    pub json: bool,
+
+    /// PRNG seed for reproducible synthetic data.
+    #[arg(long, global = true, default_value_t = 0x00C0_FFEE)]
+    pub seed: u64,
+
+    #[command(subcommand)]
+    pub command: BenchSub,
+}
+
+#[derive(Subcommand, Debug)]
+pub enum BenchSub {
+    /// Write throughput across storage backends and writer counts.
+    Write(WriteArgs),
+    /// Sequential scan throughput over a freshly populated hot ring.
+    Scan(ScanArgs),
+    /// Cold-tier compaction throughput and hot→cold compression ratio.
+    Compact(CompactArgs),
+    /// Cold-segment read + decode throughput.
+    Coldscan(ColdscanArgs),
+    /// End-to-end pipeline: writers + background compactor + readers.
+    Mixed(MixedArgs),
+    /// Multi-process, time-driven soak: writer + reader processes share a table.
+    Mp(MpArgs),
+}
+
+impl BenchCommand {
+    pub fn run(&self) -> Result<()> {
+        let seed = self.seed;
+        let json = self.json;
+        match &self.command {
+            BenchSub::Write(a) => runners::write::run(a, json, seed),
+            BenchSub::Scan(a) => runners::scan::run(a, json, seed),
+            BenchSub::Compact(a) => runners::compact::run(a, json, seed),
+            BenchSub::Coldscan(a) => runners::coldscan::run(a, json, seed),
+            BenchSub::Mixed(a) => runners::mixed::run(a, json, seed),
+            BenchSub::Mp(a) => runners::mp::run(a, json, seed),
+        }
+    }
+}
diff --git a/probing/cli/src/cli/bench/runners/coldscan.rs b/probing/cli/src/cli/bench/runners/coldscan.rs
new file mode 100644
index 00000000..19ffbe44
--- /dev/null
+++ b/probing/cli/src/cli/bench/runners/coldscan.rs
@@ -0,0 +1,92 @@
+//! `coldscan` — read + decode throughput over MEMC cold segments.
+//!
+//! Opens every `.memc` segment in a cold directory and decodes every page
+//! (Pco / raw), folding row counts into a sink. Reports both the logical
+//! (decoded) throughput and the on-disk (compressed) read rate.
+
+use std::time::Instant;
+
+use anyhow::{bail, Result};
+use probing_memtable::memc::{ColdStore, SegmentReader};
+
+use crate::cli::bench::args::ColdscanArgs;
+use crate::cli::bench::metrics::Report;
+
+pub fn run(args: &ColdscanArgs, json: bool, seed: u64) -> Result<()> {
+    let spec = args.schema.spec();
+    let row_bytes = spec.approx_row_bytes() as u64;
+
+    let (dir, built, temp) = match &args.dir {
+        Some(d) => (d.clone(), 0u64, false),
+        None => {
+            let dir = super::common::temp_dir("coldscan")?;
+            let drained = super::common::build_cold(
+                &dir,
+                &spec,
+                &args.ring,
+                args.rows,
+                args.target_mb,
+                seed,
+            )?;
+            (dir, drained, true)
+        }
+    };
+
+    let store = ColdStore::open(&dir)?;
+    let segments = store.segment_paths();
+    if segments.is_empty() {
+        bail!("no .memc segments found under {}", dir.display());
+    }
+
+    let iters = args.iters.max(1);
+    let mut rows_per_pass = 0u64;
+    let mut disk_per_pass = 0u64;
+    let mut sink = 0u64;
+
+    let start = Instant::now();
+    for _ in 0..iters {
+        let mut rows = 0u64;
+        let mut disk = 0u64;
+        for path in &segments {
+            let reader = SegmentReader::open(path)
+                .map_err(|e| anyhow::anyhow!("open {}: {e}", path.display()))?;
+            for (i, page) in reader.pages().iter().enumerate() {
+                disk += page.block_len as u64;
+                let cols = reader
+                    .read_page(i)
+                    .map_err(|e| anyhow::anyhow!("decode page {i}: {e}"))?;
+                let n = cols.first().map(|c| c.len()).unwrap_or(0) as u64;
+                rows += n;
+                sink = sink.wrapping_add(n);
+            }
+        }
+        rows_per_pass = rows;
+        disk_per_pass = disk;
+    }
+    let elapsed = start.elapsed();
+    std::hint::black_box(sink);
+
+    let rows_total = rows_per_pass * iters as u64;
+    let disk_total = disk_per_pass * iters as u64;
+    let logical_total = rows_total * row_bytes;
+
+    let mut report = Report::new(format!("coldscan · {:?}", args.schema.schema));
+    report.text("schema", format!("{:?}", args.schema.schema));
+    if built > 0 {
+        report.count("rows built", built);
+    }
+    report
+        .count("segments", segments.len() as u64)
+        .count("rows/pass", rows_per_pass)
+        .count("read passes", iters as u64)
+        .duration("elapsed", elapsed)
+        .rate("decode rate", rows_total, elapsed, "rows")
+        .byte_rate("logical rate", logical_total, elapsed)
+        .byte_rate("on-disk read", disk_total, elapsed);
+    report.emit(json);
+
+    if temp {
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+    Ok(())
+}
diff --git a/probing/cli/src/cli/bench/runners/common.rs b/probing/cli/src/cli/bench/runners/common.rs
new file mode 100644
index 00000000..cdb5f846
--- /dev/null
+++ b/probing/cli/src/cli/bench/runners/common.rs
@@ -0,0 +1,167 @@
+//! Shared helpers: unique names/paths, temp dirs, and hot-ring population.
+
+use std::io;
+use std::path::PathBuf;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+
+use anyhow::Result;
+use probing_memtable::memc::{ColdStore, Compactor, CompactorConfig};
+use probing_memtable::{DType, MemTable};
+
+use crate::cli::bench::args::RingArgs;
+use crate::cli::bench::workload::{RowGen, WorkloadSpec};
+
+/// How to attach to an already-created shared table (used by multi-handle
+/// runners). Heap is excluded because it cannot be shared.
+#[derive(Clone)]
+pub enum Attach {
+    Shm(String),
+    File(PathBuf),
+}
+
+impl Attach {
+    pub fn open(&self) -> io::Result<MemTable> {
+        match self {
+            Attach::Shm(name) => MemTable::open_shm(name),
+            Attach::File(path) => MemTable::open_file(path),
+        }
+    }
+
+    /// Serialize for passing to a child process (`shm:<name>` / `file:<path>`).
+    pub fn encode(&self) -> String {
+        match self {
+            Attach::Shm(name) => format!("shm:{name}"),
+            Attach::File(path) => format!("file:{}", path.display()),
+        }
+    }
+
+    pub fn parse(s: &str) -> Result<Self> {
+        if let Some(name) = s.strip_prefix("shm:") {
+            Ok(Attach::Shm(name.to_string()))
+        } else if let Some(path) = s.strip_prefix("file:") {
+            Ok(Attach::File(PathBuf::from(path)))
+        } else {
+            anyhow::bail!("invalid attach descriptor: {s}")
+        }
+    }
+}
+
+/// Scan all resident rows of `table` once through the cursor, folding values
+/// into a sink. Returns `(value_sink, row_count)`.
+pub fn scan_all(table: &MemTable, dtypes: &[DType]) -> (u64, u64) {
+    let mut sink = 0u64;
+    let mut rows = 0u64;
+    for chunk in table.chunks_logical() {
+        for row in table.rows(chunk) {
+            let mut c = row.cursor();
+            for dt in dtypes {
+                match dt {
+                    DType::U8 => sink = sink.wrapping_add(c.next_u8() as u64),
+                    DType::U32 => sink = sink.wrapping_add(c.next_u32() as u64),
+                    DType::I32 => sink = sink.wrapping_add(c.next_i32() as u64),
+                    DType::I64 => sink = sink.wrapping_add(c.next_i64() as u64),
+                    DType::U64 => sink = sink.wrapping_add(c.next_u64()),
+                    DType::F32 => sink = sink.wrapping_add(c.next_f32().to_bits() as u64),
+                    DType::F64 => sink = sink.wrapping_add(c.next_f64().to_bits()),
+                    DType::Str => sink = sink.wrapping_add(c.next_str().len() as u64),
+                    DType::Bytes => sink = sink.wrapping_add(c.next_bytes().len() as u64),
+                }
+            }
+            rows += 1;
+        }
+    }
+    (sink, rows)
+}
+
+/// A process-and-time unique token for naming temp files / shm objects.
+pub fn unique_token() -> String {
+    let nanos = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .map(|d| d.as_nanos())
+        .unwrap_or(0);
+    format!("{}-{}", std::process::id(), nanos % 1_000_000)
+}
+
+/// A temp file path (not created).
+pub fn temp_path(label: &str) -> PathBuf {
+    std::env::temp_dir().join(format!("probing-bench-{label}-{}.memt", unique_token()))
+}
+
+/// A temp directory path (created).
+pub fn temp_dir(label: &str) -> Result<PathBuf> {
+    let dir = std::env::temp_dir().join(format!("probing-bench-{label}-{}", unique_token()));
+    std::fs::create_dir_all(&dir)?;
+    Ok(dir)
+}
+
+/// POSIX shm name (short enough for macOS' 31-byte cap).
+pub fn shm_name() -> String {
+    let nanos = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .map(|d| d.as_nanos())
+        .unwrap_or(0);
+    format!("/pb{}", nanos % 1_000_000_000)
+}
+
+/// Fill a hot table with `rows` rows via the concurrency-safe `push_row`
+/// path, returning the number of rows written. Used by scan/compact/cold
+/// builders where ingest speed is not the measured quantity.
+pub fn populate(table: &mut MemTable, spec: &WorkloadSpec, rows: u64, seed: u64) -> u64 {
+    let mut gen = RowGen::new(spec.clone(), seed, 0);
+    let mut scratch: Vec<f64> = Vec::new();
+    for _ in 0..rows {
+        let values = gen.values(&mut scratch);
+        table.push_row_unchecked(&values);
+    }
+    rows
+}
+
+/// Ingest `rows` rows and compact them into MEMC segments under `dir`,
+/// interleaving drains so the ring never overwrites undrained chunks.
+/// Returns rows actually drained to cold.
+pub fn build_cold(
+    dir: &std::path::Path,
+    spec: &WorkloadSpec,
+    ring: &RingArgs,
+    rows: u64,
+    target_mb: u64,
+    seed: u64,
+) -> Result<u64> {
+    let row_bytes = spec.approx_row_bytes() as u64;
+    let mut table = MemTable::new(&spec.schema(), ring.chunk_size, ring.chunks);
+    let store = ColdStore::open(dir)?;
+    let config = CompactorConfig {
+        target_segment_bytes: target_mb * 1024 * 1024,
+        max_segment_age: Duration::from_secs(3600),
+        poll_interval: Duration::from_millis(1),
+        max_total_bytes: None,
+        ttl: None,
+    };
+    let mut compactor = Compactor::new(store, config);
+
+    let rows_per_chunk = ((ring.chunk_size as u64).saturating_sub(40)) / (row_bytes + 4).max(1);
+    let batch = (rows_per_chunk * (ring.chunks as u64 / 2).max(1)).max(1);
+
+    let mut gen = RowGen::new(spec.clone(), seed, 0);
+    let mut scratch: Vec<f64> = Vec::new();
+    let mut ingested = 0u64;
+    let mut drained = 0u64;
+    while ingested < rows {
+        let n = batch.min(rows - ingested);
+        for _ in 0..n {
+            let values = gen.values(&mut scratch);
+            table.push_row_unchecked(&values);
+        }
+        ingested += n;
+        drained += compactor.drain_view("bench", &table.view())? as u64;
+    }
+    loop {
+        let n = compactor.drain_view("bench", &table.view())? as u64;
+        drained += n;
+        if n == 0 {
+            break;
+        }
+    }
+    compactor.flush()?;
+    Ok(drained)
+}
diff --git a/probing/cli/src/cli/bench/runners/compact.rs b/probing/cli/src/cli/bench/runners/compact.rs
new file mode 100644
index 00000000..56d5a6d7
--- /dev/null
+++ b/probing/cli/src/cli/bench/runners/compact.rs
@@ -0,0 +1,118 @@
+//! `compact` — cold-tier roller throughput and hot→cold compression ratio.
+//!
+//! Ingest is interleaved with drain passes so sealed chunks are compacted
+//! before the ring can recycle them; we time the drain work separately from
+//! the end-to-end wall clock.
+
+use std::time::{Duration, Instant};
+
+use anyhow::Result;
+use probing_memtable::memc::{ColdStore, Compactor, CompactorConfig};
+use probing_memtable::MemTable;
+
+use crate::cli::bench::args::CompactArgs;
+use crate::cli::bench::metrics::Report;
+use crate::cli::bench::workload::RowGen;
+
+pub fn run(args: &CompactArgs, json: bool, seed: u64) -> Result<()> {
+    let spec = args.schema.spec();
+    let row_bytes = spec.approx_row_bytes() as u64;
+
+    let dir = match &args.dir {
+        Some(d) => {
+            std::fs::create_dir_all(d)?;
+            d.clone()
+        }
+        None => super::common::temp_dir("compact")?,
+    };
+
+    let mut table = MemTable::new(&spec.schema(), args.ring.chunk_size, args.ring.chunks);
+    let store = ColdStore::open(&dir)?;
+    let config = CompactorConfig {
+        target_segment_bytes: args.target_mb * 1024 * 1024,
+        max_segment_age: Duration::from_secs(3600),
+        poll_interval: Duration::from_millis(1),
+        max_total_bytes: None,
+        ttl: None,
+    };
+    let mut compactor = Compactor::new(store, config);
+
+    // Drain every ~half-ring worth of rows so undrained sealed chunks never
+    // exceed ring capacity.
+    let rows_per_chunk =
+        ((args.ring.chunk_size as u64).saturating_sub(40)) / (row_bytes + 4).max(1);
+    let batch = (rows_per_chunk * (args.ring.chunks as u64 / 2).max(1)).max(1);
+
+    let mut gen = RowGen::new(spec.clone(), seed, 0);
+    let mut scratch: Vec<f64> = Vec::new();
+    let name = "bench";
+
+    let mut ingested = 0u64;
+    let mut drained = 0u64;
+    let mut drain_time = Duration::ZERO;
+    let wall = Instant::now();
+
+    while ingested < args.rows {
+        let n = batch.min(args.rows - ingested);
+        for _ in 0..n {
+            let values = gen.values(&mut scratch);
+            table.push_row_unchecked(&values);
+        }
+        ingested += n;
+
+        let t = Instant::now();
+        drained += compactor.drain_view(name, &table.view())? as u64;
+        drain_time += t.elapsed();
+    }
+
+    // Final drains (sealed-but-not-yet-drained chunks) + seal the tail.
+    loop {
+        let t = Instant::now();
+        let n = compactor.drain_view(name, &table.view())? as u64;
+        drain_time += t.elapsed();
+        drained += n;
+        if n == 0 {
+            break;
+        }
+    }
+    let t = Instant::now();
+    compactor.flush()?;
+    drain_time += t.elapsed();
+    let wall = wall.elapsed();
+
+    let stats = compactor.stats();
+    let logical = drained * row_bytes;
+    let ratio = if stats.total_bytes > 0 {
+        logical as f64 / stats.total_bytes as f64
+    } else {
+        0.0
+    };
+
+    let mut report = Report::new(format!("compact · {:?}", args.schema.schema));
+    report
+        .text("schema", format!("{:?}", args.schema.schema))
+        .count("rows ingested", ingested)
+        .count("rows drained", drained)
+        .count("cold segments", stats.segment_count as u64)
+        .bytes("hot logical", logical)
+        .bytes("cold on-disk", stats.total_bytes)
+        .ratio("compression", ratio)
+        .duration("drain time", drain_time)
+        .duration("wall time", wall)
+        .rate("compact rate", drained, drain_time, "rows")
+        .byte_rate("cold write rate", stats.total_bytes, drain_time);
+    report.emit(json);
+
+    if args.dir.is_none() && !args.keep {
+        let _ = std::fs::remove_dir_all(&dir);
+    } else {
+        report_dir(&dir, json);
+    }
+    Ok(())
+}
+
+fn report_dir(dir: &std::path::Path, json: bool) {
+    if !json {
+        println!("  cold dir: {}", dir.display());
+    }
+}
diff --git a/probing/cli/src/cli/bench/runners/mixed.rs b/probing/cli/src/cli/bench/runners/mixed.rs
new file mode 100644
index 00000000..4b92d666
--- /dev/null
+++ b/probing/cli/src/cli/bench/runners/mixed.rs
@@ -0,0 +1,184 @@
+//! `mixed` — end-to-end pipeline / soak: concurrent writers, optional
+//! background compactor, and concurrent readers over one shared table for a
+//! fixed duration. Reports per-role throughput plus the resulting cold-tier
+//! footprint.
+
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use anyhow::{bail, Result};
+use probing_memtable::memc::{ColdStore, Compactor, CompactorConfig};
+use probing_memtable::{DType, MemTable};
+
+use crate::cli::bench::args::{Backend, MixedArgs};
+use crate::cli::bench::metrics::Report;
+use super::common::{scan_all, shm_name, temp_dir, temp_path, unique_token, Attach};
+use crate::cli::bench::workload::RowGen;
+
+pub fn run(args: &MixedArgs, json: bool, seed: u64) -> Result<()> {
+    let spec = args.schema.spec();
+    let row_bytes = spec.approx_row_bytes() as u64;
+    let writers = args.writers.max(1);
+    let readers = args.readers;
+
+    // Create the shared backing; keep the creator alive for the whole run.
+    let mut cleanup_file: Option<std::path::PathBuf> = None;
+    let (attach, _creator) = match args.backend {
+        Backend::Heap => bail!("mixed requires a shared backend (shm/file/shared), not heap"),
+        Backend::Shm => {
+            let name = shm_name();
+            let creator =
+                MemTable::shm(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?;
+            (Attach::Shm(name), creator)
+        }
+        Backend::File => {
+            let path = temp_path("mixed");
+            cleanup_file = Some(path.clone());
+            let creator =
+                MemTable::file_at(&path, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?;
+            (Attach::File(path), creator)
+        }
+        Backend::Shared => {
+            let name = format!("bench-{}", unique_token());
+            let creator =
+                MemTable::shared(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?;
+            let path = creator.path().expect("shared path").to_path_buf();
+            (Attach::File(path), creator)
+        }
+    };
+
+    let dtypes: Vec<DType> = (0..spec.schema().cols.len())
+        .map(|i| spec.schema().cols[i].dtype)
+        .collect();
+
+    let stop = Arc::new(AtomicBool::new(false));
+    let write_rows = Arc::new(AtomicU64::new(0));
+    let read_rows = Arc::new(AtomicU64::new(0));
+    let read_passes = Arc::new(AtomicU64::new(0));
+
+    // Background compactor (own read handle to the shared mapping).
+    let cold_dir = temp_dir("mixed-cold")?;
+    let compactor_handle = if args.no_compact {
+        None
+    } else {
+        let store = ColdStore::open(&cold_dir)?;
+        let config = CompactorConfig {
+            target_segment_bytes: args.target_mb * 1024 * 1024,
+            max_segment_age: Duration::from_secs(args.duration.max(1)),
+            poll_interval: Duration::from_millis(50),
+            max_total_bytes: args.max_total_mb.map(|m| m * 1024 * 1024),
+            ttl: args.ttl_secs.map(Duration::from_secs),
+        };
+        let handle = attach.open()?;
+        Some(Compactor::new(store, config).spawn(vec![("bench".to_string(), handle)]))
+    };
+
+    let mut threads = Vec::new();
+
+    for tid in 0..writers {
+        let attach = attach.clone();
+        let spec = spec.clone();
+        let stop = stop.clone();
+        let write_rows = write_rows.clone();
+        let seed = seed ^ (0x9E37_79B9_u64.wrapping_mul(tid as u64 + 1));
+        threads.push(std::thread::spawn(move || -> Result<()> {
+            let mut table = attach.open()?;
+            let mut gen = RowGen::new(spec.clone(), seed, (tid as i64) * 1_000_000_000);
+            let mut scratch: Vec<f64> = Vec::new();
+            let mut local = 0u64;
+            while !stop.load(Ordering::Relaxed) {
+                for _ in 0..256 {
+                    let values = gen.values(&mut scratch);
+                    table.push_row_unchecked(&values);
+                }
+                local += 256;
+            }
+            write_rows.fetch_add(local, Ordering::Relaxed);
+            Ok(())
+        }));
+    }
+
+    for _ in 0..readers {
+        let attach = attach.clone();
+        let stop = stop.clone();
+        let read_rows = read_rows.clone();
+        let read_passes = read_passes.clone();
+        let dtypes = dtypes.clone();
+        threads.push(std::thread::spawn(move || -> Result<()> {
+            let table = attach.open()?;
+            let mut rows = 0u64;
+            let mut passes = 0u64;
+            let mut sink = 0u64;
+            while !stop.load(Ordering::Relaxed) {
+                let (s, n) = scan_all(&table, &dtypes);
+                sink = sink.wrapping_add(s);
+                rows += n;
+                passes += 1;
+            }
+            std::hint::black_box(sink);
+            read_rows.fetch_add(rows, Ordering::Relaxed);
+            read_passes.fetch_add(passes, Ordering::Relaxed);
+            Ok(())
+        }));
+    }
+
+    let start = Instant::now();
+    std::thread::sleep(Duration::from_secs(args.duration.max(1)));
+    stop.store(true, Ordering::Relaxed);
+    for t in threads {
+        t.join().unwrap()?;
+    }
+    let elapsed = start.elapsed();
+
+    if let Some(h) = compactor_handle {
+        h.stop();
+    }
+
+    let total_writes = write_rows.load(Ordering::Relaxed);
+    let total_reads = read_rows.load(Ordering::Relaxed);
+    let passes = read_passes.load(Ordering::Relaxed);
+
+    let cold = if args.no_compact {
+        None
+    } else {
+        ColdStore::open(&cold_dir).ok().map(|s| s.stats())
+    };
+
+    let mut report = Report::new(format!("mixed · {:?} · {:?}", args.backend, args.schema.schema));
+    report
+        .text("backend", format!("{:?}", args.backend))
+        .text("schema", format!("{:?}", args.schema.schema))
+        .count("writers", writers as u64)
+        .count("readers", readers as u64)
+        .text("compactor", if args.no_compact { "off" } else { "on" })
+        .duration("duration", elapsed)
+        .count("rows written", total_writes)
+        .rate("write rate", total_writes, elapsed, "rows")
+        .byte_rate("write bw", total_writes * row_bytes, elapsed);
+    if readers > 0 {
+        report
+            .count("scan passes", passes)
+            .count("rows scanned", total_reads)
+            .rate("read rate", total_reads, elapsed, "rows");
+    }
+    if let Some(c) = cold {
+        let logical = total_writes * row_bytes;
+        let ratio = if c.total_bytes > 0 {
+            logical as f64 / c.total_bytes as f64
+        } else {
+            0.0
+        };
+        report
+            .count("cold segments", c.segment_count as u64)
+            .bytes("cold on-disk", c.total_bytes)
+            .ratio("compression*", ratio);
+    }
+    report.emit(json);
+
+    if let Some(p) = cleanup_file {
+        let _ = std::fs::remove_file(p);
+    }
+    let _ = std::fs::remove_dir_all(&cold_dir);
+    Ok(())
+}
diff --git a/probing/cli/src/cli/bench/runners/mod.rs b/probing/cli/src/cli/bench/runners/mod.rs
new file mode 100644
index 00000000..0f9b0a37
--- /dev/null
+++ b/probing/cli/src/cli/bench/runners/mod.rs
@@ -0,0 +1,10 @@
+//! Workload runners. Each `run` function executes one `bench` subcommand
+//! and prints a [`Report`](super::metrics::Report).
+
+pub mod coldscan;
+pub mod common;
+pub mod compact;
+pub mod mixed;
+pub mod mp;
+pub mod scan;
+pub mod write;
diff --git a/probing/cli/src/cli/bench/runners/mp.rs b/probing/cli/src/cli/bench/runners/mp.rs
new file mode 100644
index 00000000..b5ba246e
--- /dev/null
+++ b/probing/cli/src/cli/bench/runners/mp.rs
@@ -0,0 +1,307 @@
+//! `mp` — fully multi-process, time-driven soak.
+//!
+//! The orchestrator process creates a shared table, then re-execs itself to
+//! spawn one (or more) writer processes and several reader processes, each
+//! attaching to the same mapping by name/path. Every worker runs for a fixed
+//! wall-clock window (synchronised by a shared start instant) and prints a
+//! one-line JSON result; the orchestrator aggregates them.
+//!
+//! This is the scenario the data layer is built for: independent OS processes
+//! contending on the in-buffer robust write lock (writers) while others read
+//! lock-free (readers) — the cross-process path threads cannot exercise.
+//!
+//! Worker vs. orchestrator is selected by the `PROBING_BENCH_MP_ROLE`
+//! environment variable, so the public surface stays a single `mp` command.
+
+use std::process::{Child, Command, Stdio};
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+
+use anyhow::{bail, Context, Result};
+use clap::ValueEnum;
+use probing_memtable::{DType, MemTable};
+
+use super::common::{scan_all, shm_name, temp_path, unique_token, Attach};
+use crate::cli::bench::args::{Backend, MpArgs};
+use crate::cli::bench::metrics::Report;
+use crate::cli::bench::workload::RowGen;
+
+const ENV_ROLE: &str = "PROBING_BENCH_MP_ROLE";
+const ENV_ATTACH: &str = "PROBING_BENCH_MP_ATTACH";
+const ENV_START_MS: &str = "PROBING_BENCH_MP_START_MS";
+
+pub fn run(args: &MpArgs, json: bool, seed: u64) -> Result<()> {
+    match std::env::var(ENV_ROLE) {
+        Ok(role) => run_worker(args, &role, seed),
+        Err(_) => orchestrate(args, json, seed),
+    }
+}
+
+// ── orchestrator ───────────────────────────────────────────────────────
+
+fn orchestrate(args: &MpArgs, json: bool, seed: u64) -> Result<()> {
+    let spec = args.schema.spec();
+    let row_bytes = spec.approx_row_bytes() as u64;
+    let writers = args.writers.max(1);
+    let readers = args.readers;
+    if writers + readers == 0 {
+        bail!("need at least one worker (--writers/--readers)");
+    }
+
+    // Create the shared backing and keep it alive for the whole run.
+    let (attach, _creator) = match args.backend {
+        Backend::Heap => bail!("mp requires a shared backend (shm/file/shared), not heap"),
+        Backend::Shm => {
+            let name = shm_name();
+            let creator =
+                MemTable::shm(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?;
+            (Attach::Shm(name), creator)
+        }
+        Backend::File => {
+            let path = temp_path("mp");
+            let creator =
+                MemTable::file_at(&path, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?;
+            (Attach::File(path), creator)
+        }
+        Backend::Shared => {
+            let name = format!("mp-{}", unique_token());
+            let creator =
+                MemTable::shared(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?;
+            let path = creator.path().expect("shared path").to_path_buf();
+            (Attach::File(path), creator)
+        }
+    };
+
+    let exe = std::env::current_exe().context("resolve current executable")?;
+    let passthrough = passthrough_args(args);
+    // Give every child time to launch and attach before the measured window.
+    let start_ms = now_ms() + 1_000;
+
+    let mut children: Vec<(String, Child)> = Vec::with_capacity(writers + readers);
+    for i in 0..writers {
+        children.push((
+            "writer".into(),
+            spawn_worker(&exe, &passthrough, "writer", &attach, start_ms, seed ^ (i as u64 + 1))?,
+        ));
+    }
+    for i in 0..readers {
+        children.push((
+            "reader".into(),
+            spawn_worker(&exe, &passthrough, "reader", &attach, start_ms, seed ^ (0x100 + i as u64))?,
+        ));
+    }
+
+    // Collect results (each worker self-terminates after the window).
+    let mut write_rows = 0u64;
+    let mut read_rows = 0u64;
+    let mut read_passes = 0u64;
+    let mut worker_pids: Vec<u64> = Vec::new();
+    let mut max_elapsed = 0.0f64;
+    let mut failures = 0usize;
+
+    for (role, child) in children {
+        let out = child.wait_with_output().context("await worker")?;
+        if !out.status.success() {
+            failures += 1;
+            eprintln!("worker {role} exited with {:?}", out.status.code());
+            continue;
+        }
+        let stdout = String::from_utf8_lossy(&out.stdout);
+        let line = stdout.lines().rev().find(|l| !l.trim().is_empty());
+        let Some(line) = line else {
+            failures += 1;
+            continue;
+        };
+        let v: serde_json::Value =
+            serde_json::from_str(line.trim()).with_context(|| format!("parse worker output: {line}"))?;
+        let rows = v.get("rows").and_then(|x| x.as_u64()).unwrap_or(0);
+        let passes = v.get("passes").and_then(|x| x.as_u64()).unwrap_or(0);
+        let elapsed = v.get("elapsed_s").and_then(|x| x.as_f64()).unwrap_or(0.0);
+        if let Some(pid) = v.get("pid").and_then(|x| x.as_u64()) {
+            worker_pids.push(pid);
+        }
+        max_elapsed = max_elapsed.max(elapsed);
+        match role.as_str() {
+            "writer" => write_rows += rows,
+            "reader" => {
+                read_rows += rows;
+                read_passes += passes;
+            }
+            _ => {}
+        }
+    }
+
+    let window = Duration::from_secs_f64(max_elapsed.max(1e-9));
+    let mut report = Report::new(format!("mp · {:?} · {:?}", args.backend, args.schema.schema));
+    report
+        .text("backend", format!("{:?}", args.backend))
+        .text("schema", format!("{:?}", args.schema.schema))
+        .count("writer procs", writers as u64)
+        .count("reader procs", readers as u64)
+        .duration("window", window)
+        .count("rows written", write_rows)
+        .rate("write rate", write_rows, window, "rows")
+        .byte_rate("write bw", write_rows * row_bytes, window);
+    if readers > 0 {
+        report
+            .count("scan passes", read_passes)
+            .count("rows scanned", read_rows)
+            .rate("read rate", read_rows, window, "rows")
+            .byte_rate("read bw", read_rows * row_bytes, window);
+    }
+    if failures > 0 {
+        report.count("failed workers", failures as u64);
+    }
+    report.emit(json);
+
+    if let Attach::File(p) = &attach {
+        if matches!(args.backend, Backend::File) {
+            let _ = std::fs::remove_file(p);
+        }
+    }
+    if failures > 0 {
+        bail!("{failures} worker(s) failed");
+    }
+    Ok(())
+}
+
+/// Flags that reproduce the table geometry in a child (role/attach/start go
+/// through the environment).
+fn passthrough_args(args: &MpArgs) -> Vec<String> {
+    let kind = args
+        .schema
+        .schema
+        .to_possible_value()
+        .map(|p| p.get_name().to_string())
+        .unwrap_or_else(|| "metrics".into());
+    let backend = args
+        .backend
+        .to_possible_value()
+        .map(|p| p.get_name().to_string())
+        .unwrap_or_else(|| "shared".into());
+    vec![
+        "bench".into(),
+        "mp".into(),
+        "--schema".into(),
+        kind,
+        "--wide-cols".into(),
+        args.schema.wide_cols.to_string(),
+        "--str-len".into(),
+        args.schema.str_len.to_string(),
+        "--chunk-size".into(),
+        args.ring.chunk_size.to_string(),
+        "--chunks".into(),
+        args.ring.chunks.to_string(),
+        "--backend".into(),
+        backend,
+        "--duration".into(),
+        args.duration.to_string(),
+    ]
+}
+
+fn spawn_worker(
+    exe: &std::path::Path,
+    passthrough: &[String],
+    role: &str,
+    attach: &Attach,
+    start_ms: u128,
+    seed: u64,
+) -> Result<Child> {
+    Command::new(exe)
+        .args(passthrough)
+        .args(["--seed", &seed.to_string()])
+        .env(ENV_ROLE, role)
+        .env(ENV_ATTACH, attach.encode())
+        .env(ENV_START_MS, start_ms.to_string())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::inherit())
+        .spawn()
+        .with_context(|| format!("spawn {role} worker"))
+}
+
+// ── worker ───────────────────────────────────────────────────────────────
+
+fn run_worker(args: &MpArgs, role: &str, seed: u64) -> Result<()> {
+    let attach = Attach::parse(&std::env::var(ENV_ATTACH).context("missing attach env")?)?;
+    let start_ms: u128 = std::env::var(ENV_START_MS)
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or_else(now_ms);
+    let duration = Duration::from_secs(args.duration.max(1));
+    let spec = args.schema.spec();
+
+    // Attach to the shared table (retry briefly in case of a startup race).
+    let mut table = open_with_retry(&attach)?;
+
+    spin_until(start_ms);
+    let t0 = Instant::now();
+    let (rows, passes) = match role {
+        "writer" => {
+            let mut gen = RowGen::new(spec.clone(), seed, (std::process::id() as i64) << 20);
+            let mut scratch: Vec<f64> = Vec::new();
+            let mut rows = 0u64;
+            while t0.elapsed() < duration {
+                for _ in 0..256 {
+                    let values = gen.values(&mut scratch);
+                    table.push_row_unchecked(&values);
+                }
+                rows += 256;
+            }
+            (rows, 0u64)
+        }
+        "reader" => {
+            let dtypes: Vec<DType> = (0..spec.schema().cols.len())
+                .map(|i| spec.schema().cols[i].dtype)
+                .collect();
+            let mut rows = 0u64;
+            let mut passes = 0u64;
+            let mut sink = 0u64;
+            while t0.elapsed() < duration {
+                let (s, n) = scan_all(&table, &dtypes);
+                sink = sink.wrapping_add(s);
+                rows += n;
+                passes += 1;
+            }
+            std::hint::black_box(sink);
+            (rows, passes)
+        }
+        other => bail!("unknown worker role: {other}"),
+    };
+    let elapsed = t0.elapsed().as_secs_f64();
+
+    let out = serde_json::json!({
+        "role": role,
+        "pid": std::process::id(),
+        "rows": rows,
+        "passes": passes,
+        "elapsed_s": elapsed,
+    });
+    println!("{out}");
+    Ok(())
+}
+
+fn open_with_retry(attach: &Attach) -> Result<MemTable> {
+    let deadline = Instant::now() + Duration::from_secs(5);
+    loop {
+        match attach.open() {
+            Ok(t) => return Ok(t),
+            Err(e) if Instant::now() < deadline => {
+                let _ = e;
+                std::thread::sleep(Duration::from_millis(10));
+            }
+            Err(e) => return Err(e).context("attach to shared table"),
+        }
+    }
+}
+
+fn now_ms() -> u128 {
+    SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .map(|d| d.as_millis())
+        .unwrap_or(0)
+}
+
+fn spin_until(start_ms: u128) {
+    while now_ms() < start_ms {
+        std::thread::sleep(Duration::from_millis(1));
+    }
+}
diff --git a/probing/cli/src/cli/bench/runners/scan.rs b/probing/cli/src/cli/bench/runners/scan.rs
new file mode 100644
index 00000000..01ebdc2c
--- /dev/null
+++ b/probing/cli/src/cli/bench/runners/scan.rs
@@ -0,0 +1,51 @@
+//! `scan` — sequential read throughput over a populated hot ring.
+//!
+//! Reads every committed row in logical (oldest→newest) order through the
+//! O(1)-per-column cursor, folding values into a sink so the work is not
+//! optimised away.
+
+use std::time::Instant;
+
+use anyhow::Result;
+use probing_memtable::{DType, MemTable};
+
+use crate::cli::bench::args::ScanArgs;
+use crate::cli::bench::metrics::Report;
+use super::common::{populate, scan_all};
+
+pub fn run(args: &ScanArgs, json: bool, seed: u64) -> Result<()> {
+    let spec = args.schema.spec();
+    let mut table = MemTable::new(&spec.schema(), args.ring.chunk_size, args.ring.chunks);
+    populate(&mut table, &spec, args.rows, seed);
+
+    let dtypes: Vec<DType> = (0..table.num_cols())
+        .map(|i| table.col_dtype(i).expect("known dtype"))
+        .collect();
+
+    // Warm pass (also tells us how many rows survived the ring).
+    let resident = scan_all(&table, &dtypes);
+
+    let iters = args.iters.max(1);
+    let start = Instant::now();
+    let mut sink = 0u64;
+    for _ in 0..iters {
+        sink = sink.wrapping_add(scan_all(&table, &dtypes).0);
+    }
+    let elapsed = start.elapsed();
+    std::hint::black_box(sink);
+
+    let rows_total = resident.1 * iters as u64;
+    let bytes_total = rows_total * spec.approx_row_bytes() as u64;
+
+    let mut report = Report::new(format!("scan · {:?}", args.schema.schema));
+    report
+        .text("schema", format!("{:?}", args.schema.schema))
+        .count("rows ingested", args.rows)
+        .count("rows resident", resident.1)
+        .count("scan passes", iters as u64)
+        .duration("elapsed", elapsed)
+        .rate("throughput", rows_total, elapsed, "rows")
+        .byte_rate("bandwidth", bytes_total, elapsed);
+    report.emit(json);
+    Ok(())
+}
diff --git a/probing/cli/src/cli/bench/runners/write.rs b/probing/cli/src/cli/bench/runners/write.rs
new file mode 100644
index 00000000..16fd4aa4
--- /dev/null
+++ b/probing/cli/src/cli/bench/runners/write.rs
@@ -0,0 +1,198 @@
+//! `write` — write throughput across backends, writer counts and APIs.
+//!
+//! With `--threads > 1` on a shared backend (`shm`/`file`/`shared`) every
+//! thread opens its own handle to the same mapping, so the run genuinely
+//! contends on the in-buffer robust write lock. The `heap` backend cannot be
+//! shared, so multi-threaded heap runs use independent per-thread tables
+//! (parallel throughput, no lock contention).
+
+use std::sync::Barrier;
+use std::time::Instant;
+
+use anyhow::{bail, Result};
+use probing_memtable::MemTable;
+
+use super::common;
+use crate::cli::bench::args::{Backend, RingArgs, WriteArgs, WriterMode};
+use crate::cli::bench::metrics::{Latency, Report};
+use crate::cli::bench::workload::{RowGen, WorkloadSpec};
+
+/// How a worker thread obtains its table handle.
+enum Source {
+    Heap,
+    Shm(String),
+    File(std::path::PathBuf),
+}
+
+struct WorkerOut {
+    rows: u64,
+    bytes: u64,
+    latency: Option<Latency>,
+}
+
+pub fn run(args: &WriteArgs, json: bool, seed: u64) -> Result<()> {
+    let spec = args.schema.spec();
+    let threads = args.threads.max(1);
+
+    if args.writer == WriterMode::Streaming && threads > 1 {
+        bail!("--writer streaming requires --threads 1 (advance-on-overflow is not concurrency-safe)");
+    }
+    if threads > 1 && args.backend == Backend::Heap {
+        eprintln!(
+            "note: heap backend cannot be shared; --threads {threads} uses independent \
+             per-thread tables (no lock contention)"
+        );
+    }
+
+    // Set up the backing for shared backends; keep the creator handle alive
+    // for the whole run so attached worker handles stay valid.
+    let mut cleanup_file: Option<std::path::PathBuf> = None;
+    let (source, _creator) = match args.backend {
+        Backend::Heap => (Source::Heap, None),
+        Backend::Shm => {
+            let name = common::shm_name();
+            let creator = MemTable::shm(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?;
+            (Source::Shm(name), Some(creator))
+        }
+        Backend::File => {
+            let path = args
+                .path
+                .clone()
+                .unwrap_or_else(|| common::temp_path("write"));
+            if args.path.is_none() {
+                cleanup_file = Some(path.clone());
+            }
+            let creator =
+                MemTable::file_at(&path, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?;
+            (Source::File(path), Some(creator))
+        }
+        Backend::Shared => {
+            let name = format!("bench-{}", common::unique_token());
+            let creator =
+                MemTable::shared(&name, &spec.schema(), args.ring.chunk_size, args.ring.chunks)?;
+            let path = creator
+                .path()
+                .expect("shared table has a path")
+                .to_path_buf();
+            (Source::File(path), Some(creator))
+        }
+    };
+
+    let per_thread = args.rows / threads as u64;
+    let remainder = args.rows % threads as u64;
+    let barrier = Barrier::new(threads + 1);
+    let lat_cap = if args.latency { 1 << 16 } else { 0 };
+
+    let (outs, elapsed) = std::thread::scope(|scope| {
+        let mut handles = Vec::with_capacity(threads);
+        for tid in 0..threads {
+            let rows = per_thread + if (tid as u64) < remainder { 1 } else { 0 };
+            let spec = spec.clone();
+            let source = &source;
+            let barrier = &barrier;
+            let ring = args.ring.clone();
+            let writer = args.writer;
+            let warmup = args.warmup;
+            handles.push(scope.spawn(move || -> Result<WorkerOut> {
+                let mut table = open_handle(source, &spec, &ring)?;
+                let seed = seed ^ (0x9E37_79B9_u64.wrapping_mul(tid as u64 + 1));
+                // Distinct time windows per writer.
+                let start_ts = (tid as i64) * 1_000_000_000;
+                let mut gen = RowGen::new(spec.clone(), seed, start_ts);
+
+                run_rows(&mut table, &mut gen, writer, warmup, &mut None);
+
+                barrier.wait();
+                let mut lat = (lat_cap > 0).then(|| Latency::new(lat_cap));
+                let written = run_rows(&mut table, &mut gen, writer, rows, &mut lat);
+                Ok(WorkerOut {
+                    rows: written,
+                    bytes: written * spec.approx_row_bytes() as u64,
+                    latency: lat,
+                })
+            }));
+        }
+
+        // Release the workers together, then time the full write window.
+        barrier.wait();
+        let start = Instant::now();
+        let outs: Vec<Result<WorkerOut>> = handles.into_iter().map(|h| h.join().unwrap()).collect();
+        (outs, start.elapsed())
+    });
+
+    let mut total_rows = 0u64;
+    let mut total_bytes = 0u64;
+    let mut merged = Latency::new(lat_cap.max(1));
+    for o in outs {
+        let o = o?;
+        total_rows += o.rows;
+        total_bytes += o.bytes;
+        if let Some(l) = o.latency {
+            merged.merge(&l);
+        }
+    }
+
+    if let Some(p) = cleanup_file {
+        let _ = std::fs::remove_file(p);
+    }
+
+    let mut report = Report::new(format!("write · {:?} · {:?}", args.backend, args.schema.schema));
+    report
+        .text("backend", format!("{:?}", args.backend))
+        .text("schema", format!("{:?}", args.schema.schema))
+        .text("writer", format!("{:?}", args.writer))
+        .count("threads", threads as u64)
+        .count("rows", total_rows)
+        .duration("elapsed", elapsed)
+        .rate("throughput", total_rows, elapsed, "rows")
+        .byte_rate("bandwidth", total_bytes, elapsed)
+        .rate("per-thread", total_rows / threads as u64, elapsed, "rows");
+    if args.latency {
+        report.latency("latency", &merged);
+    }
+    report.emit(json);
+    Ok(())
+}
+
+fn open_handle(source: &Source, spec: &WorkloadSpec, ring: &RingArgs) -> Result<MemTable> {
+    Ok(match source {
+        Source::Heap => MemTable::new(&spec.schema(), ring.chunk_size, ring.chunks),
+        Source::Shm(name) => MemTable::open_shm(name)?,
+        Source::File(path) => MemTable::open_file(path)?,
+    })
+}
+
+/// Write `rows` rows, optionally recording per-row latency. Returns rows written.
+fn run_rows(
+    table: &mut MemTable,
+    gen: &mut RowGen,
+    mode: WriterMode,
+    rows: u64,
+    lat: &mut Option<Latency>,
+) -> u64 {
+    let mut scratch: Vec<f64> = Vec::new();
+    for _ in 0..rows {
+        let t = lat.as_ref().map(|_| Instant::now());
+        match mode {
+            WriterMode::Push => {
+                let values = gen.values(&mut scratch);
+                table.push_row_unchecked(&values);
+            }
+            WriterMode::Streaming => {
+                let ok = {
+                    let mut w = table.row_writer();
+                    gen.write_into(&mut w)
+                };
+                if !ok {
+                    table.advance_chunk();
+                    let mut w = table.row_writer();
+                    let _ = gen.write_into(&mut w);
+                }
+            }
+        }
+        if let (Some(l), Some(t)) = (lat.as_mut(), t) {
+            l.record(t.elapsed().as_nanos() as u64);
+        }
+    }
+    rows
+}
diff --git a/probing/cli/src/cli/bench/workload.rs b/probing/cli/src/cli/bench/workload.rs
new file mode 100644
index 00000000..b073f9ea
--- /dev/null
+++ b/probing/cli/src/cli/bench/workload.rs
@@ -0,0 +1,189 @@
+//! Synthetic schemas and deterministic row generation.
+//!
+//! Generators are driven by a seedable xorshift PRNG so a run is fully
+//! reproducible given `--seed`. The timestamp column is named `timestamp`
+//! (recognised by the memtable as the designated time column) and is kept
+//! monotonically increasing, which is both realistic for observability data
+//! and the case Pco compresses best.
+
+use std::str::FromStr;
+
+use probing_memtable::{DType, RowWriter, Schema, Value};
+
+/// Built-in column layouts covering the main compression / width regimes.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)]
+pub enum SchemaKind {
+    /// `timestamp:i64, value:f64, tag:u32` — narrow numeric, the common
+    /// metrics shape; compresses very well in the cold tier.
+    Metrics,
+    /// `timestamp:i64` + N `f64` columns — wide numeric rows.
+    Wide,
+    /// `timestamp:i64, level:u32, msg:str` — variable-length string payload
+    /// (no Pco, exercises the raw var-len path).
+    Logs,
+}
+
+impl FromStr for SchemaKind {
+    type Err = String;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_ascii_lowercase().as_str() {
+            "metrics" => Ok(Self::Metrics),
+            "wide" => Ok(Self::Wide),
+            "logs" => Ok(Self::Logs),
+            other => Err(format!(
+                "unknown schema '{other}' (expected metrics|wide|logs)"
+            )),
+        }
+    }
+}
+
+/// Parameters that shape a generated workload.
+#[derive(Debug, Clone)]
+pub struct WorkloadSpec {
+    pub kind: SchemaKind,
+    /// Number of `f64` columns for [`SchemaKind::Wide`].
+    pub wide_cols: usize,
+    /// Length in bytes of the `msg` payload for [`SchemaKind::Logs`].
+    pub str_len: usize,
+}
+
+impl WorkloadSpec {
+    pub fn schema(&self) -> Schema {
+        match self.kind {
+            SchemaKind::Metrics => Schema::new()
+                .col("timestamp", DType::I64)
+                .col("value", DType::F64)
+                .col("tag", DType::U32),
+            SchemaKind::Wide => {
+                let mut s = Schema::new().col("timestamp", DType::I64);
+                for i in 0..self.wide_cols {
+                    s = s.col(&format!("f{i}"), DType::F64);
+                }
+                s
+            }
+            SchemaKind::Logs => Schema::new()
+                .col("timestamp", DType::I64)
+                .col("level", DType::U32)
+                .col("msg", DType::Str),
+        }
+    }
+
+    /// Approximate encoded bytes of one row (excludes the 4-byte row-length
+    /// prefix); used to translate row counts into a logical byte rate.
+    pub fn approx_row_bytes(&self) -> usize {
+        match self.kind {
+            SchemaKind::Metrics => 8 + 8 + 4,
+            SchemaKind::Wide => 8 + self.wide_cols * 8,
+            SchemaKind::Logs => 8 + 4 + 4 + self.str_len,
+        }
+    }
+}
+
+/// Deterministic per-thread row generator.
+pub struct RowGen {
+    spec: WorkloadSpec,
+    rng: u64,
+    ts: i64,
+    msg: String,
+}
+
+impl RowGen {
+    /// `seed` should differ per thread for independent streams; `start_ts`
+    /// offsets the monotonic timestamp so concurrent streams don't fully
+    /// overlap in time.
+    pub fn new(spec: WorkloadSpec, seed: u64, start_ts: i64) -> Self {
+        let str_len = spec.str_len;
+        Self {
+            spec,
+            rng: seed | 1,
+            ts: start_ts,
+            msg: String::with_capacity(str_len),
+        }
+    }
+
+    #[inline]
+    fn next(&mut self) -> u64 {
+        let mut x = self.rng;
+        x ^= x >> 12;
+        x ^= x << 25;
+        x ^= x >> 27;
+        self.rng = x;
+        x.wrapping_mul(0x2545_F491_4F6C_DD1D)
+    }
+
+    /// Write one row through the streaming [`RowWriter`] fast path.
+    ///
+    /// Returns the value of [`RowWriter::finish`] — `false` means the row
+    /// did not fit the current chunk (the caller should advance and retry).
+    #[inline]
+    pub fn write_into(&mut self, w: &mut RowWriter) -> bool {
+        // Timestamp advances by a small positive jitter (1..=4): monotone,
+        // realistic, Pco-friendly.
+        self.ts += 1 + (self.next() & 0x3) as i64;
+        let ts = self.ts;
+        match self.spec.kind {
+            SchemaKind::Metrics => {
+                let v = (self.next() % 1_000_000) as f64 * 0.001;
+                w.put_i64(ts)
+                    .put_f64(v)
+                    .put_u32((self.next() % 1024) as u32)
+                    .finish()
+            }
+            SchemaKind::Wide => {
+                let mut wr = w.put_i64(ts);
+                for _ in 0..self.spec.wide_cols {
+                    let v = (self.next() % 1_000_000) as f64 * 0.001;
+                    wr = wr.put_f64(v);
+                }
+                wr.finish()
+            }
+            SchemaKind::Logs => {
+                self.fill_msg();
+                w.put_i64(ts)
+                    .put_u32((self.next() % 5) as u32)
+                    .put_str(&self.msg)
+                    .finish()
+            }
+        }
+    }
+
+    fn fill_msg(&mut self) {
+        self.msg.clear();
+        const ALPHABET: &[u8] = b"abcdefghijklmnopqrstuvwxyz0123456789 ";
+        for _ in 0..self.spec.str_len {
+            let c = ALPHABET[(self.next() as usize) % ALPHABET.len()];
+            self.msg.push(c as char);
+        }
+    }
+
+    /// Build a borrowed [`Value`] row for the `push_row` path. The returned
+    /// vector borrows `self.msg` for the logs schema, so it must be consumed
+    /// before the next call.
+    pub fn values<'a>(&'a mut self, scratch: &'a mut Vec<f64>) -> Vec<Value<'a>> {
+        self.ts += 1 + (self.next() & 0x3) as i64;
+        let ts = self.ts;
+        match self.spec.kind {
+            SchemaKind::Metrics => {
+                let v = (self.next() % 1_000_000) as f64 * 0.001;
+                vec![Value::I64(ts), Value::F64(v), Value::U32((self.next() % 1024) as u32)]
+            }
+            SchemaKind::Wide => {
+                scratch.clear();
+                for _ in 0..self.spec.wide_cols {
+                    scratch.push((self.next() % 1_000_000) as f64 * 0.001);
+                }
+                let mut row = Vec::with_capacity(1 + scratch.len());
+                row.push(Value::I64(ts));
+                for v in scratch.iter() {
+                    row.push(Value::F64(*v));
+                }
+                row
+            }
+            SchemaKind::Logs => {
+                let level = (self.next() % 5) as u32;
+                self.fill_msg();
+                vec![Value::I64(ts), Value::U32(level), Value::Str(&self.msg)]
+            }
+        }
+    }
+}
diff --git a/probing/cli/src/cli/commands.rs b/probing/cli/src/cli/commands.rs
index 520593f0..b28f26c9 100644
--- a/probing/cli/src/cli/commands.rs
+++ b/probing/cli/src/cli/commands.rs
@@ -170,4 +170,8 @@ pub enum Commands {
     /// Access various storage backends
     #[command(subcommand = false, hide = true)]
     Store(StoreCommand),
+
+    /// Stress and benchmark the in-process data layer
+    #[command(hide = true)]
+    Bench(super::bench::BenchCommand),
 }
diff --git a/probing/cli/src/cli/mod.rs b/probing/cli/src/cli/mod.rs
index 26da682c..b900e672 100644
--- a/probing/cli/src/cli/mod.rs
+++ b/probing/cli/src/cli/mod.rs
@@ -2,6 +2,7 @@ use anyhow::Result;
 use clap::Parser;
 use probing_proto::prelude::Query;
 
+pub mod bench;
 pub mod commands;
 pub mod ctrl;
 pub mod repl;
@@ -75,6 +76,9 @@ impl Cli {
             Some(Commands::Store(cmd)) => {
                 return cmd.run().await;
             }
+            Some(Commands::Bench(cmd)) => {
+                return cmd.run();
+            }
             _ => {}
         }
 
@@ -170,6 +174,7 @@ impl Cli {
             Commands::Launch { .. }
             | Commands::List { .. }
             | Commands::Store(..)
+            | Commands::Bench(..)
             | Commands::External(..) => {
                 unreachable!("These commands should be handled in run() method")
             }
diff --git a/probing/core/Cargo.toml b/probing/core/Cargo.toml
index 8acd1bc8..dcf48226 100644
--- a/probing/core/Cargo.toml
+++ b/probing/core/Cargo.toml
@@ -12,6 +12,7 @@ crate-type = ["rlib"]
 [dependencies]
 probing-proto = { path = "../proto" }
 probing-macros = { path = "../macros" }
+probing-memtable = { path = "../memtable" }
 
 anyhow = { workspace = true }
 arrow = { workspace = true }
@@ -24,10 +25,13 @@ serde_json = { workspace = true }
 thiserror = { workspace = true }
 
 async-trait = "0.1.83"
-datafusion = { version = "47.0.0", default-features = false, features = [] }
+datafusion = { workspace = true }
 futures = "0.3.31"
 sled = "0.34.7"
 bincode = "1.3.3"
 uuid = { version = "1.0", features = ["v4", "serde"] }
 url = "2.5"
 libc = "0.2"
+
+[dev-dependencies]
+tempfile = "3.8"
diff --git a/probing/core/src/core/memtable_sql.rs b/probing/core/src/core/memtable_sql.rs
new file mode 100644
index 00000000..99d381ba
--- /dev/null
+++ b/probing/core/src/core/memtable_sql.rs
@@ -0,0 +1,2048 @@
+//! Mmap memtable ↔ SQL catalog integration.
+//!
+//! Exposes mmap'd memtable files (MEMT rings / MEMH hash tables) under
+//! `<data_dir>/<pid>/` as DataFusion tables. Shared by the server and the
+//! language extensions so that every data producer writes through
+//! `probing-memtable` and every consumer queries through this module.
+//!
+//! ## File → SQL mapping (no hard-coded product prefix)
+//!
+//! - **First `.` splits schema vs table** — `acme.actors` → schema `acme`, table `actors`;
+//!   `foo.bar.baz` → schema `foo`, table `bar.baz` (on-disk name is the full filename).
+//! - **No `.`** — exposed as `memtable.<filename>` (e.g. `metrics` → `memtable.metrics`).
+//!
+//! Schema head and table tail must be non-empty; only ASCII letters, digits, `_`, and
+//! `.` inside the table tail are allowed (no `/`, `\\`). Leading-dot names are ignored.
+//!
+//! ## Read semantics (ring tables)
+//!
+//! - Files are **mmap'd read-only** (no full-file heap copy); only touched
+//!   pages are faulted in.
+//! - Chunks are materialised in **logical (oldest → newest) write order**
+//!   via [`MemTableView::chunks_logical`], one Arrow `RecordBatch` per chunk.
+//! - Each chunk's `generation` is re-checked after reading: a chunk recycled
+//!   by the writer mid-read is **discarded** instead of surfacing torn rows.
+//! - When the table has a designated timestamp column, chunks whose
+//!   `[min_ts, max_ts]` range cannot satisfy the query's time predicates are
+//!   **pruned** before materialisation ([`RingMmapTable`]).
+
+use std::any::Any;
+use std::collections::{BTreeSet, HashSet};
+use std::panic::AssertUnwindSafe;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, Mutex};
+use std::thread::JoinHandle;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use once_cell::sync::Lazy;
+use datafusion::arrow::array::{
+    ArrayRef, BinaryArray, BinaryBuilder, Float32Array, Float32Builder, Float64Array,
+    Float64Builder, GenericStringBuilder, Int32Array, Int32Builder, Int64Array, Int64Builder,
+    RecordBatch, StringArray, UInt32Array, UInt32Builder, UInt64Array, UInt64Builder, UInt8Array,
+    UInt8Builder,
+};
+use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use datafusion::catalog::CatalogProvider;
+use datafusion::catalog::SchemaProvider;
+use datafusion::catalog::Session;
+use datafusion::datasource::{TableProvider, TableType};
+use datafusion::error::DataFusionError;
+use datafusion::error::Result as DfResult;
+use datafusion::logical_expr::{Expr, Operator, TableProviderFilterPushDown};
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::scalar::ScalarValue;
+
+use probing_memtable::discover::{default_dir, MappedFile};
+use probing_memtable::memc::{ColdStats, ColdStore, ColumnData, Compactor, CompactorConfig, SegmentReader};
+use probing_memtable::{detect_table, DType, MemTableView, MemhView, TableKind, TypedValue};
+
+use super::plugin_advanced::{scan_memory_partitions, supports_filters_pushdown_for_schema};
+use super::{
+    EngineCall, EngineDatasource, EngineError, EngineExtension, EngineExtensionOption, Maybe,
+    Plugin, PluginAdvancedTable, PluginType,
+};
+use probing_macros::EngineExtension as EngineExtensionDerive;
+
+/// SQL schema used for mmap files whose basename contains no `.`.
+pub const DEFAULT_UNDOTTED_SCHEMA: &str = "memtable";
+
+fn self_dir() -> std::path::PathBuf {
+    default_dir().join(std::process::id().to_string())
+}
+
+/// Cold-segment directory for this process: `<data_dir>/<pid>/cold`.
+///
+/// Co-located with (and scoped like) the hot ring files so cold data never
+/// mixes across processes, and the compactor writer and this read path agree
+/// on one location without extra configuration.
+pub fn cold_dir() -> std::path::PathBuf {
+    self_dir().join("cold")
+}
+
+#[inline]
+fn valid_schema_head(s: &str) -> bool {
+    !s.is_empty() && s.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'_')
+}
+
+#[inline]
+fn valid_table_tail(s: &str) -> bool {
+    !s.is_empty()
+        && !s.contains('/')
+        && !s.contains('\\')
+        && s.bytes()
+            .all(|b| b.is_ascii_alphanumeric() || b == b'_' || b == b'.')
+}
+
+/// Map basename `filename` → `(schema, table)` for routing; [`None`] if skipped.
+pub fn classify_mmap_basename(filename: &str) -> Option<(String, String)> {
+    if filename.starts_with('.') {
+        return None;
+    }
+    if let Some((head, tail)) = filename.split_once('.') {
+        if valid_schema_head(head) && valid_table_tail(tail) {
+            return Some((head.to_string(), tail.to_string()));
+        }
+        return None;
+    }
+    if valid_schema_head(filename) {
+        Some((DEFAULT_UNDOTTED_SCHEMA.to_string(), filename.to_string()))
+    } else {
+        None
+    }
+}
+
+/// On-disk filename for a `(schema, table)` pair.
+pub fn mmap_filename_for(schema: &str, table: &str) -> String {
+    if schema == DEFAULT_UNDOTTED_SCHEMA {
+        table.to_string()
+    } else {
+        format!("{schema}.{table}")
+    }
+}
+
+fn tables_in_schema(target_schema: &str) -> Vec<String> {
+    let dir = self_dir();
+    let Ok(entries) = std::fs::read_dir(&dir) else {
+        return vec![];
+    };
+    let mut out = Vec::new();
+    for e in entries.flatten() {
+        if !e.path().is_file() {
+            continue;
+        }
+        let n = e.file_name().to_string_lossy().to_string();
+        if let Some((sch, tbl)) = classify_mmap_basename(&n) {
+            if sch == target_schema {
+                out.push(tbl);
+            }
+        }
+    }
+    out.sort();
+    out.dedup();
+    out
+}
+
+fn discover_all_schemas() -> BTreeSet<String> {
+    let mut out = BTreeSet::new();
+    let dir = self_dir();
+    if let Ok(entries) = std::fs::read_dir(&dir) {
+        for e in entries.flatten() {
+            if !e.path().is_file() {
+                continue;
+            }
+            let n = e.file_name().to_string_lossy().to_string();
+            if let Some((sch, _)) = classify_mmap_basename(&n) {
+                out.insert(sch);
+            }
+        }
+    }
+    out.insert(DEFAULT_UNDOTTED_SCHEMA.to_string());
+    out
+}
+
+/// Whether an mmap file backs `schema.table` right now (validates the table
+/// name first so user-supplied SQL identifiers can never escape the data dir).
+fn mmap_table_exists(schema: &str, table: &str) -> bool {
+    if !valid_table_tail(table) {
+        return false;
+    }
+    self_dir().join(mmap_filename_for(schema, table)).is_file()
+}
+
+/// Mmap ring / MEMH → Arrow batches, then a [`PluginAdvancedTable`] so DataFusion can push
+/// filters and limits into the scan path.
+pub fn bytes_to_pushdown_table(data: &[u8], logical_name: &str) -> Arc<dyn TableProvider> {
+    match detect_table(data) {
+        Some(TableKind::Ring) => {
+            let view = match MemTableView::new(data) {
+                Ok(v) => v,
+                Err(_) => return Arc::new(PluginAdvancedTable::empty_sentinel(logical_name)),
+            };
+            let schema = view_to_arrow_schema(&view);
+            let batches = view_to_recordbatches(&view);
+            match PluginAdvancedTable::try_new(logical_name, schema, batches) {
+                Ok(t) => Arc::new(t),
+                Err(e) => {
+                    log::error!("memtable PluginAdvancedTable (ring): {e}");
+                    Arc::new(PluginAdvancedTable::empty_sentinel(logical_name))
+                }
+            }
+        }
+        Some(TableKind::Hash) => {
+            let view = match MemhView::new(data) {
+                Ok(v) => v,
+                Err(_) => return Arc::new(PluginAdvancedTable::empty_sentinel(logical_name)),
+            };
+            let schema = memh_kv_schema();
+            let batches = memh_view_to_recordbatch(&view);
+            if batches.is_empty() {
+                return Arc::new(PluginAdvancedTable::empty_sentinel(logical_name));
+            }
+            match PluginAdvancedTable::try_new(logical_name, schema, batches) {
+                Ok(t) => Arc::new(t),
+                Err(e) => {
+                    log::error!("memtable PluginAdvancedTable (memh): {e}");
+                    Arc::new(PluginAdvancedTable::empty_sentinel(logical_name))
+                }
+            }
+        }
+        None => Arc::new(PluginAdvancedTable::empty_sentinel(logical_name)),
+    }
+}
+
+fn dtype_to_arrow(dt: DType) -> DataType {
+    match dt {
+        DType::U8 => DataType::UInt8,
+        DType::U32 => DataType::UInt32,
+        DType::I32 => DataType::Int32,
+        DType::I64 => DataType::Int64,
+        DType::F32 => DataType::Float32,
+        DType::F64 => DataType::Float64,
+        DType::U64 => DataType::UInt64,
+        DType::Str => DataType::Utf8,
+        DType::Bytes => DataType::Binary,
+    }
+}
+
+/// Arrow schema mirroring a ring table's column layout.
+pub fn view_to_arrow_schema(view: &MemTableView) -> SchemaRef {
+    let s = view.schema();
+    let fields: Vec<Field> = s
+        .cols
+        .iter()
+        .map(|c| Field::new(&c.name, dtype_to_arrow(c.dtype), true))
+        .collect();
+    SchemaRef::new(Schema::new(fields))
+}
+
+enum ColBuilder {
+    U8(UInt8Builder),
+    U32(UInt32Builder),
+    I32(Int32Builder),
+    I64(Int64Builder),
+    F32(Float32Builder),
+    F64(Float64Builder),
+    U64(UInt64Builder),
+    Str(GenericStringBuilder<i32>),
+    Bytes(BinaryBuilder),
+}
+
+fn make_builders(view: &MemTableView) -> Vec<ColBuilder> {
+    view.schema()
+        .cols
+        .iter()
+        .map(|c| match c.dtype {
+            DType::U8 => ColBuilder::U8(UInt8Builder::new()),
+            DType::U32 => ColBuilder::U32(UInt32Builder::new()),
+            DType::I32 => ColBuilder::I32(Int32Builder::new()),
+            DType::I64 => ColBuilder::I64(Int64Builder::new()),
+            DType::F32 => ColBuilder::F32(Float32Builder::new()),
+            DType::F64 => ColBuilder::F64(Float64Builder::new()),
+            DType::U64 => ColBuilder::U64(UInt64Builder::new()),
+            DType::Str => ColBuilder::Str(GenericStringBuilder::new()),
+            DType::Bytes => ColBuilder::Bytes(BinaryBuilder::new()),
+        })
+        .collect()
+}
+
+/// Materialise one chunk into a `RecordBatch`.
+///
+/// Returns [`None`] when the chunk was recycled while being read (its
+/// generation moved), or when reading panicked on a torn ref — both mean
+/// the bytes can no longer be trusted, so the whole chunk is dropped
+/// rather than surfacing corrupt rows to SQL.
+fn chunk_to_recordbatch(
+    view: &MemTableView,
+    chunk: usize,
+    arrow_schema: &SchemaRef,
+) -> Option<RecordBatch> {
+    let generation_before = view.chunk_generation(chunk);
+
+    let arrays = std::panic::catch_unwind(AssertUnwindSafe(|| {
+        let mut builders = make_builders(view);
+        // RowIter itself stops yielding once it observes a generation change;
+        // rows read before that may still be torn, hence the re-check below.
+        for row in view.rows(chunk) {
+            let mut cursor = row.cursor();
+            for builder in builders.iter_mut() {
+                match builder {
+                    ColBuilder::U8(b) => b.append_value(cursor.next_u8()),
+                    ColBuilder::U32(b) => b.append_value(cursor.next_u32()),
+                    ColBuilder::I32(b) => b.append_value(cursor.next_i32()),
+                    ColBuilder::I64(b) => b.append_value(cursor.next_i64()),
+                    ColBuilder::F32(b) => b.append_value(cursor.next_f32()),
+                    ColBuilder::F64(b) => b.append_value(cursor.next_f64()),
+                    ColBuilder::U64(b) => b.append_value(cursor.next_u64()),
+                    ColBuilder::Str(b) => b.append_value(cursor.next_str()),
+                    ColBuilder::Bytes(b) => b.append_value(cursor.next_bytes()),
+                }
+            }
+        }
+        builders
+            .into_iter()
+            .map(|b| -> ArrayRef {
+                match b {
+                    ColBuilder::U8(mut b) => Arc::new(b.finish()),
+                    ColBuilder::U32(mut b) => Arc::new(b.finish()),
+                    ColBuilder::I32(mut b) => Arc::new(b.finish()),
+                    ColBuilder::I64(mut b) => Arc::new(b.finish()),
+                    ColBuilder::F32(mut b) => Arc::new(b.finish()),
+                    ColBuilder::F64(mut b) => Arc::new(b.finish()),
+                    ColBuilder::U64(mut b) => Arc::new(b.finish()),
+                    ColBuilder::Str(mut b) => Arc::new(b.finish()),
+                    ColBuilder::Bytes(mut b) => Arc::new(b.finish()),
+                }
+            })
+            .collect::<Vec<ArrayRef>>()
+    }))
+    .map_err(|_| {
+        log::debug!("memtable chunk {chunk} recycled mid-read; dropping");
+    })
+    .ok()?;
+
+    if view.chunk_generation(chunk) != generation_before {
+        log::debug!("memtable chunk {chunk} recycled during materialisation; dropping");
+        return None;
+    }
+
+    match RecordBatch::try_new(arrow_schema.clone(), arrays) {
+        Ok(batch) if batch.num_rows() > 0 => Some(batch),
+        Ok(_) => None,
+        Err(e) => {
+            log::error!("memtable chunk {chunk} → RecordBatch failed: {e}");
+            None
+        }
+    }
+}
+
+/// Materialise a ring view as record batches in **logical (oldest → newest)
+/// order**, one batch per surviving chunk.
+///
+/// Always returns at least one (possibly empty) batch so the table keeps its
+/// real schema even when no rows are visible.
+pub fn view_to_recordbatches(view: &MemTableView) -> Vec<RecordBatch> {
+    let arrow_schema = view_to_arrow_schema(view);
+    let mut batches: Vec<RecordBatch> = view
+        .chunks_logical()
+        .into_iter()
+        .filter_map(|chunk| chunk_to_recordbatch(view, chunk, &arrow_schema))
+        .collect();
+    if batches.is_empty() {
+        batches.push(RecordBatch::new_empty(arrow_schema));
+    }
+    batches
+}
+
+// ── Time-range pruning (chunk level) ──────────────────────────────────
+
+/// Inclusive time window extracted from query predicates on the designated
+/// timestamp column. `None` on either side = unbounded.
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
+pub struct TsBounds {
+    pub lower: Option<i64>,
+    pub upper: Option<i64>,
+}
+
+impl TsBounds {
+    fn is_unbounded(&self) -> bool {
+        self.lower.is_none() && self.upper.is_none()
+    }
+
+    fn tighten_lower(&mut self, v: i64) {
+        self.lower = Some(self.lower.map_or(v, |cur| cur.max(v)));
+    }
+
+    fn tighten_upper(&mut self, v: i64) {
+        self.upper = Some(self.upper.map_or(v, |cur| cur.min(v)));
+    }
+}
+
+/// Integer value of a literal usable as a timestamp bound.
+fn literal_as_i64(expr: &Expr) -> Option<i64> {
+    let Expr::Literal(scalar, _) = expr else {
+        return None;
+    };
+    match scalar {
+        ScalarValue::Int64(Some(v)) => Some(*v),
+        ScalarValue::Int32(Some(v)) => Some(*v as i64),
+        ScalarValue::UInt32(Some(v)) => Some(*v as i64),
+        ScalarValue::UInt64(Some(v)) => i64::try_from(*v).ok(),
+        ScalarValue::TimestampMicrosecond(Some(v), _) => Some(*v),
+        _ => None,
+    }
+}
+
+fn is_ts_column(expr: &Expr, ts_name: &str) -> bool {
+    matches!(expr, Expr::Column(c) if c.name == ts_name)
+}
+
+/// Fold one predicate into `bounds`. Conservative: comparisons are widened
+/// to inclusive bounds (`>` treated as `>=`), unrecognised shapes are
+/// ignored — pruning may keep too much, never too little.
+fn fold_ts_predicate(expr: &Expr, ts_name: &str, bounds: &mut TsBounds) {
+    match expr {
+        Expr::BinaryExpr(be) if be.op == Operator::And => {
+            fold_ts_predicate(&be.left, ts_name, bounds);
+            fold_ts_predicate(&be.right, ts_name, bounds);
+        }
+        Expr::BinaryExpr(be) => {
+            let (op, lit) = if is_ts_column(&be.left, ts_name) {
+                let Some(v) = literal_as_i64(&be.right) else {
+                    return;
+                };
+                (be.op, v)
+            } else if is_ts_column(&be.right, ts_name) {
+                // `lit op ts` — mirror the comparison.
+                let Some(v) = literal_as_i64(&be.left) else {
+                    return;
+                };
+                let mirrored = match be.op {
+                    Operator::Gt => Operator::Lt,
+                    Operator::GtEq => Operator::LtEq,
+                    Operator::Lt => Operator::Gt,
+                    Operator::LtEq => Operator::GtEq,
+                    other => other,
+                };
+                (mirrored, v)
+            } else {
+                return;
+            };
+            match op {
+                Operator::Gt | Operator::GtEq => bounds.tighten_lower(lit),
+                Operator::Lt | Operator::LtEq => bounds.tighten_upper(lit),
+                Operator::Eq => {
+                    bounds.tighten_lower(lit);
+                    bounds.tighten_upper(lit);
+                }
+                _ => {}
+            }
+        }
+        Expr::Between(b) if !b.negated && is_ts_column(&b.expr, ts_name) => {
+            if let Some(lo) = literal_as_i64(&b.low) {
+                bounds.tighten_lower(lo);
+            }
+            if let Some(hi) = literal_as_i64(&b.high) {
+                bounds.tighten_upper(hi);
+            }
+        }
+        _ => {}
+    }
+}
+
+/// Extract the time window implied by `filters` (each entry is ANDed by
+/// DataFusion) on the column named `ts_name`.
+pub fn ts_bounds_from_filters(filters: &[Expr], ts_name: &str) -> TsBounds {
+    let mut bounds = TsBounds::default();
+    for f in filters {
+        fold_ts_predicate(f, ts_name, &mut bounds);
+    }
+    bounds
+}
+
+/// `false` only when the chunk's committed `[min_ts, max_ts]` provably lies
+/// outside `bounds`. Races with the writer resolve to `true` (keep the
+/// chunk) — materialisation re-validates the generation anyway.
+fn chunk_may_match(view: &MemTableView, chunk: usize, bounds: &TsBounds) -> bool {
+    if bounds.is_unbounded() {
+        return true;
+    }
+    let generation_before = view.chunk_generation(chunk);
+    let Some((min_ts, max_ts)) = view.chunk_ts_range(chunk) else {
+        return true;
+    };
+    if view.chunk_generation(chunk) != generation_before {
+        return true; // recycled mid-read: range untrustworthy, do not prune
+    }
+    !(bounds.lower.is_some_and(|lo| max_ts < lo) || bounds.upper.is_some_and(|hi| min_ts > hi))
+}
+
+/// Like [`view_to_recordbatches`], skipping chunks outside `bounds`.
+pub fn view_to_recordbatches_pruned(view: &MemTableView, bounds: &TsBounds) -> Vec<RecordBatch> {
+    let arrow_schema = view_to_arrow_schema(view);
+    let mut batches: Vec<RecordBatch> = view
+        .chunks_logical()
+        .into_iter()
+        .filter(|&chunk| chunk_may_match(view, chunk, bounds))
+        .filter_map(|chunk| chunk_to_recordbatch(view, chunk, &arrow_schema))
+        .collect();
+    if batches.is_empty() {
+        batches.push(RecordBatch::new_empty(arrow_schema));
+    }
+    batches
+}
+
+/// Like [`view_to_recordbatches_pruned`], additionally skipping any chunk
+/// whose `(index, current generation)` is in `excluded` — used to drop hot
+/// chunks already materialised from the cold tier, so a hot∪cold union counts
+/// each row exactly once even while a compacted chunk still lives in the ring.
+fn view_to_recordbatches_pruned_excluding(
+    view: &MemTableView,
+    bounds: &TsBounds,
+    excluded: &HashSet<(usize, u64)>,
+) -> Vec<RecordBatch> {
+    let arrow_schema = view_to_arrow_schema(view);
+    let mut batches: Vec<RecordBatch> = view
+        .chunks_logical()
+        .into_iter()
+        .filter(|&chunk| chunk_may_match(view, chunk, bounds))
+        .filter(|&chunk| !excluded.contains(&(chunk, view.chunk_generation(chunk))))
+        .filter_map(|chunk| chunk_to_recordbatch(view, chunk, &arrow_schema))
+        .collect();
+    if batches.is_empty() {
+        batches.push(RecordBatch::new_empty(arrow_schema));
+    }
+    batches
+}
+
+// ── Lazy ring TableProvider (prunes + materialises at scan time) ──────
+
+/// [`TableProvider`] over an mmap'd MEMT ring file that defers Arrow
+/// materialisation to `scan()`, where the query's filters are known:
+/// chunks whose `[min_ts, max_ts]` cannot match the time predicates are
+/// skipped without faulting in their pages.
+#[derive(Debug)]
+pub struct RingMmapTable {
+    mapped: Arc<MappedFile>,
+    schema: SchemaRef,
+}
+
+impl RingMmapTable {
+    pub fn try_new(mapped: MappedFile) -> Result<Self, &'static str> {
+        let view = MemTableView::new(mapped.as_bytes())?;
+        let schema = view_to_arrow_schema(&view);
+        Ok(Self {
+            mapped: Arc::new(mapped),
+            schema,
+        })
+    }
+
+    /// Time window implied by `filters` on this ring's designated timestamp
+    /// column (unbounded when there is no ts column or the file is torn).
+    pub fn bounds_for(&self, filters: &[Expr]) -> TsBounds {
+        match MemTableView::new(self.mapped.as_bytes()) {
+            Ok(view) => view
+                .ts_col()
+                .map(|i| ts_bounds_from_filters(filters, view.col_name(i)))
+                .unwrap_or_default(),
+            Err(_) => TsBounds::default(),
+        }
+    }
+
+    /// Materialise surviving chunks within `bounds`, one batch per chunk.
+    pub fn pruned_batches(&self, bounds: &TsBounds) -> Vec<RecordBatch> {
+        match MemTableView::new(self.mapped.as_bytes()) {
+            Ok(view) => view_to_recordbatches_pruned(&view, bounds),
+            Err(_) => vec![RecordBatch::new_empty(Arc::clone(&self.schema))],
+        }
+    }
+
+    /// Like [`pruned_batches`](Self::pruned_batches), skipping chunks whose
+    /// `(index, generation)` is already represented in the cold tier.
+    fn pruned_batches_excluding(
+        &self,
+        bounds: &TsBounds,
+        excluded: &HashSet<(usize, u64)>,
+    ) -> Vec<RecordBatch> {
+        match MemTableView::new(self.mapped.as_bytes()) {
+            Ok(view) => view_to_recordbatches_pruned_excluding(&view, bounds, excluded),
+            Err(_) => vec![RecordBatch::new_empty(Arc::clone(&self.schema))],
+        }
+    }
+}
+
+#[async_trait]
+impl TableProvider for RingMmapTable {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> DfResult<Vec<TableProviderFilterPushDown>> {
+        supports_filters_pushdown_for_schema(&self.schema, filters)
+    }
+
+    async fn scan(
+        &self,
+        state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        filters: &[Expr],
+        limit: Option<usize>,
+    ) -> DfResult<Arc<dyn ExecutionPlan>> {
+        let bounds = self.bounds_for(filters);
+        let batches = self.pruned_batches(&bounds);
+        scan_memory_partitions(
+            state,
+            Arc::clone(&self.schema),
+            &[batches],
+            projection,
+            filters,
+            limit,
+        )
+        .await
+    }
+}
+
+// ── Cold segments (MEMC) → Arrow, with two-level time pruning ─────────
+
+/// `.memc` segment paths in `dir`, or empty if the dir does not exist.
+/// Read-only: never creates the directory (unlike `ColdStore::open`).
+fn cold_segment_paths(dir: &std::path::Path) -> Vec<std::path::PathBuf> {
+    let mut out = Vec::new();
+    if let Ok(entries) = std::fs::read_dir(dir) {
+        for e in entries.flatten() {
+            let p = e.path();
+            if p.extension().and_then(|s| s.to_str()) == Some("memc") {
+                out.push(p);
+            }
+        }
+    }
+    out
+}
+
+/// One decoded cold column → an Arrow array (schema order is preserved).
+fn cold_column_to_array(col: ColumnData) -> ArrayRef {
+    match col {
+        ColumnData::U8(v) => Arc::new(UInt8Array::from(v)),
+        ColumnData::U32(v) => Arc::new(UInt32Array::from(v)),
+        ColumnData::I32(v) => Arc::new(Int32Array::from(v)),
+        ColumnData::I64(v) => Arc::new(Int64Array::from(v)),
+        ColumnData::F32(v) => Arc::new(Float32Array::from(v)),
+        ColumnData::F64(v) => Arc::new(Float64Array::from(v)),
+        ColumnData::U64(v) => Arc::new(UInt64Array::from(v)),
+        ColumnData::Str(v) => Arc::new(StringArray::from_iter_values(v)),
+        ColumnData::Bytes(v) => Arc::new(BinaryArray::from_iter_values(v)),
+    }
+}
+
+/// Decode the cold pages of `table` within `bounds`, returning the batches and
+/// the set of hot-ring `(chunk index, generation)` those pages came from.
+///
+/// Two-level pruning mirrors the hot ring: sealed segments whose header
+/// `ts_range` cannot match are skipped without reading pages, then each
+/// segment's page directory is pruned per-page before decode. The returned
+/// `covered` set lets the caller drop the corresponding still-resident hot
+/// chunks so a hot∪cold union never double-counts a compacted chunk.
+fn cold_scan(
+    dir: &std::path::Path,
+    table: &str,
+    schema: &SchemaRef,
+    bounds: &TsBounds,
+) -> (Vec<RecordBatch>, HashSet<(usize, u64)>) {
+    let mut out = Vec::new();
+    let mut covered: HashSet<(usize, u64)> = HashSet::new();
+    for path in cold_segment_paths(dir) {
+        let Ok(reader) = SegmentReader::open(&path) else {
+            continue; // unreadable/foreign file: skip rather than fail the scan
+        };
+        if let Some((smin, smax)) = reader.ts_range() {
+            if bounds.lower.is_some_and(|lo| smax < lo) || bounds.upper.is_some_and(|hi| smin > hi)
+            {
+                continue; // segment-level prune: whole file out of range
+            }
+        }
+        let Some(tid) = reader.table_id_by_name(table) else {
+            continue; // this segment holds no pages for the queried table
+        };
+        let pages = reader.pages();
+        for idx in reader.pages_in_range(tid, bounds.lower, bounds.upper) {
+            if let Some(p) = pages.get(idx) {
+                if p.source_chunk != probing_memtable::memc::SOURCE_CHUNK_NONE {
+                    covered.insert((p.source_chunk as usize, p.source_gen));
+                }
+            }
+            match reader.read_page(idx) {
+                Ok(cols) => {
+                    let arrays: Vec<ArrayRef> =
+                        cols.into_iter().map(cold_column_to_array).collect();
+                    match RecordBatch::try_new(Arc::clone(schema), arrays) {
+                        Ok(b) if b.num_rows() > 0 => out.push(b),
+                        Ok(_) => {}
+                        Err(e) => log::error!("cold page {idx} → RecordBatch failed: {e}"),
+                    }
+                }
+                Err(e) => log::debug!("cold page {idx} decode skipped: {e}"),
+            }
+        }
+    }
+    (out, covered)
+}
+
+/// [`TableProvider`] unioning a hot ring with its cold MEMC segments under one
+/// logical table. A single time predicate prunes both tiers: hot chunks by
+/// `[min_ts, max_ts]`, cold segments/pages by their recorded ranges. Hot and
+/// cold batches are handed to the scan as two partitions, so projection,
+/// filter, and limit pushdown apply uniformly across both.
+#[derive(Debug)]
+pub struct HotColdTable {
+    hot: RingMmapTable,
+    cold_dir: std::path::PathBuf,
+    table: String,
+    schema: SchemaRef,
+}
+
+impl HotColdTable {
+    pub fn new(hot: RingMmapTable, cold_dir: std::path::PathBuf, table: impl Into<String>) -> Self {
+        let schema = hot.schema();
+        Self {
+            hot,
+            cold_dir,
+            table: table.into(),
+            schema,
+        }
+    }
+}
+
+#[async_trait]
+impl TableProvider for HotColdTable {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> DfResult<Vec<TableProviderFilterPushDown>> {
+        supports_filters_pushdown_for_schema(&self.schema, filters)
+    }
+
+    async fn scan(
+        &self,
+        state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        filters: &[Expr],
+        limit: Option<usize>,
+    ) -> DfResult<Arc<dyn ExecutionPlan>> {
+        let bounds = self.hot.bounds_for(filters);
+        let (cold, covered) = cold_scan(&self.cold_dir, &self.table, &self.schema, &bounds);
+        // Drop hot chunks already in cold so each row is counted once.
+        let hot = self.hot.pruned_batches_excluding(&bounds, &covered);
+
+        let partitions: Vec<Vec<RecordBatch>> = if cold.is_empty() {
+            vec![hot]
+        } else {
+            vec![hot, cold]
+        };
+        scan_memory_partitions(
+            state,
+            Arc::clone(&self.schema),
+            &partitions,
+            projection,
+            filters,
+            limit,
+        )
+        .await
+    }
+}
+
+/// Route an mmap'd file to its [`TableProvider`]: MEMT rings get the lazy
+/// pruning provider; MEMH (and anything else) keeps the eager path.
+pub fn mapped_file_to_table(mapped: MappedFile, logical_name: &str) -> Arc<dyn TableProvider> {
+    match detect_table(mapped.as_bytes()) {
+        Some(TableKind::Ring) => match RingMmapTable::try_new(mapped) {
+            Ok(t) => Arc::new(t),
+            Err(_) => Arc::new(PluginAdvancedTable::empty_sentinel(logical_name)),
+        },
+        _ => bytes_to_pushdown_table(mapped.as_bytes(), logical_name),
+    }
+}
+
+// ── MEMH: key-value table → two-column RecordBatch ────────────────────
+
+/// Fixed Arrow schema for MEMH tables: `key` (Utf8) + `value` (Utf8).
+///
+/// All MEMH values are serialised to strings so that heterogeneous value types
+/// (scalars, strings, bytes) can be represented in a single column and queried
+/// with SQL string predicates.
+fn memh_kv_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("key", DataType::Utf8, false),
+        Field::new("value", DataType::Utf8, true),
+    ]))
+}
+
+fn typed_value_to_str(v: &TypedValue<'_>) -> String {
+    match v {
+        TypedValue::U8(n) => n.to_string(),
+        TypedValue::I32(n) => n.to_string(),
+        TypedValue::I64(n) => n.to_string(),
+        TypedValue::F32(n) => n.to_string(),
+        TypedValue::F64(n) => n.to_string(),
+        TypedValue::U64(n) => n.to_string(),
+        TypedValue::U32(n) => n.to_string(),
+        TypedValue::Str(s) => s.to_string(),
+        TypedValue::Bytes(b) => {
+            // Hex-encode without adding a dep; e.g. "0xdeadbeef"
+            let mut out = String::with_capacity(2 + b.len() * 2);
+            out.push_str("0x");
+            for byte in *b {
+                use std::fmt::Write;
+                let _ = write!(out, "{byte:02x}");
+            }
+            out
+        }
+    }
+}
+
+fn memh_view_to_recordbatch(view: &MemhView<'_>) -> Vec<RecordBatch> {
+    let schema = memh_kv_schema();
+    let mut keys: GenericStringBuilder<i32> = GenericStringBuilder::new();
+    let mut values: GenericStringBuilder<i32> = GenericStringBuilder::new();
+
+    for (k, v) in view.iter() {
+        keys.append_value(k);
+        values.append_value(typed_value_to_str(&v));
+    }
+
+    match RecordBatch::try_new(
+        schema,
+        vec![Arc::new(keys.finish()), Arc::new(values.finish())],
+    ) {
+        Ok(batch) => vec![batch],
+        Err(e) => {
+            log::error!("memh → RecordBatch failed: {e}");
+            vec![]
+        }
+    }
+}
+
+// ── Dynamic schemas from mmap filenames ───────────────────────────────
+
+/// One DataFusion schema combining mmap-backed tables with an optional inner
+/// (static) provider.
+///
+/// Lookup order: mmap file first, then `inner`. Mmap files only exist when a
+/// producer explicitly created them, so they take precedence over static
+/// providers — some of which (e.g. lazy namespaces) claim every name exists.
+#[derive(Debug)]
+pub struct MmapFileSchemaProvider {
+    schema: String,
+    inner: Option<Arc<dyn SchemaProvider>>,
+}
+
+impl MmapFileSchemaProvider {
+    pub fn new(schema: impl Into<String>) -> Self {
+        Self {
+            schema: schema.into(),
+            inner: None,
+        }
+    }
+
+    /// Merge with a static provider: mmap tables shadow `inner` only on
+    /// exact-name collision; everything else falls through.
+    pub fn with_inner(schema: impl Into<String>, inner: Option<Arc<dyn SchemaProvider>>) -> Self {
+        Self {
+            schema: schema.into(),
+            inner,
+        }
+    }
+}
+
+#[async_trait]
+impl SchemaProvider for MmapFileSchemaProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn table_names(&self) -> Vec<String> {
+        let mut names = tables_in_schema(&self.schema);
+        if let Some(inner) = &self.inner {
+            names.extend(inner.table_names());
+        }
+        names.sort();
+        names.dedup();
+        names
+    }
+
+    async fn table(&self, name: &str) -> DfResult<Option<Arc<dyn TableProvider>>> {
+        if mmap_table_exists(&self.schema, name) {
+            let basename = mmap_filename_for(&self.schema, name);
+            let path = self_dir().join(&basename);
+            // Zero-copy read: map the file instead of copying it to the heap.
+            // Ring files materialise lazily at scan() time with chunk-level
+            // time pruning; only surviving chunk bytes get faulted in. A ring
+            // is unioned with its cold MEMC segments (keyed by the unique
+            // on-disk basename) so one query spans both tiers.
+            if let Ok(mapped) = MappedFile::open(&path) {
+                if let Some(TableKind::Ring) = detect_table(mapped.as_bytes()) {
+                    return Ok(Some(match RingMmapTable::try_new(mapped) {
+                        Ok(ring) => Arc::new(HotColdTable::new(ring, cold_dir(), basename)),
+                        Err(_) => Arc::new(PluginAdvancedTable::empty_sentinel(name)),
+                    }));
+                }
+                return Ok(Some(mapped_file_to_table(mapped, name)));
+            }
+        }
+        match &self.inner {
+            Some(inner) => inner.table(name).await,
+            None => Ok(None),
+        }
+    }
+
+    fn register_table(
+        &self,
+        name: String,
+        table: Arc<dyn TableProvider>,
+    ) -> DfResult<Option<Arc<dyn TableProvider>>> {
+        match &self.inner {
+            Some(inner) => inner.register_table(name, table),
+            None => Err(DataFusionError::NotImplemented(
+                "unable to create tables".to_string(),
+            )),
+        }
+    }
+
+    fn deregister_table(&self, name: &str) -> DfResult<Option<Arc<dyn TableProvider>>> {
+        match &self.inner {
+            Some(inner) => inner.deregister_table(name),
+            None => Err(DataFusionError::NotImplemented(
+                "unable to drop tables".to_string(),
+            )),
+        }
+    }
+
+    fn table_exist(&self, name: &str) -> bool {
+        mmap_table_exists(&self.schema, name)
+            || self
+                .inner
+                .as_ref()
+                .map(|inner| inner.table_exist(name))
+                .unwrap_or(false)
+    }
+}
+
+/// Wraps the `probe` catalog: static schemas (python, cluster, …) keep
+/// working, mmap-backed schemas are discovered at query time, and when both
+/// exist for the same name they are **merged** (mmap tables first) instead of
+/// the mmap side shadowing the static provider.
+#[derive(Debug)]
+struct DynamicMmapCatalog {
+    inner: Arc<dyn CatalogProvider>,
+}
+
+impl CatalogProvider for DynamicMmapCatalog {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema_names(&self) -> Vec<String> {
+        let mut names: BTreeSet<String> = self.inner.schema_names().into_iter().collect();
+        for sch in discover_all_schemas() {
+            names.insert(sch);
+        }
+        names.into_iter().collect()
+    }
+
+    fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>> {
+        let inner = self.inner.schema(name);
+        let has_mmap = name == DEFAULT_UNDOTTED_SCHEMA || !tables_in_schema(name).is_empty();
+        match (has_mmap, inner) {
+            (true, inner) => Some(Arc::new(MmapFileSchemaProvider::with_inner(name, inner))),
+            (false, Some(inner)) => Some(inner),
+            (false, None) => None,
+        }
+    }
+
+    fn register_schema(
+        &self,
+        name: &str,
+        schema: Arc<dyn SchemaProvider>,
+    ) -> DfResult<Option<Arc<dyn SchemaProvider>>> {
+        self.inner.register_schema(name, schema)
+    }
+}
+
+/// Namespace plugin that wraps the `probe` catalog with [`DynamicMmapCatalog`]
+/// for dynamic schema discovery from mmap files at query time.
+#[derive(Debug, Default)]
+pub struct UnifiedMemtablePlugin;
+
+impl Plugin for UnifiedMemtablePlugin {
+    fn name(&self) -> String {
+        "mmap_memtables".into()
+    }
+    fn kind(&self) -> PluginType {
+        PluginType::Namespace
+    }
+    fn namespace(&self) -> String {
+        "memtable".into()
+    }
+
+    fn provide_catalog(&self, inner: Arc<dyn CatalogProvider>) -> Option<Arc<dyn CatalogProvider>> {
+        Some(Arc::new(DynamicMmapCatalog { inner }))
+    }
+}
+
+// ── Cold compaction runtime owner ─────────────────────────────────────
+
+/// Tunables for the background hot→cold compactor.
+#[derive(Clone, Debug)]
+pub struct ColdRuntimeConfig {
+    /// Whether the background compactor thread runs.
+    pub enabled: bool,
+    /// Sleep between drain passes.
+    pub poll: Duration,
+    /// Seal + roll a segment once it reaches this size (fragmentation knob).
+    pub target_segment_bytes: u64,
+    /// Seal an idle open segment after this long so it becomes queryable.
+    pub max_segment_age: Duration,
+    /// Cold-store byte budget; oldest segments evicted past it.
+    pub max_total_bytes: Option<u64>,
+    /// Drop cold segments older than this.
+    pub ttl: Option<Duration>,
+}
+
+impl Default for ColdRuntimeConfig {
+    fn default() -> Self {
+        Self {
+            enabled: false,
+            poll: Duration::from_secs(2),
+            target_segment_bytes: 64 * 1024 * 1024,
+            max_segment_age: Duration::from_secs(300),
+            max_total_bytes: None,
+            ttl: None,
+        }
+    }
+}
+
+impl ColdRuntimeConfig {
+    fn to_compactor(&self) -> CompactorConfig {
+        CompactorConfig {
+            target_segment_bytes: self.target_segment_bytes,
+            max_segment_age: self.max_segment_age,
+            poll_interval: self.poll,
+            max_total_bytes: self.max_total_bytes,
+            ttl: self.ttl,
+        }
+    }
+
+    /// Build a config from `PROBING_COLD*` environment variables, used to
+    /// auto-start compaction at engine init (opt-in, off by default).
+    pub fn from_env() -> Self {
+        fn env_u64(k: &str) -> Option<u64> {
+            std::env::var(k).ok().and_then(|v| v.trim().parse().ok())
+        }
+        let mut c = Self::default();
+        if let Ok(v) = std::env::var("PROBING_COLD") {
+            c.enabled = matches!(v.trim(), "1" | "on" | "true" | "yes");
+        }
+        if let Some(mb) = env_u64("PROBING_COLD_TARGET_MB") {
+            c.target_segment_bytes = mb.saturating_mul(1024 * 1024);
+        }
+        if let Some(mb) = env_u64("PROBING_COLD_MAX_TOTAL_MB") {
+            c.max_total_bytes = Some(mb.saturating_mul(1024 * 1024));
+        }
+        if let Some(s) = env_u64("PROBING_COLD_TTL_SECS") {
+            c.ttl = Some(Duration::from_secs(s));
+        }
+        if let Some(ms) = env_u64("PROBING_COLD_POLL_MS") {
+            c.poll = Duration::from_millis(ms.max(50));
+        }
+        if let Some(s) = env_u64("PROBING_COLD_MAX_AGE_SECS") {
+            c.max_segment_age = Duration::from_secs(s);
+        }
+        c
+    }
+}
+
+/// Ring files under `self_dir()` that are candidate compaction sources,
+/// returned as `(on-disk basename, path)`. The basename is the cold table
+/// identity (matching the SQL read path), so names never collide across
+/// schemas. The `cold/` subdir is skipped (it is a directory, not a file).
+fn cold_source_candidates() -> Vec<(String, std::path::PathBuf)> {
+    let mut out = Vec::new();
+    if let Ok(entries) = std::fs::read_dir(self_dir()) {
+        for e in entries.flatten() {
+            let p = e.path();
+            if !p.is_file() {
+                continue;
+            }
+            let name = e.file_name().to_string_lossy().to_string();
+            if classify_mmap_basename(&name).is_some() {
+                out.push((name, p));
+            }
+        }
+    }
+    out
+}
+
+/// Process-global owner of the background hot→cold compactor thread.
+///
+/// Modeled on the task-stats worker: a lazy singleton with start/stop, so the
+/// compactor has a single lifecycle home regardless of how many producers
+/// create hot tables. The loop rediscovers ring files each pass (tables appear
+/// over time), drains newly-sealed chunks into the shared cold store, rolls
+/// segments by age, and enforces the byte/TTL budget.
+pub struct ColdCompactor {
+    running: Arc<AtomicBool>,
+    handle: Mutex<Option<JoinHandle<()>>>,
+}
+
+impl ColdCompactor {
+    pub fn instance() -> &'static Self {
+        static INSTANCE: Lazy<ColdCompactor> = Lazy::new(|| ColdCompactor {
+            running: Arc::new(AtomicBool::new(false)),
+            handle: Mutex::new(None),
+        });
+        &INSTANCE
+    }
+
+    pub fn is_running(&self) -> bool {
+        self.running.load(Ordering::Acquire)
+    }
+
+    /// (Re)apply `cfg`: stop any running thread, then start a fresh one when
+    /// `cfg.enabled`. Idempotent and the single entry point for the config
+    /// surface, so changing a knob simply restarts with the new settings.
+    pub fn apply(&self, cfg: ColdRuntimeConfig) {
+        self.stop();
+        if cfg.enabled {
+            self.start(cfg);
+        }
+    }
+
+    fn start(&self, cfg: ColdRuntimeConfig) {
+        if self.running.swap(true, Ordering::SeqCst) {
+            return; // already running
+        }
+        let dir = cold_dir();
+        let store = match ColdStore::open(&dir) {
+            Ok(s) => s,
+            Err(e) => {
+                log::error!("cold compactor: cannot open {}: {e}", dir.display());
+                self.running.store(false, Ordering::SeqCst);
+                return;
+            }
+        };
+        let mut compactor = Compactor::new(store, cfg.to_compactor());
+        // Exactly-once across restarts: recover per-chunk watermarks from any
+        // segments already on disk before draining.
+        if let Err(e) = compactor.prime_from_cold() {
+            log::warn!("cold compactor: prime_from_cold failed: {e}");
+        }
+
+        let running = self.running.clone();
+        let poll = cfg.poll;
+        let handle = std::thread::Builder::new()
+            .name("memc-compactor".into())
+            .spawn(move || {
+                while running.load(Ordering::SeqCst) {
+                    for (name, path) in cold_source_candidates() {
+                        let Ok(mapped) = MappedFile::open(&path) else {
+                            continue;
+                        };
+                        if !matches!(detect_table(mapped.as_bytes()), Some(TableKind::Ring)) {
+                            continue; // only ring tables tier to cold
+                        }
+                        if let Ok(view) = MemTableView::new(mapped.as_bytes()) {
+                            if let Err(e) = compactor.drain_view(&name, &view) {
+                                log::debug!("cold compactor: drain {name}: {e}");
+                            }
+                        }
+                    }
+                    let _ = compactor.maybe_roll_on_age();
+                    let _ = compactor.enforce();
+                    sleep_interruptible(&running, poll);
+                }
+                // Final flush so the last open segment is sealed on shutdown.
+                if let Err(e) = compactor.flush() {
+                    log::debug!("cold compactor: final flush: {e}");
+                }
+            })
+            .expect("spawn memc-compactor thread");
+        *self.handle.lock().unwrap() = Some(handle);
+    }
+
+    /// Signal the thread to flush and exit, then join it.
+    pub fn stop(&self) {
+        if !self.running.swap(false, Ordering::SeqCst) {
+            return;
+        }
+        if let Some(h) = self.handle.lock().unwrap().take() {
+            let _ = h.join();
+        }
+    }
+
+    pub fn stats(&self) -> Option<ColdStats> {
+        ColdStore::open(cold_dir()).ok().map(|s| s.stats())
+    }
+}
+
+/// Sleep up to `total`, waking early (within ~200ms) if `running` is cleared.
+fn sleep_interruptible(running: &AtomicBool, total: Duration) {
+    let step = Duration::from_millis(200);
+    let mut left = total;
+    while left > Duration::ZERO && running.load(Ordering::SeqCst) {
+        let nap = left.min(step);
+        std::thread::sleep(nap);
+        left = left.saturating_sub(nap);
+    }
+}
+
+/// Start (or stop) background compaction from `PROBING_COLD*` env vars.
+/// Call once after the engine is built; off by default.
+pub fn start_cold_compaction_from_env() {
+    ColdCompactor::instance().apply(ColdRuntimeConfig::from_env());
+}
+
+// ── EngineExtension ────────────────────────────────────────────────────
+
+/// Exposes mmap memtables to SQL and owns the cold-compaction config surface.
+///
+/// Config knobs (also settable via `SET memtable.<key> = ...`):
+/// - `cold_compaction` (`on`/`off`) — run the background compactor.
+/// - `cold_max_total_mb` — cold-store byte budget in MiB.
+/// - `cold_ttl_secs` — evict cold segments older than this.
+#[derive(Debug, Default, EngineExtensionDerive)]
+pub struct MemTableExtension {
+    /// Background hot→cold compaction switch: "on" or "off".
+    #[option(aliases = ["cold.compaction"])]
+    cold_compaction: Maybe<String>,
+    /// Cold-store byte budget in MiB (oldest segments evicted past it).
+    #[option(aliases = ["cold.max_total_mb"])]
+    cold_max_total_mb: Maybe<i64>,
+    /// Evict cold segments older than this many seconds.
+    #[option(aliases = ["cold.ttl_secs"])]
+    cold_ttl_secs: Maybe<i64>,
+}
+
+impl MemTableExtension {
+    fn cold_enabled(&self) -> bool {
+        matches!(
+            self.cold_compaction,
+            Maybe::Just(ref s) if matches!(s.trim(), "1" | "on" | "true" | "yes")
+        )
+    }
+
+    /// Merge the current option fields over the env-derived defaults.
+    fn cold_config(&self) -> ColdRuntimeConfig {
+        let mut cfg = ColdRuntimeConfig::from_env();
+        cfg.enabled = self.cold_enabled();
+        if let Maybe::Just(mb) = self.cold_max_total_mb {
+            cfg.max_total_bytes = (mb > 0).then(|| (mb as u64).saturating_mul(1024 * 1024));
+        }
+        if let Maybe::Just(s) = self.cold_ttl_secs {
+            cfg.ttl = (s > 0).then(|| Duration::from_secs(s as u64));
+        }
+        cfg
+    }
+
+    fn apply_cold(&self) {
+        ColdCompactor::instance().apply(self.cold_config());
+    }
+
+    fn set_cold_compaction(&mut self, v: Maybe<String>) -> Result<(), EngineError> {
+        self.cold_compaction = v;
+        self.apply_cold();
+        Ok(())
+    }
+
+    fn set_cold_max_total_mb(&mut self, v: Maybe<i64>) -> Result<(), EngineError> {
+        self.cold_max_total_mb = v;
+        self.apply_cold();
+        Ok(())
+    }
+
+    fn set_cold_ttl_secs(&mut self, v: Maybe<i64>) -> Result<(), EngineError> {
+        self.cold_ttl_secs = v;
+        self.apply_cold();
+        Ok(())
+    }
+}
+
+impl EngineCall for MemTableExtension {}
+
+impl EngineDatasource for MemTableExtension {
+    fn datasrc(
+        &self,
+        _namespace: &str,
+        _name: Option<&str>,
+    ) -> Option<Arc<dyn Plugin + Sync + Send>> {
+        Some(Arc::new(UnifiedMemtablePlugin))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion::arrow::array::{AsArray, Float64Array, Int32Array, Int64Array, UInt8Array};
+    use probing_memtable::{MemTable, Schema as MtSchema, Value};
+    use std::sync::Mutex;
+
+    /// `PROBING_DATA_DIR` is process-global; serialize tests that mutate it.
+    static PROBING_DATA_DIR_LOCK: Mutex<()> = Mutex::new(());
+
+    fn concat_i64(batches: &[RecordBatch], col: usize) -> Vec<i64> {
+        batches
+            .iter()
+            .flat_map(|b| {
+                let a = b.column(col).as_any().downcast_ref::<Int64Array>().unwrap();
+                (0..a.len()).map(|i| a.value(i)).collect::<Vec<_>>()
+            })
+            .collect()
+    }
+
+    fn collect_i32(batches: &[RecordBatch]) -> Vec<i32> {
+        batches
+            .iter()
+            .flat_map(|b| {
+                let a = b.column(0).as_any().downcast_ref::<Int32Array>().unwrap();
+                (0..a.len()).map(|i| a.value(i)).collect::<Vec<_>>()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn dtype_mapping_covers_all_variants() {
+        assert_eq!(dtype_to_arrow(DType::U8), DataType::UInt8);
+        assert_eq!(dtype_to_arrow(DType::U32), DataType::UInt32);
+        assert_eq!(dtype_to_arrow(DType::I32), DataType::Int32);
+        assert_eq!(dtype_to_arrow(DType::I64), DataType::Int64);
+        assert_eq!(dtype_to_arrow(DType::F32), DataType::Float32);
+        assert_eq!(dtype_to_arrow(DType::F64), DataType::Float64);
+        assert_eq!(dtype_to_arrow(DType::U64), DataType::UInt64);
+        assert_eq!(dtype_to_arrow(DType::Str), DataType::Utf8);
+        assert_eq!(dtype_to_arrow(DType::Bytes), DataType::Binary);
+    }
+
+    #[test]
+    fn recordbatch_from_mixed_types() {
+        let schema = MtSchema::new()
+            .col("id", DType::I32)
+            .col("value", DType::F64)
+            .col("tag", DType::Str);
+        let mut t = MemTable::new(&schema, 4096, 2);
+        t.push_row(&[Value::I32(1), Value::F64(3.14), Value::Str("hello")]);
+        t.push_row(&[Value::I32(2), Value::F64(2.72), Value::Str("world")]);
+
+        let view = t.view();
+        let batches = view_to_recordbatches(&view);
+        assert_eq!(batches.len(), 1);
+        let batch = &batches[0];
+        assert_eq!(batch.num_rows(), 2);
+        assert_eq!(batch.num_columns(), 3);
+
+        let ids = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(ids.value(0), 1);
+        assert_eq!(ids.value(1), 2);
+
+        let vals = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap();
+        assert!((vals.value(0) - 3.14).abs() < 1e-10);
+        assert!((vals.value(1) - 2.72).abs() < 1e-10);
+
+        let tags: &datafusion::arrow::array::StringArray = batch.column(2).as_string();
+        assert_eq!(tags.value(0), "hello");
+        assert_eq!(tags.value(1), "world");
+    }
+
+    #[test]
+    fn recordbatches_multiple_chunks_in_logical_order() {
+        let schema = MtSchema::new().col("v", DType::I64);
+        // Small chunk so rows spill across chunks
+        let mut t = MemTable::new(&schema, 128, 4);
+        for i in 0..20 {
+            t.push_row(&[Value::I64(i)]);
+        }
+
+        let view = t.view();
+        let batches = view_to_recordbatches(&view);
+        assert!(!batches.is_empty());
+
+        // Concatenated in logical order, surviving values must be strictly
+        // increasing — even though the ring may have wrapped.
+        let values = concat_i64(&batches, 0);
+        assert!(!values.is_empty());
+        for w in values.windows(2) {
+            assert!(w[1] > w[0], "values not in logical order: {values:?}");
+        }
+        // The most recent row always survives.
+        assert_eq!(*values.last().unwrap(), 19);
+    }
+
+    #[test]
+    fn recordbatches_logical_order_after_wrap() {
+        let schema = MtSchema::new().col("v", DType::I64);
+        let mut t = MemTable::new(&schema, 80, 2);
+        t.push_row(&[Value::I64(10)]); // chunk 0, gen 1
+        t.advance_chunk();
+        t.push_row(&[Value::I64(20)]); // chunk 1, gen 1
+        t.advance_chunk(); // wrap: chunk 0 → gen 2
+        t.push_row(&[Value::I64(30)]); // chunk 0, gen 2
+
+        let view = t.view();
+        let batches = view_to_recordbatches(&view);
+        // chunk 1 (older) first, then recycled chunk 0
+        assert_eq!(concat_i64(&batches, 0), vec![20, 30]);
+    }
+
+    #[test]
+    fn recordbatch_empty_table_keeps_schema() {
+        let schema = MtSchema::new().col("x", DType::U8);
+        let t = MemTable::new(&schema, 1024, 1);
+        let view = t.view();
+        let batches = view_to_recordbatches(&view);
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0].num_rows(), 0);
+        assert_eq!(batches[0].schema().field(0).name(), "x");
+    }
+
+    #[test]
+    fn arrow_schema_matches_memtable_schema() {
+        let schema = MtSchema::new()
+            .col("ts", DType::I64)
+            .col("cpu", DType::F64)
+            .col("name", DType::Str);
+        let t = MemTable::new(&schema, 1024, 1);
+        let view = t.view();
+        let arrow = view_to_arrow_schema(&view);
+
+        assert_eq!(arrow.fields().len(), 3);
+        assert_eq!(arrow.field(0).name(), "ts");
+        assert_eq!(*arrow.field(0).data_type(), DataType::Int64);
+        assert_eq!(arrow.field(1).name(), "cpu");
+        assert_eq!(*arrow.field(1).data_type(), DataType::Float64);
+        assert_eq!(arrow.field(2).name(), "name");
+        assert_eq!(*arrow.field(2).data_type(), DataType::Utf8);
+    }
+
+    #[test]
+    fn recordbatch_u8_column() {
+        let schema = MtSchema::new().col("flag", DType::U8);
+        let mut t = MemTable::new(&schema, 1024, 1);
+        t.push_row(&[Value::U8(0)]);
+        t.push_row(&[Value::U8(255)]);
+
+        let view = t.view();
+        let batches = view_to_recordbatches(&view);
+        let col = batches[0]
+            .column(0)
+            .as_any()
+            .downcast_ref::<UInt8Array>()
+            .unwrap();
+        assert_eq!(col.value(0), 0);
+        assert_eq!(col.value(1), 255);
+    }
+
+    // ── time-range pruning ─────────────────────────────────────────────
+
+    #[test]
+    fn ts_bounds_extraction_from_filters() {
+        use datafusion::prelude::{col, lit};
+
+        // Conjunction across filter entries
+        let b = ts_bounds_from_filters(
+            &[col("ts").gt_eq(lit(100i64)), col("ts").lt(lit(200i64))],
+            "ts",
+        );
+        assert_eq!(
+            b,
+            TsBounds {
+                lower: Some(100),
+                upper: Some(200)
+            }
+        );
+
+        // AND inside one entry + tightening
+        let f = col("ts").gt(lit(10i64)).and(col("ts").gt(lit(50i64)));
+        assert_eq!(ts_bounds_from_filters(&[f], "ts").lower, Some(50));
+
+        // Literal on the left mirrors the comparison: 300 <= ts
+        let f = lit(300i64).lt_eq(col("ts"));
+        assert_eq!(ts_bounds_from_filters(&[f], "ts").lower, Some(300));
+
+        // BETWEEN
+        let f = col("ts").between(lit(10i64), lit(20i64));
+        let b = ts_bounds_from_filters(&[f], "ts");
+        assert_eq!((b.lower, b.upper), (Some(10), Some(20)));
+
+        // Equality pins both sides
+        let f = col("ts").eq(lit(42i64));
+        let b = ts_bounds_from_filters(&[f], "ts");
+        assert_eq!((b.lower, b.upper), (Some(42), Some(42)));
+
+        // OR cannot be folded → unbounded (conservative)
+        let f = col("ts").gt(lit(5i64)).or(col("v").eq(lit(1i64)));
+        let b = ts_bounds_from_filters(&[f], "ts");
+        assert_eq!((b.lower, b.upper), (None, None));
+
+        // Predicates on other columns are ignored
+        let b = ts_bounds_from_filters(&[col("v").gt(lit(5i64))], "ts");
+        assert_eq!((b.lower, b.upper), (None, None));
+    }
+
+    #[test]
+    fn pruned_batches_skip_out_of_range_chunks() {
+        let schema = MtSchema::new().col("ts", DType::I64);
+        // ChunkHeader=40, I64 row=12 → 64-40=24 → 2 rows per chunk
+        let mut t = MemTable::new(&schema, 64, 4);
+        for ts in [10i64, 20, 30, 40, 50, 60] {
+            t.push_row(&[Value::I64(ts)]);
+        }
+        let view = t.view();
+        assert_eq!(view_to_recordbatches(&view).len(), 3);
+
+        // lower bound falls inside chunk 1: chunk 0 (max 20) pruned
+        let pruned = view_to_recordbatches_pruned(
+            &view,
+            &TsBounds {
+                lower: Some(35),
+                upper: None,
+            },
+        );
+        assert_eq!(concat_i64(&pruned, 0), vec![30, 40, 50, 60]);
+
+        // tight window: only the chunk containing [50, 60] survives
+        let pruned = view_to_recordbatches_pruned(
+            &view,
+            &TsBounds {
+                lower: Some(55),
+                upper: Some(58),
+            },
+        );
+        assert_eq!(concat_i64(&pruned, 0), vec![50, 60]);
+
+        // window past all data: everything pruned, schema kept
+        let pruned = view_to_recordbatches_pruned(
+            &view,
+            &TsBounds {
+                lower: Some(1000),
+                upper: None,
+            },
+        );
+        assert_eq!(pruned.len(), 1);
+        assert_eq!(pruned[0].num_rows(), 0);
+        assert_eq!(pruned[0].schema().field(0).name(), "ts");
+
+        // unbounded: identical to the unpruned materialisation
+        let unpruned = view_to_recordbatches_pruned(&view, &TsBounds::default());
+        assert_eq!(concat_i64(&unpruned, 0), vec![10, 20, 30, 40, 50, 60]);
+    }
+
+    #[test]
+    fn tables_without_ts_col_are_never_pruned() {
+        let schema = MtSchema::new().col("v", DType::I64); // not a ts name
+        let mut t = MemTable::new(&schema, 64, 4);
+        for v in [1i64, 2, 3, 4] {
+            t.push_row(&[Value::I64(v)]);
+        }
+        let view = t.view();
+        assert_eq!(view.ts_col(), None);
+        // Even with bounds set, chunks without ts metadata must survive.
+        let batches = view_to_recordbatches_pruned(
+            &view,
+            &TsBounds {
+                lower: Some(100),
+                upper: None,
+            },
+        );
+        assert_eq!(concat_i64(&batches, 0), vec![1, 2, 3, 4]);
+    }
+
+    #[tokio::test]
+    async fn ring_mmap_table_sql_end_to_end() {
+        let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap();
+        use datafusion::prelude::SessionContext;
+        use probing_memtable::discover::ExposedTable;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let orig = std::env::var("PROBING_DATA_DIR").ok();
+        std::env::set_var("PROBING_DATA_DIR", tmp.path());
+
+        let schema = MtSchema::new()
+            .col("timestamp", DType::I64)
+            .col("v", DType::I32);
+        // 2 rows per chunk → 12 rows spread over 8 chunks
+        let mut table = ExposedTable::create("prune_demo", &schema, 80, 8).unwrap();
+        for i in 1i64..=12 {
+            table.push_row(&[Value::I64(i * 100), Value::I32(i as i32)]);
+        }
+
+        let path = self_dir().join("prune_demo");
+        let mapped = MappedFile::open(&path).unwrap();
+        let provider = mapped_file_to_table(mapped, "prune_demo");
+        assert!(
+            provider.as_any().downcast_ref::<RingMmapTable>().is_some(),
+            "ring files must get the lazy pruning provider"
+        );
+
+        let ctx = SessionContext::new();
+        ctx.register_table("prune_demo", provider).unwrap();
+        let batches = ctx
+            .sql("SELECT v FROM prune_demo WHERE timestamp >= 700 AND timestamp < 1100 ORDER BY v")
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+        let got: Vec<i32> = batches
+            .iter()
+            .flat_map(|b| {
+                let a = b
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<Int32Array>()
+                    .unwrap();
+                (0..a.len()).map(|i| a.value(i)).collect::<Vec<_>>()
+            })
+            .collect();
+        assert_eq!(got, vec![7, 8, 9, 10]);
+
+        drop(table);
+        match orig {
+            Some(v) => std::env::set_var("PROBING_DATA_DIR", v),
+            None => std::env::remove_var("PROBING_DATA_DIR"),
+        }
+    }
+
+    #[tokio::test]
+    async fn hot_cold_union_dedups_and_spans_time() {
+        use datafusion::prelude::SessionContext;
+        use probing_memtable::memc::{ColdStore, Compactor, CompactorConfig};
+
+        let tmp = tempfile::tempdir().unwrap();
+        let hot_path = tmp.path().join("hc_demo");
+        let cold = tmp.path().join("cold");
+
+        let schema = MtSchema::new()
+            .col("timestamp", DType::I64)
+            .col("v", DType::I32);
+        // 2 rows per chunk, 4 chunks.
+        let mut t = MemTable::file_at(&hot_path, &schema, 80, 4).unwrap();
+        for i in 1i64..=6 {
+            t.push_row(&[Value::I64(i * 100), Value::I32(i as i32)]);
+        }
+        // chunks 0,1 sealed (ts 100,200 / 300,400); chunk 2 full-but-writing (500,600).
+
+        {
+            let store = ColdStore::open(&cold).unwrap();
+            let mut c = Compactor::new(
+                store,
+                CompactorConfig {
+                    target_segment_bytes: 1 << 30,
+                    ..Default::default()
+                },
+            );
+            let drained = c.drain_view("hc_demo", &t.view()).unwrap();
+            assert_eq!(drained, 4, "two sealed chunks → 4 rows compacted");
+            c.flush().unwrap();
+        }
+
+        let mapped = MappedFile::open(&hot_path).unwrap();
+        let ring = RingMmapTable::try_new(mapped).unwrap();
+        let provider: Arc<dyn TableProvider> =
+            Arc::new(HotColdTable::new(ring, cold.clone(), "hc_demo"));
+
+        let ctx = SessionContext::new();
+        ctx.register_table("hc_demo", provider).unwrap();
+
+        // Full scan: cold (4) + hot tail (2), with the still-resident compacted
+        // chunks deduped out of hot — exactly-once across tiers.
+        let all = ctx
+            .sql("SELECT v FROM hc_demo ORDER BY v")
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+        assert_eq!(collect_i32(&all), vec![1, 2, 3, 4, 5, 6]);
+
+        // One time predicate prunes both tiers and selects across the boundary.
+        let span = ctx
+            .sql("SELECT v FROM hc_demo WHERE timestamp >= 200 AND timestamp <= 500 ORDER BY v")
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+        assert_eq!(collect_i32(&span), vec![2, 3, 4, 5]);
+
+        drop(t);
+    }
+
+    #[tokio::test]
+    async fn cold_compactor_runtime_drains_and_is_queryable() {
+        let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap();
+        use datafusion::prelude::SessionContext;
+        use probing_memtable::discover::ExposedTable;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let orig = std::env::var("PROBING_DATA_DIR").ok();
+        std::env::set_var("PROBING_DATA_DIR", tmp.path());
+
+        let schema = MtSchema::new()
+            .col("timestamp", DType::I64)
+            .col("v", DType::I32);
+        let mut table = ExposedTable::create("rt_demo", &schema, 80, 8).unwrap();
+        for i in 1i64..=6 {
+            table.push_row(&[Value::I64(i * 100), Value::I32(i as i32)]);
+        }
+        // chunks 0,1 sealed; chunk 2 full-but-writing (stays hot-only).
+
+        // The runtime owner discovers the ring on its own and drains it.
+        ColdCompactor::instance().apply(ColdRuntimeConfig {
+            enabled: true,
+            poll: Duration::from_millis(50),
+            ..Default::default()
+        });
+
+        let mut waited = 0;
+        while ColdCompactor::instance()
+            .stats()
+            .map(|s| s.segment_count)
+            .unwrap_or(0)
+            == 0
+            && waited < 5000
+        {
+            std::thread::sleep(Duration::from_millis(50));
+            waited += 50;
+        }
+        ColdCompactor::instance().stop(); // final flush seals the open segment
+        assert!(
+            ColdCompactor::instance()
+                .stats()
+                .map(|s| s.segment_count)
+                .unwrap_or(0)
+                >= 1,
+            "compactor should have produced a cold segment"
+        );
+
+        // Query through the same hot∪cold provider the catalog builds.
+        let path = self_dir().join("rt_demo");
+        let mapped = MappedFile::open(&path).unwrap();
+        let ring = RingMmapTable::try_new(mapped).unwrap();
+        let provider: Arc<dyn TableProvider> =
+            Arc::new(HotColdTable::new(ring, cold_dir(), "rt_demo"));
+        let ctx = SessionContext::new();
+        ctx.register_table("rt_demo", provider).unwrap();
+        let all = ctx
+            .sql("SELECT v FROM rt_demo ORDER BY v")
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+        assert_eq!(collect_i32(&all), vec![1, 2, 3, 4, 5, 6]);
+
+        drop(table);
+        ColdCompactor::instance().stop();
+        match orig {
+            Some(v) => std::env::set_var("PROBING_DATA_DIR", v),
+            None => std::env::remove_var("PROBING_DATA_DIR"),
+        }
+    }
+
+    #[tokio::test]
+    async fn engine_catalog_query_unions_cold_tier() {
+        let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap();
+        use datafusion::catalog::MemoryCatalogProvider;
+        use datafusion::prelude::SessionContext;
+        use probing_memtable::discover::ExposedTable;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let orig = std::env::var("PROBING_DATA_DIR").ok();
+        std::env::set_var("PROBING_DATA_DIR", tmp.path());
+
+        let schema = MtSchema::new()
+            .col("timestamp", DType::I64)
+            .col("v", DType::I32);
+        let mut table = ExposedTable::create("metrics", &schema, 80, 8).unwrap();
+        for i in 1i64..=6 {
+            table.push_row(&[Value::I64(i * 100), Value::I32(i as i32)]);
+        }
+
+        // Drain the sealed chunks to cold via the runtime owner.
+        ColdCompactor::instance().apply(ColdRuntimeConfig {
+            enabled: true,
+            poll: Duration::from_millis(50),
+            ..Default::default()
+        });
+        let mut waited = 0;
+        while ColdCompactor::instance()
+            .stats()
+            .map(|s| s.segment_count)
+            .unwrap_or(0)
+            == 0
+            && waited < 5000
+        {
+            std::thread::sleep(Duration::from_millis(50));
+            waited += 50;
+        }
+        ColdCompactor::instance().stop();
+
+        // Real query path: register the dynamic catalog and resolve the table
+        // purely by name — DynamicMmapCatalog → MmapFileSchemaProvider →
+        // HotColdTable, exactly as the engine does.
+        let ctx = SessionContext::new();
+        let catalog = Arc::new(DynamicMmapCatalog {
+            inner: Arc::new(MemoryCatalogProvider::new()),
+        });
+        ctx.register_catalog("probe", catalog);
+
+        let all = ctx
+            .sql("SELECT v FROM probe.memtable.metrics ORDER BY v")
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+        assert_eq!(collect_i32(&all), vec![1, 2, 3, 4, 5, 6], "hot∪cold once");
+
+        // One time predicate prunes across both tiers through the catalog.
+        let span = ctx
+            .sql(
+                "SELECT v FROM probe.memtable.metrics \
+                 WHERE timestamp >= 200 AND timestamp <= 500 ORDER BY v",
+            )
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+        assert_eq!(collect_i32(&span), vec![2, 3, 4, 5]);
+
+        drop(table);
+        ColdCompactor::instance().stop();
+        match orig {
+            Some(v) => std::env::set_var("PROBING_DATA_DIR", v),
+            None => std::env::remove_var("PROBING_DATA_DIR"),
+        }
+    }
+
+    #[test]
+    fn classify_and_mmap_roundtrip() {
+        assert_eq!(
+            classify_mmap_basename("pulsing.actors"),
+            Some(("pulsing".into(), "actors".into()))
+        );
+        assert_eq!(
+            classify_mmap_basename("foo.bar.baz"),
+            Some(("foo".into(), "bar.baz".into()))
+        );
+        assert_eq!(
+            classify_mmap_basename("metrics"),
+            Some((DEFAULT_UNDOTTED_SCHEMA.into(), "metrics".into()))
+        );
+        assert_eq!(
+            mmap_filename_for(DEFAULT_UNDOTTED_SCHEMA, "metrics"),
+            "metrics"
+        );
+        assert_eq!(mmap_filename_for("pulsing", "actors"), "pulsing.actors");
+        assert_eq!(mmap_filename_for("foo", "bar.baz"), "foo.bar.baz");
+    }
+
+    #[test]
+    fn mmap_table_exists_rejects_path_traversal() {
+        assert!(!mmap_table_exists("memtable", "../../etc/passwd"));
+        assert!(!mmap_table_exists("memtable", "a/b"));
+        assert!(!mmap_table_exists("memtable", ""));
+    }
+
+    fn read_pushdown_from_mmap(schema: &str, table: &str) -> Arc<dyn TableProvider> {
+        let path = self_dir().join(mmap_filename_for(schema, table));
+        let mapped = MappedFile::open(path).unwrap();
+        bytes_to_pushdown_table(mapped.as_bytes(), table)
+    }
+
+    #[test]
+    fn namespace_list_and_mmap_read_via_exposed_table() {
+        let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap();
+        use probing_memtable::discover::ExposedTable;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let orig = std::env::var("PROBING_DATA_DIR").ok();
+        std::env::set_var("PROBING_DATA_DIR", tmp.path());
+
+        let schema = MtSchema::new().col("ts", DType::I64).col("msg", DType::Str);
+        let mut table = ExposedTable::create("test_metrics", &schema, 4096, 2).unwrap();
+        {
+            let mut w = table.writer();
+            w.push_row(&[Value::I64(100), Value::Str("alpha")]);
+            w.push_row(&[Value::I64(200), Value::Str("beta")]);
+        }
+
+        let names = tables_in_schema(DEFAULT_UNDOTTED_SCHEMA);
+        assert!(
+            names.contains(&"test_metrics".to_string()),
+            "got: {names:?}"
+        );
+        assert!(mmap_table_exists(DEFAULT_UNDOTTED_SCHEMA, "test_metrics"));
+
+        let provider = read_pushdown_from_mmap(DEFAULT_UNDOTTED_SCHEMA, "test_metrics");
+        assert!(provider
+            .as_any()
+            .downcast_ref::<PluginAdvancedTable>()
+            .is_some());
+
+        let path = self_dir().join(mmap_filename_for(DEFAULT_UNDOTTED_SCHEMA, "test_metrics"));
+        let mapped = MappedFile::open(&path).unwrap();
+        let view = MemTableView::new(mapped.as_bytes()).unwrap();
+        let batches = view_to_recordbatches(&view);
+        assert_eq!(batches.len(), 1);
+        let batch = &batches[0];
+        assert_eq!(batch.num_rows(), 2);
+
+        let ts = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        assert_eq!(ts.value(0), 100);
+        assert_eq!(ts.value(1), 200);
+
+        let msgs: &datafusion::arrow::array::StringArray = batch.column(1).as_string();
+        assert_eq!(msgs.value(0), "alpha");
+        assert_eq!(msgs.value(1), "beta");
+
+        drop(table);
+        match orig {
+            Some(v) => std::env::set_var("PROBING_DATA_DIR", v),
+            None => std::env::remove_var("PROBING_DATA_DIR"),
+        }
+    }
+
+    #[test]
+    fn dotted_schema_isolated_from_memtable_list() {
+        let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap();
+        use probing_memtable::discover::ExposedTable;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let orig = std::env::var("PROBING_DATA_DIR").ok();
+        std::env::set_var("PROBING_DATA_DIR", tmp.path());
+
+        let schema = MtSchema::new().col("ts", DType::I64).col("msg", DType::Str);
+        let dotted = mmap_filename_for("acme", "metrics_demo");
+        let mut ring = ExposedTable::create(&dotted, &schema, 4096, 2).unwrap();
+        {
+            let mut w = ring.writer();
+            w.push_row(&[Value::I64(1), Value::Str("x")]);
+        }
+
+        let mem_names = tables_in_schema(DEFAULT_UNDOTTED_SCHEMA);
+        assert!(
+            !mem_names.contains(&"metrics_demo".to_string()),
+            "dotted file must not appear as memtable table: {mem_names:?}"
+        );
+
+        let acme_names = tables_in_schema("acme");
+        assert!(
+            acme_names.contains(&"metrics_demo".to_string()),
+            "got: {acme_names:?}"
+        );
+
+        let provider = read_pushdown_from_mmap("acme", "metrics_demo");
+        assert!(provider
+            .as_any()
+            .downcast_ref::<PluginAdvancedTable>()
+            .is_some());
+
+        drop(ring);
+        match orig {
+            Some(v) => std::env::set_var("PROBING_DATA_DIR", v),
+            None => std::env::remove_var("PROBING_DATA_DIR"),
+        }
+    }
+
+    #[tokio::test]
+    async fn merged_schema_provider_does_not_shadow_inner() {
+        let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap();
+        use datafusion::catalog::MemorySchemaProvider;
+        use datafusion::datasource::MemTable as DfMemTable;
+        use probing_memtable::discover::ExposedTable;
+
+        let tmp = tempfile::tempdir().unwrap();
+        let orig = std::env::var("PROBING_DATA_DIR").ok();
+        std::env::set_var("PROBING_DATA_DIR", tmp.path());
+
+        // Static (inner) provider with one table
+        let inner = Arc::new(MemorySchemaProvider::new());
+        let static_schema = Arc::new(Schema::new(vec![Field::new(
+            "x",
+            DataType::Int64,
+            false,
+        )]));
+        let static_batch = RecordBatch::try_new(
+            static_schema.clone(),
+            vec![Arc::new(Int64Array::from(vec![42i64]))],
+        )
+        .unwrap();
+        inner
+            .register_table(
+                "static_tbl".to_string(),
+                Arc::new(DfMemTable::try_new(static_schema, vec![vec![static_batch]]).unwrap()),
+            )
+            .unwrap();
+
+        // Mmap table in schema "python"
+        let mt_schema = MtSchema::new().col("v", DType::I64);
+        let mut ring =
+            ExposedTable::create(&mmap_filename_for("python", "extern_tbl"), &mt_schema, 4096, 2)
+                .unwrap();
+        ring.push_row(&[Value::I64(7)]);
+
+        let merged = MmapFileSchemaProvider::with_inner("python", Some(inner.clone() as _));
+
+        // Both tables visible
+        let names = merged.table_names();
+        assert!(names.contains(&"extern_tbl".to_string()), "got {names:?}");
+        assert!(names.contains(&"static_tbl".to_string()), "got {names:?}");
+
+        // Static table still resolvable through the merged provider
+        assert!(merged.table("static_tbl").await.unwrap().is_some());
+        // Mmap table resolvable too
+        assert!(merged.table("extern_tbl").await.unwrap().is_some());
+        assert!(merged.table_exist("static_tbl"));
+        assert!(merged.table_exist("extern_tbl"));
+
+        drop(ring);
+        match orig {
+            Some(v) => std::env::set_var("PROBING_DATA_DIR", v),
+            None => std::env::remove_var("PROBING_DATA_DIR"),
+        }
+    }
+}
diff --git a/probing/core/src/core/mod.rs b/probing/core/src/core/mod.rs
index d8c34a42..4cbe12c3 100644
--- a/probing/core/src/core/mod.rs
+++ b/probing/core/src/core/mod.rs
@@ -4,6 +4,8 @@ pub mod cluster_model;
 mod engine;
 mod error;
 pub mod extension;
+pub mod memtable_sql;
+mod plugin_advanced;
 mod plugin;
 
 pub use engine::Engine;
@@ -19,8 +21,12 @@ pub use plugin::CustomNamespaceDataSource;
 pub use plugin::CustomTable;
 pub use plugin::LazyTableSource;
 pub use plugin::NamespacePluginHelper;
+pub use plugin_advanced::PluginAdvancedTable;
 pub use plugin::TablePluginHelper;
 
+pub use memtable_sql::MemTableExtension;
+pub use memtable_sql::UnifiedMemtablePlugin;
+
 pub use extension::EngineCall;
 pub use extension::EngineDatasource;
 pub use extension::EngineExtension;
diff --git a/probing/core/src/core/plugin.rs b/probing/core/src/core/plugin.rs
index 8adf70f9..e7efdef5 100644
--- a/probing/core/src/core/plugin.rs
+++ b/probing/core/src/core/plugin.rs
@@ -6,19 +6,26 @@ use async_trait::async_trait;
 use datafusion::arrow::array::RecordBatch;
 use datafusion::arrow::datatypes::SchemaRef;
 use datafusion::catalog::{CatalogProvider, SchemaProvider, Session, TableProvider};
-use datafusion::datasource::memory::DataSourceExec;
-use datafusion::datasource::memory::MemorySourceConfig;
+use datafusion::common::Statistics;
 use datafusion::datasource::TableType;
 use datafusion::error::{DataFusionError, Result};
 use datafusion::execution::SessionState;
+use datafusion::logical_expr::TableProviderFilterPushDown;
+use datafusion::physical_plan::common::compute_record_batch_statistics;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::Expr;
 
+use super::plugin_advanced::{scan_memory_partitions, supports_filters_pushdown_for_schema};
+
 /// Trait defining a custom table with static/dynamic schema and data
 ///
 /// Implement this to create tables that:
 /// - Have a fixed name
 /// - Use a predefined schema
+///
+/// The default [`TableDataSource`] integration applies **conservative** `WHERE` / `LIMIT`
+/// pushdown (same rules as [`super::plugin_advanced`](super::plugin_advanced)): simple predicates
+/// whose columns all exist on the table may run inside the scan; others stay in a planner `Filter`.
 pub trait CustomTable {
     /// Returns the table name (must be constant)
     fn name() -> &'static str;
@@ -120,21 +127,47 @@ impl<T: CustomTable + Default + Debug + Send + Sync + 'static> TableProvider
         TableType::Base
     }
 
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        supports_filters_pushdown_for_schema(&T::schema(), filters)
+    }
+
+    fn statistics(&self) -> Option<Statistics> {
+        let partitions = vec![T::data()];
+        Some(compute_record_batch_statistics(
+            &partitions,
+            T::schema().as_ref(),
+            None,
+        ))
+    }
+
     async fn scan(
         &self,
-        _state: &dyn Session,
+        state: &dyn Session,
         projection: Option<&Vec<usize>>,
-        // filters and limit can be used here to inject some push-down operations if needed
-        _filters: &[Expr],
-        _limit: Option<usize>,
+        filters: &[Expr],
+        limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let data = T::data();
-        let srccfg = MemorySourceConfig::try_new(&[data], T::schema(), projection.cloned())?;
-        let exec = DataSourceExec::new(Arc::new(srccfg));
-        Ok(Arc::new(exec))
+        let batches = T::data();
+        let partitions = vec![batches];
+        scan_memory_partitions(
+            state,
+            T::schema(),
+            &partitions,
+            projection,
+            filters,
+            limit,
+        )
+        .await
     }
 }
 
+/// Eager in-memory table built from pre-materialized [`RecordBatch`]es (e.g. mmap → Arrow).
+///
+/// Supports the same **conservative** `WHERE` / `LIMIT` pushdown as [`TableDataSource`] via
+/// [`super::plugin_advanced::scan_memory_partitions`](super::plugin_advanced::scan_memory_partitions).
 #[derive(Default, Debug)]
 pub struct LazyTableSource {
     pub name: String,
@@ -163,13 +196,31 @@ impl TableProvider for LazyTableSource {
         TableType::Base
     }
 
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        supports_filters_pushdown_for_schema(&self.schema(), filters)
+    }
+
+    fn statistics(&self) -> Option<Statistics> {
+        if self.data.is_empty() {
+            return None;
+        }
+        let partitions = vec![self.data.clone()];
+        Some(compute_record_batch_statistics(
+            &partitions,
+            self.schema().as_ref(),
+            None,
+        ))
+    }
+
     async fn scan(
         &self,
-        _state: &dyn Session,
+        state: &dyn Session,
         projection: Option<&Vec<usize>>,
-        // filters and limit can be used here to inject some push-down operations if needed
-        _filters: &[Expr],
-        _limit: Option<usize>,
+        filters: &[Expr],
+        limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let data = &self.data;
         if data.is_empty() {
@@ -178,10 +229,16 @@ impl TableProvider for LazyTableSource {
             ));
         }
         let schema = data[0].schema();
-        let srccfg =
-            MemorySourceConfig::try_new(std::slice::from_ref(data), schema, projection.cloned())?;
-        let exec = DataSourceExec::new(Arc::new(srccfg));
-        Ok(Arc::new(exec))
+        let partitions = vec![self.data.clone()];
+        scan_memory_partitions(
+            state,
+            schema,
+            &partitions,
+            projection,
+            filters,
+            limit,
+        )
+        .await
     }
 }
 
diff --git a/probing/core/src/core/plugin_advanced.rs b/probing/core/src/core/plugin_advanced.rs
new file mode 100644
index 00000000..9f1fa941
--- /dev/null
+++ b/probing/core/src/core/plugin_advanced.rs
@@ -0,0 +1,590 @@
+//! Advanced [`TableProvider`] path and **shared pushdown helpers** for in-memory Arrow batches.
+//!
+//! [`PluginAdvancedTable`] is aimed at internal callers (e.g. mmap memtables). The same filter /
+//! limit / stats behaviour is reused by [`super::plugin::TableDataSource`](super::plugin::TableDataSource)
+//! and [`super::plugin::LazyTableSource`](super::plugin::LazyTableSource) via [`scan_memory_partitions`]
+//! and [`supports_filters_pushdown_for_schema`].
+
+use std::any::Any;
+use std::collections::HashSet;
+use std::fmt::Debug;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use datafusion::arrow::array::Int64Array;
+use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use datafusion::arrow::record_batch::{RecordBatch, RecordBatchOptions};
+use datafusion::catalog::Session;
+use datafusion::common::tree_node::TreeNode;
+use datafusion::common::DFSchema;
+use datafusion::common::Statistics;
+use datafusion::datasource::memory::{DataSourceExec, MemorySourceConfig};
+use datafusion::datasource::{TableProvider, TableType};
+use datafusion::error::{DataFusionError, Result};
+use datafusion::logical_expr::{Expr, TableProviderFilterPushDown};
+use datafusion::physical_expr::utils::conjunction;
+use datafusion::physical_plan::common::compute_record_batch_statistics;
+use datafusion::physical_plan::filter::FilterExecBuilder;
+use datafusion::physical_plan::ExecutionPlan;
+
+/// In-memory table: one or more partitions of [`RecordBatch`]es sharing `schema`.
+///
+/// - Declares **filter push-down** for predicates that pass a conservative structural check
+///   (no subqueries, all referenced columns exist on the table schema).
+/// - Applies pushed filters in `scan` via [`FilterExec`] on top of [`MemorySourceConfig`].
+/// - Applies **`LIMIT` / fetch** on the memory source when there are no pushed filters, and on
+///   [`FilterExec`] when filters are present (so limit still applies with pushdown).
+/// - Exposes **row / null-count style statistics** via [`TableProvider::statistics`].
+#[derive(Debug)]
+pub struct PluginAdvancedTable {
+    /// Logical table name (for `Debug` / tracing only).
+    label: String,
+    schema: SchemaRef,
+    /// Partition layout expected by [`MemorySourceConfig`].
+    partitions: Vec<Vec<RecordBatch>>,
+}
+
+impl PluginAdvancedTable {
+    pub fn label(&self) -> &str {
+        &self.label
+    }
+
+    /// Build from a single partition list; validates each batch against `schema`.
+    pub fn try_new(
+        label: impl Into<String>,
+        schema: SchemaRef,
+        batches: Vec<RecordBatch>,
+    ) -> Result<Self> {
+        let label = label.into();
+        for b in &batches {
+            Self::check_batch_schema(&label, &schema, b)?;
+        }
+        Ok(Self {
+            label,
+            schema,
+            partitions: vec![batches],
+        })
+    }
+
+    /// Multi-partition layout (advanced; most callers use [`Self::try_new`]).
+    pub fn try_new_partitions(
+        label: impl Into<String>,
+        schema: SchemaRef,
+        partitions: Vec<Vec<RecordBatch>>,
+    ) -> Result<Self> {
+        let label = label.into();
+        for part in &partitions {
+            for b in part {
+                Self::check_batch_schema(&label, &schema, b)?;
+            }
+        }
+        Ok(Self {
+            label,
+            schema,
+            partitions,
+        })
+    }
+
+    /// Sentinel for invalid mmap / empty inputs (zero-row, minimal schema).
+    pub fn empty_sentinel(label: impl Into<String>) -> Self {
+        let label = label.into();
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "_empty",
+            DataType::Int64,
+            true,
+        )]));
+        let empty = RecordBatch::try_new_with_options(
+            Arc::clone(&schema),
+            vec![Arc::new(Int64Array::from(Vec::<i64>::new()))],
+            &RecordBatchOptions::new().with_row_count(Some(0)),
+        )
+        .expect("empty batch");
+        Self {
+            label,
+            schema,
+            partitions: vec![vec![empty]],
+        }
+    }
+
+    fn check_batch_schema(label: &str, expected: &SchemaRef, batch: &RecordBatch) -> Result<()> {
+        let got = batch.schema();
+        if got.as_ref() != expected.as_ref() {
+            return Err(DataFusionError::Plan(format!(
+                "PluginAdvancedTable {label}: batch schema mismatch (expected {expected}, got {got})"
+            )));
+        }
+        Ok(())
+    }
+}
+
+/// `true` if `expr` contains constructs we cannot evaluate inside a plain memory scan.
+pub(crate) fn has_unsupported_pushdown_subexpr(expr: &Expr) -> bool {
+    use datafusion::logical_expr::Expr as E;
+    expr.exists(|e| {
+        Ok(matches!(
+            e,
+            E::ScalarSubquery(_)
+                | E::Exists { .. }
+                | E::InSubquery(_)
+                | E::Placeholder(_)
+                | E::GroupingSet(_)
+                | E::OuterReferenceColumn(_, _)
+        ))
+    })
+    .unwrap_or(true)
+}
+
+/// Structural gate for [`TableProvider::supports_filters_pushdown`] without a [`Session`].
+pub(crate) fn can_push_filter_exact_for_schema(schema: &SchemaRef, expr: &Expr) -> bool {
+    if has_unsupported_pushdown_subexpr(expr) {
+        return false;
+    }
+    let names: HashSet<String> = schema
+        .fields()
+        .iter()
+        .map(|f| f.name().clone())
+        .collect();
+    for c in expr.column_refs() {
+        if !names.contains(c.name()) {
+            return false;
+        }
+    }
+    true
+}
+
+pub(crate) fn supports_filters_pushdown_for_schema(
+    schema: &SchemaRef,
+    filters: &[&Expr],
+) -> Result<Vec<TableProviderFilterPushDown>> {
+    Ok(filters
+        .iter()
+        .map(|f| {
+            if can_push_filter_exact_for_schema(schema, f) {
+                TableProviderFilterPushDown::Exact
+            } else {
+                TableProviderFilterPushDown::Unsupported
+            }
+        })
+        .collect())
+}
+
+/// Build a scan plan over in-memory partitions with optional filter + limit pushdown.
+pub(crate) async fn scan_memory_partitions(
+    state: &dyn Session,
+    schema: SchemaRef,
+    partitions: &[Vec<RecordBatch>],
+    projection: Option<&Vec<usize>>,
+    filters: &[Expr],
+    limit: Option<usize>,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    let show_sizes = state.config_options().explain.show_sizes;
+
+    let plan: Arc<dyn ExecutionPlan> = if filters.is_empty() {
+        let mem = MemorySourceConfig::try_new(partitions, schema.clone(), projection.cloned())?
+            .with_show_sizes(show_sizes)
+            .with_limit(limit);
+        DataSourceExec::from_data_source(mem)
+    } else {
+        // Predicates are compiled against the FULL table schema, so the
+        // source must scan unprojected; otherwise column indices inside the
+        // physical predicate would resolve against the projected batch
+        // (e.g. `a > 1` silently evaluating on column `b`). The requested
+        // projection is applied by FilterExec on the way out.
+        let df_schema = DFSchema::try_from(Arc::clone(&schema))?;
+        let mut phys = Vec::new();
+        for f in filters {
+            phys.push(state.create_physical_expr(f.clone(), &df_schema)?);
+        }
+        let predicate = conjunction(phys);
+
+        let mem = MemorySourceConfig::try_new(partitions, schema.clone(), None)?
+            .with_show_sizes(show_sizes);
+        let input: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(mem);
+        let filt = FilterExecBuilder::new(predicate, input)
+            .apply_projection(projection.cloned())?
+            .with_fetch(limit)
+            .build()?;
+        Arc::new(filt)
+    };
+
+    Ok(plan)
+}
+
+#[async_trait]
+impl TableProvider for PluginAdvancedTable {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        supports_filters_pushdown_for_schema(&self.schema, filters)
+    }
+
+    fn statistics(&self) -> Option<Statistics> {
+        Some(compute_record_batch_statistics(
+            &self.partitions,
+            self.schema.as_ref(),
+            None,
+        ))
+    }
+
+    async fn scan(
+        &self,
+        state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        filters: &[Expr],
+        limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        scan_memory_partitions(
+            state,
+            self.schema(),
+            &self.partitions,
+            projection,
+            filters,
+            limit,
+        )
+        .await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion::arrow::array::Int32Array;
+    use datafusion::common::stats::Precision;
+    use datafusion::datasource::TableProvider;
+    use datafusion::execution::context::TaskContext;
+    use datafusion::logical_expr::expr_fn::{out_ref_col, placeholder};
+    use datafusion::logical_expr::TableProviderFilterPushDown;
+    use datafusion::physical_plan::collect;
+    use datafusion::prelude::{col, lit, SessionContext};
+    use std::sync::Arc;
+
+    fn test_schema_id() -> SchemaRef {
+        Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]))
+    }
+
+    fn batch_ids(schema: &SchemaRef, values: Vec<i32>) -> Result<RecordBatch> {
+        RecordBatch::try_new(
+            Arc::clone(schema),
+            vec![Arc::new(Int32Array::from(values))],
+        )
+        .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
+    }
+
+    // --- construction ---
+
+    #[test]
+    fn try_new_accepts_matching_schema() -> Result<()> {
+        let schema = test_schema_id();
+        let b = batch_ids(&schema, vec![1, 2])?;
+        let t = PluginAdvancedTable::try_new("x", Arc::clone(&schema), vec![b])?;
+        assert_eq!(t.label(), "x");
+        assert_eq!(t.schema().fields().len(), 1);
+        Ok(())
+    }
+
+    #[test]
+    fn try_new_rejects_schema_mismatch() {
+        let expected = test_schema_id();
+        let wrong = Arc::new(Schema::new(vec![Field::new(
+            "other",
+            DataType::Int32,
+            false,
+        )]));
+        let batch = batch_ids(&wrong, vec![1]).unwrap();
+        let err = PluginAdvancedTable::try_new("bad", expected, vec![batch]).unwrap_err();
+        let msg = err.to_string();
+        assert!(
+            msg.contains("batch schema mismatch"),
+            "unexpected error: {msg}"
+        );
+    }
+
+    #[test]
+    fn try_new_partitions_validates_all_batches() {
+        let schema = test_schema_id();
+        let wrong = Arc::new(Schema::new(vec![Field::new(
+            "x",
+            DataType::Int32,
+            false,
+        )]));
+        let good = batch_ids(&schema, vec![1]).unwrap();
+        let bad = batch_ids(&wrong, vec![2]).unwrap();
+        let err = PluginAdvancedTable::try_new_partitions(
+            "p",
+            Arc::clone(&schema),
+            vec![vec![good], vec![bad]],
+        )
+        .unwrap_err();
+        assert!(err.to_string().contains("batch schema mismatch"));
+    }
+
+    #[test]
+    fn try_new_partitions_succeeds() -> Result<()> {
+        let schema = test_schema_id();
+        let p0 = batch_ids(&schema, vec![1, 2])?;
+        let p1 = batch_ids(&schema, vec![3])?;
+        let t = PluginAdvancedTable::try_new_partitions("m", schema.clone(), vec![vec![p0], vec![p1]])?;
+        let s = t.statistics().expect("stats");
+        assert_eq!(s.num_rows, Precision::Exact(3));
+        Ok(())
+    }
+
+    #[test]
+    fn empty_sentinel_zero_rows_and_schema() {
+        let t = PluginAdvancedTable::empty_sentinel("mmap-empty");
+        assert_eq!(t.label(), "mmap-empty");
+        assert_eq!(t.schema().fields().len(), 1);
+        assert_eq!(t.schema().field(0).name(), "_empty");
+        let s = t.statistics().expect("stats");
+        assert_eq!(s.num_rows, Precision::Exact(0));
+    }
+
+    // --- pushdown helpers ---
+
+    #[test]
+    fn has_unsupported_detects_placeholder_outer_ref() {
+        assert!(has_unsupported_pushdown_subexpr(&placeholder("$1")));
+        assert!(has_unsupported_pushdown_subexpr(&out_ref_col(
+            DataType::Int32,
+            "c"
+        )));
+        assert!(!has_unsupported_pushdown_subexpr(&col("id")));
+        assert!(!has_unsupported_pushdown_subexpr(&col("id").gt(lit(0i32))));
+    }
+
+    #[test]
+    fn can_push_filter_exact_for_schema_gate() {
+        let schema = test_schema_id();
+        assert!(can_push_filter_exact_for_schema(
+            &schema,
+            &col("id").gt(lit(1i32))
+        ));
+        assert!(!can_push_filter_exact_for_schema(
+            &schema,
+            &col("missing").gt(lit(1i32))
+        ));
+        assert!(!can_push_filter_exact_for_schema(
+            &schema,
+            &placeholder("$1")
+        ));
+    }
+
+    #[test]
+    fn supports_filters_pushdown_for_schema_mixed() -> Result<()> {
+        let schema = test_schema_id();
+        let f1 = col("id").gt(lit(0i32));
+        let f2 = col("nope").eq(lit(1i32));
+        let v = supports_filters_pushdown_for_schema(&schema, &[&f1, &f2])?;
+        assert_eq!(v.len(), 2);
+        assert_eq!(v[0], TableProviderFilterPushDown::Exact);
+        assert_eq!(v[1], TableProviderFilterPushDown::Unsupported);
+        Ok(())
+    }
+
+    // --- scan_memory_partitions ---
+
+    #[tokio::test]
+    async fn scan_memory_partitions_limit_without_filter() -> Result<()> {
+        let schema = test_schema_id();
+        let batch = batch_ids(&schema, vec![10, 20, 30, 40])?;
+        let ctx = SessionContext::new();
+        let state = ctx.state();
+        let plan = scan_memory_partitions(
+            &state,
+            Arc::clone(&schema),
+            &[vec![batch]],
+            None,
+            &[],
+            Some(2),
+        )
+        .await?;
+        let batches = collect(plan, Arc::new(TaskContext::default())).await?;
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0].num_rows(), 2);
+        let arr = batches[0]
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(arr.value(0), 10);
+        assert_eq!(arr.value(1), 20);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn scan_memory_partitions_filter_and_limit() -> Result<()> {
+        let schema = test_schema_id();
+        let batch = batch_ids(&schema, vec![1, 2, 3, 4, 5])?;
+        let filter = col("id").gt(lit(2i32));
+        let ctx = SessionContext::new();
+        let state = ctx.state();
+        let plan = scan_memory_partitions(
+            &state,
+            Arc::clone(&schema),
+            &[vec![batch]],
+            None,
+            std::slice::from_ref(&filter),
+            Some(2),
+        )
+        .await?;
+        let batches = collect(plan, Arc::new(TaskContext::default())).await?;
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0].num_rows(), 2);
+        let arr = batches[0]
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(arr.value(0), 3);
+        assert_eq!(arr.value(1), 4);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn scan_memory_partitions_filter_with_projection_uses_full_schema() -> Result<()> {
+        // Regression: predicate column (`a`) is NOT part of the projection.
+        // The filter must still evaluate against the full schema instead of
+        // resolving indices on the projected batch.
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+        ]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int64Array::from(vec![1i64, 2, 3])),
+                Arc::new(Int64Array::from(vec![10i64, 20, 30])),
+            ],
+        )?;
+        let filter = col("a").gt(lit(1i64));
+        let ctx = SessionContext::new();
+        let state = ctx.state();
+        let plan = scan_memory_partitions(
+            &state,
+            Arc::clone(&schema),
+            &[vec![batch]],
+            Some(&vec![1usize]), // project only `b`
+            std::slice::from_ref(&filter),
+            None,
+        )
+        .await?;
+        let batches = collect(plan, Arc::new(TaskContext::default())).await?;
+        assert_eq!(batches.len(), 1);
+        let out = &batches[0];
+        assert_eq!(out.num_columns(), 1);
+        assert_eq!(out.schema().field(0).name(), "b");
+        let arr = out
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        assert_eq!(arr.len(), 2);
+        assert_eq!(arr.value(0), 20);
+        assert_eq!(arr.value(1), 30);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn scan_memory_partitions_invalid_column_in_filter_errors() -> Result<()> {
+        let schema = test_schema_id();
+        let batch = batch_ids(&schema, vec![1])?;
+        let bad_filter = col("unknown").eq(lit(1i32));
+        let ctx = SessionContext::new();
+        let state = ctx.state();
+        let err = scan_memory_partitions(
+            &state,
+            schema,
+            &[vec![batch]],
+            None,
+            std::slice::from_ref(&bad_filter),
+            None,
+        )
+        .await
+        .unwrap_err();
+        let msg = err.to_string();
+        assert!(
+            msg.contains("unknown") || msg.contains("column"),
+            "unexpected: {msg}"
+        );
+        Ok(())
+    }
+
+    // --- TableProvider ---
+
+    #[test]
+    fn table_provider_as_any_and_table_type() -> Result<()> {
+        let schema = test_schema_id();
+        let t = PluginAdvancedTable::try_new("t", schema, vec![batch_ids(&test_schema_id(), vec![1])?])?;
+        assert!(t.as_any().downcast_ref::<PluginAdvancedTable>().is_some());
+        assert_eq!(t.table_type(), TableType::Base);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn table_provider_supports_filters_pushdown_delegates() -> Result<()> {
+        let schema = test_schema_id();
+        let t = PluginAdvancedTable::try_new("t", schema, vec![batch_ids(&test_schema_id(), vec![1])?])?;
+        let f = col("id").gt(lit(0i32));
+        let v = t.supports_filters_pushdown(&[&f])?;
+        assert_eq!(v, vec![TableProviderFilterPushDown::Exact]);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn filter_and_limit_pushdown_scan() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "id",
+            DataType::Int32,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
+        )?;
+        let table = Arc::new(PluginAdvancedTable::try_new("t", Arc::clone(&schema), vec![batch])?);
+        let ctx = SessionContext::new();
+        ctx.register_table("t", table)?;
+        let df = ctx.sql("SELECT id FROM t WHERE id > 2 LIMIT 2").await?;
+        let batches = df.collect().await?;
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0].num_rows(), 2);
+        let arr = batches[0]
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(arr.value(0), 3);
+        assert_eq!(arr.value(1), 4);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn statistics_reports_row_count() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "id",
+            DataType::Int32,
+            false,
+        )]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![10, 20]))],
+        )?;
+        let table = PluginAdvancedTable::try_new("t", schema, vec![batch])?;
+        let s = table.statistics().expect("stats");
+        assert_eq!(s.num_rows, Precision::Exact(2));
+        Ok(())
+    }
+}
diff --git a/probing/extensions/cc/Cargo.toml b/probing/extensions/cc/Cargo.toml
index 5081cd0d..06721d59 100644
--- a/probing/extensions/cc/Cargo.toml
+++ b/probing/extensions/cc/Cargo.toml
@@ -22,7 +22,7 @@ thiserror = { workspace = true }
 
 async-trait = "0.1.83"
 rmesg = { version = "1.0.21", optional = true }
-datafusion = { version = "47.0.0", default-features = false, features = [] }
+datafusion = { workspace = true }
 
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs = { version = "0.17.0", default-features = false, features = ["chrono"] }
diff --git a/probing/extensions/python/Cargo.toml b/probing/extensions/python/Cargo.toml
index 887bc8cc..f06f9c32 100644
--- a/probing/extensions/python/Cargo.toml
+++ b/probing/extensions/python/Cargo.toml
@@ -14,6 +14,7 @@ default = ["extension-module", "tracing"]
 [dependencies]
 probing-cc = { path = "../cc" }
 probing-core = { path = "../../core" }
+probing-memtable = { path = "../../memtable" }
 probing-proto = { path = "../../proto" }
 probing-store = { path = "../../crates/store" }
 probing-cli = { path = "../../cli" }
@@ -53,6 +54,7 @@ regex = ">=1.6.0"
 
 [dev-dependencies]
 tokio = { workspace = true }
+tempfile = "3.8"
 
 [build-dependencies]
 pyo3-build-config = "0.25.1"
diff --git a/probing/extensions/python/src/extensions/python/exttbls.rs b/probing/extensions/python/src/extensions/python/exttbls.rs
index 8c5132eb..f4571d21 100644
--- a/probing/extensions/python/src/extensions/python/exttbls.rs
+++ b/probing/extensions/python/src/extensions/python/exttbls.rs
@@ -1,19 +1,50 @@
-use std::sync::Arc;
-use std::{collections::HashMap, sync::Mutex};
+//! Python-facing `ExternalTable`, backed by **mmap memtables**.
+//!
+//! Each table is an [`ExposedTable`] (MEMT ring buffer) under
+//! `<PROBING_DATA_DIR>/<pid>/python.<name>`, so:
+//!
+//! - data **survives a crash** of the producing process (postmortem-readable),
+//! - any process can query it via the mmap SQL catalog
+//!   (`probing_core::core::memtable_sql`) as `python.<name>`,
+//! - the training process only ever pays the cost of an mmap row write —
+//!   query-side materialisation happens in whoever runs the SQL.
+//!
+//! The first appended row fixes the column dtypes (the Python API only
+//! declares column names). A leading `timestamp` column (microseconds since
+//! epoch, `I64`) is always present, matching the previous TimeSeries layout.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
 
 use once_cell::sync::Lazy;
-use probing_proto::prelude::{Ele, TimeSeries};
-use probing_proto::types::series::DiscardStrategy;
+use probing_memtable::discover::ExposedTable;
+use probing_memtable::{DType, Schema as MtSchema, Value};
+use probing_proto::prelude::Ele;
 use pyo3::prelude::*;
 use pyo3::types::{PyDict, PyType};
 use pyo3::{pyclass, pymethods, Bound, PyObject, PyResult, Python};
 
 use crate::features::convert::{ele_to_python, python_to_ele};
 
-fn value_to_object(py: Python, v: &probing_proto::prelude::Ele) -> PyObject {
+/// SQL schema (and filename prefix) for Python extern tables.
+pub const EXTERN_TABLE_SCHEMA: &str = "python";
+
+/// Ring layout: fixed chunk count; chunk byte size derives from capacity.
+const NUM_CHUNKS: u32 = 8;
+const MIN_CHUNK_BYTES: usize = 4 * 1024;
+const MAX_CHUNK_BYTES: usize = 8 * 1024 * 1024;
+
+fn value_to_object(py: Python, v: &Ele) -> PyObject {
     ele_to_python(py, v).unwrap_or_else(|_| py.None())
 }
 
+fn now_micros() -> i64 {
+    std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .map(|d| d.as_micros() as i64)
+        .unwrap_or(0)
+}
+
 #[pyclass]
 pub struct PyExternalTableConfig {
     #[pyo3(get)]
@@ -49,22 +80,6 @@ impl FromPyObject<'_> for PyExternalTableConfig {
     }
 }
 
-impl From<PyExternalTableConfig> for DiscardStrategy {
-    fn from(py_config: PyExternalTableConfig) -> Self {
-        match py_config.discard_strategy.as_str() {
-            "BaseElementCount" => DiscardStrategy::BaseElementCount {
-                discard_threshold: py_config.discard_threshold,
-                chunk_size: py_config.chunk_size,
-            },
-            "BaseMemorySize" => DiscardStrategy::BaseMemorySize {
-                discard_threshold: py_config.discard_threshold,
-                chunk_size: py_config.chunk_size,
-            },
-            _ => DiscardStrategy::None,
-        }
-    }
-}
-
 #[pymethods]
 impl PyExternalTableConfig {
     #[new]
@@ -76,9 +91,10 @@ impl PyExternalTableConfig {
         }
     }
 
+    #[allow(clippy::wrong_self_convention)] // Python-facing method name, kept for API compat
     fn into_py(&self, py: Python<'_>) -> PyObject {
         let dict = PyDict::new(py);
-        dict.set_item("chunk_size", &self.chunk_size).unwrap();
+        dict.set_item("chunk_size", self.chunk_size).unwrap();
         dict.set_item("discard_threshold", self.discard_threshold)
             .unwrap();
         dict.set_item("discard_strategy", &self.discard_strategy)
@@ -87,12 +103,231 @@ impl PyExternalTableConfig {
     }
 }
 
-pub static EXTERN_TABLES: Lazy<Mutex<HashMap<String, Arc<Mutex<TimeSeries>>>>> =
+/// Total ring capacity in bytes derived from the (legacy) discard config.
+///
+/// - `BaseMemorySize`: `discard_threshold` *is* a byte budget.
+/// - `BaseElementCount`: estimate 64 bytes/row.
+/// - anything else: 16 MiB default.
+fn ring_capacity_bytes(discard_threshold: usize, strategy: &str) -> usize {
+    let raw = match strategy {
+        "BaseMemorySize" => discard_threshold,
+        "BaseElementCount" => discard_threshold.saturating_mul(64),
+        _ => 16 * 1024 * 1024,
+    };
+    raw.clamp(MIN_CHUNK_BYTES * NUM_CHUNKS as usize, 1 << 30)
+}
+
+fn ring_chunk_bytes(capacity: usize) -> u32 {
+    (capacity / NUM_CHUNKS as usize).clamp(MIN_CHUNK_BYTES, MAX_CHUNK_BYTES) as u32
+}
+
+/// Column dtype inferred from the first appended value.
+fn ele_dtype(e: &Ele) -> DType {
+    match e {
+        Ele::I32(_) => DType::I32,
+        Ele::I64(_) => DType::I64,
+        Ele::F32(_) => DType::F32,
+        Ele::F64(_) => DType::F64,
+        Ele::BOOL(_) => DType::U8,
+        Ele::DataTime(_) => DType::U64,
+        Ele::Text(_) | Ele::Url(_) | Ele::Nil => DType::Str,
+    }
+}
+
+/// Owned cell value: coerced from an [`Ele`] to match the column dtype, so a
+/// `Vec<Value>` row can borrow from it.
+enum OwnedVal {
+    U8(u8),
+    I32(i32),
+    I64(i64),
+    F32(f32),
+    F64(f64),
+    U64(u64),
+    S(String),
+}
+
+fn ele_to_owned(e: &Ele, dt: DType) -> OwnedVal {
+    let as_f64 = |e: &Ele| match e {
+        Ele::I32(v) => *v as f64,
+        Ele::I64(v) => *v as f64,
+        Ele::F32(v) => *v as f64,
+        Ele::F64(v) => *v,
+        Ele::BOOL(v) => *v as u8 as f64,
+        Ele::DataTime(v) => *v as f64,
+        _ => 0.0,
+    };
+    match dt {
+        DType::U8 => OwnedVal::U8(match e {
+            Ele::BOOL(v) => *v as u8,
+            other => as_f64(other) as u8,
+        }),
+        DType::I32 => OwnedVal::I32(as_f64(e) as i32),
+        DType::I64 => OwnedVal::I64(as_f64(e) as i64),
+        DType::F32 => OwnedVal::F32(as_f64(e) as f32),
+        DType::F64 => OwnedVal::F64(as_f64(e)),
+        DType::U64 => OwnedVal::U64(as_f64(e) as u64),
+        DType::U32 => OwnedVal::U64(as_f64(e) as u64),
+        DType::Str | DType::Bytes => OwnedVal::S(match e {
+            Ele::Text(s) | Ele::Url(s) => s.clone(),
+            Ele::Nil => String::new(),
+            other => other.to_string(),
+        }),
+    }
+}
+
+fn owned_to_value(o: &OwnedVal) -> Value<'_> {
+    match o {
+        OwnedVal::U8(v) => Value::U8(*v),
+        OwnedVal::I32(v) => Value::I32(*v),
+        OwnedVal::I64(v) => Value::I64(*v),
+        OwnedVal::F32(v) => Value::F32(*v),
+        OwnedVal::F64(v) => Value::F64(*v),
+        OwnedVal::U64(v) => Value::U64(*v),
+        OwnedVal::S(s) => Value::Str(s),
+    }
+}
+
+/// State behind one extern table. The mmap ring is created lazily on the
+/// first append because the Python API declares names but not types.
+pub struct ExternBacking {
+    name: String,
+    columns: Vec<String>,
+    capacity_bytes: usize,
+    dtypes: Vec<DType>,
+    table: Option<ExposedTable>,
+}
+
+impl ExternBacking {
+    fn new(name: &str, columns: Vec<String>, capacity_bytes: usize) -> Self {
+        Self {
+            name: name.to_string(),
+            columns,
+            capacity_bytes,
+            dtypes: vec![],
+            table: None,
+        }
+    }
+
+    fn ensure_table(&mut self, first_row: &[Ele]) -> Result<(), String> {
+        if self.table.is_some() {
+            return Ok(());
+        }
+        let dtypes: Vec<DType> = first_row.iter().map(ele_dtype).collect();
+        let mut schema = MtSchema::new().col("timestamp", DType::I64);
+        for (name, dt) in self.columns.iter().zip(dtypes.iter()) {
+            schema = schema.col(name, *dt);
+        }
+        let chunk_bytes = ring_chunk_bytes(self.capacity_bytes);
+        let filename = format!("{EXTERN_TABLE_SCHEMA}.{}", self.name);
+        let table = ExposedTable::create(&filename, &schema, chunk_bytes, NUM_CHUNKS)
+            .map_err(|e| format!("failed to create mmap table {filename}: {e}"))?;
+        self.dtypes = dtypes;
+        self.table = Some(table);
+        Ok(())
+    }
+
+    fn append(&mut self, timestamp: i64, values: &[Ele]) -> Result<(), String> {
+        if values.len() != self.columns.len() {
+            return Err("column count mismatch".to_string());
+        }
+        self.ensure_table(values)?;
+
+        let owned: Vec<OwnedVal> = values
+            .iter()
+            .zip(self.dtypes.iter())
+            .map(|(e, dt)| ele_to_owned(e, *dt))
+            .collect();
+        let mut row: Vec<Value> = Vec::with_capacity(owned.len() + 1);
+        row.push(Value::I64(timestamp));
+        row.extend(owned.iter().map(owned_to_value));
+
+        // ExposedTable::push_row validates schema and auto-advances chunks.
+        self.table
+            .as_mut()
+            .expect("ensured above")
+            .push_row(&row);
+        Ok(())
+    }
+
+    /// Rows in chronological order; when `limit` is set, only the most
+    /// recent `limit` rows are returned (still oldest → newest).
+    fn take(&self, limit: Option<usize>) -> Vec<(Ele, Vec<Ele>)> {
+        let Some(table) = &self.table else {
+            return vec![];
+        };
+        let view = table.view();
+        let mut out: Vec<(Ele, Vec<Ele>)> = Vec::new();
+        for chunk in view.chunks_logical() {
+            for row in view.rows(chunk) {
+                let mut cursor = row.cursor();
+                let ts = Ele::I64(cursor.next_i64());
+                let vals: Vec<Ele> = self
+                    .dtypes
+                    .iter()
+                    .map(|dt| match dt {
+                        DType::U8 => Ele::BOOL(cursor.next_u8() != 0),
+                        DType::I32 => Ele::I32(cursor.next_i32()),
+                        DType::I64 => Ele::I64(cursor.next_i64()),
+                        DType::F32 => Ele::F32(cursor.next_f32()),
+                        DType::F64 => Ele::F64(cursor.next_f64()),
+                        DType::U64 => Ele::DataTime(cursor.next_u64()),
+                        DType::U32 => Ele::I64(cursor.next_u32() as i64),
+                        DType::Str => Ele::Text(cursor.next_str().to_string()),
+                        DType::Bytes => Ele::Text(String::from_utf8_lossy(cursor.next_bytes()).to_string()),
+                    })
+                    .collect();
+                out.push((ts, vals));
+            }
+        }
+        if let Some(limit) = limit {
+            if out.len() > limit {
+                out.drain(..out.len() - limit);
+            }
+        }
+        out
+    }
+}
+
+impl std::fmt::Debug for ExternBacking {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ExternBacking")
+            .field("name", &self.name)
+            .field("columns", &self.columns)
+            .field("created", &self.table.is_some())
+            .finish()
+    }
+}
+
+pub static EXTERN_TABLES: Lazy<Mutex<HashMap<String, Arc<Mutex<ExternBacking>>>>> =
     Lazy::new(|| Mutex::new(Default::default()));
 
 #[pyclass]
 #[derive(Clone, Debug)]
-pub struct ExternalTable(Arc<Mutex<TimeSeries>>, usize);
+pub struct ExternalTable(Arc<Mutex<ExternBacking>>, usize);
+
+impl ExternalTable {
+    fn extract_eles(values: Vec<PyObject>) -> Vec<Ele> {
+        Python::with_gil(|py| {
+            values
+                .into_iter()
+                .map(|v| {
+                    let bound = v.bind(py);
+                    python_to_ele(bound).unwrap_or(Ele::Nil)
+                })
+                .collect()
+        })
+    }
+
+    fn create_backing(
+        name: &str,
+        columns: Vec<String>,
+        discard_threshold: usize,
+        discard_strategy: &str,
+    ) -> Arc<Mutex<ExternBacking>> {
+        let capacity = ring_capacity_bytes(discard_threshold, discard_strategy);
+        Arc::new(Mutex::new(ExternBacking::new(name, columns, capacity)))
+    }
+}
 
 #[pymethods]
 impl ExternalTable {
@@ -105,32 +340,22 @@ impl ExternalTable {
         discard_threshold: usize,
         discard_strategy: String,
     ) -> Self {
+        let _ = chunk_size; // ring chunking is byte-based; kept for API compat
         let ncolumn = columns.len();
-        let config = PyExternalTableConfig {
-            chunk_size,
-            discard_threshold,
-            discard_strategy,
-        };
-        let config: DiscardStrategy = config.into();
-        let ts = Arc::new(Mutex::new(
-            TimeSeries::builder_with_config(config)
-                .with_columns(columns)
-                .build(),
-        ));
+        let backing = Self::create_backing(name, columns, discard_threshold, &discard_strategy);
         EXTERN_TABLES
             .lock()
             .unwrap()
-            .insert(name.to_string(), ts.clone());
-        ExternalTable(ts, ncolumn)
+            .insert(name.to_string(), backing.clone());
+        ExternalTable(backing, ncolumn)
     }
 
     #[classmethod]
     fn get(_cls: &Bound<'_, PyType>, name: &str) -> PyResult<ExternalTable> {
         let binding = EXTERN_TABLES.lock().unwrap();
-        let ts = binding.get(name);
-        if let Some(ts) = ts {
-            let ncolumn = ts.lock().unwrap().cols.len();
-            Ok(ExternalTable(ts.clone(), ncolumn))
+        if let Some(backing) = binding.get(name) {
+            let ncolumn = backing.lock().unwrap().columns.len();
+            Ok(ExternalTable(backing.clone(), ncolumn))
         } else {
             Err(pyo3::exceptions::PyValueError::new_err(format!(
                 "table {name} not found"
@@ -148,37 +373,30 @@ impl ExternalTable {
         discard_threshold: usize,
         discard_strategy: String,
     ) -> PyResult<ExternalTable> {
+        let _ = chunk_size;
         let mut binding = EXTERN_TABLES.lock().unwrap();
-        let ts = binding.get(name);
-        if let Some(ts) = ts {
-            let ncolumn = ts.lock().unwrap().cols.len();
-            Ok(ExternalTable(ts.clone(), ncolumn))
+        if let Some(backing) = binding.get(name) {
+            let ncolumn = backing.lock().unwrap().columns.len();
+            Ok(ExternalTable(backing.clone(), ncolumn))
         } else {
             let ncolumn = columns.len();
-            let config = PyExternalTableConfig {
-                chunk_size,
-                discard_threshold,
-                discard_strategy,
-            };
-            let config: DiscardStrategy = config.into();
-            let ts = Arc::new(Mutex::new(
-                TimeSeries::builder_with_config(config)
-                    .with_columns(columns)
-                    .build(),
-            ));
-            binding.insert(name.to_string(), ts.clone());
-            Ok(ExternalTable(ts, ncolumn))
+            let backing =
+                Self::create_backing(name, columns, discard_threshold, &discard_strategy);
+            binding.insert(name.to_string(), backing.clone());
+            Ok(ExternalTable(backing, ncolumn))
         }
     }
 
     #[classmethod]
     fn drop(_cls: &Bound<'_, PyType>, name: &str) -> PyResult<()> {
+        // Dropping the backing drops the ExposedTable, which unlinks the
+        // mmap file and removes the table from SQL.
         let _ = EXTERN_TABLES.lock().unwrap().remove(name);
         Ok(())
     }
 
     fn names(&self) -> Vec<String> {
-        self.0.lock().unwrap().names.clone()
+        self.0.lock().unwrap().columns.clone()
     }
 
     fn append(&mut self, values: Vec<PyObject>) -> PyResult<()> {
@@ -187,23 +405,12 @@ impl ExternalTable {
                 "column count mismatch",
             ));
         }
-        let t = std::time::SystemTime::now()
-            .duration_since(std::time::UNIX_EPOCH)
+        let eles = Self::extract_eles(values);
+        self.0
+            .lock()
             .unwrap()
-            .as_micros() as i64;
-        let values: Vec<Ele> = Python::with_gil(|py| {
-            values
-                .into_iter()
-                .map(|v| {
-                    let bound = v.bind(py);
-                    python_to_ele(&bound).unwrap_or(Ele::Nil)
-                })
-                .collect()
-        });
-        match self.0.lock().unwrap().append(t.into(), values) {
-            Ok(_) => Ok(()),
-            Err(e) => Err(pyo3::exceptions::PyValueError::new_err(e.to_string())),
-        }
+            .append(now_micros(), &eles)
+            .map_err(pyo3::exceptions::PyValueError::new_err)
     }
 
     fn append_ts(&mut self, t: i64, values: Vec<PyObject>) -> PyResult<()> {
@@ -212,26 +419,25 @@ impl ExternalTable {
                 "column count mismatch",
             ));
         }
-        let values: Vec<Ele> = Python::with_gil(|py| {
-            values
-                .into_iter()
-                .map(|v| {
-                    let bound = v.bind(py);
-                    python_to_ele(&bound).unwrap_or(Ele::Nil)
-                })
-                .collect()
-        });
-        let _ = self.0.lock().unwrap().append(t.into(), values);
+        let eles = Self::extract_eles(values);
+        self.0
+            .lock()
+            .unwrap()
+            .append(t, &eles)
+            .map_err(pyo3::exceptions::PyValueError::new_err)
+    }
+
+    fn append_many(&mut self, rows: Vec<Vec<PyObject>>) -> PyResult<()> {
+        for row in rows {
+            self.append(row)?;
+        }
         Ok(())
     }
 
     #[pyo3(signature = (limit=None))]
     fn take(&self, limit: Option<usize>) -> PyResult<Vec<(PyObject, Vec<PyObject>)>> {
-        let result: Vec<(PyObject, Vec<PyObject>)> = self
-            .0
-            .lock()
-            .unwrap()
-            .take(limit)
+        let rows = self.0.lock().unwrap().take(limit);
+        let result = rows
             .iter()
             .map(|(t, vals)| {
                 Python::with_gil(|py| {
@@ -252,21 +458,23 @@ impl ExternalTable {
 mod tests {
     use super::*;
     use crate::extensions::python::PythonPlugin;
-    use probing_cc::extensions::envs::EnvPlugin;
-    use probing_cc::extensions::files::FilesPlugin;
-    use probing_core::core::Engine;
+    use probing_core::core::{Engine, UnifiedMemtablePlugin};
     use pyo3::ffi::c_str;
 
+    /// Route all mmap files of this test process into one tempdir.
+    static TEST_DATA_DIR: Lazy<tempfile::TempDir> = Lazy::new(|| {
+        let dir = tempfile::tempdir().unwrap();
+        std::env::set_var("PROBING_DATA_DIR", dir.path());
+        dir
+    });
+
     fn setup() {
-        // Module registration is now handled automatically via _core module
-        // In test environment, we need to manually set up the probing module
-        // since _core may not be importable as a Python module
+        let _ = &*TEST_DATA_DIR;
         pyo3::prepare_freethreaded_python();
         Python::with_gil(|py| {
             use pyo3::types::PyModule;
             use pyo3::PyTypeInfo;
 
-            // Get or create probing module
             let sys = PyModule::import(py, "sys").unwrap();
             let modules = sys.getattr("modules").unwrap();
 
@@ -278,8 +486,6 @@ mod tests {
                 m
             };
 
-            // Manually add ExternalTable to probing module for tests
-            // This mimics what _core module does
             if !probing.hasattr("ExternalTable").unwrap_or(false) {
                 probing
                     .setattr("ExternalTable", ExternalTable::type_object(py))
@@ -288,19 +494,23 @@ mod tests {
         });
     }
 
-    fn setup_table3() {
+    /// Create a table with a unique name and three rows; idempotent per name.
+    fn setup_table(name: &str) {
         setup();
         Python::with_gil(|py| {
             py.run(
-                c_str!(
+                &std::ffi::CString::new(format!(
                     r#"
 import probing
-table3 = probing.ExternalTable.get_or_create("table3", ["a", "b"])
-table3.append([1, 2])
-table3.append([3, 4])
-table3.append([5, 6])
-                "#
-                ),
+if not hasattr(probing, "_made_{name}"):
+    t = probing.ExternalTable.get_or_create("{name}", ["a", "b"])
+    t.append([1, 2])
+    t.append([3, 4])
+    t.append([5, 6])
+    probing._made_{name} = True
+"#
+                ))
+                .unwrap(),
                 None,
                 None,
             )
@@ -308,6 +518,16 @@ table3.append([5, 6])
         });
     }
 
+    async fn engine_with_python() -> Engine {
+        Engine::builder()
+            .with_default_namespace("probe")
+            .with_plugin(PythonPlugin::create("python"))
+            .with_plugin(Arc::new(UnifiedMemtablePlugin))
+            .build()
+            .await
+            .unwrap()
+    }
+
     #[test]
     fn test_create_new_table() {
         setup();
@@ -337,8 +557,7 @@ table = probing.ExternalTable.get_or_create("table2", ["a", "b"])
             )
             .unwrap();
             let binding = EXTERN_TABLES.lock().unwrap();
-            let table1 = binding.get("table2");
-            assert!(table1.is_some());
+            assert!(binding.contains_key("table2"));
         });
     }
 
@@ -346,25 +565,12 @@ table = probing.ExternalTable.get_or_create("table2", ["a", "b"])
     fn test_drop_table_in_python() {
         setup();
         Python::with_gil(|py| {
-            // Create the table first
-            py.run(
-                c_str!(
-                    r#"
-import probing
-probing.ExternalTable.get_or_create("table2", ["a", "b"])
-                    "#
-                ),
-                None,
-                None,
-            )
-            .unwrap();
-
-            // Now drop it
             py.run(
                 c_str!(
                     r#"
 import probing
-probing.ExternalTable.drop("table2")
+probing.ExternalTable.get_or_create("table_to_drop", ["a", "b"])
+probing.ExternalTable.drop("table_to_drop")
                     "#
                 ),
                 None,
@@ -372,28 +578,68 @@ probing.ExternalTable.drop("table2")
             )
             .unwrap();
             let binding = EXTERN_TABLES.lock().unwrap();
-            let table1 = binding.get("table2");
-            assert!(table1.is_none());
+            assert!(!binding.contains_key("table_to_drop"));
+        });
+    }
+
+    #[test]
+    fn test_append_take_roundtrip_and_mmap_file() {
+        setup();
+        let mut table = ExternalTable::new(
+            "roundtrip",
+            vec!["x".to_string(), "msg".to_string()],
+            10000,
+            1_000_000,
+            "BaseMemorySize".to_string(),
+        );
+        Python::with_gil(|py| {
+            let vals: Vec<PyObject> = vec![
+                1i64.into_pyobject(py).unwrap().into_any().unbind(),
+                "hello".into_pyobject(py).unwrap().into_any().unbind(),
+            ];
+            table.append(vals).unwrap();
+            let vals: Vec<PyObject> = vec![
+                2i64.into_pyobject(py).unwrap().into_any().unbind(),
+                "world".into_pyobject(py).unwrap().into_any().unbind(),
+            ];
+            table.append(vals).unwrap();
+        });
+
+        // mmap file exists on disk under <data_dir>/<pid>/python.roundtrip
+        let path = probing_memtable::discover::default_dir()
+            .join(std::process::id().to_string())
+            .join("python.roundtrip");
+        assert!(path.is_file(), "mmap file missing: {path:?}");
+
+        // take() returns rows oldest → newest, with coerced values
+        let rows = table.take(None).unwrap();
+        assert_eq!(rows.len(), 2);
+        Python::with_gil(|py| {
+            let (_, vals) = &rows[0];
+            assert_eq!(vals[0].extract::<i64>(py).unwrap(), 1);
+            assert_eq!(vals[1].extract::<String>(py).unwrap(), "hello");
+            let (_, vals) = &rows[1];
+            assert_eq!(vals[0].extract::<i64>(py).unwrap(), 2);
+            assert_eq!(vals[1].extract::<String>(py).unwrap(), "world");
+        });
+
+        // take(limit) keeps the most recent rows
+        let rows = table.take(Some(1)).unwrap();
+        assert_eq!(rows.len(), 1);
+        Python::with_gil(|py| {
+            assert_eq!(rows[0].1[1].extract::<String>(py).unwrap(), "world");
         });
     }
 
     #[test]
     fn test_see_py_table_in_engine() {
-        setup_table3();
+        setup_table("table3");
         let rt = tokio::runtime::Builder::new_multi_thread()
             .worker_threads(4)
             .enable_all()
             .build()
             .unwrap();
-        let engine = rt
-            .block_on(async {
-                Engine::builder()
-                    .with_default_namespace("probe")
-                    .with_plugin(PythonPlugin::create("python"))
-                    .build()
-                    .await
-            })
-            .unwrap();
+        let engine = rt.block_on(engine_with_python());
         let tables = rt.block_on(async {
             engine
                 .async_query(
@@ -402,10 +648,8 @@ probing.ExternalTable.drop("table2")
                 .await
                 .unwrap()
         });
-        // Query may return None if no tables found
         let df = tables.expect("Table 'table3' should be found in information_schema.tables");
         assert!(!df.cols.is_empty(), "Should have at least one column");
-        // Check if we have any rows - DataFrame.len() returns number of rows
         assert!(
             df.len() > 0,
             "Table 'table3' should be found in information_schema.tables"
@@ -414,85 +658,86 @@ probing.ExternalTable.drop("table2")
 
     #[test]
     fn test_see_py_table_data_in_engine() {
-        setup_table3();
+        setup_table("table4");
         let rt = tokio::runtime::Builder::new_multi_thread()
             .worker_threads(4)
             .enable_all()
             .build()
             .unwrap();
-        let engine = rt
-            .block_on(async {
-                Engine::builder()
-                    .with_default_namespace("probe")
-                    .with_plugin(PythonPlugin::create("python"))
-                    .build()
-                    .await
-            })
-            .unwrap();
+        let engine = rt.block_on(engine_with_python());
         let tables = rt.block_on(async {
             engine
-                .async_query("select * from python.table3 ")
+                .async_query("select * from python.table4 ")
                 .await
                 .unwrap()
         });
-        let df = tables.expect("Table 'table3' should be queryable");
-        // DataFrame.len() returns number of rows
+        let df = tables.expect("Table 'table4' should be queryable");
         assert_eq!(df.len(), 3, "Should have 3 rows");
+        // timestamp + a + b
+        assert_eq!(df.names.len(), 3, "Should have 3 columns: {:?}", df.names);
+        assert_eq!(df.names[0], "timestamp");
     }
 
     #[test]
     fn test_calculate_in_sql_with_filter() {
-        setup_table3();
+        setup_table("table5");
         let rt = tokio::runtime::Builder::new_multi_thread()
             .worker_threads(4)
             .enable_all()
             .build()
             .unwrap();
-        let engine = rt
-            .block_on(async {
-                Engine::builder()
-                    .with_default_namespace("probe")
-                    .with_plugin(PythonPlugin::create("python"))
-                    .build()
-                    .await
-            })
-            .unwrap();
+        let engine = rt.block_on(engine_with_python());
         let tables = rt.block_on(async {
             engine
-                .async_query("select a + b as c from python.table3 where a > 1")
+                .async_query("select a + b as c from python.table5 where a > 1")
                 .await
                 .unwrap()
         });
         let df = tables.expect("Query should return results");
-        // DataFrame.len() returns number of rows
         assert_eq!(df.len(), 2, "Should have 2 rows where a > 1");
     }
 
     #[test]
     fn test_aggregate_in_sql() {
-        setup_table3();
+        setup_table("table6");
         let rt = tokio::runtime::Builder::new_multi_thread()
             .worker_threads(4)
             .enable_all()
             .build()
             .unwrap();
-        let engine = rt
-            .block_on(async {
-                Engine::builder()
-                    .with_default_namespace("probe")
-                    .with_plugin(PythonPlugin::create("python"))
-                    .build()
-                    .await
-            })
-            .unwrap();
+        let engine = rt.block_on(engine_with_python());
         let tables = rt.block_on(async {
             engine
-                .async_query("select sum(a), sum(b) from python.table3")
+                .async_query("select sum(a), sum(b) from python.table6")
                 .await
                 .unwrap()
         });
         let df = tables.expect("Aggregation query should return results");
-        println!("{df:?}");
         assert!(!df.cols.is_empty(), "Should have aggregation results");
     }
+
+    #[test]
+    fn test_static_python_tables_not_shadowed() {
+        // Extern mmap tables under schema `python` must not hide the static
+        // namespace (backtrace, expression tables) — the merged catalog
+        // resolves mmap first, then falls through to the inner provider.
+        setup_table("table7");
+        let rt = tokio::runtime::Builder::new_multi_thread()
+            .worker_threads(2)
+            .enable_all()
+            .build()
+            .unwrap();
+        let engine = rt.block_on(engine_with_python());
+        // `python.\`time.time()\`` is served by the static namespace's
+        // expression path; it must still resolve with extern tables present.
+        let result = rt.block_on(async {
+            engine
+                .async_query("select * from python.`time.time()`")
+                .await
+        });
+        assert!(
+            result.is_ok(),
+            "static python namespace shadowed: {result:?}"
+        );
+    }
 }
diff --git a/probing/extensions/python/src/extensions/python/tbls.rs b/probing/extensions/python/src/extensions/python/tbls.rs
index 7dfa7d1a..a16a20b6 100644
--- a/probing/extensions/python/src/extensions/python/tbls.rs
+++ b/probing/extensions/python/src/extensions/python/tbls.rs
@@ -5,13 +5,12 @@ use std::sync::Arc;
 use anyhow::Result;
 
 use log::error;
+use probing_core::core::LazyTableSource;
 use probing_core::core::{
     ArrayRef, CustomNamespace, DataType, Field, Float64Array, Int64Array, NamespacePluginHelper,
     RecordBatch, Schema, SchemaRef, StringArray,
 };
-use probing_core::core::{Float32Array, Int32Array, LazyTableSource};
-use probing_proto::prelude::{CallFrame, Ele, TimeSeries};
-use probing_proto::types;
+use probing_proto::prelude::CallFrame;
 use pyo3::types::PyAnyMethods;
 use pyo3::types::PyDict;
 use pyo3::types::PyDictMethods;
@@ -161,27 +160,6 @@ impl PythonNamespace {
         })
     }
 
-    fn data_from_extern(expr: &str) -> Result<Vec<RecordBatch>> {
-        let binding = super::exttbls::EXTERN_TABLES
-            .lock()
-            .map_err(|e| anyhow::anyhow!("Failed to lock EXTERN_TABLES: {:?}", e))?;
-
-        let table = binding
-            .get(expr)
-            .ok_or_else(|| anyhow::anyhow!("Table '{}' not found", expr))?;
-
-        let names = table
-            .lock()
-            .map_err(|e| anyhow::anyhow!("Failed to lock table: {:?}", e))?
-            .names
-            .clone();
-
-        let ts = table
-            .lock()
-            .map_err(|e| anyhow::anyhow!("Failed to lock table: {:?}", e))?;
-
-        Self::time_series_to_recordbatch(names, &ts)
-    }
 }
 
 impl CustomNamespace for PythonNamespace {
@@ -190,15 +168,10 @@ impl CustomNamespace for PythonNamespace {
     }
 
     fn list() -> Vec<String> {
-        let mut tables = super::exttbls::EXTERN_TABLES.lock().map_or_else(
-            |e| {
-                log::error!("Failed to lock EXTERN_TABLES: {e:?}");
-                vec![]
-            },
-            |binding| binding.keys().cloned().collect(),
-        );
-        tables.push("backtrace".to_string()); // Add backtrace to the list
-        tables
+        // Extern tables (`probing.ExternalTable`) are mmap-backed and served
+        // by the mmap SQL catalog (`probing_core::core::memtable_sql`), not
+        // by this namespace.
+        vec!["backtrace".to_string()]
     }
 
     fn data(expr: &str) -> Vec<RecordBatch> {
@@ -210,14 +183,6 @@ impl CustomNamespace for PythonNamespace {
                     vec![]
                 }
             }
-        } else if Self::list().contains(&expr.to_string()) {
-            match Self::data_from_extern(expr) {
-                Ok(batches) => batches,
-                Err(e) => {
-                    error!("Error getting data from extern: {e:?}");
-                    vec![]
-                }
-            }
         } else {
             match Self::data_from_python(expr) {
                 Ok(batches) => batches,
@@ -244,168 +209,21 @@ impl CustomNamespace for PythonNamespace {
             });
         }
 
-        let binding = super::exttbls::EXTERN_TABLES.lock().map_or_else(
-            |e| {
-                log::error!("Failed to lock EXTERN_TABLES: {e:?}");
-                Default::default()
-            },
-            |binding| binding.clone(),
-        );
-
-        if binding.contains_key(expr) {
-            let table = binding.get(expr).unwrap();
-            let names = table.lock().unwrap().names.clone();
-            let dtypes = table
-                .lock()
-                .unwrap()
-                .cols
-                .iter()
-                .map(|x| x.dtype())
-                .collect::<Vec<_>>();
-            let mut fields = Vec::new();
-
-            // Check if table already has a timestamp column
-            let has_timestamp = names.iter().any(|n| n == "timestamp");
-
-            // Only add timestamp if it doesn't already exist
-            if !has_timestamp {
-                fields.push(Field::new("timestamp", DataType::Int64, true));
-            }
-
-            for (name, dtype) in names.iter().zip(dtypes.iter()) {
-                fields.push(Field::new(
-                    name,
-                    match dtype {
-                        types::EleType::I64 => DataType::Int64,
-                        types::EleType::F64 => DataType::Float64,
-                        types::EleType::I32 => DataType::Int32,
-                        types::EleType::F32 => DataType::Float32,
-                        _ => DataType::Utf8,
-                    },
-                    false,
-                ));
-            }
-
-            let schema = Some(SchemaRef::new(Schema::new(fields)));
-
-            Arc::new(LazyTableSource {
-                name: expr.to_string(),
-                schema,
-                data: Self::data_from_extern(expr).unwrap_or_default(),
-            })
+        let data: Vec<RecordBatch> = Self::data_from_python(expr).unwrap_or_default();
+        let schema = if data.is_empty() {
+            None
         } else {
-            let data: Vec<RecordBatch> = Self::data_from_python(expr).unwrap_or_default();
-            let schema = if data.is_empty() {
-                None
-            } else {
-                Some(data[0].schema().clone())
-            };
-            Arc::new(LazyTableSource {
-                name: expr.to_string(),
-                schema,
-                data,
-            })
-        }
+            Some(data[0].schema().clone())
+        };
+        Arc::new(LazyTableSource {
+            name: expr.to_string(),
+            schema,
+            data,
+        })
     }
 }
 
 impl PythonNamespace {
-    pub fn time_series_to_recordbatch(
-        names: Vec<String>,
-        ts: &TimeSeries,
-    ) -> Result<Vec<RecordBatch>> {
-        let mut fields: Vec<Field> = vec![];
-        let mut columns: Vec<ArrayRef> = vec![];
-
-        fields.push(Field::new("timestamp", DataType::Int64, true));
-        names.iter().zip(ts.cols.iter()).for_each(|(name, col)| {
-            let data_type = match col.dtype() {
-                types::EleType::I64 => DataType::Int64,
-                types::EleType::F64 => DataType::Float64,
-                types::EleType::I32 => DataType::Int32,
-                types::EleType::F32 => DataType::Float32,
-                _ => DataType::Utf8,
-            };
-            fields.push(Field::new(name, data_type, false));
-        });
-
-        let length = ts.len();
-
-        let timeseries = ts
-            .timestamp
-            .iter()
-            .take(length)
-            .map(|x| match x {
-                Ele::I64(x) => x,
-                _ => 0,
-            })
-            .collect::<Vec<_>>();
-        columns.push(Arc::new(Int64Array::from(timeseries)));
-
-        for col in ts.cols.iter() {
-            let col = match col.dtype() {
-                types::EleType::I64 => Arc::new(Int64Array::from(
-                    col.iter()
-                        .take(length)
-                        .map(|x| match x {
-                            Ele::I64(x) => x,
-                            _ => 0,
-                        })
-                        .collect::<Vec<_>>(),
-                )) as ArrayRef,
-                types::EleType::F64 => Arc::new(Float64Array::from(
-                    col.iter()
-                        .take(length)
-                        .map(|x| match x {
-                            Ele::F64(x) => x,
-                            _ => 0.0,
-                        })
-                        .collect::<Vec<_>>(),
-                )) as ArrayRef,
-                types::EleType::I32 => Arc::new(Int32Array::from(
-                    col.iter()
-                        .take(length)
-                        .map(|x| match x {
-                            Ele::I32(x) => x,
-                            _ => 0,
-                        })
-                        .collect::<Vec<_>>(),
-                )) as ArrayRef,
-                types::EleType::F32 => Arc::new(Float32Array::from(
-                    col.iter()
-                        .take(length)
-                        .map(|x| match x {
-                            Ele::F32(x) => x,
-                            _ => 0.0,
-                        })
-                        .collect::<Vec<_>>(),
-                )) as ArrayRef,
-                types::EleType::Text => Arc::new(StringArray::from(
-                    col.iter()
-                        .take(length)
-                        .map(|x| match x {
-                            Ele::Text(x) => x,
-                            _ => x.to_string(),
-                        })
-                        .collect::<Vec<_>>(),
-                )) as ArrayRef,
-                _ => Arc::new(StringArray::from(
-                    col.iter()
-                        .take(length)
-                        .map(|x| x.to_string())
-                        .collect::<Vec<_>>(),
-                )) as ArrayRef,
-            };
-
-            columns.push(col);
-        }
-
-        Ok(vec![RecordBatch::try_new(
-            SchemaRef::new(Schema::new(fields)),
-            columns,
-        )?])
-    }
-
     pub fn object_to_recordbatch(obj: Bound<'_, PyAny>) -> Result<Vec<RecordBatch>> {
         let mut fields: Vec<Field> = vec![];
         let mut columns: Vec<ArrayRef> = vec![];
diff --git a/probing/memtable/Cargo.toml b/probing/memtable/Cargo.toml
index c6a99f39..2c5652e3 100644
--- a/probing/memtable/Cargo.toml
+++ b/probing/memtable/Cargo.toml
@@ -10,6 +10,7 @@ description = "Self-describing columnar memory table with ring buffer"
 xxhash-rust = { version = "0.8", features = ["xxh3"] }
 memmap2 = "0.9"
 libc = "0.2"
+pco = "0.4"
 
 [dev-dependencies]
 
diff --git a/probing/memtable/src/discover.rs b/probing/memtable/src/discover.rs
index 43e06680..eec04e56 100644
--- a/probing/memtable/src/discover.rs
+++ b/probing/memtable/src/discover.rs
@@ -45,7 +45,7 @@ use crate::memh::layout::required_total_size as memh_required_size;
 use crate::memh::table::init_buf as memh_init_buf;
 use crate::memh::{MemhView, MemhWriter};
 use crate::memtable::{MemTable, MemTableView, MemTableWriter};
-use crate::raw::{init_buf, process_start_time, validate_buf};
+use crate::raw::{process_start_time, validate_buf};
 use crate::schema::{Schema, Value};
 
 use memmap2::{Mmap, MmapMut};
@@ -102,12 +102,12 @@ pub fn is_creator_alive(pid: u32, expected_start_time: u64) -> bool {
 
 /// A memtable backed by an mmap'd file, exposed for cross-process discovery.
 ///
-/// On [`Drop`], the file is removed. If the parent `<pid>/` directory is
-/// empty afterward, it is removed too.
+/// Thin wrapper around a **shared-memory** [`MemTable`] (see
+/// [`MemTable::shared`]); kept for API stability. On [`Drop`], the file is
+/// removed. If the parent `<pid>/` directory is empty afterward, it is
+/// removed too.
 pub struct ExposedTable {
-    mmap: MmapMut,
-    path: PathBuf,
-    dir: PathBuf,
+    inner: MemTable,
 }
 
 impl ExposedTable {
@@ -131,37 +131,22 @@ impl ExposedTable {
         chunk_size: u32,
         num_chunks: u32,
     ) -> io::Result<Self> {
-        let dir = base_dir.join(std::process::id().to_string());
-        fs::create_dir_all(&dir)?;
-
-        let path = dir.join(name);
-        let size = MemTable::required_size(schema, chunk_size as usize, num_chunks as usize);
-
-        let file = OpenOptions::new()
-            .read(true)
-            .write(true)
-            .create(true)
-            .truncate(true)
-            .open(&path)?;
-        file.set_len(size as u64)?;
-
-        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
-        init_buf(&mut mmap, schema, chunk_size, num_chunks);
-
-        Ok(Self { mmap, path, dir })
+        Ok(Self {
+            inner: MemTable::shared_in(base_dir, name, schema, chunk_size, num_chunks)?,
+        })
     }
 
     pub fn as_bytes(&self) -> &[u8] {
-        &self.mmap
+        self.inner.as_bytes()
     }
 
     pub fn as_bytes_mut(&mut self) -> &mut [u8] {
-        &mut self.mmap
+        self.inner.as_bytes_mut()
     }
 
     /// File path of this table.
     pub fn path(&self) -> &Path {
-        &self.path
+        self.inner.path().expect("ExposedTable is always shared")
     }
 
     /// Create a [`MemTableWriter`] backed by the mmap'd region.
@@ -169,51 +154,22 @@ impl ExposedTable {
     /// **Note**: this re-validates the entire buffer on every call.
     /// Prefer [`push_row`](Self::push_row) for hot-path writes.
     pub fn writer(&mut self) -> MemTableWriter<'_> {
-        MemTableWriter::new(&mut self.mmap).expect("mmap buffer validated at creation")
+        MemTableWriter::new(self.inner.as_bytes_mut()).expect("mmap buffer validated at creation")
     }
 
     /// Append a row without re-validating the buffer.
     ///
     /// This is the fast path for high-frequency writes — it skips the
     /// O(rows × chunks) `validate_buf` that `writer()` performs on every call.
-    /// Safe because the buffer was validated at `create()` time and only
-    /// mutated through well-formed write operations.
-    ///
-    /// # Panic safety
-    ///
-    /// The spinlock is released even if the write panics (e.g. row exceeds
-    /// chunk capacity), preventing a deadlocked mmap file.
+    /// The spinlock is released even if the write panics, preventing a
+    /// deadlocked mmap file (see [`MemTable::push_row`]).
     pub fn push_row(&mut self, values: &[Value]) {
-        use crate::layout::{acquire_write_lock, release_write_lock};
-        use crate::memtable::push_plain_row;
-        use crate::raw::validate_row_schema;
-
-        debug_assert!(
-            validate_row_schema(&self.mmap, values),
-            "value types do not match schema"
-        );
-
-        acquire_write_lock(&mut self.mmap);
-        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
-            push_plain_row(&mut self.mmap, values);
-        }));
-        release_write_lock(&mut self.mmap);
-
-        if let Err(payload) = result {
-            std::panic::resume_unwind(payload);
-        }
+        self.inner.push_row(values)
     }
 
     /// Create a read-only [`MemTableView`].
     pub fn view(&self) -> MemTableView<'_> {
-        MemTableView::new(&self.mmap).expect("mmap buffer validated at creation")
-    }
-}
-
-impl Drop for ExposedTable {
-    fn drop(&mut self) {
-        let _ = fs::remove_file(&self.path);
-        let _ = fs::remove_dir(&self.dir); // succeeds only if empty
+        self.inner.view()
     }
 }
 
@@ -296,6 +252,42 @@ impl Drop for ExposedHashTable {
     }
 }
 
+// ── MappedFile ────────────────────────────────────────────────────────
+
+/// Read-only mmap of a memtable file (MEMT ring or MEMH hash), without
+/// format validation.
+///
+/// This is the zero-copy read path for SQL/catalog integration: pages are
+/// faulted in on demand instead of copying the whole file to the heap
+/// (rings are sized for capacity, so most chunks may be untouched).
+/// Callers inspect the bytes with [`crate::detect_table`] and construct
+/// the appropriate view, which performs its own validation.
+///
+/// The mapping stays valid even if the creating process unlinks the file
+/// (e.g. [`ExposedTable`] drop) while this handle is alive.
+#[derive(Debug)]
+pub struct MappedFile {
+    mmap: Mmap,
+    path: PathBuf,
+}
+
+impl MappedFile {
+    pub fn open(path: impl AsRef<Path>) -> io::Result<Self> {
+        let path = path.as_ref().to_path_buf();
+        let file = File::open(&path)?;
+        let mmap = unsafe { Mmap::map(&file)? };
+        Ok(Self { mmap, path })
+    }
+
+    pub fn path(&self) -> &Path {
+        &self.path
+    }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.mmap
+    }
+}
+
 // ── DiscoveredTable ───────────────────────────────────────────────────
 
 /// A memtable discovered on the filesystem (read-only mmap).
@@ -469,6 +461,7 @@ fn read_any_start_time(dir: &Path) -> u64 {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::raw::init_buf;
     use crate::schema::{DType, Value};
     use std::sync::atomic::{AtomicU32, Ordering as AtOrd};
 
diff --git a/probing/memtable/src/layout.rs b/probing/memtable/src/layout.rs
index 536f29ee..f737468d 100644
--- a/probing/memtable/src/layout.rs
+++ b/probing/memtable/src/layout.rs
@@ -1,15 +1,15 @@
 //! Low-level layout: header, column descriptors, chunk headers, byte helpers.
 //!
-//! ## Header v2 binary layout (64 bytes, 1 cache line)
+//! ## Header v3 binary layout (64 bytes, 1 cache line)
 //!
 //! ```text
 //! offset  size  field               notes
 //! ──────────────────────────────────────────────────────────
 //!  0       4    magic               0x4D454D54 ("MEMT" in LE)
-//!  4       2    version             2
+//!  4       2    version             3
 //!  6       2    header_size         64 (validation only)
 //!  8       2    byte_order          BOM: written as [0x01, 0x02]
-//! 10       2    _pad0               0
+//! 10       2    ts_col              timestamp column index + 1 (0 = none)
 //! 12       4    flags               feature bits (see FLAG_*)
 //! 16       4    num_cols
 //! 20       4    num_chunks
@@ -17,11 +17,11 @@
 //! 28       4    data_offset         (64-aligned)
 //! ─── 32 byte boundary (cold/hot split) ─────────────────
 //! 32       4    write_chunk         AtomicU32
-//! 36       4    write_lock          AtomicU32
+//! 36       4    write_lock          AtomicU32: 0 = unlocked, else holder PID
 //! 40       4    refcount            AtomicU32
 //! 44       4    creator_pid         PID of creating process
 //! 48       8    creator_start_time  process start time (platform-specific)
-//! 56       8    _reserved           0
+//! 56       8    lock_owner_start    AtomicU64: lock holder's start time
 //! ──────────────────────────────────────────────────────────
 //! ```
 //!
@@ -29,7 +29,8 @@
 //! allows readers to detect endianness mismatch without guessing.
 
 use std::mem;
-use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+use std::sync::atomic::{AtomicI64, AtomicU32, AtomicU64, Ordering};
+use std::time::{Duration, Instant};
 
 // ── C-style layout structs ──────────────────────────────────────────
 
@@ -38,7 +39,11 @@ pub const MAGIC_MEMT: u32 = 0x4D45_4D54;
 pub(crate) const MAGIC: u32 = MAGIC_MEMT;
 
 /// Header format version for MEMT.
-pub(crate) const VERSION: u16 = 2;
+///
+/// v3: `_pad0` became `ts_col`, `_reserved` became `lock_owner_start`,
+/// `write_lock` stores the holder PID (was 0/1), and `ChunkHeader` grew
+/// `min_ts`/`max_ts` (24 → 40 bytes).
+pub(crate) const VERSION: u16 = 3;
 
 /// Byte-order mark: written as raw bytes `[0x01, 0x02]`.
 /// On a LE host, `u16::from_ne_bytes([0x01, 0x02])` == `0x0201`.
@@ -80,7 +85,12 @@ pub(crate) struct Header {
     pub header_size: u16,
     /// Byte-order mark, written as `BYTE_ORDER_MARK`.
     pub byte_order: u16,
-    pub _pad0: u16,
+    /// Designated timestamp column **index + 1** (0 = no timestamp column).
+    ///
+    /// Set at init when the schema contains an `I64` column named
+    /// `"timestamp"`. The writer maintains per-chunk `min_ts`/`max_ts`
+    /// from this column so readers can prune chunks by time range.
+    pub ts_col: u16,
     /// Feature flags (see `FLAG_*` constants).
     pub flags: u32,
     pub num_cols: u32,
@@ -92,7 +102,10 @@ pub(crate) struct Header {
     // ── hot zone (atomically mutated) ────────────────────
     /// Ring buffer: index of the chunk currently being written.
     pub write_chunk: AtomicU32,
-    /// Spinlock for writer serialization: 0 = unlocked, 1 = locked.
+    /// Robust writer spinlock: 0 = unlocked, otherwise the **PID** of the
+    /// holding process. A waiter that has spun past
+    /// [`LOCK_STEAL_TIMEOUT`] checks the holder's liveness and steals the
+    /// lock from a dead process (see [`acquire_write_lock`]).
     pub write_lock: AtomicU32,
     /// Reference count for shared lifetime management.
     pub refcount: AtomicU32,
@@ -103,7 +116,10 @@ pub(crate) struct Header {
     /// macOS: microseconds since epoch (via `sysctl`).
     /// Other: 0 (falls back to PID-only liveness check).
     pub creator_start_time: u64,
-    pub _reserved: [u32; 2],
+    /// Start time of the current lock holder (0 = unknown / not written
+    /// yet). Written by the holder right after acquiring; lets waiters
+    /// detect PID recycling before stealing. Advisory only.
+    pub lock_owner_start: AtomicU64,
 }
 
 /// Per-column descriptor, immediately following the Header.
@@ -135,7 +151,12 @@ impl ColumnDesc {
     }
 }
 
-/// Per-chunk metadata, at the start of every chunk's byte region.
+/// Sentinel for `ChunkHeader.min_ts` when the chunk holds no rows.
+pub(crate) const TS_MIN_INIT: i64 = i64::MAX;
+/// Sentinel for `ChunkHeader.max_ts` when the chunk holds no rows.
+pub(crate) const TS_MAX_INIT: i64 = i64::MIN;
+
+/// Per-chunk metadata, at the start of every chunk's byte region (40 bytes).
 #[repr(C)]
 pub(crate) struct ChunkHeader {
     /// Incremented each time the chunk is recycled (ring wrap).
@@ -148,6 +169,12 @@ pub(crate) struct ChunkHeader {
     /// Chunk lifecycle state (see `ChunkState`).
     pub state: AtomicU32,
     pub _reserved: u32,
+    /// Smallest value of the designated timestamp column in this chunk
+    /// ([`TS_MIN_INIT`] when empty or no `Header::ts_col`). Maintained by
+    /// the writer; readers must validate against `generation` snapshots.
+    pub min_ts: AtomicI64,
+    /// Largest timestamp in this chunk ([`TS_MAX_INIT`] when empty).
+    pub max_ts: AtomicI64,
 }
 
 /// Chunk lifecycle state.
@@ -164,7 +191,7 @@ pub(crate) const CHUNK_HEADER_SIZE: usize = mem::size_of::<ChunkHeader>();
 const _: () = {
     assert!(mem::size_of::<Header>() == 64);
     assert!(mem::size_of::<ColumnDesc>() == 64);
-    assert!(mem::size_of::<ChunkHeader>() == 24);
+    assert!(mem::size_of::<ChunkHeader>() == 40);
 };
 // ── struct accessors ────────────────────────────────────────────────
 
@@ -197,10 +224,101 @@ pub(crate) fn chunk_header(buf: &[u8], cs: usize) -> &ChunkHeader {
     unsafe { &*(buf[cs..].as_ptr() as *const ChunkHeader) }
 }
 
-/// Acquire the writer spinlock with exponential back-off.
+/// How long a waiter spins before checking whether the lock holder is
+/// still alive (and stealing the lock from a dead process).
+///
+/// Writers hold the lock for nanoseconds–microseconds; even a descheduled
+/// holder resumes within milliseconds. Reaching this timeout in practice
+/// means the holder crashed while holding the lock.
+pub(crate) const LOCK_STEAL_TIMEOUT: Duration = Duration::from_millis(500);
+
+/// `true` when a process with `pid` exists (it may belong to another user).
+fn process_alive(pid: u32) -> bool {
+    if pid == std::process::id() {
+        return true;
+    }
+    if unsafe { libc::kill(pid as libc::pid_t, 0) } == 0 {
+        return true;
+    }
+    // EPERM: the process exists but we may not signal it.
+    std::io::Error::last_os_error().raw_os_error() == Some(libc::EPERM)
+}
+
+/// This process's kernel start time, cached per PID (reads `/proc` on Linux).
+///
+/// **Fork safety:** the cache is keyed on the live PID, not a one-shot
+/// `OnceLock`. A child inheriting a parent's cached value would otherwise
+/// record the *parent's* start time in `lock_owner_start`, and a waiter
+/// comparing against the child's real start time would mistake the live child
+/// for a recycled PID and steal its lock — exactly the hazard fork-heavy
+/// workloads (e.g. PyTorch DataLoader) trigger. Re-reading whenever the PID
+/// changes makes every post-fork caller observe its own start time.
+fn my_start_time() -> u64 {
+    static MY_PID: AtomicU32 = AtomicU32::new(0);
+    static MY_START: AtomicU64 = AtomicU64::new(0);
+
+    let pid = std::process::id();
+    if MY_PID.load(Ordering::Acquire) == pid {
+        let cached = MY_START.load(Ordering::Acquire);
+        if cached != 0 {
+            return cached;
+        }
+    }
+    let start = crate::raw::process_start_time(pid);
+    // Publish start before PID: a reader that observes the matching PID is then
+    // guaranteed to also observe the start written for it.
+    MY_START.store(start, Ordering::Release);
+    MY_PID.store(pid, Ordering::Release);
+    start
+}
+
+/// Decide whether the lock can be stolen from `holder`, and try to.
+///
+/// Steal conditions (either):
+/// - `holder` no longer exists (crashed / killed while holding the lock);
+/// - `holder` exists but its kernel start time does not match the one the
+///   real holder recorded in `lock_owner_start` — the PID was recycled by
+///   an unrelated process. Re-checked after a grace period to rule out
+///   the transient window where a fresh holder has not yet recorded its
+///   start time.
+///
+/// Stealing is safe with respect to data: rows only become visible via the
+/// `used`/`row_count` Release stores at the end of a write, so a row half
+/// written by the dead holder stays uncommitted and is simply overwritten.
+#[cold]
+#[inline(never)]
+fn try_steal_lock(h: &Header, holder: u32, me: u32) -> bool {
+    if process_alive(holder) {
+        let owner_start = h.lock_owner_start.load(Ordering::Relaxed);
+        let actual_start = crate::raw::process_start_time(holder);
+        if owner_start == 0 || actual_start == 0 || actual_start == owner_start {
+            return false; // genuinely alive (or cannot tell) — keep waiting
+        }
+        std::thread::sleep(Duration::from_millis(10));
+        if h.write_lock.load(Ordering::Relaxed) != holder
+            || h.lock_owner_start.load(Ordering::Relaxed) != owner_start
+        {
+            return false; // lock changed hands meanwhile — not stale
+        }
+    }
+    if h.write_lock
+        .compare_exchange(holder, me, Ordering::Acquire, Ordering::Relaxed)
+        .is_ok()
+    {
+        h.lock_owner_start.store(my_start_time(), Ordering::Relaxed);
+        return true;
+    }
+    false
+}
+
+/// Acquire the **robust** writer spinlock with exponential back-off.
 ///
-/// First few failures use `spin_loop()` (pause instruction), then
-/// escalate to `yield_now()` to avoid burning CPU under contention.
+/// The lock word holds the owner's PID (0 = unlocked). First few failures
+/// use `spin_loop()` (pause instruction), then escalate to `yield_now()`.
+/// A waiter stuck past [`LOCK_STEAL_TIMEOUT`] verifies the holder's
+/// liveness and steals the lock from a dead process (see
+/// [`try_steal_lock`]), so a writer crashing inside the critical section
+/// cannot deadlock other writer processes forever.
 ///
 /// SAFETY NOTE: the buffer parameter is `&mut [u8]` (not `&[u8]`) so that
 /// LLVM does **not** mark the pointer `readonly`. With `&[u8]` LLVM may
@@ -208,12 +326,30 @@ pub(crate) fn chunk_header(buf: &[u8], cs: usize) -> &ChunkHeader {
 /// the spin loop into an infinite loop in optimised (release) builds.
 pub(crate) fn acquire_write_lock(buf: &mut [u8]) {
     let ptr = buf.as_mut_ptr() as *const Header;
-    let lock = unsafe { &(*ptr).write_lock };
+    let h = unsafe { &*ptr };
+    let me = std::process::id();
     let mut spins = 0u32;
-    while lock
-        .compare_exchange_weak(0, 1, Ordering::Acquire, Ordering::Relaxed)
-        .is_err()
-    {
+    let mut waiting_since: Option<Instant> = None;
+    loop {
+        match h
+            .write_lock
+            .compare_exchange_weak(0, me, Ordering::Acquire, Ordering::Relaxed)
+        {
+            Ok(_) => {
+                h.lock_owner_start.store(my_start_time(), Ordering::Relaxed);
+                return;
+            }
+            Err(holder) if holder != 0 => {
+                let since = *waiting_since.get_or_insert_with(Instant::now);
+                if spins >= 16 && since.elapsed() >= LOCK_STEAL_TIMEOUT {
+                    if try_steal_lock(h, holder, me) {
+                        return;
+                    }
+                    waiting_since = Some(Instant::now());
+                }
+            }
+            Err(_) => {} // spurious failure with lock free — retry CAS
+        }
         if spins < 16 {
             for _ in 0..1 << spins.min(4) {
                 std::hint::spin_loop();
@@ -226,9 +362,15 @@ pub(crate) fn acquire_write_lock(buf: &mut [u8]) {
 }
 
 /// Release the writer spinlock. See [`acquire_write_lock`] for why `&mut`.
+///
+/// Clears `lock_owner_start` *before* the lock word so that waiters never
+/// pair the next holder's PID with this holder's start time.
 pub(crate) fn release_write_lock(buf: &mut [u8]) {
     let ptr = buf.as_mut_ptr() as *const Header;
-    unsafe { (*ptr).write_lock.store(0, Ordering::Release) };
+    unsafe {
+        (*ptr).lock_owner_start.store(0, Ordering::Relaxed);
+        (*ptr).write_lock.store(0, Ordering::Release);
+    }
 }
 pub(crate) fn r32(buf: &[u8], off: usize) -> u32 {
     u32::from_le_bytes(buf[off..off + 4].try_into().unwrap())
@@ -262,7 +404,7 @@ mod tests {
     fn struct_sizes() {
         assert_eq!(mem::size_of::<Header>(), 64);
         assert_eq!(mem::size_of::<ColumnDesc>(), 64);
-        assert_eq!(mem::size_of::<ChunkHeader>(), 24);
+        assert_eq!(mem::size_of::<ChunkHeader>(), 40);
     }
 
     #[test]
@@ -271,4 +413,40 @@ mod tests {
         let expected_le = u16::from_le_bytes(BYTE_ORDER_MARK);
         assert_eq!(bom, expected_le);
     }
+
+    /// Fork safety: after `fork()`, `my_start_time()` must return the *child's*
+    /// own kernel start time, not a value cached for the parent before the
+    /// fork. With the old `OnceLock` cache the child returned the parent's
+    /// start time; a waiter then compared it against the child's real start
+    /// time and stole the lock from a live holder. The test process has run
+    /// long enough that its start tick differs from a freshly-forked child's,
+    /// so the stale value would be observably wrong.
+    ///
+    /// Linux-only: kernel start times come from `/proc`. On platforms without
+    /// it `process_start_time` returns 0, the PID-recycle steal path is inert,
+    /// and there is no fork hazard to guard against.
+    #[cfg(target_os = "linux")]
+    #[test]
+    fn my_start_time_refreshes_after_fork() {
+        // Warm the per-PID cache for the parent (mimics the leaked OnceLock).
+        let parent = my_start_time();
+        assert_ne!(parent, 0, "parent start time should be readable");
+
+        unsafe {
+            let pid = libc::fork();
+            assert!(pid >= 0, "fork failed");
+            if pid == 0 {
+                // Child: the cached value must equal a fresh read for THIS pid.
+                let cached = my_start_time();
+                let fresh = crate::raw::process_start_time(std::process::id());
+                libc::_exit(if cached == fresh && cached != 0 { 0 } else { 1 });
+            }
+            let mut status = 0;
+            libc::waitpid(pid, &mut status, 0);
+            assert!(
+                libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0,
+                "child my_start_time() must reflect its own process, not the parent's cache",
+            );
+        }
+    }
 }
diff --git a/probing/memtable/src/lib.rs b/probing/memtable/src/lib.rs
index 3d16ff9e..2cbd5c04 100644
--- a/probing/memtable/src/lib.rs
+++ b/probing/memtable/src/lib.rs
@@ -93,6 +93,7 @@ mod cache;
 mod dedup;
 pub mod discover;
 mod layout;
+pub mod memc;
 pub mod memh;
 mod memtable;
 mod raw;
@@ -108,7 +109,7 @@ pub use memh::{
     MemhValidateError, MemhView, MemhWriter, SharedMemhWriter, TypedValue, MAGIC_MEMH,
     VERSION_MEMH,
 };
-pub use memtable::{MemTable, MemTableView, MemTableWriter};
+pub use memtable::{BackingKind, MemTable, MemTableView, MemTableWriter};
 pub use raw::validate_buf;
 pub use refcount::{acquire_ref, refcount, release_ref};
 pub use row::{Row, RowCursor, RowIter};
diff --git a/probing/memtable/src/memc/codec.rs b/probing/memtable/src/memc/codec.rs
new file mode 100644
index 00000000..bdeb1e6c
--- /dev/null
+++ b/probing/memtable/src/memc/codec.rs
@@ -0,0 +1,281 @@
+//! Columnar encode/decode for MEMC page payloads.
+//!
+//! A page payload is the concatenation of per-column sub-blocks, each:
+//!
+//! ```text
+//! [u8 encoding][u8 dtype][u16 _pad][u32 byte_len][payload bytes]
+//! ```
+//!
+//! Numeric columns use Pco (`simpler_compress`); `U8` and variable-length
+//! `Str`/`Bytes` columns are stored raw (Pco has no `u8`/string support).
+
+use pco::standalone::{simple_decompress, simpler_compress};
+
+use super::layout::{get_u32, ColEncoding, PCO_LEVEL};
+use crate::schema::{DType, Value};
+
+/// One column's worth of values, type-tagged.
+#[derive(Debug, Clone, PartialEq)]
+pub enum ColumnData {
+    U8(Vec<u8>),
+    U32(Vec<u32>),
+    I32(Vec<i32>),
+    I64(Vec<i64>),
+    F32(Vec<f32>),
+    F64(Vec<f64>),
+    U64(Vec<u64>),
+    Str(Vec<String>),
+    Bytes(Vec<Vec<u8>>),
+}
+
+impl ColumnData {
+    pub fn dtype(&self) -> DType {
+        match self {
+            ColumnData::U8(_) => DType::U8,
+            ColumnData::U32(_) => DType::U32,
+            ColumnData::I32(_) => DType::I32,
+            ColumnData::I64(_) => DType::I64,
+            ColumnData::F32(_) => DType::F32,
+            ColumnData::F64(_) => DType::F64,
+            ColumnData::U64(_) => DType::U64,
+            ColumnData::Str(_) => DType::Str,
+            ColumnData::Bytes(_) => DType::Bytes,
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        match self {
+            ColumnData::U8(v) => v.len(),
+            ColumnData::U32(v) => v.len(),
+            ColumnData::I32(v) => v.len(),
+            ColumnData::I64(v) => v.len(),
+            ColumnData::F32(v) => v.len(),
+            ColumnData::F64(v) => v.len(),
+            ColumnData::U64(v) => v.len(),
+            ColumnData::Str(v) => v.len(),
+            ColumnData::Bytes(v) => v.len(),
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+
+/// Builds one [`ColumnData`] of a fixed [`DType`] by pushing [`Value`]s.
+pub struct ColumnBuilder {
+    data: ColumnData,
+}
+
+impl ColumnBuilder {
+    pub fn new(dtype: DType) -> Self {
+        let data = match dtype {
+            DType::U8 => ColumnData::U8(Vec::new()),
+            DType::U32 => ColumnData::U32(Vec::new()),
+            DType::I32 => ColumnData::I32(Vec::new()),
+            DType::I64 => ColumnData::I64(Vec::new()),
+            DType::F32 => ColumnData::F32(Vec::new()),
+            DType::F64 => ColumnData::F64(Vec::new()),
+            DType::U64 => ColumnData::U64(Vec::new()),
+            DType::Str => ColumnData::Str(Vec::new()),
+            DType::Bytes => ColumnData::Bytes(Vec::new()),
+        };
+        Self { data }
+    }
+
+    /// Append a value. Mismatched types are coerced where lossless and
+    /// otherwise dropped as a zero/empty default — callers validate the
+    /// row schema up front, so this only guards against logic errors.
+    pub fn push(&mut self, v: &Value) {
+        match (&mut self.data, v) {
+            (ColumnData::U8(d), Value::U8(x)) => d.push(*x),
+            (ColumnData::U32(d), Value::U32(x)) => d.push(*x),
+            (ColumnData::I32(d), Value::I32(x)) => d.push(*x),
+            (ColumnData::I64(d), Value::I64(x)) => d.push(*x),
+            (ColumnData::F32(d), Value::F32(x)) => d.push(*x),
+            (ColumnData::F64(d), Value::F64(x)) => d.push(*x),
+            (ColumnData::U64(d), Value::U64(x)) => d.push(*x),
+            (ColumnData::Str(d), Value::Str(x)) => d.push((*x).to_string()),
+            (ColumnData::Bytes(d), Value::Bytes(x)) => d.push(x.to_vec()),
+            _ => debug_assert!(false, "ColumnBuilder type mismatch"),
+        }
+    }
+
+    pub fn finish(self) -> ColumnData {
+        self.data
+    }
+}
+
+fn pco_compress<T: pco::data_types::Number>(nums: &[T]) -> Result<Vec<u8>, String> {
+    simpler_compress(nums, PCO_LEVEL).map_err(|e| e.to_string())
+}
+
+fn pco_decompress<T: pco::data_types::Number>(data: &[u8]) -> Result<Vec<T>, String> {
+    simple_decompress::<T>(data).map_err(|e| e.to_string())
+}
+
+fn encode_varlen(entries: impl Iterator<Item = (usize, Vec<u8>)>, total: usize) -> Vec<u8> {
+    let mut out = Vec::with_capacity(total);
+    for (len, bytes) in entries {
+        out.extend_from_slice(&(len as u32).to_le_bytes());
+        out.extend_from_slice(&bytes);
+    }
+    out
+}
+
+/// Encode one column into its sub-block (header + payload).
+pub fn encode_column(col: &ColumnData) -> Result<Vec<u8>, String> {
+    let (encoding, payload): (ColEncoding, Vec<u8>) = match col {
+        ColumnData::U8(v) => (ColEncoding::RawFixed, v.clone()),
+        ColumnData::I32(v) => (ColEncoding::Pco, pco_compress(v)?),
+        ColumnData::I64(v) => (ColEncoding::Pco, pco_compress(v)?),
+        ColumnData::F32(v) => (ColEncoding::Pco, pco_compress(v)?),
+        ColumnData::F64(v) => (ColEncoding::Pco, pco_compress(v)?),
+        ColumnData::U32(v) => (ColEncoding::Pco, pco_compress(v)?),
+        ColumnData::U64(v) => (ColEncoding::Pco, pco_compress(v)?),
+        ColumnData::Str(v) => {
+            let total: usize = v.iter().map(|s| 4 + s.len()).sum();
+            let payload = encode_varlen(v.iter().map(|s| (s.len(), s.as_bytes().to_vec())), total);
+            (ColEncoding::RawVarLen, payload)
+        }
+        ColumnData::Bytes(v) => {
+            let total: usize = v.iter().map(|b| 4 + b.len()).sum();
+            let payload = encode_varlen(v.iter().map(|b| (b.len(), b.clone())), total);
+            (ColEncoding::RawVarLen, payload)
+        }
+    };
+
+    let mut out = Vec::with_capacity(8 + payload.len());
+    out.push(encoding as u8);
+    out.push(col.dtype() as u32 as u8);
+    out.extend_from_slice(&[0u8, 0u8]);
+    out.extend_from_slice(&(payload.len() as u32).to_le_bytes());
+    out.extend_from_slice(&payload);
+    Ok(out)
+}
+
+/// Decode one column sub-block, returning the column and bytes consumed.
+pub fn decode_column(buf: &[u8], row_count: usize) -> Result<(ColumnData, usize), String> {
+    if buf.len() < 8 {
+        return Err("column sub-block too small".into());
+    }
+    let encoding = ColEncoding::from_u8(buf[0]).ok_or("invalid column encoding")?;
+    let dtype = DType::from_u32(buf[1] as u32).ok_or("invalid column dtype")?;
+    let payload_len = get_u32(buf, 4) as usize;
+    let start = 8;
+    let end = start + payload_len;
+    if buf.len() < end {
+        return Err("column payload out of bounds".into());
+    }
+    let payload = &buf[start..end];
+
+    let col = match (encoding, dtype) {
+        (ColEncoding::RawFixed, DType::U8) => ColumnData::U8(payload.to_vec()),
+        (ColEncoding::Pco, DType::I32) => ColumnData::I32(pco_decompress(payload)?),
+        (ColEncoding::Pco, DType::I64) => ColumnData::I64(pco_decompress(payload)?),
+        (ColEncoding::Pco, DType::F32) => ColumnData::F32(pco_decompress(payload)?),
+        (ColEncoding::Pco, DType::F64) => ColumnData::F64(pco_decompress(payload)?),
+        (ColEncoding::Pco, DType::U32) => ColumnData::U32(pco_decompress(payload)?),
+        (ColEncoding::Pco, DType::U64) => ColumnData::U64(pco_decompress(payload)?),
+        (ColEncoding::RawVarLen, DType::Str) => {
+            ColumnData::Str(decode_varlen_str(payload, row_count)?)
+        }
+        (ColEncoding::RawVarLen, DType::Bytes) => {
+            ColumnData::Bytes(decode_varlen_bytes(payload, row_count)?)
+        }
+        _ => return Err("encoding/dtype mismatch".into()),
+    };
+    Ok((col, end))
+}
+
+fn decode_varlen_entries(payload: &[u8], row_count: usize) -> Result<Vec<Vec<u8>>, String> {
+    let mut out = Vec::with_capacity(row_count);
+    let mut off = 0usize;
+    while off + 4 <= payload.len() {
+        let len = get_u32(payload, off) as usize;
+        off += 4;
+        if off + len > payload.len() {
+            return Err("varlen entry out of bounds".into());
+        }
+        out.push(payload[off..off + len].to_vec());
+        off += len;
+    }
+    Ok(out)
+}
+
+fn decode_varlen_str(payload: &[u8], row_count: usize) -> Result<Vec<String>, String> {
+    decode_varlen_entries(payload, row_count)?
+        .into_iter()
+        .map(|b| String::from_utf8(b).map_err(|_| "varlen str not utf-8".to_string()))
+        .collect()
+}
+
+fn decode_varlen_bytes(payload: &[u8], row_count: usize) -> Result<Vec<Vec<u8>>, String> {
+    decode_varlen_entries(payload, row_count)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn roundtrip(col: ColumnData) {
+        let rc = col.len();
+        let encoded = encode_column(&col).unwrap();
+        let (decoded, consumed) = decode_column(&encoded, rc).unwrap();
+        assert_eq!(consumed, encoded.len());
+        assert_eq!(decoded, col);
+    }
+
+    #[test]
+    fn numeric_columns_roundtrip() {
+        roundtrip(ColumnData::I64((0..1000).map(|i| i * 7 - 3).collect()));
+        roundtrip(ColumnData::I32(vec![-5, 0, 5, 100, -100]));
+        roundtrip(ColumnData::F64(vec![1.5, 2.5, 3.14, -9.0]));
+        roundtrip(ColumnData::F32(vec![0.1, 0.2, 0.3]));
+        roundtrip(ColumnData::U32(vec![1, 2, 3, u32::MAX]));
+        roundtrip(ColumnData::U64(vec![1, 2, 3, u64::MAX]));
+        roundtrip(ColumnData::U8(vec![0, 1, 2, 255]));
+    }
+
+    #[test]
+    fn varlen_columns_roundtrip() {
+        roundtrip(ColumnData::Str(vec![
+            "alpha".into(),
+            "".into(),
+            "δοκιμή".into(),
+        ]));
+        roundtrip(ColumnData::Bytes(vec![vec![1, 2, 3], vec![], vec![0xFF; 10]]));
+    }
+
+    #[test]
+    fn pco_actually_compresses_monotonic_i64() {
+        // A monotonic timestamp column should shrink dramatically under Pco.
+        let col = ColumnData::I64((0..10_000).map(|i| 1_700_000_000_000 + i * 1000).collect());
+        let encoded = encode_column(&col).unwrap();
+        let raw = 10_000 * 8;
+        assert!(
+            encoded.len() < raw / 4,
+            "expected >4x compression, got {} vs {raw}",
+            encoded.len()
+        );
+    }
+
+    #[test]
+    fn column_builder_from_values() {
+        let mut b = ColumnBuilder::new(DType::I64);
+        for v in [Value::I64(10), Value::I64(20), Value::I64(30)] {
+            b.push(&v);
+        }
+        assert_eq!(b.finish(), ColumnData::I64(vec![10, 20, 30]));
+    }
+
+    #[test]
+    fn corrupt_payload_len_is_rejected() {
+        let col = ColumnData::I64(vec![1, 2, 3]);
+        let mut encoded = encode_column(&col).unwrap();
+        // Overstate payload_len → decode must refuse rather than panic.
+        let bad = (encoded.len() as u32 + 100).to_le_bytes();
+        encoded[4..8].copy_from_slice(&bad);
+        assert!(decode_column(&encoded, 3).is_err());
+    }
+}
diff --git a/probing/memtable/src/memc/compactor.rs b/probing/memtable/src/memc/compactor.rs
new file mode 100644
index 00000000..d93e4ad7
--- /dev/null
+++ b/probing/memtable/src/memc/compactor.rs
@@ -0,0 +1,421 @@
+//! [`Compactor`]: the **roller** that drains sealed hot-ring chunks into
+//! cold MEMC segments, bounding segment size to prevent fragmentation.
+//!
+//! ## Why a roller
+//!
+//! The MEMC format and [`ColdStore`] give us immutable segments and
+//! oldest-first eviction, but *nothing decides when to seal a segment and
+//! start a fresh one*. Without that policy you either seal every page
+//! (a blizzard of tiny files) or never seal (one unbounded file). The
+//! compactor closes that gap with a size-or-time roll policy:
+//!
+//! ```text
+//! after each appended page:
+//!     size_bytes() >= target_segment_bytes   → seal + roll
+//! on every poll tick:
+//!     open segment older than max_segment_age → seal + roll (low-rate tables)
+//! on shutdown / flush:
+//!     seal the open segment unconditionally   → bounded tail file
+//! ```
+//!
+//! A busy process emits a steady stream of ~`target`-sized files; an idle
+//! one keeps appending to a single open segment until the age window or
+//! shutdown, so neither extreme fragments the directory.
+//!
+//! ## Multi-table
+//!
+//! One [`Compactor`] feeds **one** [`ColdStore`] from **many** hot tables.
+//! Pages from every table share the same segment files (each carries its
+//! `table_id`), so adding tables grows pages, not files or directories.
+//!
+//! ## Concurrency
+//!
+//! The hot table is written by the application; the compactor only ever
+//! *reads* it. For shared/file-backed tables the compactor opens its own
+//! read handle to the same mapping and relies on the ring's lock-free
+//! `Acquire`/`Release` chunk protocol: it drains only `Sealed` chunks and
+//! re-checks the chunk generation after transposing, discarding a page if
+//! the ring recycled the chunk mid-read. The still-open `Writing` chunk is
+//! left to the hot tier until it seals.
+
+use std::collections::HashMap;
+use std::io;
+use std::path::PathBuf;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use std::thread::JoinHandle;
+use std::time::{Duration, Instant};
+
+use super::codec::{ColumnBuilder, ColumnData};
+use super::layout::SOURCE_CHUNK_NONE;
+use super::reader::SegmentReader;
+use super::store::ColdStore;
+use super::writer::SegmentWriter;
+use crate::layout::ChunkState;
+use crate::memtable::{MemTable, MemTableView};
+use crate::schema::{DType, Value};
+
+/// Roll/retention policy for a [`Compactor`].
+#[derive(Debug, Clone)]
+pub struct CompactorConfig {
+    /// Seal the open segment and start a new one once it reaches this many
+    /// bytes. Bounds individual file size; the main fragmentation knob.
+    pub target_segment_bytes: u64,
+    /// Also seal an open segment this old, so low-rate tables don't sit
+    /// unsealed (and unqueryable through the cold footer) indefinitely.
+    pub max_segment_age: Duration,
+    /// How long the background thread sleeps between drain passes.
+    pub poll_interval: Duration,
+    /// Cold-store byte budget; oldest segments are evicted past it.
+    pub max_total_bytes: Option<u64>,
+    /// Cold-store TTL; segments older than this are evicted.
+    pub ttl: Option<Duration>,
+}
+
+impl Default for CompactorConfig {
+    fn default() -> Self {
+        Self {
+            target_segment_bytes: 64 * 1024 * 1024,
+            max_segment_age: Duration::from_secs(300),
+            poll_interval: Duration::from_millis(500),
+            max_total_bytes: None,
+            ttl: None,
+        }
+    }
+}
+
+/// Per-table draining bookkeeping.
+struct TableProgress {
+    /// Last drained generation per chunk index (parallel to the hot ring).
+    /// A chunk is re-drained only when its generation advances past this.
+    drained_gen: Vec<u64>,
+    /// This table's id inside the *current* open segment, if registered.
+    /// Reset to `None` on every roll (table ids are segment-local).
+    seg_table_id: Option<u32>,
+}
+
+/// Drains sealed hot chunks into size-bounded cold segments.
+///
+/// Usable synchronously (call [`drain_view`](Self::drain_view) yourself) or
+/// as a background thread via [`spawn`](Self::spawn).
+pub struct Compactor {
+    store: ColdStore,
+    config: CompactorConfig,
+    current: Option<SegmentWriter>,
+    opened_at: Instant,
+    tables: HashMap<String, TableProgress>,
+    /// Per-(table, chunk) drain watermark recovered from existing cold
+    /// segments by [`prime_from_cold`](Self::prime_from_cold); merged into a
+    /// table's `drained_gen` the first time it is seen, so a restart over a
+    /// persistent cold dir does not re-compact already-persisted chunks.
+    seed: HashMap<String, HashMap<usize, u64>>,
+}
+
+impl Compactor {
+    pub fn new(store: ColdStore, config: CompactorConfig) -> Self {
+        Self {
+            store,
+            config,
+            current: None,
+            opened_at: Instant::now(),
+            tables: HashMap::new(),
+            seed: HashMap::new(),
+        }
+    }
+
+    /// Rebuild per-(table, chunk) drain watermarks from the cold segments
+    /// already on disk, so draining is **exactly-once across restarts** when
+    /// the cold dir persists. Call once after [`new`](Self::new), before any
+    /// `drain_view`. Each cold page records the hot-ring `(source_gen,
+    /// source_chunk)` it came from; we keep the max generation per chunk.
+    pub fn prime_from_cold(&mut self) -> io::Result<()> {
+        for path in self.store.segment_paths() {
+            let Ok(reader) = SegmentReader::open(&path) else {
+                continue; // unreadable/foreign file: skip, never fail priming
+            };
+            for page in reader.pages() {
+                if page.source_chunk == SOURCE_CHUNK_NONE {
+                    continue;
+                }
+                let Some(def) = reader.table_def(page.table_id) else {
+                    continue;
+                };
+                let slot = self
+                    .seed
+                    .entry(def.name.clone())
+                    .or_default()
+                    .entry(page.source_chunk as usize)
+                    .or_insert(0);
+                *slot = (*slot).max(page.source_gen);
+            }
+        }
+        Ok(())
+    }
+
+    pub fn config(&self) -> &CompactorConfig {
+        &self.config
+    }
+
+    /// Bytes written to the currently open segment (0 if none).
+    pub fn current_segment_bytes(&self) -> u64 {
+        self.current.as_ref().map(|w| w.size_bytes()).unwrap_or(0)
+    }
+
+    /// Cold-store capacity snapshot.
+    pub fn stats(&self) -> super::store::ColdStats {
+        self.store.stats()
+    }
+
+    /// Drain every newly-sealed chunk of `view` (a read handle to a hot
+    /// table named `name`) into cold pages, rolling segments by size as it
+    /// goes. Returns the number of rows compacted this call.
+    pub fn drain_view(&mut self, name: &str, view: &MemTableView) -> io::Result<usize> {
+        let cols: Vec<(String, DType)> = view
+            .schema()
+            .cols
+            .iter()
+            .map(|c| (c.name.clone(), c.dtype))
+            .collect();
+        let num_chunks = view.num_chunks();
+
+        if !self.tables.contains_key(name) {
+            let mut drained_gen = vec![0u64; num_chunks];
+            if let Some(seeds) = self.seed.get(name) {
+                for (&chunk, &gen) in seeds {
+                    if chunk < drained_gen.len() {
+                        drained_gen[chunk] = drained_gen[chunk].max(gen);
+                    }
+                }
+            }
+            self.tables.insert(
+                name.to_string(),
+                TableProgress {
+                    drained_gen,
+                    seg_table_id: None,
+                },
+            );
+        }
+        let prog = self.tables.get_mut(name).unwrap();
+        if prog.drained_gen.len() != num_chunks {
+            prog.drained_gen.resize(num_chunks, 0);
+        }
+
+        let sealed = ChunkState::Sealed as u32;
+        let mut total_rows = 0usize;
+
+        for chunk in view.chunks_logical() {
+            if view.chunk_state(chunk) != sealed {
+                continue;
+            }
+            let gen = view.chunk_generation(chunk);
+            let already = self.tables[name].drained_gen[chunk];
+            if gen == 0 || gen <= already {
+                continue;
+            }
+
+            let (gen_read, columns) = match transpose_chunk(view, chunk, &cols) {
+                Some(x) => x,
+                None => continue, // recycled mid-read; try again next pass
+            };
+            let rows = columns.first().map(|c| c.len()).unwrap_or(0);
+            if rows == 0 {
+                self.tables.get_mut(name).unwrap().drained_gen[chunk] = gen_read;
+                continue;
+            }
+
+            self.ensure_segment()?;
+            let table_id = self.register_if_needed(name, &cols)?;
+            self.current
+                .as_mut()
+                .expect("segment open")
+                .append_page(table_id, &columns, gen_read, chunk as u32)?;
+            self.tables.get_mut(name).unwrap().drained_gen[chunk] = gen_read;
+            total_rows += rows;
+
+            self.maybe_roll_on_size()?;
+        }
+        Ok(total_rows)
+    }
+
+    /// Seal the open segment if it has grown past `target_segment_bytes`.
+    fn maybe_roll_on_size(&mut self) -> io::Result<Option<PathBuf>> {
+        let over = self
+            .current
+            .as_ref()
+            .is_some_and(|w| w.size_bytes() >= self.config.target_segment_bytes);
+        if over {
+            self.roll()
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Seal the open segment if it is older than `max_segment_age` and holds
+    /// at least one page. Call this periodically (the background loop does).
+    pub fn maybe_roll_on_age(&mut self) -> io::Result<Option<PathBuf>> {
+        let aged = self.current.as_ref().is_some_and(|w| w.page_count() > 0)
+            && self.opened_at.elapsed() >= self.config.max_segment_age;
+        if aged {
+            self.roll()
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Seal the current segment and clear the open slot. An open segment
+    /// with no pages is removed instead of sealed, so an age-triggered roll
+    /// on an empty writer never leaves a stub file. Returns the sealed path.
+    pub fn roll(&mut self) -> io::Result<Option<PathBuf>> {
+        let Some(w) = self.current.take() else {
+            return Ok(None);
+        };
+        for p in self.tables.values_mut() {
+            p.seg_table_id = None;
+        }
+        if w.page_count() == 0 {
+            let path = w.path().to_path_buf();
+            drop(w);
+            let _ = std::fs::remove_file(&path);
+            return Ok(None);
+        }
+        Ok(Some(w.seal()?))
+    }
+
+    /// Seal whatever is open (shutdown / explicit checkpoint).
+    pub fn flush(&mut self) -> io::Result<Option<PathBuf>> {
+        self.roll()
+    }
+
+    /// Apply the cold-store byte/TTL budget, deleting oldest segments.
+    pub fn enforce(&self) -> Vec<PathBuf> {
+        self.store
+            .enforce_limits(self.config.max_total_bytes, self.config.ttl)
+    }
+
+    fn ensure_segment(&mut self) -> io::Result<()> {
+        if self.current.is_none() {
+            self.current = Some(self.store.create_segment()?);
+            self.opened_at = Instant::now();
+            for p in self.tables.values_mut() {
+                p.seg_table_id = None;
+            }
+        }
+        Ok(())
+    }
+
+    fn register_if_needed(&mut self, name: &str, cols: &[(String, DType)]) -> io::Result<u32> {
+        if let Some(id) = self.tables[name].seg_table_id {
+            return Ok(id);
+        }
+        let id = self
+            .current
+            .as_mut()
+            .expect("segment open")
+            .register_table(name, cols)?;
+        self.tables.get_mut(name).unwrap().seg_table_id = Some(id);
+        Ok(id)
+    }
+
+    /// Move this compactor onto a background thread that drains `sources`
+    /// every `poll_interval`, rolls by size/age, and enforces the budget.
+    /// Each source is `(table_name, read_handle)`; the handle must be a
+    /// shared/file-backed [`MemTable`] the application is writing elsewhere.
+    /// Dropping (or [`stop`](CompactorHandle::stop)ping) the returned handle
+    /// does a final drain + flush so no sealed chunk is left behind.
+    pub fn spawn(mut self, sources: Vec<(String, MemTable)>) -> CompactorHandle {
+        let stop = Arc::new(AtomicBool::new(false));
+        let stop_thread = stop.clone();
+        let poll = self.config.poll_interval;
+        let thread = std::thread::Builder::new()
+            .name("memc-compactor".into())
+            .spawn(move || {
+                while !stop_thread.load(Ordering::Relaxed) {
+                    for (name, table) in &sources {
+                        let view = table.view();
+                        let _ = self.drain_view(name, &view);
+                    }
+                    let _ = self.maybe_roll_on_age();
+                    let _ = self.enforce();
+                    std::thread::park_timeout(poll);
+                }
+                for (name, table) in &sources {
+                    let view = table.view();
+                    let _ = self.drain_view(name, &view);
+                }
+                let _ = self.flush();
+                let _ = self.enforce();
+            })
+            .expect("spawn memc-compactor thread");
+        CompactorHandle {
+            stop,
+            thread: Some(thread),
+        }
+    }
+}
+
+/// Handle to a background [`Compactor`] thread. Stops and joins on drop.
+pub struct CompactorHandle {
+    stop: Arc<AtomicBool>,
+    thread: Option<JoinHandle<()>>,
+}
+
+impl CompactorHandle {
+    /// Signal the thread to do a final drain + flush, then join it.
+    pub fn stop(mut self) {
+        self.shutdown();
+    }
+
+    fn shutdown(&mut self) {
+        self.stop.store(true, Ordering::Relaxed);
+        if let Some(t) = self.thread.take() {
+            t.thread().unpark();
+            let _ = t.join();
+        }
+    }
+}
+
+impl Drop for CompactorHandle {
+    fn drop(&mut self) {
+        self.shutdown();
+    }
+}
+
+/// Transpose one chunk's rows into per-column [`ColumnData`].
+///
+/// Returns `None` if the chunk was empty, never written, or recycled by the
+/// ring while we read it (detected by a generation change), so the caller
+/// can skip and retry on the next pass without persisting torn data.
+fn transpose_chunk(
+    view: &MemTableView,
+    chunk: usize,
+    cols: &[(String, DType)],
+) -> Option<(u64, Vec<ColumnData>)> {
+    let gen_before = view.chunk_generation(chunk);
+    if gen_before == 0 {
+        return None;
+    }
+    let mut builders: Vec<ColumnBuilder> =
+        cols.iter().map(|(_, dt)| ColumnBuilder::new(*dt)).collect();
+
+    for row in view.rows(chunk) {
+        let mut cur = row.cursor();
+        for (ci, (_, dt)) in cols.iter().enumerate() {
+            match dt {
+                DType::U8 => builders[ci].push(&Value::U8(cur.next_u8())),
+                DType::U32 => builders[ci].push(&Value::U32(cur.next_u32())),
+                DType::I32 => builders[ci].push(&Value::I32(cur.next_i32())),
+                DType::I64 => builders[ci].push(&Value::I64(cur.next_i64())),
+                DType::F32 => builders[ci].push(&Value::F32(cur.next_f32())),
+                DType::F64 => builders[ci].push(&Value::F64(cur.next_f64())),
+                DType::U64 => builders[ci].push(&Value::U64(cur.next_u64())),
+                DType::Str => builders[ci].push(&Value::Str(cur.next_str())),
+                DType::Bytes => builders[ci].push(&Value::Bytes(cur.next_bytes())),
+            }
+        }
+    }
+
+    if view.chunk_generation(chunk) != gen_before {
+        return None; // ring overwrote the chunk mid-transpose
+    }
+    Some((gen_before, builders.into_iter().map(|b| b.finish()).collect()))
+}
diff --git a/probing/memtable/src/memc/layout.rs b/probing/memtable/src/memc/layout.rs
new file mode 100644
index 00000000..ddb3eaf1
--- /dev/null
+++ b/probing/memtable/src/memc/layout.rs
@@ -0,0 +1,434 @@
+//! MEMC v1 binary layout: segment header, block headers, footer.
+//!
+//! All multi-byte fields are little-endian. See [`super`] (module docs)
+//! for the full format walkthrough.
+
+use crate::schema::DType;
+use xxhash_rust::xxh3::xxh3_64;
+
+/// Segment file magic: ASCII bytes `M E M C` in little-endian order.
+pub const MAGIC_MEMC: u32 = u32::from_le_bytes(*b"MEMC");
+/// Table-definition block magic.
+pub const MAGIC_TABLE_BLOCK: u32 = u32::from_le_bytes(*b"MCTB");
+/// Page (data) block magic.
+pub const MAGIC_PAGE_BLOCK: u32 = u32::from_le_bytes(*b"MCPG");
+/// Footer magic.
+pub const MAGIC_FOOTER: u32 = u32::from_le_bytes(*b"MCFT");
+
+/// MEMC format version.
+pub const VERSION_MEMC: u16 = 1;
+
+/// Segment header size (one cache line, mirrors MEMT/MEMH).
+pub const SEGMENT_HEADER_SIZE: usize = 64;
+/// Block header size; blocks start 64-aligned.
+pub const BLOCK_HEADER_SIZE: usize = 64;
+/// Fixed size of one page-directory entry in the footer.
+pub const PAGE_DIR_ENTRY_SIZE: usize = 56;
+
+/// `flags` bit: segment is sealed (footer present, file immutable).
+pub const FLAG_SEALED: u16 = 1 << 0;
+
+/// Sentinels for "no timestamp column / no rows yet" (match the hot ring).
+pub const TS_MIN_INIT: i64 = i64::MAX;
+pub const TS_MAX_INIT: i64 = i64::MIN;
+
+/// Pco compression level for numeric columns (pco default).
+pub const PCO_LEVEL: usize = 8;
+
+/// Column encoding inside a page payload.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[repr(u8)]
+pub enum ColEncoding {
+    /// Plain little-endian array of the fixed-size type.
+    RawFixed = 0,
+    /// Pco-compressed numeric column.
+    Pco = 1,
+    /// Concatenated `[u32 len][bytes]` entries (Str / Bytes).
+    RawVarLen = 2,
+}
+
+impl ColEncoding {
+    pub fn from_u8(v: u8) -> Option<Self> {
+        match v {
+            0 => Some(Self::RawFixed),
+            1 => Some(Self::Pco),
+            2 => Some(Self::RawVarLen),
+            _ => None,
+        }
+    }
+}
+
+/// Low 32 bits of xxh3-64 — the integrity check used throughout MEMC.
+#[inline]
+pub fn xxh32(bytes: &[u8]) -> u32 {
+    xxh3_64(bytes) as u32
+}
+
+#[inline]
+pub fn align64(n: usize) -> usize {
+    (n + 63) & !63
+}
+
+// ── byte helpers (encode into Vec / decode from slice) ───────────────
+
+#[inline]
+pub fn get_u16(buf: &[u8], off: usize) -> u16 {
+    u16::from_le_bytes(buf[off..off + 2].try_into().unwrap())
+}
+#[inline]
+pub fn get_u32(buf: &[u8], off: usize) -> u32 {
+    u32::from_le_bytes(buf[off..off + 4].try_into().unwrap())
+}
+#[inline]
+pub fn get_u64(buf: &[u8], off: usize) -> u64 {
+    u64::from_le_bytes(buf[off..off + 8].try_into().unwrap())
+}
+#[inline]
+pub fn get_i64(buf: &[u8], off: usize) -> i64 {
+    i64::from_le_bytes(buf[off..off + 8].try_into().unwrap())
+}
+#[inline]
+pub fn put_u16(buf: &mut [u8], off: usize, v: u16) {
+    buf[off..off + 2].copy_from_slice(&v.to_le_bytes());
+}
+#[inline]
+pub fn put_u32(buf: &mut [u8], off: usize, v: u32) {
+    buf[off..off + 4].copy_from_slice(&v.to_le_bytes());
+}
+#[inline]
+pub fn put_u64(buf: &mut [u8], off: usize, v: u64) {
+    buf[off..off + 8].copy_from_slice(&v.to_le_bytes());
+}
+#[inline]
+pub fn put_i64(buf: &mut [u8], off: usize, v: i64) {
+    buf[off..off + 8].copy_from_slice(&v.to_le_bytes());
+}
+
+// ── segment header ────────────────────────────────────────────────────
+
+/// Parsed segment header.
+///
+/// ```text
+/// offset size field
+///  0      4   magic            "MEMC"
+///  4      2   version          1
+///  6      2   header_size      64
+///  8      2   byte_order       BOM [0x01, 0x02]
+/// 10      2   flags            bit0 = SEALED
+/// 12      4   writer_pid
+/// 16      8   writer_start     creator process start time
+/// 24      8   created_unix_ms
+/// 32      8   footer_off       0 until sealed
+/// 40      8   ts_min           segment-wide (valid when sealed)
+/// 48      8   ts_max
+/// 56      4   page_count       valid when sealed
+/// 60      4   header_xxh       xxh32 of bytes 0..60
+/// ```
+#[derive(Debug, Clone)]
+pub struct SegmentHeader {
+    pub flags: u16,
+    pub writer_pid: u32,
+    pub writer_start: u64,
+    pub created_unix_ms: u64,
+    pub footer_off: u64,
+    pub ts_min: i64,
+    pub ts_max: i64,
+    pub page_count: u32,
+}
+
+impl SegmentHeader {
+    pub fn is_sealed(&self) -> bool {
+        self.flags & FLAG_SEALED != 0
+    }
+
+    pub fn encode(&self) -> [u8; SEGMENT_HEADER_SIZE] {
+        let mut b = [0u8; SEGMENT_HEADER_SIZE];
+        put_u32(&mut b, 0, MAGIC_MEMC);
+        put_u16(&mut b, 4, VERSION_MEMC);
+        put_u16(&mut b, 6, SEGMENT_HEADER_SIZE as u16);
+        b[8..10].copy_from_slice(&[0x01, 0x02]);
+        put_u16(&mut b, 10, self.flags);
+        put_u32(&mut b, 12, self.writer_pid);
+        put_u64(&mut b, 16, self.writer_start);
+        put_u64(&mut b, 24, self.created_unix_ms);
+        put_u64(&mut b, 32, self.footer_off);
+        put_i64(&mut b, 40, self.ts_min);
+        put_i64(&mut b, 48, self.ts_max);
+        put_u32(&mut b, 56, self.page_count);
+        let h = xxh32(&b[..60]);
+        put_u32(&mut b, 60, h);
+        b
+    }
+
+    pub fn decode(buf: &[u8]) -> Result<Self, &'static str> {
+        if buf.len() < SEGMENT_HEADER_SIZE {
+            return Err("buffer too small for MEMC header");
+        }
+        if get_u32(buf, 0) != MAGIC_MEMC {
+            return Err("invalid MEMC magic");
+        }
+        if get_u16(buf, 4) != VERSION_MEMC {
+            return Err("unsupported MEMC version");
+        }
+        if get_u16(buf, 6) as usize != SEGMENT_HEADER_SIZE {
+            return Err("invalid MEMC header size");
+        }
+        if buf[8..10] != [0x01, 0x02] {
+            return Err("byte order mismatch");
+        }
+        if get_u32(buf, 60) != xxh32(&buf[..60]) {
+            return Err("MEMC header checksum mismatch");
+        }
+        Ok(Self {
+            flags: get_u16(buf, 10),
+            writer_pid: get_u32(buf, 12),
+            writer_start: get_u64(buf, 16),
+            created_unix_ms: get_u64(buf, 24),
+            footer_off: get_u64(buf, 32),
+            ts_min: get_i64(buf, 40),
+            ts_max: get_i64(buf, 48),
+            page_count: get_u32(buf, 56),
+        })
+    }
+}
+
+// ── block header (table-definition and page blocks) ──────────────────
+
+/// Header shared by `MCTB` (table definition) and `MCPG` (page) blocks.
+///
+/// ```text
+/// offset size field
+///  0      4   block magic      "MCTB" / "MCPG"
+///  4      4   table_id
+///  8      4   row_count        (MCTB: 0)
+/// 12      4   col_count
+/// 16      8   ts_min           (MCTB: TS_MIN_INIT)
+/// 24      8   ts_max           (MCTB: TS_MAX_INIT)
+/// 32      8   source_gen       hot-ring chunk generation this page drained (0 = n/a)
+/// 40      4   payload_len
+/// 44      4   payload_xxh      xxh32 of payload bytes
+/// 48      4   source_chunk     hot-ring chunk index this page drained (u32::MAX = n/a)
+/// 52      4   header_xxh       xxh32 of bytes 0..52
+/// 56      8   reserved (zero)
+/// ```
+///
+/// `source_gen` + `source_chunk` together identify the hot-ring chunk a page
+/// was compacted from, letting a restarting compactor rebuild its per-chunk
+/// drain watermark from existing cold pages (exactly-once across restarts).
+///
+/// The payload follows the header and is padded to the next 64-byte
+/// boundary; the padding is excluded from `payload_xxh`.
+#[derive(Debug, Clone)]
+pub struct BlockHeader {
+    pub magic: u32,
+    pub table_id: u32,
+    pub row_count: u32,
+    pub col_count: u32,
+    pub ts_min: i64,
+    pub ts_max: i64,
+    pub source_gen: u64,
+    pub payload_len: u32,
+    pub payload_xxh: u32,
+    pub source_chunk: u32,
+}
+
+/// Sentinel for "this page did not originate from a specific hot-ring chunk".
+pub const SOURCE_CHUNK_NONE: u32 = u32::MAX;
+
+impl BlockHeader {
+    pub fn encode(&self) -> [u8; BLOCK_HEADER_SIZE] {
+        let mut b = [0u8; BLOCK_HEADER_SIZE];
+        put_u32(&mut b, 0, self.magic);
+        put_u32(&mut b, 4, self.table_id);
+        put_u32(&mut b, 8, self.row_count);
+        put_u32(&mut b, 12, self.col_count);
+        put_i64(&mut b, 16, self.ts_min);
+        put_i64(&mut b, 24, self.ts_max);
+        put_u64(&mut b, 32, self.source_gen);
+        put_u32(&mut b, 40, self.payload_len);
+        put_u32(&mut b, 44, self.payload_xxh);
+        put_u32(&mut b, 48, self.source_chunk);
+        let h = xxh32(&b[..52]);
+        put_u32(&mut b, 52, h);
+        b
+    }
+
+    /// Decode and verify the header checksum. The payload checksum is
+    /// verified separately, against the actual payload bytes.
+    pub fn decode(buf: &[u8]) -> Option<Self> {
+        if buf.len() < BLOCK_HEADER_SIZE {
+            return None;
+        }
+        let magic = get_u32(buf, 0);
+        if magic != MAGIC_TABLE_BLOCK && magic != MAGIC_PAGE_BLOCK {
+            return None;
+        }
+        if get_u32(buf, 52) != xxh32(&buf[..52]) {
+            return None;
+        }
+        Some(Self {
+            magic,
+            table_id: get_u32(buf, 4),
+            row_count: get_u32(buf, 8),
+            col_count: get_u32(buf, 12),
+            ts_min: get_i64(buf, 16),
+            ts_max: get_i64(buf, 24),
+            source_gen: get_u64(buf, 32),
+            payload_len: get_u32(buf, 40),
+            payload_xxh: get_u32(buf, 44),
+            source_chunk: get_u32(buf, 48),
+        })
+    }
+}
+
+// ── table-definition payload ──────────────────────────────────────────
+
+/// In-memory table definition (parsed from an `MCTB` payload).
+#[derive(Debug, Clone)]
+pub struct TableDef {
+    pub id: u32,
+    pub name: String,
+    pub cols: Vec<(String, DType)>,
+    /// Index of the designated timestamp column, per the hot-ring
+    /// convention (`I64` column named `timestamp` / `ts`).
+    pub ts_col: Option<usize>,
+}
+
+/// Encode a table definition payload:
+/// `[u16 name_len][u16 col_count][name]` then per column
+/// `[u8 dtype][u8 0][u16 name_len][name]`.
+pub fn encode_table_payload(name: &str, cols: &[(String, DType)]) -> Vec<u8> {
+    let mut out = Vec::with_capacity(8 + name.len() + cols.len() * 16);
+    out.extend_from_slice(&(name.len() as u16).to_le_bytes());
+    out.extend_from_slice(&(cols.len() as u16).to_le_bytes());
+    out.extend_from_slice(name.as_bytes());
+    for (cname, dtype) in cols {
+        out.push(*dtype as u32 as u8);
+        out.push(0);
+        out.extend_from_slice(&(cname.len() as u16).to_le_bytes());
+        out.extend_from_slice(cname.as_bytes());
+    }
+    out
+}
+
+pub fn decode_table_payload(id: u32, payload: &[u8]) -> Result<TableDef, &'static str> {
+    if payload.len() < 4 {
+        return Err("table payload too small");
+    }
+    let name_len = get_u16(payload, 0) as usize;
+    let col_count = get_u16(payload, 2) as usize;
+    let mut off = 4;
+    if payload.len() < off + name_len {
+        return Err("table name out of bounds");
+    }
+    let name = std::str::from_utf8(&payload[off..off + name_len])
+        .map_err(|_| "table name not utf-8")?
+        .to_string();
+    off += name_len;
+
+    let mut cols = Vec::with_capacity(col_count);
+    for _ in 0..col_count {
+        if payload.len() < off + 4 {
+            return Err("column entry out of bounds");
+        }
+        let dtype = DType::from_u32(payload[off] as u32).ok_or("invalid column dtype")?;
+        let cname_len = get_u16(payload, off + 2) as usize;
+        off += 4;
+        if payload.len() < off + cname_len {
+            return Err("column name out of bounds");
+        }
+        let cname = std::str::from_utf8(&payload[off..off + cname_len])
+            .map_err(|_| "column name not utf-8")?
+            .to_string();
+        off += cname_len;
+        cols.push((cname, dtype));
+    }
+
+    let ts_col = cols
+        .iter()
+        .position(|(n, dt)| *dt == DType::I64 && crate::raw::TS_COL_NAMES.contains(&n.as_str()));
+    Ok(TableDef {
+        id,
+        name,
+        cols,
+        ts_col,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn magics_are_distinct_from_hot_formats() {
+        assert_ne!(MAGIC_MEMC, crate::MAGIC_MEMT);
+        assert_ne!(MAGIC_MEMC, crate::MAGIC_MEMH);
+        assert_ne!(MAGIC_TABLE_BLOCK, MAGIC_PAGE_BLOCK);
+    }
+
+    #[test]
+    fn segment_header_roundtrip() {
+        let h = SegmentHeader {
+            flags: FLAG_SEALED,
+            writer_pid: 1234,
+            writer_start: 99,
+            created_unix_ms: 1_700_000_000_000,
+            footer_off: 4096,
+            ts_min: -5,
+            ts_max: 500,
+            page_count: 7,
+        };
+        let bytes = h.encode();
+        let d = SegmentHeader::decode(&bytes).unwrap();
+        assert!(d.is_sealed());
+        assert_eq!(d.writer_pid, 1234);
+        assert_eq!(d.footer_off, 4096);
+        assert_eq!((d.ts_min, d.ts_max), (-5, 500));
+        assert_eq!(d.page_count, 7);
+
+        // Corruption is detected
+        let mut bad = bytes;
+        bad[12] ^= 0xFF;
+        assert!(SegmentHeader::decode(&bad).is_err());
+    }
+
+    #[test]
+    fn block_header_roundtrip_and_corruption() {
+        let h = BlockHeader {
+            magic: MAGIC_PAGE_BLOCK,
+            table_id: 3,
+            row_count: 100,
+            col_count: 2,
+            ts_min: 10,
+            ts_max: 20,
+            source_gen: 42,
+            payload_len: 512,
+            payload_xxh: 0xDEAD,
+            source_chunk: 6,
+        };
+        let bytes = h.encode();
+        let d = BlockHeader::decode(&bytes).unwrap();
+        assert_eq!(d.table_id, 3);
+        assert_eq!(d.source_gen, 42);
+        assert_eq!(d.source_chunk, 6);
+
+        let mut bad = bytes;
+        bad[8] ^= 1;
+        assert!(BlockHeader::decode(&bad).is_none());
+    }
+
+    #[test]
+    fn table_payload_roundtrip() {
+        let cols = vec![
+            ("timestamp".to_string(), DType::I64),
+            ("value".to_string(), DType::F64),
+            ("tag".to_string(), DType::Str),
+        ];
+        let payload = encode_table_payload("metrics", &cols);
+        let def = decode_table_payload(5, &payload).unwrap();
+        assert_eq!(def.name, "metrics");
+        assert_eq!(def.cols.len(), 3);
+        assert_eq!(def.cols[2].1, DType::Str);
+        assert_eq!(def.ts_col, Some(0), "timestamp I64 column detected");
+    }
+}
diff --git a/probing/memtable/src/memc/mod.rs b/probing/memtable/src/memc/mod.rs
new file mode 100644
index 00000000..93e33f8d
--- /dev/null
+++ b/probing/memtable/src/memc/mod.rs
@@ -0,0 +1,64 @@
+//! MEMC: **cold** columnar segment files — the on-disk second tier below
+//! the hot MEMT ring.
+//!
+//! A background compactor drains sealed chunks from a hot [`MemTable`] and
+//! appends them, transposed to columns and Pco-compressed, as immutable
+//! **pages** inside append-only **segment** files. Segments live in a
+//! [`ColdStore`] directory and are evicted oldest-first by byte budget or
+//! TTL — a second-level ring that gives the system a time-retention axis
+//! the fixed-capacity hot ring cannot provide on its own.
+//!
+//! [`MemTable`]: crate::MemTable
+//!
+//! ## File format (one `.memc` segment)
+//!
+//! ```text
+//! ┌────────────────────────────────────────────┐ 0
+//! │ SegmentHeader (64 B)                         │
+//! │   magic "MEMC", version, BOM, flags          │
+//! │   writer pid/start, created_unix_ms          │
+//! │   footer_off, ts_min/ts_max, page_count      │
+//! │   header_xxh                                 │
+//! ├────────────────────────────────────────────┤ 64
+//! │ MCTB table-def block(s) — one per table      │
+//! │   [BlockHeader 64B][name+columns payload]    │
+//! ├────────────────────────────────────────────┤
+//! │ MCPG page block(s) — columnar, multi-table   │
+//! │   [BlockHeader 64B]                           │
+//! │   per column: [enc][dtype][len][bytes]       │
+//! │     numeric → Pco · u8/str/bytes → raw       │
+//! ├────────────────────────────────────────────┤ footer_off
+//! │ Footer: [MAGIC][count][len][xxh]             │
+//! │   page directory: N × 48B                    │
+//! │     (table_id, ts_min/max, block_off/len, …) │
+//! └────────────────────────────────────────────┘
+//! ```
+//!
+//! Every block header and payload carries an xxh3 checksum. Sealed
+//! segments are read through the footer directory; if the writer crashed
+//! before sealing, [`SegmentReader`] forward-scans the checksummed blocks
+//! and drops the torn tail.
+//!
+//! ## Query path (two-level time pruning)
+//!
+//! Segment header `ts_min/ts_max` prunes whole files (no mmap), then the
+//! page directory's per-page `(table_id, ts_min, ts_max)` prunes pages
+//! before decode — mirroring the hot ring's chunk-level pruning so a query
+//! planner can span hot chunks and cold pages with one time predicate.
+
+mod codec;
+mod compactor;
+mod layout;
+mod reader;
+mod store;
+mod writer;
+
+pub use codec::{ColumnBuilder, ColumnData};
+pub use compactor::{Compactor, CompactorConfig, CompactorHandle};
+pub use layout::{ColEncoding, TableDef, MAGIC_MEMC, SOURCE_CHUNK_NONE, VERSION_MEMC};
+pub use reader::{PageMeta, SegmentReader};
+pub use store::{default_cold_dir, writer_id, ColdStats, ColdStore};
+pub use writer::SegmentWriter;
+
+#[cfg(test)]
+mod tests;
diff --git a/probing/memtable/src/memc/reader.rs b/probing/memtable/src/memc/reader.rs
new file mode 100644
index 00000000..f684336e
--- /dev/null
+++ b/probing/memtable/src/memc/reader.rs
@@ -0,0 +1,255 @@
+//! [`SegmentReader`]: mmap a `.memc` file and read its tables and pages.
+//!
+//! A sealed segment is read via its footer page directory. An unsealed or
+//! torn segment (writer crashed before `seal`) falls back to a forward
+//! scan of checksummed blocks, stopping at the first damaged/partial block
+//! — so a half-written tail is silently dropped rather than surfaced.
+
+use std::collections::HashMap;
+use std::io;
+use std::path::{Path, PathBuf};
+
+use memmap2::Mmap;
+
+use super::codec::{decode_column, ColumnData};
+use super::layout::{
+    align64, get_u32, xxh32, BlockHeader, SegmentHeader, TableDef, BLOCK_HEADER_SIZE, MAGIC_FOOTER,
+    MAGIC_PAGE_BLOCK, MAGIC_TABLE_BLOCK, PAGE_DIR_ENTRY_SIZE, SEGMENT_HEADER_SIZE,
+};
+
+/// Metadata for one page, enough to prune before decoding.
+#[derive(Debug, Clone)]
+pub struct PageMeta {
+    pub table_id: u32,
+    pub row_count: u32,
+    pub col_count: u32,
+    pub ts_min: i64,
+    pub ts_max: i64,
+    pub block_off: u64,
+    pub block_len: u32,
+    pub source_gen: u64,
+    pub source_chunk: u32,
+}
+
+/// Read-only view over a memory-mapped MEMC segment.
+pub struct SegmentReader {
+    mmap: Mmap,
+    path: PathBuf,
+    header: SegmentHeader,
+    tables: HashMap<u32, TableDef>,
+    pages: Vec<PageMeta>,
+}
+
+impl SegmentReader {
+    pub fn open(path: impl AsRef<Path>) -> io::Result<Self> {
+        let path = path.as_ref().to_path_buf();
+        let file = std::fs::File::open(&path)?;
+        let mmap = unsafe { Mmap::map(&file)? };
+        Self::from_mmap(mmap, path)
+    }
+
+    fn from_mmap(mmap: Mmap, path: PathBuf) -> io::Result<Self> {
+        let header = SegmentHeader::decode(&mmap)
+            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+
+        let mut tables = HashMap::new();
+        let mut pages = Vec::new();
+
+        let footer_ok = header.is_sealed()
+            && header.footer_off != 0
+            && Self::load_footer(&mmap, &header, &mut pages);
+
+        // Always scan blocks for table definitions (cheap; MCTB blocks live
+        // before pages). On footer failure this also recovers page metadata.
+        Self::scan_blocks(&mmap, &header, &mut tables, footer_ok, &mut pages);
+
+        Ok(Self {
+            mmap,
+            path,
+            header,
+            tables,
+            pages,
+        })
+    }
+
+    /// Parse the footer page directory. Returns `false` (and leaves `pages`
+    /// untouched) if the footer is malformed or fails its checksum.
+    fn load_footer(mmap: &[u8], header: &SegmentHeader, pages: &mut Vec<PageMeta>) -> bool {
+        let foff = header.footer_off as usize;
+        if foff + 16 > mmap.len() || get_u32(mmap, foff) != MAGIC_FOOTER {
+            return false;
+        }
+        let count = get_u32(mmap, foff + 4) as usize;
+        let entries_len = get_u32(mmap, foff + 8) as usize;
+        let checksum = get_u32(mmap, foff + 12);
+        if count != header.page_count as usize || entries_len != count * PAGE_DIR_ENTRY_SIZE {
+            return false;
+        }
+        let entries_start = foff + 16;
+        let entries_end = entries_start + entries_len;
+        if entries_end > mmap.len() || xxh32(&mmap[entries_start..entries_end]) != checksum {
+            return false;
+        }
+
+        let mut out = Vec::with_capacity(count);
+        for i in 0..count {
+            let o = entries_start + i * PAGE_DIR_ENTRY_SIZE;
+            out.push(PageMeta {
+                table_id: get_u32(mmap, o),
+                row_count: get_u32(mmap, o + 4),
+                ts_min: super::layout::get_i64(mmap, o + 8),
+                ts_max: super::layout::get_i64(mmap, o + 16),
+                block_off: super::layout::get_u64(mmap, o + 24),
+                block_len: get_u32(mmap, o + 32),
+                col_count: get_u32(mmap, o + 36),
+                source_gen: super::layout::get_u64(mmap, o + 40),
+                source_chunk: get_u32(mmap, o + 48),
+            });
+        }
+        *pages = out;
+        true
+    }
+
+    /// Forward-scan blocks from the first block to `footer_off`/EOF.
+    /// Collects table definitions always; collects page metadata only when
+    /// `footer_ok` is false (recovery path). Stops at the first block that
+    /// fails to decode or whose payload checksum mismatches.
+    fn scan_blocks(
+        mmap: &[u8],
+        header: &SegmentHeader,
+        tables: &mut HashMap<u32, TableDef>,
+        footer_ok: bool,
+        pages: &mut Vec<PageMeta>,
+    ) {
+        let limit = if header.footer_off != 0 {
+            (header.footer_off as usize).min(mmap.len())
+        } else {
+            mmap.len()
+        };
+        let mut off = SEGMENT_HEADER_SIZE;
+        while off + BLOCK_HEADER_SIZE <= limit {
+            let Some(bh) = BlockHeader::decode(&mmap[off..]) else {
+                break;
+            };
+            let payload_start = off + BLOCK_HEADER_SIZE;
+            let payload_end = payload_start + bh.payload_len as usize;
+            if payload_end > limit {
+                break; // torn tail
+            }
+            if xxh32(&mmap[payload_start..payload_end]) != bh.payload_xxh {
+                break; // corrupt payload — stop here
+            }
+            let block_len = align64(BLOCK_HEADER_SIZE + bh.payload_len as usize);
+
+            match bh.magic {
+                MAGIC_TABLE_BLOCK => {
+                    if let Ok(def) =
+                        super::layout::decode_table_payload(bh.table_id, &mmap[payload_start..payload_end])
+                    {
+                        tables.insert(bh.table_id, def);
+                    }
+                }
+                MAGIC_PAGE_BLOCK if !footer_ok => {
+                    pages.push(PageMeta {
+                        table_id: bh.table_id,
+                        row_count: bh.row_count,
+                        col_count: bh.col_count,
+                        ts_min: bh.ts_min,
+                        ts_max: bh.ts_max,
+                        block_off: off as u64,
+                        block_len: block_len as u32,
+                        source_gen: bh.source_gen,
+                        source_chunk: bh.source_chunk,
+                    });
+                }
+                _ => {}
+            }
+            off += block_len;
+        }
+    }
+
+    pub fn path(&self) -> &Path {
+        &self.path
+    }
+
+    pub fn is_sealed(&self) -> bool {
+        self.header.is_sealed()
+    }
+
+    /// Segment-wide timestamp range (sealed segments only; `None` otherwise
+    /// or when the segment has no timestamped rows).
+    pub fn ts_range(&self) -> Option<(i64, i64)> {
+        if self.header.is_sealed() && self.header.ts_min <= self.header.ts_max {
+            Some((self.header.ts_min, self.header.ts_max))
+        } else {
+            None
+        }
+    }
+
+    pub fn table_defs(&self) -> Vec<&TableDef> {
+        self.tables.values().collect()
+    }
+
+    pub fn table_def(&self, id: u32) -> Option<&TableDef> {
+        self.tables.get(&id)
+    }
+
+    pub fn table_id_by_name(&self, name: &str) -> Option<u32> {
+        self.tables
+            .values()
+            .find(|d| d.name == name)
+            .map(|d| d.id)
+    }
+
+    pub fn pages(&self) -> &[PageMeta] {
+        &self.pages
+    }
+
+    /// Pages for `table_id` whose `[ts_min, ts_max]` overlaps `[lo, hi]`
+    /// (either bound `None` = unbounded). Pages without a ts range
+    /// (`ts_min > ts_max`) are always included.
+    pub fn pages_in_range(
+        &self,
+        table_id: u32,
+        lo: Option<i64>,
+        hi: Option<i64>,
+    ) -> Vec<usize> {
+        self.pages
+            .iter()
+            .enumerate()
+            .filter(|(_, p)| p.table_id == table_id)
+            .filter(|(_, p)| {
+                if p.ts_min > p.ts_max {
+                    return true; // no ts metadata: cannot prune
+                }
+                !(lo.is_some_and(|l| p.ts_max < l) || hi.is_some_and(|h| p.ts_min > h))
+            })
+            .map(|(i, _)| i)
+            .collect()
+    }
+
+    /// Decode page `index` into its columns (in schema order).
+    pub fn read_page(&self, index: usize) -> Result<Vec<ColumnData>, String> {
+        let p = self.pages.get(index).ok_or("page index out of range")?;
+        let hstart = p.block_off as usize;
+        let bh = BlockHeader::decode(&self.mmap[hstart..]).ok_or("page block header invalid")?;
+        let payload_start = hstart + BLOCK_HEADER_SIZE;
+        let payload_end = payload_start + bh.payload_len as usize;
+        if payload_end > self.mmap.len() {
+            return Err("page payload out of bounds".into());
+        }
+        if xxh32(&self.mmap[payload_start..payload_end]) != bh.payload_xxh {
+            return Err("page payload checksum mismatch".into());
+        }
+
+        let rc = bh.row_count as usize;
+        let mut cols = Vec::with_capacity(bh.col_count as usize);
+        let mut off = payload_start;
+        for _ in 0..bh.col_count {
+            let (col, used) = decode_column(&self.mmap[off..payload_end], rc)?;
+            cols.push(col);
+            off += used;
+        }
+        Ok(cols)
+    }
+}
diff --git a/probing/memtable/src/memc/store.rs b/probing/memtable/src/memc/store.rs
new file mode 100644
index 00000000..5f0fbd28
--- /dev/null
+++ b/probing/memtable/src/memc/store.rs
@@ -0,0 +1,247 @@
+//! [`ColdStore`]: directory of MEMC segment files with capacity management.
+//!
+//! Layout (one directory per host, segments shared across all of a
+//! writer's tables):
+//!
+//! ```text
+//! <base>/
+//!     a3f2c1-000001.memc   ← writer "a3f2c1", sequence 1 (sealed)
+//!     a3f2c1-000002.memc   ← sequence 2 (current, may be unsealed)
+//!     9c81b0-000001.memc   ← another writer/process on the same host
+//! ```
+//!
+//! The store is a **second-level ring**: the hot MEMT buffer wraps by
+//! bytes, the cold store wraps by whole segment files. Eviction deletes
+//! the oldest segments once a byte budget or TTL is exceeded; because
+//! segments are immutable whole files, eviction is atomic and O(1) per
+//! file, and `unlink`ing a segment that a query still has mmap'd is safe
+//! under POSIX (the inode survives until the last mapping drops).
+
+use std::io;
+use std::path::{Path, PathBuf};
+use std::time::{Duration, SystemTime};
+
+use super::layout::xxh32;
+use super::writer::SegmentWriter;
+use crate::raw::process_start_time;
+
+const SEGMENT_EXT: &str = "memc";
+
+/// Stable per-writer id: hash of (pid, process start time). Restarting the
+/// process yields a fresh id, so sequence numbers never collide across the
+/// lifetime of a host directory.
+pub fn writer_id(pid: u32, start_time: u64) -> String {
+    let mut buf = [0u8; 12];
+    buf[0..4].copy_from_slice(&pid.to_le_bytes());
+    buf[4..12].copy_from_slice(&start_time.to_le_bytes());
+    format!("{:06x}", xxh32(&buf) & 0x00FF_FFFF)
+}
+
+/// Capacity snapshot of a cold store.
+#[derive(Debug, Clone, Default, PartialEq, Eq)]
+pub struct ColdStats {
+    pub segment_count: usize,
+    pub total_bytes: u64,
+    /// Modification time of the oldest segment, ms since epoch (0 if none).
+    pub oldest_unix_ms: u64,
+}
+
+/// A directory of MEMC segments owned by one writer process.
+pub struct ColdStore {
+    dir: PathBuf,
+    writer_id: String,
+    next_seq: u32,
+}
+
+/// Default cold-store base directory: `$PROBING_COLD_DIR`, else
+/// `<temp>/probing-cold`.
+pub fn default_cold_dir() -> PathBuf {
+    std::env::var_os("PROBING_COLD_DIR")
+        .map(PathBuf::from)
+        .unwrap_or_else(|| std::env::temp_dir().join("probing-cold"))
+}
+
+impl ColdStore {
+    /// Open (creating if needed) a cold store rooted at `dir`.
+    pub fn open(dir: impl AsRef<Path>) -> io::Result<Self> {
+        let dir = dir.as_ref().to_path_buf();
+        std::fs::create_dir_all(&dir)?;
+        let pid = std::process::id();
+        let wid = writer_id(pid, process_start_time(pid));
+        let next_seq = Self::max_seq_for(&dir, &wid) + 1;
+        Ok(Self {
+            dir,
+            writer_id: wid,
+            next_seq,
+        })
+    }
+
+    pub fn dir(&self) -> &Path {
+        &self.dir
+    }
+
+    pub fn writer_id(&self) -> &str {
+        &self.writer_id
+    }
+
+    /// Highest existing sequence number for `wid` in `dir` (0 if none).
+    fn max_seq_for(dir: &Path, wid: &str) -> u32 {
+        let mut max = 0u32;
+        if let Ok(entries) = std::fs::read_dir(dir) {
+            for e in entries.flatten() {
+                let name = e.file_name().to_string_lossy().to_string();
+                if let Some((w, seq)) = parse_segment_name(&name) {
+                    if w == wid {
+                        max = max.max(seq);
+                    }
+                }
+            }
+        }
+        max
+    }
+
+    /// Path for the next segment (does not create the file).
+    pub fn next_segment_path(&mut self) -> PathBuf {
+        let seq = self.next_seq;
+        self.next_seq += 1;
+        self.dir
+            .join(format!("{}-{:06}.{}", self.writer_id, seq, SEGMENT_EXT))
+    }
+
+    /// Create a new [`SegmentWriter`] for the next sequence number.
+    pub fn create_segment(&mut self) -> io::Result<SegmentWriter> {
+        let path = self.next_segment_path();
+        SegmentWriter::create(path)
+    }
+
+    /// All segment files in the directory (any writer), sorted oldest →
+    /// newest by modification time.
+    pub fn segment_paths(&self) -> Vec<PathBuf> {
+        let mut segs: Vec<(SystemTime, PathBuf)> = Vec::new();
+        if let Ok(entries) = std::fs::read_dir(&self.dir) {
+            for e in entries.flatten() {
+                let path = e.path();
+                if path.extension().and_then(|s| s.to_str()) != Some(SEGMENT_EXT) {
+                    continue;
+                }
+                let mtime = e
+                    .metadata()
+                    .and_then(|m| m.modified())
+                    .unwrap_or(SystemTime::UNIX_EPOCH);
+                segs.push((mtime, path));
+            }
+        }
+        segs.sort_by(|a, b| a.0.cmp(&b.0));
+        segs.into_iter().map(|(_, p)| p).collect()
+    }
+
+    pub fn stats(&self) -> ColdStats {
+        let paths = self.segment_paths();
+        let mut total = 0u64;
+        let mut oldest = u64::MAX;
+        for p in &paths {
+            if let Ok(meta) = std::fs::metadata(p) {
+                total += meta.len();
+                if let Ok(mtime) = meta.modified() {
+                    let ms = mtime
+                        .duration_since(SystemTime::UNIX_EPOCH)
+                        .map(|d| d.as_millis() as u64)
+                        .unwrap_or(0);
+                    oldest = oldest.min(ms);
+                }
+            }
+        }
+        ColdStats {
+            segment_count: paths.len(),
+            total_bytes: total,
+            oldest_unix_ms: if paths.is_empty() { 0 } else { oldest },
+        }
+    }
+
+    /// Evict oldest segments until under `max_bytes` and within `ttl`.
+    ///
+    /// Either limit may be `None` to disable it. The newest segment is
+    /// never evicted (it may be the one currently being appended). Returns
+    /// the paths removed.
+    pub fn enforce_limits(
+        &self,
+        max_bytes: Option<u64>,
+        ttl: Option<Duration>,
+    ) -> Vec<PathBuf> {
+        let mut paths = self.segment_paths();
+        if paths.len() <= 1 {
+            return Vec::new();
+        }
+        // Protect the newest segment (oldest-first order ⇒ it is last);
+        // it may be the one currently being appended.
+        paths.pop();
+
+        let file_len = |p: &Path| std::fs::metadata(p).map(|m| m.len()).unwrap_or(0);
+        let now = SystemTime::now();
+        let mut total: u64 = self.stats().total_bytes;
+
+        let mut removed = Vec::new();
+        for path in paths {
+            let too_old = ttl
+                .and_then(|ttl| {
+                    let mtime = std::fs::metadata(&path).ok()?.modified().ok()?;
+                    now.duration_since(mtime).ok().map(|age| age > ttl)
+                })
+                .unwrap_or(false);
+            let over_budget = max_bytes.is_some_and(|max| total > max);
+            if !(too_old || over_budget) {
+                break; // sorted oldest-first: nothing newer qualifies either
+            }
+            let sz = file_len(&path);
+            if std::fs::remove_file(&path).is_ok() {
+                total = total.saturating_sub(sz);
+                removed.push(path);
+            }
+        }
+        removed
+    }
+}
+
+/// Parse `"<writer_id>-<seq>.memc"` → `(writer_id, seq)`.
+fn parse_segment_name(name: &str) -> Option<(String, u32)> {
+    let stem = name.strip_suffix(".memc")?;
+    let (wid, seq) = stem.rsplit_once('-')?;
+    let seq: u32 = seq.parse().ok()?;
+    Some((wid.to_string(), seq))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn writer_id_is_stable_and_pid_sensitive() {
+        assert_eq!(writer_id(100, 5), writer_id(100, 5));
+        assert_ne!(writer_id(100, 5), writer_id(101, 5));
+        assert_ne!(writer_id(100, 5), writer_id(100, 6));
+        assert_eq!(writer_id(100, 5).len(), 6);
+    }
+
+    #[test]
+    fn parse_segment_name_roundtrip() {
+        assert_eq!(
+            parse_segment_name("a3f2c1-000007.memc"),
+            Some(("a3f2c1".to_string(), 7))
+        );
+        assert_eq!(parse_segment_name("notasegment.txt"), None);
+        assert_eq!(parse_segment_name("missingseq.memc"), None);
+    }
+
+    #[test]
+    fn sequence_numbers_increment_and_persist() {
+        let tmp = std::env::temp_dir().join(format!("memc-store-test-{}", std::process::id()));
+        let _ = std::fs::remove_dir_all(&tmp);
+        let mut store = ColdStore::open(&tmp).unwrap();
+        let p1 = store.next_segment_path();
+        let p2 = store.next_segment_path();
+        assert_ne!(p1, p2);
+        assert!(p1.to_string_lossy().contains("-000001."));
+        assert!(p2.to_string_lossy().contains("-000002."));
+        let _ = std::fs::remove_dir_all(&tmp);
+    }
+}
diff --git a/probing/memtable/src/memc/tests.rs b/probing/memtable/src/memc/tests.rs
new file mode 100644
index 00000000..556927fb
--- /dev/null
+++ b/probing/memtable/src/memc/tests.rs
@@ -0,0 +1,643 @@
+//! End-to-end tests for the MEMC cold segment format and store.
+
+use super::*;
+use crate::schema::{DType, Schema, Value};
+use crate::MemTable;
+use std::time::Duration;
+
+fn tmp_dir(tag: &str) -> std::path::PathBuf {
+    let dir = std::env::temp_dir().join(format!(
+        "memc-test-{tag}-{}-{:?}",
+        std::process::id(),
+        std::thread::current().id()
+    ));
+    let _ = std::fs::remove_dir_all(&dir);
+    std::fs::create_dir_all(&dir).unwrap();
+    dir
+}
+
+fn metrics_cols() -> Vec<(String, DType)> {
+    vec![
+        ("timestamp".to_string(), DType::I64),
+        ("value".to_string(), DType::F64),
+        ("tag".to_string(), DType::Str),
+    ]
+}
+
+#[test]
+fn segment_roundtrip_sealed() {
+    let dir = tmp_dir("roundtrip");
+    let path = dir.join("seg.memc");
+
+    let mut w = SegmentWriter::create(&path).unwrap();
+    let tid = w.register_table("metrics", &metrics_cols()).unwrap();
+    w.append_page(
+        tid,
+        &[
+            ColumnData::I64(vec![100, 200, 300]),
+            ColumnData::F64(vec![1.0, 2.0, 3.0]),
+            ColumnData::Str(vec!["a".into(), "b".into(), "c".into()]),
+        ],
+        7,
+        0,
+    )
+    .unwrap();
+    w.append_page(
+        tid,
+        &[
+            ColumnData::I64(vec![400, 500]),
+            ColumnData::F64(vec![4.0, 5.0]),
+            ColumnData::Str(vec!["d".into(), "e".into()]),
+        ],
+        8,
+        1,
+    )
+    .unwrap();
+    w.seal().unwrap();
+
+    let r = SegmentReader::open(&path).unwrap();
+    assert!(r.is_sealed());
+    assert_eq!(r.ts_range(), Some((100, 500)));
+    assert_eq!(r.pages().len(), 2);
+
+    let id = r.table_id_by_name("metrics").unwrap();
+    let def = r.table_def(id).unwrap();
+    assert_eq!(def.cols.len(), 3);
+    assert_eq!(def.ts_col, Some(0));
+
+    let cols = r.read_page(0).unwrap();
+    assert_eq!(cols[0], ColumnData::I64(vec![100, 200, 300]));
+    assert_eq!(cols[2], ColumnData::Str(vec!["a".into(), "b".into(), "c".into()]));
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn size_bytes_tracks_growth_for_roll_decisions() {
+    let dir = tmp_dir("sizehint");
+    let path = dir.join("seg.memc");
+
+    let mut w = SegmentWriter::create(&path).unwrap();
+    let base = w.size_bytes();
+    assert_eq!(base, 64, "starts at the 64-byte header");
+    assert_eq!(w.ts_span(), None);
+
+    let tid = w
+        .register_table("m", &[("timestamp".to_string(), DType::I64)])
+        .unwrap();
+    let after_reg = w.size_bytes();
+    assert!(after_reg > base, "table block advances the offset");
+
+    w.append_page(tid, &[ColumnData::I64(vec![10, 20, 30])], 0, 0)
+        .unwrap();
+    assert!(w.size_bytes() > after_reg, "page advances the offset");
+    assert_eq!(w.ts_span(), Some((10, 30)));
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn multi_table_segment() {
+    let dir = tmp_dir("multitable");
+    let path = dir.join("seg.memc");
+
+    let mut w = SegmentWriter::create(&path).unwrap();
+    let metrics = w.register_table("metrics", &metrics_cols()).unwrap();
+    let events = w
+        .register_table(
+            "events",
+            &[("ts".to_string(), DType::I64), ("code".to_string(), DType::I32)],
+        )
+        .unwrap();
+
+    w.append_page(
+        metrics,
+        &[
+            ColumnData::I64(vec![10, 20]),
+            ColumnData::F64(vec![0.1, 0.2]),
+            ColumnData::Str(vec!["x".into(), "y".into()]),
+        ],
+        1,
+        0,
+    )
+    .unwrap();
+    w.append_page(
+        events,
+        &[ColumnData::I64(vec![15]), ColumnData::I32(vec![42])],
+        1,
+        0,
+    )
+    .unwrap();
+    w.seal().unwrap();
+
+    let r = SegmentReader::open(&path).unwrap();
+    let mpages = r.pages_in_range(metrics, None, None);
+    let epages = r.pages_in_range(events, None, None);
+    assert_eq!(mpages.len(), 1);
+    assert_eq!(epages.len(), 1);
+    assert_eq!(r.read_page(epages[0]).unwrap()[1], ColumnData::I32(vec![42]));
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn page_pruning_by_time_range() {
+    let dir = tmp_dir("prune");
+    let path = dir.join("seg.memc");
+
+    let mut w = SegmentWriter::create(&path).unwrap();
+    let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap();
+    w.append_page(tid, &[ColumnData::I64(vec![0, 10, 20])], 0, 0)
+        .unwrap();
+    w.append_page(tid, &[ColumnData::I64(vec![100, 110, 120])], 0, 1)
+        .unwrap();
+    w.append_page(tid, &[ColumnData::I64(vec![200, 210])], 0, 2)
+        .unwrap();
+    w.seal().unwrap();
+
+    let r = SegmentReader::open(&path).unwrap();
+    // Window [105, 130] overlaps only the middle page.
+    let hit = r.pages_in_range(tid, Some(105), Some(130));
+    assert_eq!(hit.len(), 1);
+    assert_eq!(r.read_page(hit[0]).unwrap()[0], ColumnData::I64(vec![100, 110, 120]));
+
+    // Lower bound past everything → no pages.
+    assert!(r.pages_in_range(tid, Some(1000), None).is_empty());
+    // Unbounded → all three.
+    assert_eq!(r.pages_in_range(tid, None, None).len(), 3);
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn unsealed_segment_recovers_via_forward_scan() {
+    let dir = tmp_dir("unsealed");
+    let path = dir.join("seg.memc");
+
+    {
+        let mut w = SegmentWriter::create(&path).unwrap();
+        let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap();
+        w.append_page(tid, &[ColumnData::I64(vec![1, 2, 3])], 0, 0)
+            .unwrap();
+        w.append_page(tid, &[ColumnData::I64(vec![4, 5, 6])], 0, 1)
+            .unwrap();
+        // Drop WITHOUT seal — simulates a crash before footer is written.
+    }
+
+    let r = SegmentReader::open(&path).unwrap();
+    assert!(!r.is_sealed());
+    assert_eq!(r.pages().len(), 2, "forward scan must recover both pages");
+    let id = r.table_id_by_name("m").unwrap();
+    assert_eq!(r.read_page(0).unwrap()[0], ColumnData::I64(vec![1, 2, 3]));
+    assert_eq!(r.pages_in_range(id, None, None).len(), 2);
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn torn_tail_block_is_dropped() {
+    let dir = tmp_dir("torn");
+    let path = dir.join("seg.memc");
+
+    {
+        let mut w = SegmentWriter::create(&path).unwrap();
+        let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap();
+        w.append_page(tid, &[ColumnData::I64(vec![1, 2, 3])], 0, 0)
+            .unwrap();
+        w.append_page(tid, &[ColumnData::I64(vec![4, 5, 6])], 0, 1)
+            .unwrap();
+    }
+    // Find the second page's block, then truncate into the middle of its
+    // payload (header intact) to mimic a partial write.
+    let cut = {
+        let r = SegmentReader::open(&path).unwrap();
+        let p1 = &r.pages()[1];
+        p1.block_off + (super::layout::BLOCK_HEADER_SIZE as u64) + 8
+    };
+    let f = std::fs::OpenOptions::new().write(true).open(&path).unwrap();
+    f.set_len(cut).unwrap();
+    drop(f);
+
+    let r = SegmentReader::open(&path).unwrap();
+    assert_eq!(
+        r.pages().len(),
+        1,
+        "torn tail page must be dropped, first page survives"
+    );
+    assert_eq!(r.read_page(0).unwrap()[0], ColumnData::I64(vec![1, 2, 3]));
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn cold_store_segment_creation_and_listing() {
+    let dir = tmp_dir("store-create");
+    let mut store = ColdStore::open(&dir).unwrap();
+
+    for batch in 0..3 {
+        let mut w = store.create_segment().unwrap();
+        let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap();
+        w.append_page(tid, &[ColumnData::I64(vec![batch, batch + 1])], 0, 0)
+            .unwrap();
+        w.seal().unwrap();
+    }
+
+    let segs = store.segment_paths();
+    assert_eq!(segs.len(), 3);
+    let stats = store.stats();
+    assert_eq!(stats.segment_count, 3);
+    assert!(stats.total_bytes > 0);
+
+    // A fresh store over the same dir continues the sequence.
+    let mut store2 = ColdStore::open(&dir).unwrap();
+    let next = store2.next_segment_path();
+    assert!(next.to_string_lossy().contains("-000004."));
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn eviction_respects_byte_budget_and_keeps_newest() {
+    let dir = tmp_dir("evict-bytes");
+    let mut store = ColdStore::open(&dir).unwrap();
+
+    let mut sizes = Vec::new();
+    for i in 0..5i64 {
+        let mut w = store.create_segment().unwrap();
+        let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap();
+        w.append_page(
+            tid,
+            &[ColumnData::I64((0..100).map(|x| x + i * 1000).collect())],
+            0,
+            0,
+        )
+        .unwrap();
+        let path = w.seal().unwrap();
+        sizes.push(std::fs::metadata(&path).unwrap().len());
+        // Ensure distinct mtimes for deterministic oldest-first ordering.
+        std::thread::sleep(Duration::from_millis(10));
+    }
+
+    let total: u64 = sizes.iter().sum();
+    // Budget that should force dropping the oldest couple of segments.
+    let budget = total - sizes[0] - sizes[1] + 1;
+    let removed = store.enforce_limits(Some(budget), None);
+    assert!(!removed.is_empty(), "expected some eviction");
+
+    let remaining = store.segment_paths();
+    assert!(remaining.len() < 5);
+    assert!(store.stats().total_bytes <= budget);
+    // Newest survives.
+    assert!(remaining
+        .last()
+        .unwrap()
+        .to_string_lossy()
+        .contains("-000005."));
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn eviction_by_ttl() {
+    let dir = tmp_dir("evict-ttl");
+    let mut store = ColdStore::open(&dir).unwrap();
+    for _ in 0..3 {
+        let mut w = store.create_segment().unwrap();
+        let tid = w.register_table("m", &[("timestamp".to_string(), DType::I64)]).unwrap();
+        w.append_page(tid, &[ColumnData::I64(vec![1, 2])], 0, 0)
+            .unwrap();
+        w.seal().unwrap();
+        std::thread::sleep(Duration::from_millis(10));
+    }
+    // TTL of 0 → every segment except the protected newest is expired.
+    let removed = store.enforce_limits(None, Some(Duration::from_millis(0)));
+    assert_eq!(removed.len(), 2);
+    assert_eq!(store.segment_paths().len(), 1);
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn pco_compresses_large_numeric_segment() {
+    let dir = tmp_dir("compress");
+    let path = dir.join("seg.memc");
+
+    let n = 50_000i64;
+    let mut w = SegmentWriter::create(&path).unwrap();
+    let tid = w
+        .register_table(
+            "metrics",
+            &[("timestamp".to_string(), DType::I64), ("value".to_string(), DType::F64)],
+        )
+        .unwrap();
+    w.append_page(
+        tid,
+        &[
+            ColumnData::I64((0..n).map(|i| 1_700_000_000_000 + i * 1000).collect()),
+            ColumnData::F64((0..n).map(|i| (i as f64) * 0.5).collect()),
+        ],
+        0,
+        0,
+    )
+    .unwrap();
+    let sealed = w.seal().unwrap();
+
+    let on_disk = std::fs::metadata(&sealed).unwrap().len();
+    let raw = (n as u64) * (8 + 8);
+    assert!(
+        on_disk < raw / 3,
+        "expected >3x compression: {on_disk} vs {raw}"
+    );
+
+    // And it still reads back exactly.
+    let r = SegmentReader::open(&sealed).unwrap();
+    let cols = r.read_page(0).unwrap();
+    assert_eq!(cols[0].len(), n as usize);
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+// ── Compactor (the roller) ───────────────────────────────────────────
+
+fn hot_metrics(chunk_size: u32, num_chunks: u32) -> MemTable {
+    let schema = Schema::new()
+        .col("timestamp", DType::I64)
+        .col("value", DType::F64)
+        .col("tag", DType::Str);
+    MemTable::new(&schema, chunk_size, num_chunks)
+}
+
+/// Total rows across all pages of every sealed segment in `dir`.
+fn cold_row_count(dir: &std::path::Path) -> usize {
+    let store = ColdStore::open(dir).unwrap();
+    store
+        .segment_paths()
+        .iter()
+        .map(|p| {
+            let r = SegmentReader::open(p).unwrap();
+            r.pages().iter().map(|pg| pg.row_count as usize).sum::<usize>()
+        })
+        .sum()
+}
+
+#[test]
+fn compactor_drains_only_sealed_chunks() {
+    let dir = tmp_dir("compact-basic");
+    let mut t = hot_metrics(512, 4);
+
+    for i in 0..3 {
+        t.push_row(&[Value::I64(100 + i), Value::F64(i as f64), Value::Str("a")]);
+    }
+    t.advance_chunk(); // seal chunk 0 (3 rows)
+    for i in 0..2 {
+        t.push_row(&[Value::I64(200 + i), Value::F64(i as f64), Value::Str("b")]);
+    }
+    t.advance_chunk(); // seal chunk 1 (2 rows)
+    // chunk 2 stays Writing — must NOT be drained
+    t.push_row(&[Value::I64(999), Value::F64(9.0), Value::Str("c")]);
+
+    let store = ColdStore::open(&dir).unwrap();
+    let cfg = CompactorConfig {
+        target_segment_bytes: 1 << 30, // never roll on size
+        ..Default::default()
+    };
+    let mut c = Compactor::new(store, cfg);
+    let rows = c.drain_view("metrics", &t.view()).unwrap();
+    assert_eq!(rows, 5, "only the two sealed chunks drain");
+
+    // Draining again is idempotent — nothing new sealed.
+    assert_eq!(c.drain_view("metrics", &t.view()).unwrap(), 0);
+
+    let sealed = c.flush().unwrap().expect("one segment sealed");
+    let r = SegmentReader::open(&sealed).unwrap();
+    assert!(r.is_sealed());
+    assert_eq!(r.pages().len(), 2);
+    assert_eq!(r.ts_range(), Some((100, 201)));
+
+    let id = r.table_id_by_name("metrics").unwrap();
+    assert_eq!(r.table_def(id).unwrap().ts_col, Some(0));
+    assert_eq!(r.read_page(0).unwrap()[0], ColumnData::I64(vec![100, 101, 102]));
+
+    assert_eq!(cold_row_count(&dir), 5);
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn compactor_rolls_by_size_and_reregisters_table() {
+    let dir = tmp_dir("compact-roll");
+    let mut t = hot_metrics(512, 4);
+
+    for c in 0..3 {
+        for i in 0..2 {
+            t.push_row(&[
+                Value::I64(1000 * c + i),
+                Value::F64(i as f64),
+                Value::Str("x"),
+            ]);
+        }
+        t.advance_chunk(); // seal each chunk
+    }
+
+    let store = ColdStore::open(&dir).unwrap();
+    let cfg = CompactorConfig {
+        target_segment_bytes: 1, // force a roll after every page
+        ..Default::default()
+    };
+    let mut c = Compactor::new(store, cfg);
+    let rows = c.drain_view("metrics", &t.view()).unwrap();
+    assert_eq!(rows, 6);
+    assert!(c.flush().unwrap().is_none(), "no open segment after size rolls");
+
+    // Three sealed chunks → three one-page segments, each independently
+    // carrying the table definition (re-registered on every roll).
+    let store = ColdStore::open(&dir).unwrap();
+    let paths = store.segment_paths();
+    assert_eq!(paths.len(), 3);
+    for p in &paths {
+        let r = SegmentReader::open(p).unwrap();
+        assert!(r.is_sealed());
+        assert_eq!(r.pages().len(), 1);
+        assert!(r.table_id_by_name("metrics").is_some());
+    }
+    assert_eq!(cold_row_count(&dir), 6);
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn compactor_multi_table_shares_segments() {
+    let dir = tmp_dir("compact-multi");
+    let mut a = hot_metrics(512, 4);
+    let mut b = hot_metrics(512, 4);
+    for i in 0..2 {
+        a.push_row(&[Value::I64(i), Value::F64(0.0), Value::Str("a")]);
+        b.push_row(&[Value::I64(100 + i), Value::F64(1.0), Value::Str("b")]);
+    }
+    a.advance_chunk();
+    b.advance_chunk();
+
+    let store = ColdStore::open(&dir).unwrap();
+    let mut c = Compactor::new(
+        store,
+        CompactorConfig {
+            target_segment_bytes: 1 << 30,
+            ..Default::default()
+        },
+    );
+    c.drain_view("table_a", &a.view()).unwrap();
+    c.drain_view("table_b", &b.view()).unwrap();
+    c.flush().unwrap();
+
+    // Both tables land in a single shared segment file.
+    let store = ColdStore::open(&dir).unwrap();
+    let paths = store.segment_paths();
+    assert_eq!(paths.len(), 1);
+    let r = SegmentReader::open(&paths[0]).unwrap();
+    assert_eq!(r.table_defs().len(), 2);
+    assert!(r.table_id_by_name("table_a").is_some());
+    assert!(r.table_id_by_name("table_b").is_some());
+    assert_eq!(r.pages().len(), 2);
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn compactor_background_thread_drains_on_stop() {
+    let dir = tmp_dir("compact-spawn");
+    let file = dir.join("hot.memt");
+    let schema = Schema::new()
+        .col("timestamp", DType::I64)
+        .col("value", DType::F64);
+
+    // Writer handle (application side) and an independent read handle the
+    // compactor thread owns — same mmap'd file, lock-free reads.
+    let mut writer = MemTable::file_at(&file, &schema, 512, 4).unwrap();
+    let reader = MemTable::open_file(&file).unwrap();
+
+    let store = ColdStore::open(&dir).unwrap();
+    let handle = Compactor::new(
+        store,
+        CompactorConfig {
+            target_segment_bytes: 1 << 30,
+            poll_interval: Duration::from_millis(10),
+            ..Default::default()
+        },
+    )
+    .spawn(vec![("metrics".to_string(), reader)]);
+
+    for i in 0..4 {
+        writer.push_row(&[Value::I64(i), Value::F64(i as f64)]);
+    }
+    writer.advance_chunk();
+    std::thread::sleep(Duration::from_millis(40));
+    for i in 0..3 {
+        writer.push_row(&[Value::I64(100 + i), Value::F64(i as f64)]);
+    }
+    writer.advance_chunk();
+
+    // stop() performs a final drain + flush before joining.
+    handle.stop();
+
+    assert_eq!(cold_row_count(&dir), 7);
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn compactor_enforce_evicts_oldest_segments() {
+    let dir = tmp_dir("compact-evict");
+    let mut t = hot_metrics(512, 8);
+    for c in 0..5 {
+        for i in 0..2 {
+            t.push_row(&[Value::I64(c * 10 + i), Value::F64(0.0), Value::Str("x")]);
+        }
+        t.advance_chunk();
+    }
+
+    let store = ColdStore::open(&dir).unwrap();
+    let mut c = Compactor::new(
+        store,
+        CompactorConfig {
+            target_segment_bytes: 1, // one segment per page
+            max_total_bytes: Some(1), // keep only the protected newest
+            ..Default::default()
+        },
+    );
+    c.drain_view("metrics", &t.view()).unwrap();
+    c.flush().unwrap();
+    assert_eq!(c.stats().segment_count, 5);
+
+    let removed = c.enforce();
+    assert!(!removed.is_empty(), "over-budget segments evicted");
+    // enforce_limits never deletes the newest segment.
+    assert!(c.stats().segment_count >= 1);
+    assert!(c.stats().segment_count < 5);
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn compactor_restart_dedup_via_prime() {
+    let dir = tmp_dir("compact-restart");
+    let mut t = hot_metrics(512, 4);
+    for c in 0..2 {
+        for i in 0..2 {
+            t.push_row(&[Value::I64(c * 10 + i), Value::F64(0.0), Value::Str("x")]);
+        }
+        t.advance_chunk(); // seal chunks 0 and 1
+    }
+
+    let cfg = || CompactorConfig {
+        target_segment_bytes: 1 << 30,
+        ..Default::default()
+    };
+
+    // First run: drain the two sealed chunks into cold.
+    {
+        let mut c = Compactor::new(ColdStore::open(&dir).unwrap(), cfg());
+        assert_eq!(c.drain_view("metrics", &t.view()).unwrap(), 4);
+        c.flush().unwrap();
+    }
+    assert_eq!(cold_row_count(&dir), 4);
+
+    // Simulated restart over the SAME cold dir. prime_from_cold rebuilds the
+    // per-chunk watermark from persisted source_gen/source_chunk, so the same
+    // still-resident sealed chunks are recognised as already compacted.
+    {
+        let mut c = Compactor::new(ColdStore::open(&dir).unwrap(), cfg());
+        c.prime_from_cold().unwrap();
+        assert_eq!(c.drain_view("metrics", &t.view()).unwrap(), 0);
+        assert!(c.flush().unwrap().is_none(), "nothing new to seal");
+    }
+    assert_eq!(cold_row_count(&dir), 4, "exactly-once: no duplication");
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
+
+#[test]
+fn compactor_without_prime_redrains_on_restart() {
+    // Negative control: this is precisely the duplication prime_from_cold
+    // prevents. Without priming, a fresh compactor re-drains resident chunks.
+    let dir = tmp_dir("compact-noprime");
+    let mut t = hot_metrics(512, 4);
+    for i in 0..2 {
+        t.push_row(&[Value::I64(i), Value::F64(0.0), Value::Str("x")]);
+    }
+    t.advance_chunk();
+
+    let cfg = || CompactorConfig {
+        target_segment_bytes: 1 << 30,
+        ..Default::default()
+    };
+
+    {
+        let mut c = Compactor::new(ColdStore::open(&dir).unwrap(), cfg());
+        assert_eq!(c.drain_view("metrics", &t.view()).unwrap(), 2);
+        c.flush().unwrap();
+    }
+    {
+        let mut c = Compactor::new(ColdStore::open(&dir).unwrap(), cfg());
+        // No prime_from_cold → the resident sealed chunk is drained again.
+        assert_eq!(c.drain_view("metrics", &t.view()).unwrap(), 2);
+        c.flush().unwrap();
+    }
+    assert_eq!(cold_row_count(&dir), 4, "duplicated without priming");
+
+    let _ = std::fs::remove_dir_all(&dir);
+}
diff --git a/probing/memtable/src/memc/writer.rs b/probing/memtable/src/memc/writer.rs
new file mode 100644
index 00000000..224d7304
--- /dev/null
+++ b/probing/memtable/src/memc/writer.rs
@@ -0,0 +1,337 @@
+//! [`SegmentWriter`]: build one `.memc` segment file incrementally.
+//!
+//! Lifecycle: create → `register_table`* → `append_page`* → `seal`.
+//! Blocks are flushed to the file as they are produced; the footer (page
+//! directory) and the sealed segment header are written last, so a crash
+//! before `seal` leaves a forward-scannable, checksummed prefix.
+
+use std::collections::HashMap;
+use std::fs::{File, OpenOptions};
+use std::io::{self, Seek, SeekFrom, Write};
+use std::path::{Path, PathBuf};
+
+use super::codec::{encode_column, ColumnData};
+use super::layout::{
+    align64, xxh32, BlockHeader, ColEncoding, SegmentHeader, BLOCK_HEADER_SIZE, FLAG_SEALED,
+    MAGIC_FOOTER, MAGIC_PAGE_BLOCK, MAGIC_TABLE_BLOCK, PAGE_DIR_ENTRY_SIZE, SEGMENT_HEADER_SIZE,
+    SOURCE_CHUNK_NONE, TS_MAX_INIT, TS_MIN_INIT,
+};
+use crate::raw::process_start_time;
+use crate::schema::DType;
+
+/// One page-directory entry, mirrored into the footer on seal.
+#[derive(Debug, Clone)]
+pub(crate) struct PageDirEntry {
+    pub table_id: u32,
+    pub row_count: u32,
+    pub col_count: u32,
+    pub ts_min: i64,
+    pub ts_max: i64,
+    pub block_off: u64,
+    pub block_len: u32,
+    pub source_gen: u64,
+    pub source_chunk: u32,
+}
+
+struct TableInfo {
+    cols: Vec<(String, DType)>,
+    ts_col: Option<usize>,
+}
+
+/// Incremental writer for a single MEMC segment file.
+pub struct SegmentWriter {
+    file: File,
+    path: PathBuf,
+    offset: u64,
+    tables: HashMap<u32, TableInfo>,
+    next_table_id: u32,
+    pages: Vec<PageDirEntry>,
+    seg_ts_min: i64,
+    seg_ts_max: i64,
+    sealed: bool,
+}
+
+fn now_unix_ms() -> u64 {
+    std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .map(|d| d.as_millis() as u64)
+        .unwrap_or(0)
+}
+
+impl SegmentWriter {
+    /// Create a new segment file at `path`, writing the (unsealed) header.
+    pub fn create(path: impl AsRef<Path>) -> io::Result<Self> {
+        let path = path.as_ref().to_path_buf();
+        if let Some(parent) = path.parent() {
+            std::fs::create_dir_all(parent)?;
+        }
+        let mut file = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .truncate(true)
+            .open(&path)?;
+
+        let pid = std::process::id();
+        let header = SegmentHeader {
+            flags: 0,
+            writer_pid: pid,
+            writer_start: process_start_time(pid),
+            created_unix_ms: now_unix_ms(),
+            footer_off: 0,
+            ts_min: TS_MIN_INIT,
+            ts_max: TS_MAX_INIT,
+            page_count: 0,
+        };
+        file.write_all(&header.encode())?;
+
+        Ok(Self {
+            file,
+            path,
+            offset: SEGMENT_HEADER_SIZE as u64,
+            tables: HashMap::new(),
+            next_table_id: 1,
+            pages: Vec::new(),
+            seg_ts_min: TS_MIN_INIT,
+            seg_ts_max: TS_MAX_INIT,
+            sealed: false,
+        })
+    }
+
+    pub fn path(&self) -> &Path {
+        &self.path
+    }
+
+    pub fn page_count(&self) -> usize {
+        self.pages.len()
+    }
+
+    /// Bytes written to the segment so far (header + all blocks, before the
+    /// footer). A compactor polls this to decide when to seal and roll to a
+    /// fresh segment, bounding file size and preventing fragmentation.
+    pub fn size_bytes(&self) -> u64 {
+        self.offset
+    }
+
+    /// Timestamp span covered so far, `None` until a timestamped page lands.
+    /// Lets a compactor also roll on a wall-clock window (e.g. seal every
+    /// 5 min) so low-rate tables don't sit unsealed indefinitely.
+    pub fn ts_span(&self) -> Option<(i64, i64)> {
+        if self.seg_ts_min <= self.seg_ts_max {
+            Some((self.seg_ts_min, self.seg_ts_max))
+        } else {
+            None
+        }
+    }
+
+    /// Register a table, write its `MCTB` definition block, return its id.
+    pub fn register_table(
+        &mut self,
+        name: &str,
+        cols: &[(String, DType)],
+    ) -> io::Result<u32> {
+        let id = self.next_table_id;
+        self.next_table_id += 1;
+
+        let payload = super::layout::encode_table_payload(name, cols);
+        let header = BlockHeader {
+            magic: MAGIC_TABLE_BLOCK,
+            table_id: id,
+            row_count: 0,
+            col_count: cols.len() as u32,
+            ts_min: TS_MIN_INIT,
+            ts_max: TS_MAX_INIT,
+            source_gen: 0,
+            payload_len: payload.len() as u32,
+            payload_xxh: xxh32(&payload),
+            source_chunk: SOURCE_CHUNK_NONE,
+        };
+        self.write_block(&header, &payload)?;
+
+        let ts_col = cols
+            .iter()
+            .position(|(n, dt)| *dt == DType::I64 && crate::raw::TS_COL_NAMES.contains(&n.as_str()));
+        self.tables.insert(
+            id,
+            TableInfo {
+                cols: cols.to_vec(),
+                ts_col,
+            },
+        );
+        Ok(id)
+    }
+
+    /// Append a columnar page for `table_id`. `source_gen` / `source_chunk`
+    /// record the hot-ring chunk this page was compacted from (generation and
+    /// chunk index); pass `(0, SOURCE_CHUNK_NONE)` when not applicable. They
+    /// let a restarting compactor rebuild its per-chunk drain watermark.
+    ///
+    /// All columns must share the same length and match the registered
+    /// schema's dtypes in order.
+    pub fn append_page(
+        &mut self,
+        table_id: u32,
+        columns: &[ColumnData],
+        source_gen: u64,
+        source_chunk: u32,
+    ) -> io::Result<()> {
+        let info = self
+            .tables
+            .get(&table_id)
+            .ok_or_else(|| io::Error::new(io::ErrorKind::NotFound, "unknown table_id"))?;
+        if columns.len() != info.cols.len() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "page column count mismatch",
+            ));
+        }
+        let row_count = columns.first().map(|c| c.len()).unwrap_or(0);
+        for (i, col) in columns.iter().enumerate() {
+            if col.dtype() != info.cols[i].1 {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "page column dtype mismatch",
+                ));
+            }
+            if col.len() != row_count {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "page columns have unequal lengths",
+                ));
+            }
+        }
+        if row_count == 0 {
+            return Ok(()); // nothing to persist
+        }
+
+        let (ts_min, ts_max) = match info.ts_col {
+            Some(ci) => match &columns[ci] {
+                ColumnData::I64(v) => v
+                    .iter()
+                    .fold((TS_MIN_INIT, TS_MAX_INIT), |(lo, hi), &t| {
+                        (lo.min(t), hi.max(t))
+                    }),
+                _ => (TS_MIN_INIT, TS_MAX_INIT),
+            },
+            None => (TS_MIN_INIT, TS_MAX_INIT),
+        };
+
+        let mut payload = Vec::new();
+        for col in columns {
+            let sub = encode_column(col).map_err(|e| {
+                io::Error::new(io::ErrorKind::InvalidData, format!("column encode: {e}"))
+            })?;
+            payload.extend_from_slice(&sub);
+        }
+
+        let header = BlockHeader {
+            magic: MAGIC_PAGE_BLOCK,
+            table_id,
+            row_count: row_count as u32,
+            col_count: columns.len() as u32,
+            ts_min,
+            ts_max,
+            source_gen,
+            payload_len: payload.len() as u32,
+            payload_xxh: xxh32(&payload),
+            source_chunk,
+        };
+        let block_off = self.offset;
+        let block_len = self.write_block(&header, &payload)?;
+
+        if ts_min <= ts_max {
+            self.seg_ts_min = self.seg_ts_min.min(ts_min);
+            self.seg_ts_max = self.seg_ts_max.max(ts_max);
+        }
+        self.pages.push(PageDirEntry {
+            table_id,
+            row_count: row_count as u32,
+            col_count: columns.len() as u32,
+            ts_min,
+            ts_max,
+            block_off,
+            block_len: block_len as u32,
+            source_gen,
+            source_chunk,
+        });
+        Ok(())
+    }
+
+    /// Write the footer (page directory) and the sealed header, then flush.
+    ///
+    /// After this the file is immutable; the writer is consumed.
+    pub fn seal(mut self) -> io::Result<PathBuf> {
+        let footer_off = self.offset;
+        let mut footer = Vec::with_capacity(16 + self.pages.len() * PAGE_DIR_ENTRY_SIZE);
+        footer.extend_from_slice(&MAGIC_FOOTER.to_le_bytes());
+        footer.extend_from_slice(&(self.pages.len() as u32).to_le_bytes());
+        let entries_len = (self.pages.len() * PAGE_DIR_ENTRY_SIZE) as u32;
+        footer.extend_from_slice(&entries_len.to_le_bytes());
+        footer.extend_from_slice(&[0u8; 4]); // checksum placeholder
+
+        let entries_start = footer.len();
+        for p in &self.pages {
+            footer.extend_from_slice(&p.table_id.to_le_bytes());
+            footer.extend_from_slice(&p.row_count.to_le_bytes());
+            footer.extend_from_slice(&p.ts_min.to_le_bytes());
+            footer.extend_from_slice(&p.ts_max.to_le_bytes());
+            footer.extend_from_slice(&p.block_off.to_le_bytes());
+            footer.extend_from_slice(&p.block_len.to_le_bytes());
+            footer.extend_from_slice(&p.col_count.to_le_bytes());
+            footer.extend_from_slice(&p.source_gen.to_le_bytes());
+            footer.extend_from_slice(&p.source_chunk.to_le_bytes());
+            footer.extend_from_slice(&[0u8; 4]); // pad to 56
+        }
+        let checksum = xxh32(&footer[entries_start..]);
+        footer[12..16].copy_from_slice(&checksum.to_le_bytes());
+
+        self.file.write_all(&footer)?;
+
+        // Rewrite the header with seal metadata.
+        let pid = std::process::id();
+        let header = SegmentHeader {
+            flags: FLAG_SEALED,
+            writer_pid: pid,
+            writer_start: process_start_time(pid),
+            created_unix_ms: now_unix_ms(),
+            footer_off,
+            ts_min: self.seg_ts_min,
+            ts_max: self.seg_ts_max,
+            page_count: self.pages.len() as u32,
+        };
+        self.file.seek(SeekFrom::Start(0))?;
+        self.file.write_all(&header.encode())?;
+        self.file.flush()?;
+        self.file.sync_data()?;
+        self.sealed = true;
+        Ok(self.path.clone())
+    }
+
+    /// Write a block header + payload, zero-padded to a 64-byte boundary.
+    /// Returns the total bytes written (the block length).
+    fn write_block(&mut self, header: &BlockHeader, payload: &[u8]) -> io::Result<u64> {
+        debug_assert!(matches!(
+            ColEncoding::from_u8(0),
+            Some(ColEncoding::RawFixed)
+        ));
+        let raw = BLOCK_HEADER_SIZE + payload.len();
+        let padded = align64(raw);
+        self.file.write_all(&header.encode())?;
+        self.file.write_all(payload)?;
+        if padded > raw {
+            self.file.write_all(&vec![0u8; padded - raw])?;
+        }
+        self.offset += padded as u64;
+        Ok(padded as u64)
+    }
+}
+
+impl Drop for SegmentWriter {
+    fn drop(&mut self) {
+        // An unsealed segment on drop keeps its checksummed block prefix on
+        // disk; the reader's forward-scan recovery path will pick it up.
+        if !self.sealed {
+            let _ = self.file.flush();
+        }
+    }
+}
diff --git a/probing/memtable/src/memtable.rs b/probing/memtable/src/memtable.rs
index 3c681412..6545df5d 100644
--- a/probing/memtable/src/memtable.rs
+++ b/probing/memtable/src/memtable.rs
@@ -4,13 +4,18 @@ use crate::layout::{
     header_mut, release_write_lock, w32, CHUNK_HEADER_SIZE, FLAG_DEDUP,
 };
 use crate::raw::{
-    advance_chunk_unlocked, init_buf, validate_buf, validate_row_schema, write_row_bytes,
+    advance_chunk_unlocked, init_buf, note_row_ts, row_ts, validate_buf, validate_row_schema,
+    write_row_bytes,
 };
 use crate::refcount::refcount;
 use crate::row::RowIter;
 use crate::schema::{Col, DType, Schema, Value};
 use crate::writer::RowWriter;
+use memmap2::MmapMut;
 use std::fmt;
+use std::fs::OpenOptions;
+use std::io;
+use std::path::{Path, PathBuf};
 use std::sync::atomic::Ordering;
 
 // ── Shared read-only accessor methods (expands inside each impl) ─────
@@ -59,6 +64,37 @@ macro_rules! impl_table_reader {
             let cs = chunk_start_off(buf, chunk);
             chunk_header(buf, cs).state.load(Ordering::Acquire)
         }
+        /// Index of the designated timestamp column ([`None`] when the
+        /// schema has no `I64` column named `timestamp` / `ts`).
+        pub fn ts_col(&self) -> Option<usize> {
+            match header(self.as_bytes()).ts_col as usize {
+                0 => None,
+                idx => Some(idx - 1),
+            }
+        }
+        /// `(min, max)` of the designated timestamp column over the rows
+        /// committed in `chunk`; [`None`] when the chunk is empty or the
+        /// table has no timestamp column.
+        ///
+        /// The `used` Acquire load pairs with the writer's Release store
+        /// that publishes each row, so the returned range covers every row
+        /// visible to this reader. Like all chunk metadata the snapshot is
+        /// racy: callers pruning by time must bracket it between two
+        /// [`chunk_generation`](Self::chunk_generation) reads.
+        pub fn chunk_ts_range(&self, chunk: usize) -> Option<(i64, i64)> {
+            self.ts_col()?;
+            let buf = self.as_bytes();
+            let cs = chunk_start_off(buf, chunk);
+            let ch = chunk_header(buf, cs);
+            let _used = ch.used.load(Ordering::Acquire);
+            let min = ch.min_ts.load(Ordering::Relaxed);
+            let max = ch.max_ts.load(Ordering::Relaxed);
+            if min > max {
+                None // sentinel values: no committed rows
+            } else {
+                Some((min, max))
+            }
+        }
         pub fn rows(&self, chunk: usize) -> RowIter<'_> {
             let buf = self.as_bytes();
             let cs = chunk_start_off(buf, chunk);
@@ -76,6 +112,33 @@ macro_rules! impl_table_reader {
             let cs = chunk_start_off(buf, chunk);
             chunk_header(buf, cs).row_count.load(Ordering::Acquire) as usize
         }
+        /// Chunk indices in **logical (oldest → newest) write order**.
+        ///
+        /// The ring writes chunks in `(generation, index)` order: chunk 0 at
+        /// generation 1, then chunks 1..N-1 at generation 1, then wraps back
+        /// to chunk 0 at generation 2, and so on.  Sorting non-empty chunks
+        /// by `(generation, index)` therefore recovers temporal order
+        /// regardless of the current wrap position.
+        ///
+        /// Chunks that were never written (generation 0) or hold no
+        /// committed rows are skipped.  The snapshot is racy by design:
+        /// callers that read concurrently with a writer must re-check
+        /// [`chunk_generation`](Self::chunk_generation) after consuming a
+        /// chunk and discard it on mismatch.
+        pub fn chunks_logical(&self) -> Vec<usize> {
+            let mut order: Vec<(u64, usize)> = (0..self.num_chunks())
+                .filter_map(|i| {
+                    let generation = self.chunk_generation(i);
+                    if generation == 0 || self.num_rows(i) == 0 {
+                        None
+                    } else {
+                        Some((generation, i))
+                    }
+                })
+                .collect();
+            order.sort_unstable();
+            order.into_iter().map(|(_, i)| i).collect()
+        }
         pub fn creator_pid(&self) -> u32 {
             header(self.as_bytes()).creator_pid
         }
@@ -112,6 +175,7 @@ fn make_row_writer<'a>(
     let wc = h.write_chunk.load(Ordering::Relaxed) as usize;
     let csz = h.chunk_size as usize;
     let doff = h.data_offset as usize;
+    let ts_col = h.ts_col;
     let cs = doff + wc * csz;
     let used = chunk_header(buf, cs).used.load(Ordering::Relaxed) as usize;
     RowWriter {
@@ -125,6 +189,8 @@ fn make_row_writer<'a>(
         done: false,
         col_idx: 0,
         locked,
+        ts_col,
+        pending_ts: None,
     }
 }
 
@@ -155,12 +221,6 @@ fn locked_append(buf: &mut [u8], values: &[Value]) -> bool {
     ok
 }
 
-fn locked_push(buf: &mut [u8], values: &[Value]) {
-    acquire_write_lock(buf);
-    push_plain_row(buf, values);
-    release_write_lock(buf);
-}
-
 fn locked_advance(buf: &mut [u8]) {
     acquire_write_lock(buf);
     advance_chunk_unlocked(buf);
@@ -224,6 +284,9 @@ fn append_row_dedup_bytes(buf: &mut [u8], state: &mut DedupState, values: &[Valu
             off += v.encode(&mut buf[off..]);
         }
     }
+    if let Some(ts) = row_ts(header(buf), values) {
+        note_row_ts(chunk_header(buf, cs), ts);
+    }
     chunk_header(buf, cs)
         .used
         .store((used + total) as u32, Ordering::Release);
@@ -233,10 +296,110 @@ fn append_row_dedup_bytes(buf: &mut [u8], state: &mut DedupState, values: &[Valu
     true
 }
 
-// ── MemTable (owned buffer) ──────────────────────────────────────────
+// ── MemTable (owned buffer: heap or mmap'd shared memory) ───────────
+
+/// Which kind of storage backs a [`MemTable`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum BackingKind {
+    /// Process-private heap allocation.
+    Heap,
+    /// POSIX shared memory object (`shm_open`) — memory-only.
+    Shm,
+    /// mmap'd regular file — disk-backed.
+    File,
+}
+
+/// Storage behind a [`MemTable`].
+enum Backing {
+    /// Process-private heap allocation. Invisible to other processes;
+    /// freed on drop.
+    Heap(Vec<u8>),
+    /// POSIX shared memory object (`shm_open` + `mmap`). Memory-only:
+    /// never touches disk, gone after reboot. Other processes attach by
+    /// name. When `unlink_on_drop`, the creator removes the name on drop
+    /// (existing mappings stay valid until unmapped).
+    Shm {
+        mmap: MmapMut,
+        name: String,
+        unlink_on_drop: bool,
+    },
+    /// mmap'd regular file. Disk-backed: contents persist after drop /
+    /// reboot unless `unlink_on_drop` is set (used by the discoverable
+    /// `<data_dir>/<pid>/<name>` convention, where `dir` is the parent
+    /// `<pid>/` directory to remove when it becomes empty).
+    File {
+        mmap: MmapMut,
+        path: PathBuf,
+        dir: Option<PathBuf>,
+        unlink_on_drop: bool,
+    },
+}
+
+impl Backing {
+    #[inline]
+    fn bytes(&self) -> &[u8] {
+        match self {
+            Backing::Heap(v) => v,
+            Backing::Shm { mmap, .. } => mmap,
+            Backing::File { mmap, .. } => mmap,
+        }
+    }
 
+    #[inline]
+    fn bytes_mut(&mut self) -> &mut [u8] {
+        match self {
+            Backing::Heap(v) => v,
+            Backing::Shm { mmap, .. } => mmap,
+            Backing::File { mmap, .. } => mmap,
+        }
+    }
+}
+
+/// Normalise a POSIX shm name: must start with `/`, no other slashes.
+/// Keep names short — macOS limits them to 31 bytes (`PSHMNAMLEN`).
+fn shm_name_cstring(name: &str) -> io::Result<std::ffi::CString> {
+    let normalised = if name.starts_with('/') {
+        name.to_string()
+    } else {
+        format!("/{name}")
+    };
+    if normalised[1..].contains('/') {
+        return Err(io::Error::new(
+            io::ErrorKind::InvalidInput,
+            "shm name must not contain '/' (apart from the leading one)",
+        ));
+    }
+    std::ffi::CString::new(normalised)
+        .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "shm name contains NUL"))
+}
+
+/// `shm_open` wrapper returning an owned [`std::fs::File`].
+fn shm_open_file(name: &std::ffi::CString, oflag: libc::c_int) -> io::Result<std::fs::File> {
+    use std::os::fd::FromRawFd;
+    let fd = unsafe { libc::shm_open(name.as_ptr(), oflag, 0o600 as libc::c_uint) };
+    if fd < 0 {
+        return Err(io::Error::last_os_error());
+    }
+    Ok(unsafe { std::fs::File::from_raw_fd(fd) })
+}
+
+/// Ring-buffer table that owns its storage. Three backings, one API:
+///
+/// | Constructor | Backing | Cross-process | Survives crash | Survives reboot |
+/// |-------------|---------|--------------|----------------|-----------------|
+/// | [`new`](Self::new) / [`from_buf`](Self::from_buf) | heap | no | no | no |
+/// | [`shm`](Self::shm) / [`open_shm`](Self::open_shm) | POSIX shared memory | by name | yes¹ | no |
+/// | [`file_at`](Self::file_at) / [`open_file`](Self::open_file) | mmap'd file | by path | yes | yes |
+/// | [`shared`](Self::shared) / [`shared_in`](Self::shared_in) | mmap'd file under `<data_dir>/<pid>/` | discovery + SQL catalog | yes¹ | — |
+///
+/// ¹ until the name/file is unlinked (creator drop or stale-pid cleanup).
+///
+/// On Linux the discoverable `shared` flavour lives in `/dev/shm` (tmpfs),
+/// so it is effectively shared memory *with* a browsable path; `shm` is the
+/// portable memory-only variant (on macOS, shm objects have no filesystem
+/// path at all).
 pub struct MemTable {
-    buf: Vec<u8>,
+    backing: Backing,
 }
 
 impl MemTable {
@@ -244,57 +407,324 @@ impl MemTable {
         compute_data_offset(schema.cols.len()) + chunk_size * num_chunks
     }
 
+    /// Create a **heap-backed** (process-private) table.
     pub fn new(schema: &Schema, chunk_size: u32, num_chunks: u32) -> Self {
         let size = Self::required_size(schema, chunk_size as usize, num_chunks as usize);
         let mut buf = vec![0u8; size];
         init_buf(&mut buf, schema, chunk_size, num_chunks);
-        Self { buf }
+        Self {
+            backing: Backing::Heap(buf),
+        }
     }
 
+    /// Adopt an existing heap buffer (validates the MEMT layout).
     pub fn from_buf(buf: Vec<u8>) -> Result<Self, &'static str> {
         validate_buf(&buf)?;
-        Ok(Self { buf })
+        Ok(Self {
+            backing: Backing::Heap(buf),
+        })
+    }
+
+    // ── POSIX shared memory (memory-only) ────────────────────────────
+
+    /// Create a **POSIX shared-memory** table (`shm_open`).
+    ///
+    /// Memory-only: never hits disk, vanishes on reboot. Other processes
+    /// attach with [`open_shm`](Self::open_shm) using the same `name`
+    /// (normalised to a leading `/`; keep it short — macOS caps shm names
+    /// at 31 bytes). The creator unlinks the name on drop; attached
+    /// processes keep a valid mapping until they unmap.
+    ///
+    /// Fails with `AlreadyExists` if the name is taken.
+    pub fn shm(name: &str, schema: &Schema, chunk_size: u32, num_chunks: u32) -> io::Result<Self> {
+        let cname = shm_name_cstring(name)?;
+        let size = Self::required_size(schema, chunk_size as usize, num_chunks as usize);
+
+        let file = shm_open_file(&cname, libc::O_CREAT | libc::O_EXCL | libc::O_RDWR)?;
+        file.set_len(size as u64)?;
+
+        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
+        init_buf(&mut mmap, schema, chunk_size, num_chunks);
+
+        Ok(Self {
+            backing: Backing::Shm {
+                mmap,
+                name: cname.into_string().expect("validated utf-8"),
+                unlink_on_drop: true,
+            },
+        })
+    }
+
+    /// Attach to an existing POSIX shared-memory table created by
+    /// [`shm`](Self::shm) (validates the MEMT layout).
+    ///
+    /// The returned handle does **not** unlink the name on drop.
+    pub fn open_shm(name: &str) -> io::Result<Self> {
+        let cname = shm_name_cstring(name)?;
+        let file = shm_open_file(&cname, libc::O_RDWR)?;
+
+        let mmap = unsafe { MmapMut::map_mut(&file)? };
+        validate_buf(&mmap).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+
+        Ok(Self {
+            backing: Backing::Shm {
+                mmap,
+                name: cname.into_string().expect("validated utf-8"),
+                unlink_on_drop: false,
+            },
+        })
+    }
+
+    // ── mmap'd file (disk-backed, persistent) ────────────────────────
+
+    /// Create a table backed by an **mmap'd regular file** at `path`.
+    ///
+    /// Disk-backed and persistent: the file is **kept** on drop and can be
+    /// reopened later with [`open_file`](Self::open_file) — including
+    /// after a process crash or reboot. Truncates any existing file.
+    pub fn file_at(
+        path: impl AsRef<Path>,
+        schema: &Schema,
+        chunk_size: u32,
+        num_chunks: u32,
+    ) -> io::Result<Self> {
+        let path = path.as_ref().to_path_buf();
+        if let Some(parent) = path.parent() {
+            std::fs::create_dir_all(parent)?;
+        }
+        let size = Self::required_size(schema, chunk_size as usize, num_chunks as usize);
+
+        let file = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .truncate(true)
+            .open(&path)?;
+        file.set_len(size as u64)?;
+
+        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
+        init_buf(&mut mmap, schema, chunk_size, num_chunks);
+
+        Ok(Self {
+            backing: Backing::File {
+                mmap,
+                path,
+                dir: None,
+                unlink_on_drop: false,
+            },
+        })
+    }
+
+    /// Reopen an existing mmap'd-file table read-write (validates the
+    /// MEMT layout). The file is kept on drop.
+    pub fn open_file(path: impl AsRef<Path>) -> io::Result<Self> {
+        let path = path.as_ref().to_path_buf();
+        let file = OpenOptions::new().read(true).write(true).open(&path)?;
+
+        let mmap = unsafe { MmapMut::map_mut(&file)? };
+        validate_buf(&mmap).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+
+        Ok(Self {
+            backing: Backing::File {
+                mmap,
+                path,
+                dir: None,
+                unlink_on_drop: false,
+            },
+        })
+    }
+
+    // ── discoverable file (data-dir convention) ──────────────────────
+
+    /// Create a **discoverable** mmap'd-file table in the
+    /// [`default_dir`](crate::discover::default_dir), at
+    /// `<data_dir>/<pid>/<name>`.
+    ///
+    /// This is the flavour the SQL catalog and cross-process discovery
+    /// scan for. On Linux the default dir is `/dev/shm` (tmpfs), making
+    /// this shared memory with a browsable path. The file is unlinked on
+    /// drop; after a crash it stays readable until stale-pid cleanup.
+    pub fn shared(
+        name: &str,
+        schema: &Schema,
+        chunk_size: u32,
+        num_chunks: u32,
+    ) -> io::Result<Self> {
+        Self::shared_in(
+            &crate::discover::default_dir(),
+            name,
+            schema,
+            chunk_size,
+            num_chunks,
+        )
+    }
+
+    /// Like [`shared`](Self::shared), under a custom base directory
+    /// (file at `<base_dir>/<pid>/<name>`).
+    pub fn shared_in(
+        base_dir: &Path,
+        name: &str,
+        schema: &Schema,
+        chunk_size: u32,
+        num_chunks: u32,
+    ) -> io::Result<Self> {
+        let dir = base_dir.join(std::process::id().to_string());
+        std::fs::create_dir_all(&dir)?;
+
+        let path = dir.join(name);
+        let size = Self::required_size(schema, chunk_size as usize, num_chunks as usize);
+
+        let file = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .truncate(true)
+            .open(&path)?;
+        file.set_len(size as u64)?;
+
+        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
+        init_buf(&mut mmap, schema, chunk_size, num_chunks);
+
+        Ok(Self {
+            backing: Backing::File {
+                mmap,
+                path,
+                dir: Some(dir),
+                unlink_on_drop: true,
+            },
+        })
+    }
+
+    // ── backing introspection ─────────────────────────────────────────
+
+    /// Which backend stores this table.
+    pub fn backing_kind(&self) -> BackingKind {
+        match &self.backing {
+            Backing::Heap(_) => BackingKind::Heap,
+            Backing::Shm { .. } => BackingKind::Shm,
+            Backing::File { .. } => BackingKind::File,
+        }
+    }
+
+    /// `true` when other processes can attach (shm or mmap'd file).
+    pub fn is_shared(&self) -> bool {
+        !matches!(self.backing, Backing::Heap(_))
+    }
+
+    /// File path of the mapping; [`None`] for heap and shm backings
+    /// (POSIX shm objects have no portable filesystem path).
+    pub fn path(&self) -> Option<&Path> {
+        match &self.backing {
+            Backing::File { path, .. } => Some(path),
+            _ => None,
+        }
+    }
+
+    /// POSIX shm name (with leading `/`); [`None`] for other backings.
+    pub fn shm_name(&self) -> Option<&str> {
+        match &self.backing {
+            Backing::Shm { name, .. } => Some(name),
+            _ => None,
+        }
     }
 
     pub fn as_bytes(&self) -> &[u8] {
-        &self.buf
+        self.backing.bytes()
     }
+
+    pub fn as_bytes_mut(&mut self) -> &mut [u8] {
+        self.backing.bytes_mut()
+    }
+
     pub fn view(&self) -> MemTableView<'_> {
-        MemTableView { buf: &self.buf }
+        MemTableView {
+            buf: self.backing.bytes(),
+        }
     }
 
     impl_table_reader!();
 
     pub fn row_writer(&mut self) -> RowWriter<'_> {
-        begin_row_writer(&mut self.buf, None)
+        begin_row_writer(self.backing.bytes_mut(), None)
     }
     pub fn append_row(&mut self, values: &[Value]) -> bool {
         assert!(
-            validate_row_schema(&self.buf, values),
+            validate_row_schema(self.backing.bytes(), values),
             "value types do not match schema"
         );
-        locked_append(&mut self.buf, values)
+        locked_append(self.backing.bytes_mut(), values)
     }
     pub fn advance_chunk(&mut self) {
-        locked_advance(&mut self.buf)
+        locked_advance(self.backing.bytes_mut())
     }
+
+    /// Append a row, auto-advancing to the next chunk when full.
+    ///
+    /// # Panic safety
+    ///
+    /// The spinlock is released even if the write panics (e.g. row exceeds
+    /// chunk capacity) — for shared tables this prevents a deadlocked mmap
+    /// file that other processes may still be reading.
     pub fn push_row(&mut self, values: &[Value]) {
         assert!(
-            validate_row_schema(&self.buf, values),
+            validate_row_schema(self.backing.bytes(), values),
             "value types do not match schema"
         );
-        locked_push(&mut self.buf, values);
+        self.push_row_unchecked(values);
     }
     pub fn push_row_unchecked(&mut self, values: &[Value]) {
-        locked_push(&mut self.buf, values);
+        let buf = self.backing.bytes_mut();
+        acquire_write_lock(buf);
+        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+            push_plain_row(buf, values);
+        }));
+        release_write_lock(buf);
+        if let Err(payload) = result {
+            std::panic::resume_unwind(payload);
+        }
+    }
+}
+
+impl Drop for MemTable {
+    fn drop(&mut self) {
+        match &self.backing {
+            Backing::Heap(_) => {}
+            Backing::Shm {
+                name,
+                unlink_on_drop: true,
+                ..
+            } => {
+                if let Ok(cname) = std::ffi::CString::new(name.as_str()) {
+                    unsafe { libc::shm_unlink(cname.as_ptr()) };
+                }
+            }
+            Backing::Shm { .. } => {}
+            Backing::File {
+                path,
+                dir,
+                unlink_on_drop: true,
+                ..
+            } => {
+                let _ = std::fs::remove_file(path);
+                if let Some(dir) = dir {
+                    let _ = std::fs::remove_dir(dir); // succeeds only if empty
+                }
+            }
+            Backing::File { .. } => {}
+        }
     }
 }
 
 impl fmt::Display for MemTable {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let kind = match self.backing_kind() {
+            BackingKind::Heap => "heap",
+            BackingKind::Shm => "shm",
+            BackingKind::File => "file",
+        };
         write!(
             f,
-            "MemTable({} cols, {} chunks × {} bytes)",
+            "MemTable({kind}, {} cols, {} chunks × {} bytes)",
             self.num_cols(),
             self.num_chunks(),
             self.chunk_size()
@@ -501,7 +931,7 @@ impl fmt::Display for MemTableWriter<'_> {
 
 #[cfg(test)]
 mod tests {
-    use super::{MemTable, MemTableView, MemTableWriter};
+    use super::{BackingKind, MemTable, MemTableView, MemTableWriter};
     use crate::layout::{col_desc, header, header_mut, MAGIC, VERSION};
     use crate::raw::init_buf;
     use crate::refcount::{acquire_ref, refcount, release_ref};
@@ -625,8 +1055,8 @@ mod tests {
     #[test]
     fn append_row_returns_false_when_full() {
         let schema = Schema::new().col("x", DType::I64);
-        // ChunkHeader=24, each I64 row=12 → 48-24=24 data bytes → 2 rows fit
-        let mut t = MemTable::new(&schema, 48, 1);
+        // ChunkHeader=40, each I64 row=12 → 64-40=24 data bytes → 2 rows fit
+        let mut t = MemTable::new(&schema, 64, 1);
         assert!(t.append_row(&[Value::I64(1)]));
         assert!(t.append_row(&[Value::I64(2)]));
         assert!(!t.append_row(&[Value::I64(3)]));
@@ -636,8 +1066,8 @@ mod tests {
     #[test]
     fn ring_buffer_wrap() {
         let schema = Schema::new().col("v", DType::I32);
-        // ChunkHeader=24, each I32 row=8 → 80-24=56 data bytes → 7 rows fit
-        let mut t = MemTable::new(&schema, 80, 3);
+        // ChunkHeader=40, each I32 row=8 → 96-40=56 data bytes → 7 rows fit
+        let mut t = MemTable::new(&schema, 96, 3);
         for i in 0..7 {
             t.push_row(&[Value::I32(i)]);
         }
@@ -652,6 +1082,197 @@ mod tests {
         assert_eq!(t.rows(0).next().unwrap().col_i32(0), 213);
     }
 
+    #[test]
+    fn heap_backing_is_private() {
+        let schema = Schema::new().col("x", DType::I32);
+        let mut t = MemTable::new(&schema, 1024, 2);
+        assert!(!t.is_shared());
+        assert_eq!(t.backing_kind(), BackingKind::Heap);
+        assert!(t.path().is_none());
+        assert!(t.shm_name().is_none());
+        t.push_row(&[Value::I32(7)]);
+        assert_eq!(t.rows(0).next().unwrap().col_i32(0), 7);
+    }
+
+    #[test]
+    fn shm_backing_roundtrip_and_unlink() {
+        // Short name: macOS caps shm names at 31 bytes.
+        let name = format!("/pbg_t{}", std::process::id() % 1_000_000);
+        // In case a previous failed run leaked the name.
+        if let Ok(c) = std::ffi::CString::new(name.as_str()) {
+            unsafe { libc::shm_unlink(c.as_ptr()) };
+        }
+
+        let schema = Schema::new().col("ts", DType::I64).col("msg", DType::Str);
+        let mut creator = MemTable::shm(&name, &schema, 4096, 2).unwrap();
+        assert_eq!(creator.backing_kind(), BackingKind::Shm);
+        assert!(creator.is_shared());
+        assert!(creator.path().is_none());
+        assert_eq!(creator.shm_name(), Some(name.as_str()));
+
+        creator.push_row(&[Value::I64(1), Value::Str("alpha")]);
+
+        // Second attachment (what another process would do) sees the data…
+        let mut attached = MemTable::open_shm(&name).unwrap();
+        assert_eq!(attached.num_rows(0), 1);
+        assert_eq!(attached.rows(0).next().unwrap().col_str(1), "alpha");
+
+        // …and writes through it are visible to the creator (same memory).
+        attached.push_row(&[Value::I64(2), Value::Str("beta")]);
+        assert_eq!(creator.num_rows(0), 2);
+
+        // Name collision is rejected.
+        assert!(MemTable::shm(&name, &schema, 4096, 2).is_err());
+
+        // Creator drop unlinks the name; the attached mapping stays valid.
+        drop(creator);
+        assert!(MemTable::open_shm(&name).is_err());
+        assert_eq!(attached.num_rows(0), 2);
+    }
+
+    #[test]
+    fn file_backing_persists_across_reopen() {
+        let dir = std::env::temp_dir().join(format!(
+            "probing_mt_file_test_{}_{}",
+            std::process::id(),
+            line!()
+        ));
+        let _ = std::fs::remove_dir_all(&dir);
+        let path = dir.join("persistent.mt");
+
+        let schema = Schema::new().col("v", DType::I64);
+        {
+            let mut t = MemTable::file_at(&path, &schema, 4096, 2).unwrap();
+            assert_eq!(t.backing_kind(), BackingKind::File);
+            assert_eq!(t.path(), Some(path.as_path()));
+            t.push_row(&[Value::I64(42)]);
+        }
+        // Unlike `shared`, the file survives drop…
+        assert!(path.is_file());
+
+        // …and can be reopened read-write with data intact.
+        let mut t = MemTable::open_file(&path).unwrap();
+        assert_eq!(t.num_rows(0), 1);
+        assert_eq!(t.rows(0).next().unwrap().col_i64(0), 42);
+        t.push_row(&[Value::I64(43)]);
+        assert_eq!(t.num_rows(0), 2);
+
+        // Reopening garbage fails validation.
+        let bad = dir.join("garbage.mt");
+        std::fs::write(&bad, vec![0u8; 256]).unwrap();
+        assert!(MemTable::open_file(&bad).is_err());
+
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+
+    #[test]
+    fn shared_backing_roundtrip_and_cleanup() {
+        let base = std::env::temp_dir().join(format!(
+            "probing_mt_shared_test_{}_{}",
+            std::process::id(),
+            line!()
+        ));
+        let _ = std::fs::remove_dir_all(&base);
+
+        let schema = Schema::new().col("ts", DType::I64).col("msg", DType::Str);
+        let path = {
+            let mut t = MemTable::shared_in(&base, "shm_tbl", &schema, 4096, 2).unwrap();
+            assert!(t.is_shared());
+            let path = t.path().unwrap().to_path_buf();
+            assert!(path.is_file());
+
+            t.push_row(&[Value::I64(1), Value::Str("alpha")]);
+            t.push_row(&[Value::I64(2), Value::Str("beta")]);
+
+            // Same write/read API as the heap backing
+            assert_eq!(t.num_rows(0), 2);
+            assert_eq!(t.chunks_logical(), vec![0]);
+
+            // Another handle (separate mmap of the same file) sees the data —
+            // this is what a cross-process reader does.
+            let bytes = std::fs::read(&path).unwrap();
+            let view = MemTableView::new(&bytes).unwrap();
+            assert_eq!(view.num_rows(0), 2);
+            let row = view.rows(0).next().unwrap();
+            assert_eq!(row.col_i64(0), 1);
+            assert_eq!(row.col_str(1), "alpha");
+
+            path
+        };
+        // Drop unlinks the file and the (now empty) <pid>/ directory.
+        assert!(!path.exists());
+        assert!(!path.parent().unwrap().exists());
+
+        let _ = std::fs::remove_dir_all(&base);
+    }
+
+    #[test]
+    fn shared_and_heap_share_write_semantics_across_wrap() {
+        let base = std::env::temp_dir().join(format!(
+            "probing_mt_shared_test_{}_{}",
+            std::process::id(),
+            line!()
+        ));
+        let _ = std::fs::remove_dir_all(&base);
+
+        let schema = Schema::new().col("v", DType::I32);
+        let mut heap = MemTable::new(&schema, 80, 3);
+        let mut shm = MemTable::shared_in(&base, "wrap_tbl", &schema, 80, 3).unwrap();
+
+        for i in 0..20 {
+            heap.push_row(&[Value::I32(i)]);
+            shm.push_row(&[Value::I32(i)]);
+        }
+
+        let collect = |t: &MemTable| -> Vec<i32> {
+            t.chunks_logical()
+                .into_iter()
+                .flat_map(|c| t.rows(c).map(|r| r.col_i32(0)).collect::<Vec<_>>())
+                .collect()
+        };
+        assert_eq!(collect(&heap), collect(&shm));
+
+        drop(shm);
+        let _ = std::fs::remove_dir_all(&base);
+    }
+
+    #[test]
+    fn chunks_logical_pre_wrap() {
+        let schema = Schema::new().col("v", DType::I32);
+        let mut t = MemTable::new(&schema, 80, 3);
+        // No data yet: only chunk 0 is Writing (gen 1) but has no rows
+        assert!(t.chunks_logical().is_empty());
+
+        t.push_row(&[Value::I32(1)]);
+        assert_eq!(t.chunks_logical(), vec![0]);
+
+        t.advance_chunk();
+        t.push_row(&[Value::I32(2)]);
+        assert_eq!(t.chunks_logical(), vec![0, 1]);
+    }
+
+    #[test]
+    fn chunks_logical_post_wrap() {
+        let schema = Schema::new().col("v", DType::I32);
+        let mut t = MemTable::new(&schema, 80, 2);
+        t.push_row(&[Value::I32(10)]); // chunk 0, gen 1
+        t.advance_chunk();
+        t.push_row(&[Value::I32(20)]); // chunk 1, gen 1
+        t.advance_chunk(); // wraps: chunk 0 recycled → gen 2, zeroed
+        t.push_row(&[Value::I32(30)]); // chunk 0, gen 2
+
+        // Logical order: oldest surviving data (chunk 1, gen 1) first,
+        // then the recycled chunk 0 (gen 2).
+        let order = t.chunks_logical();
+        assert_eq!(order, vec![1, 0]);
+
+        let values: Vec<i32> = order
+            .iter()
+            .flat_map(|&c| t.rows(c).map(|r| r.col_i32(0)).collect::<Vec<_>>())
+            .collect();
+        assert_eq!(values, vec![20, 30]);
+    }
+
     #[test]
     fn ring_buffer_with_str() {
         let schema = Schema::new().col("msg", DType::Str);
@@ -707,7 +1328,7 @@ mod tests {
     fn display_format() {
         let schema = Schema::new().col("a", DType::I32);
         let t = MemTable::new(&schema, 1024, 2);
-        assert_eq!(format!("{t}"), "MemTable(1 cols, 2 chunks × 1024 bytes)");
+        assert_eq!(format!("{t}"), "MemTable(heap, 1 cols, 2 chunks × 1024 bytes)");
     }
 
     #[test]
@@ -1111,8 +1732,8 @@ mod tests {
     #[test]
     fn chunk_generation_increments_on_wrap() {
         let schema = Schema::new().col("v", DType::I32);
-        // 80 bytes per chunk → 7 I32 rows per chunk
-        let mut t = MemTable::new(&schema, 80, 2);
+        // 96 bytes per chunk → 7 I32 rows per chunk (ChunkHeader=40)
+        let mut t = MemTable::new(&schema, 96, 2);
 
         assert_eq!(t.chunk_generation(0), 1);
         assert_eq!(t.chunk_generation(1), 0);
@@ -1350,4 +1971,173 @@ mod tests {
         }
         assert!(total > 0, "should have rows across chunks");
     }
+
+    // ── designated timestamp column / chunk ts range ──────────────────
+
+    #[test]
+    fn ts_col_detection() {
+        let t = MemTable::new(
+            &Schema::new().col("v", DType::F64).col("timestamp", DType::I64),
+            1024,
+            1,
+        );
+        assert_eq!(t.ts_col(), Some(1));
+
+        let t = MemTable::new(&Schema::new().col("ts", DType::I64), 1024, 1);
+        assert_eq!(t.ts_col(), Some(0));
+
+        // Wrong dtype or name → no designated column
+        let t = MemTable::new(&Schema::new().col("timestamp", DType::F64), 1024, 1);
+        assert_eq!(t.ts_col(), None);
+        let t = MemTable::new(&Schema::new().col("when", DType::I64), 1024, 1);
+        assert_eq!(t.ts_col(), None);
+        assert_eq!(t.chunk_ts_range(0), None);
+    }
+
+    #[test]
+    fn chunk_ts_range_tracks_min_max() {
+        let schema = Schema::new().col("ts", DType::I64).col("v", DType::I32);
+        let mut t = MemTable::new(&schema, 1024, 2);
+        assert_eq!(t.chunk_ts_range(0), None, "empty chunk has no range");
+
+        t.push_row(&[Value::I64(500), Value::I32(1)]);
+        t.push_row(&[Value::I64(100), Value::I32(2)]); // out-of-order ts
+        t.push_row(&[Value::I64(900), Value::I32(3)]);
+        assert_eq!(t.chunk_ts_range(0), Some((100, 900)));
+
+        // Advance: new chunk starts with a fresh range
+        t.advance_chunk();
+        assert_eq!(t.chunk_ts_range(1), None);
+        t.push_row(&[Value::I64(1000), Value::I32(4)]);
+        assert_eq!(t.chunk_ts_range(1), Some((1000, 1000)));
+        assert_eq!(t.chunk_ts_range(0), Some((100, 900)), "old chunk keeps range");
+    }
+
+    #[test]
+    fn chunk_ts_range_resets_on_wrap() {
+        let schema = Schema::new().col("ts", DType::I64);
+        // ChunkHeader=40, I64 row=12 → 64-40=24 → 2 rows per chunk
+        let mut t = MemTable::new(&schema, 64, 2);
+        t.push_row(&[Value::I64(10)]);
+        t.push_row(&[Value::I64(20)]);
+        t.push_row(&[Value::I64(30)]); // → chunk 1
+        t.push_row(&[Value::I64(40)]);
+        t.push_row(&[Value::I64(50)]); // wrap → chunk 0 recycled
+        assert_eq!(t.write_chunk(), 0);
+        assert_eq!(t.chunk_ts_range(0), Some((50, 50)), "recycled range resets");
+        assert_eq!(t.chunk_ts_range(1), Some((30, 40)));
+    }
+
+    #[test]
+    fn row_writer_maintains_ts_range() {
+        let schema = Schema::new().col("timestamp", DType::I64).col("m", DType::Str);
+        let mut t = MemTable::new(&schema, 4096, 1);
+        t.row_writer().put_i64(300).put_str("a").finish();
+        t.row_writer().put_i64(100).put_str("b").finish();
+        assert_eq!(t.chunk_ts_range(0), Some((100, 300)));
+    }
+
+    #[test]
+    fn dedup_writer_maintains_ts_range() {
+        let schema = Schema::new().col("ts", DType::I64).col("tag", DType::Str);
+        let size = MemTable::required_size(&schema, 4096, 1);
+        let mut buf = vec![0u8; size];
+        let mut w = MemTableWriter::init(&mut buf, &schema, 4096, 1).dedup();
+        w.push_row(&[Value::I64(7), Value::Str("x")]);
+        w.push_row(&[Value::I64(3), Value::Str("x")]);
+        assert_eq!(w.chunk_ts_range(0), Some((3, 7)));
+    }
+
+    #[test]
+    fn validate_rejects_bad_ts_col() {
+        let schema = Schema::new().col("ts", DType::I64).col("v", DType::F64);
+        let mut t = MemTable::new(&schema, 1024, 1);
+        header_mut(t.as_bytes_mut()).ts_col = 3; // out of range (2 cols)
+        assert!(MemTableView::new(t.as_bytes()).is_err());
+        header_mut(t.as_bytes_mut()).ts_col = 2; // col 1 is F64, not I64
+        assert!(MemTableView::new(t.as_bytes()).is_err());
+        header_mut(t.as_bytes_mut()).ts_col = 1; // col 0 is I64 → ok
+        assert!(MemTableView::new(t.as_bytes()).is_ok());
+    }
+
+    // ── robust write lock ──────────────────────────────────────────────
+
+    /// PID of a process that no longer exists: spawn a short-lived child
+    /// and wait for it to exit.
+    fn dead_pid() -> u32 {
+        let mut child = std::process::Command::new("true")
+            .spawn()
+            .expect("spawn true");
+        let pid = child.id();
+        child.wait().expect("wait true");
+        pid
+    }
+
+    #[test]
+    fn lock_word_holds_pid_while_held() {
+        let schema = Schema::new().col("x", DType::I32);
+        let mut t = MemTable::new(&schema, 1024, 1);
+        let lock_ptr = header(t.as_bytes()).write_lock.as_ptr() as usize;
+        let lock = unsafe { &*(lock_ptr as *const std::sync::atomic::AtomicU32) };
+        {
+            let _w = t.row_writer(); // holds the lock
+            assert_eq!(
+                lock.load(Ordering::Relaxed),
+                std::process::id(),
+                "lock word must hold the owner PID"
+            );
+        }
+        assert_eq!(lock.load(Ordering::Relaxed), 0);
+    }
+
+    #[test]
+    fn stale_lock_from_dead_process_is_stolen() {
+        let schema = Schema::new().col("x", DType::I32);
+        let mut t = MemTable::new(&schema, 1024, 1);
+
+        // Simulate a writer that crashed inside the critical section.
+        header(t.as_bytes())
+            .write_lock
+            .store(dead_pid(), Ordering::SeqCst);
+
+        let start = std::time::Instant::now();
+        t.push_row(&[Value::I32(42)]); // must not deadlock
+        let took = start.elapsed();
+
+        assert_eq!(t.rows(0).next().unwrap().col_i32(0), 42);
+        assert_eq!(header(t.as_bytes()).write_lock.load(Ordering::Relaxed), 0);
+        assert!(
+            took >= crate::layout::LOCK_STEAL_TIMEOUT,
+            "steal must wait out the timeout first (took {took:?})"
+        );
+    }
+
+    #[test]
+    fn live_holder_is_not_preempted() {
+        let schema = Schema::new().col("x", DType::I32);
+        let mut t = MemTable::new(&schema, 1024, 1);
+
+        // Another thread of this (alive) process holds the lock and
+        // releases it well past the steal timeout.
+        let me = std::process::id();
+        header(t.as_bytes()).write_lock.store(me, Ordering::SeqCst);
+        let lock_ptr = header(t.as_bytes()).write_lock.as_ptr() as usize;
+        let hold = crate::layout::LOCK_STEAL_TIMEOUT + std::time::Duration::from_millis(200);
+        let releaser = std::thread::spawn(move || {
+            std::thread::sleep(hold);
+            let lock = unsafe { &*(lock_ptr as *const std::sync::atomic::AtomicU32) };
+            lock.store(0, Ordering::Release);
+        });
+
+        let start = std::time::Instant::now();
+        t.push_row(&[Value::I32(7)]);
+        let took = start.elapsed();
+        releaser.join().unwrap();
+
+        assert!(
+            took >= hold - std::time::Duration::from_millis(50),
+            "live holder must be waited on, not preempted (took {took:?})"
+        );
+        assert_eq!(t.rows(0).next().unwrap().col_i32(0), 7);
+    }
 }
diff --git a/probing/memtable/src/raw.rs b/probing/memtable/src/raw.rs
index 0c27050b..9eb61272 100644
--- a/probing/memtable/src/raw.rs
+++ b/probing/memtable/src/raw.rs
@@ -1,12 +1,42 @@
 use crate::layout::{
     chunk_header, col_desc, col_desc_mut, compute_data_offset, header, header_mut, r32, w32,
     ChunkHeader, ChunkState, Header, BYTE_ORDER_MARK, CHUNK_HEADER_SIZE, FLAGS_KNOWN, FLAG_DEDUP,
-    MAGIC, VERSION,
+    MAGIC, TS_MAX_INIT, TS_MIN_INIT, VERSION,
 };
 use crate::schema::{DType, Schema, Value};
 use std::mem;
 use std::sync::atomic::Ordering;
 
+/// Column names recognised as the designated timestamp column (must be
+/// `I64`). Matched at [`init_buf`] time and recorded in `Header::ts_col`.
+pub(crate) const TS_COL_NAMES: [&str; 2] = ["timestamp", "ts"];
+
+/// Fold a committed row's timestamp into the chunk's `min_ts`/`max_ts`.
+///
+/// Called by the (single, lock-holding) writer **before** the `used`
+/// Release store that publishes the row, so any reader that observes the
+/// row also observes a covering ts range.
+pub(crate) fn note_row_ts(ch: &ChunkHeader, ts: i64) {
+    if ts < ch.min_ts.load(Ordering::Relaxed) {
+        ch.min_ts.store(ts, Ordering::Relaxed);
+    }
+    if ts > ch.max_ts.load(Ordering::Relaxed) {
+        ch.max_ts.store(ts, Ordering::Relaxed);
+    }
+}
+
+/// Extract the designated timestamp from a row, per `Header::ts_col`.
+#[inline]
+pub(crate) fn row_ts(h: &Header, values: &[Value]) -> Option<i64> {
+    match h.ts_col as usize {
+        0 => None,
+        idx => match values.get(idx - 1) {
+            Some(Value::I64(ts)) => Some(*ts),
+            _ => None,
+        },
+    }
+}
+
 /// Returns the kernel-reported start time of a process.
 ///
 /// Used to populate [`Header::creator_start_time`] and to verify liveness
@@ -74,6 +104,9 @@ pub(crate) fn write_row_bytes(buf: &mut [u8], values: &[Value], row_data: usize)
     }
     unsafe {
         let ch = &*(ptr.add(cs) as *const ChunkHeader);
+        if let Some(ts) = row_ts(&*(ptr as *const Header), values) {
+            note_row_ts(ch, ts);
+        }
         ch.used.store((used + total) as u32, Ordering::Release);
         ch.row_count.fetch_add(1, Ordering::Release);
     }
@@ -104,6 +137,8 @@ pub(crate) fn advance_chunk_unlocked(buf: &mut [u8]) {
         let new_ch = &*(ptr.add(cs) as *const ChunkHeader);
         new_ch.used.store(0, Ordering::Relaxed);
         new_ch.row_count.store(0, Ordering::Relaxed);
+        new_ch.min_ts.store(TS_MIN_INIT, Ordering::Relaxed);
+        new_ch.max_ts.store(TS_MAX_INIT, Ordering::Relaxed);
         new_ch
             .state
             .store(ChunkState::Writing as u32, Ordering::Relaxed);
@@ -222,6 +257,15 @@ pub fn validate_buf(buf: &[u8]) -> Result<(), &'static str> {
             return Err("invalid column dtype");
         }
     }
+    let ts_col = h.ts_col as usize;
+    if ts_col != 0 {
+        if ts_col > nc {
+            return Err("ts_col out of range");
+        }
+        if DType::from_u32(col_desc(buf, ts_col - 1).dtype) != Some(DType::I64) {
+            return Err("ts_col must reference an I64 column");
+        }
+    }
     let payload_cap = csz - CHUNK_HEADER_SIZE;
     for i in 0..h.num_chunks as usize {
         let cs = expected_off + i * csz;
@@ -294,12 +338,21 @@ pub(crate) fn init_buf(buf: &mut [u8], schema: &Schema, chunk_size: u32, num_chu
         CHUNK_HEADER_SIZE + 8
     );
 
+    // First I64 column with a recognised timestamp name becomes the
+    // designated time column (index + 1; 0 = none).
+    let ts_col = schema
+        .cols
+        .iter()
+        .position(|c| c.dtype == DType::I64 && TS_COL_NAMES.contains(&c.name.as_str()))
+        .map(|i| (i + 1) as u16)
+        .unwrap_or(0);
+
     let h = header_mut(buf);
     h.magic = MAGIC;
     h.version = VERSION;
     h.header_size = mem::size_of::<Header>() as u16;
     h.byte_order = u16::from_ne_bytes(BYTE_ORDER_MARK);
-    h._pad0 = 0;
+    h.ts_col = ts_col;
     h.flags = 0;
     h.num_cols = nc as u32;
     h.num_chunks = num_chunks;
@@ -310,7 +363,7 @@ pub(crate) fn init_buf(buf: &mut [u8], schema: &Schema, chunk_size: u32, num_chu
     h.refcount.store(1, Ordering::Relaxed);
     h.creator_pid = std::process::id();
     h.creator_start_time = process_start_time(std::process::id());
-    h._reserved = [0; 2];
+    h.lock_owner_start.store(0, Ordering::Relaxed);
 
     for (i, col) in schema.cols.iter().enumerate() {
         let cd = col_desc_mut(buf, i);
@@ -326,6 +379,8 @@ pub(crate) fn init_buf(buf: &mut [u8], schema: &Schema, chunk_size: u32, num_chu
         ch.generation.store(0, Ordering::Relaxed);
         ch.used.store(0, Ordering::Relaxed);
         ch.row_count.store(0, Ordering::Relaxed);
+        ch.min_ts.store(TS_MIN_INIT, Ordering::Relaxed);
+        ch.max_ts.store(TS_MAX_INIT, Ordering::Relaxed);
         ch.state.store(ChunkState::Empty as u32, Ordering::Relaxed);
     }
     // Chunk 0 is the initial write target
diff --git a/probing/memtable/src/writer.rs b/probing/memtable/src/writer.rs
index f514f6ae..1a4cb428 100644
--- a/probing/memtable/src/writer.rs
+++ b/probing/memtable/src/writer.rs
@@ -1,5 +1,6 @@
 use crate::dedup::DedupState;
 use crate::layout::{chunk_header, release_write_lock, w32, CHUNK_HEADER_SIZE};
+use crate::raw::note_row_ts;
 use std::sync::atomic::Ordering;
 
 /// Streaming row writer — **low-overhead, weak-contract** hot-path API.
@@ -25,6 +26,11 @@ pub struct RowWriter<'a> {
     pub(crate) done: bool,
     pub(crate) col_idx: usize,
     pub(crate) locked: bool,
+    /// `Header::ts_col` (timestamp column index + 1; 0 = none).
+    pub(crate) ts_col: u16,
+    /// Timestamp captured by `put_i64` on the designated column,
+    /// folded into the chunk's min/max on a successful `finish()`.
+    pub(crate) pending_ts: Option<i64>,
 }
 
 impl<'a> RowWriter<'a> {
@@ -84,6 +90,9 @@ impl<'a> RowWriter<'a> {
         self
     }
     pub fn put_i64(&mut self, v: i64) -> &mut Self {
+        if self.ts_col as usize == self.col_idx + 1 {
+            self.pending_ts = Some(v);
+        }
         self.write_raw(&v.to_le_bytes());
         self.col_idx += 1;
         self
@@ -135,6 +144,9 @@ impl<'a> RowWriter<'a> {
             let row_data = self.pos - self.row_start - 4;
             w32(self.buf, self.row_start, row_data as u32);
             let new_used = (self.pos - self.chunk_start - CHUNK_HEADER_SIZE) as u32;
+            if let Some(ts) = self.pending_ts {
+                note_row_ts(chunk_header(self.buf, self.chunk_start), ts);
+            }
             chunk_header(self.buf, self.chunk_start)
                 .used
                 .store(new_used, Ordering::Release);
@@ -197,8 +209,8 @@ mod tests {
     #[test]
     fn row_writer_overflow() {
         let schema = Schema::new().col("x", DType::I64);
-        // ChunkHeader=24, each I64 row=12 → 40-24=16 → 1 row fits, 2nd overflows
-        let mut t = MemTable::new(&schema, 40, 1);
+        // ChunkHeader=40, each I64 row=12 → 56-40=16 → 1 row fits, 2nd overflows
+        let mut t = MemTable::new(&schema, 56, 1);
         assert!(t.row_writer().put_i64(1).finish());
         assert!(!t.row_writer().put_i64(2).finish());
         assert_eq!(t.num_rows(0), 1);
diff --git a/probing/server/Cargo.toml b/probing/server/Cargo.toml
index e2f5fe85..314e3835 100644
--- a/probing/server/Cargo.toml
+++ b/probing/server/Cargo.toml
@@ -17,7 +17,7 @@ probing-python = { path = "../extensions/python", default-features = false }
 probing-proto = { path = "../proto" }
 probing-core = { path = "../core" }
 
-datafusion = { version = "47.0.0", default-features = false }
+datafusion = { workspace = true }
 
 anyhow = { workspace = true }
 log = { workspace = true }
diff --git a/probing/server/src/engine.rs b/probing/server/src/engine.rs
index 6f15752e..4c3749eb 100644
--- a/probing/server/src/engine.rs
+++ b/probing/server/src/engine.rs
@@ -30,7 +30,10 @@ pub async fn initialize_engine() -> Result<()> {
     #[cfg(target_os = "linux")]
     let builder = builder.with_extension(cc::TaskStatsExtension::default(), "rdma", Some("flow"));
 
-    probing_core::initialize_engine(builder).await
+    let result = probing_core::initialize_engine(builder).await;
+    // Opt-in background hot→cold compaction (PROBING_COLD=on / SET memtable.cold_compaction).
+    crate::memtable_ext::start_cold_compaction_from_env();
+    result
 }
 
 pub async fn handle_query(request: Query) -> Result<QueryDataFormat> {
diff --git a/probing/server/src/memtable_ext.rs b/probing/server/src/memtable_ext.rs
index 8c49efdf..fa27daa6 100644
--- a/probing/server/src/memtable_ext.rs
+++ b/probing/server/src/memtable_ext.rs
@@ -1,698 +1,8 @@
-//! Mmap memtable integration for DataFusion.
+//! Mmap memtable ↔ SQL integration.
 //!
-//! ## File → SQL mapping (no hard-coded product prefix)
-//!
-//! Each regular file under `<data_dir>/<pid>/` can be queried when its name is valid:
-//!
-//! - **First `.` splits schema vs table** — `acme.actors` → schema `acme`, table `actors`;
-//!   `foo.bar.baz` → schema `foo`, table `bar.baz` (on-disk name is the full filename).
-//! - **No `.`** — exposed as `memtable.<filename>` (e.g. `metrics` → `memtable.metrics`).
-//!
-//! Schema head and table tail must be non-empty; only ASCII letters, digits, `_`, and
-//! `.` inside the table tail are allowed (no `/`, `\\`). Leading-dot names are ignored.
-use std::any::Any;
-use std::collections::BTreeSet;
-use std::sync::Arc;
-
-use async_trait::async_trait;
-use datafusion::arrow::array::{
-    ArrayRef, BinaryBuilder, Float32Builder, Float64Builder, GenericStringBuilder, Int32Builder,
-    Int64Builder, RecordBatch, UInt32Builder, UInt64Builder, UInt8Builder,
-};
-use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-use datafusion::catalog::CatalogProvider;
-use datafusion::catalog::SchemaProvider;
-use datafusion::datasource::TableProvider;
-use datafusion::error::DataFusionError;
-use datafusion::error::Result as DfResult;
-
-use probing_core::core::{
-    EngineCall, EngineDatasource, EngineError, EngineExtension, EngineExtensionOption,
-    LazyTableSource, Plugin, PluginType,
-};
-use probing_memtable::discover::default_dir;
-use probing_memtable::{detect_table, DType, MemTableView, MemhView, TableKind, TypedValue};
-
-/// SQL schema used for mmap files whose basename contains no `.`.
-pub const DEFAULT_UNDOTTED_SCHEMA: &str = "memtable";
-
-fn self_dir() -> std::path::PathBuf {
-    default_dir().join(std::process::id().to_string())
-}
-
-#[inline]
-fn valid_schema_head(s: &str) -> bool {
-    !s.is_empty() && s.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'_')
-}
-
-#[inline]
-fn valid_table_tail(s: &str) -> bool {
-    !s.is_empty()
-        && !s.contains('/')
-        && !s.contains('\\')
-        && s.bytes()
-            .all(|b| b.is_ascii_alphanumeric() || b == b'_' || b == b'.')
-}
-
-/// Map basename `filename` → `(schema, table)` for routing; [`None`] if skipped.
-pub fn classify_mmap_basename(filename: &str) -> Option<(String, String)> {
-    if filename.starts_with('.') {
-        return None;
-    }
-    if let Some((head, tail)) = filename.split_once('.') {
-        if valid_schema_head(head) && valid_table_tail(tail) {
-            return Some((head.to_string(), tail.to_string()));
-        }
-        return None;
-    }
-    if valid_schema_head(filename) {
-        Some((DEFAULT_UNDOTTED_SCHEMA.to_string(), filename.to_string()))
-    } else {
-        None
-    }
-}
-
-/// On-disk filename for a `(schema, table)` pair.
-pub fn mmap_filename_for(schema: &str, table: &str) -> String {
-    if schema == DEFAULT_UNDOTTED_SCHEMA {
-        table.to_string()
-    } else {
-        format!("{schema}.{table}")
-    }
-}
-
-fn tables_in_schema(target_schema: &str) -> Vec<String> {
-    let dir = self_dir();
-    let Ok(entries) = std::fs::read_dir(&dir) else {
-        return vec![];
-    };
-    let mut out = Vec::new();
-    for e in entries.flatten() {
-        if !e.path().is_file() {
-            continue;
-        }
-        let n = e.file_name().to_string_lossy().to_string();
-        if let Some((sch, tbl)) = classify_mmap_basename(&n) {
-            if sch == target_schema {
-                out.push(tbl);
-            }
-        }
-    }
-    out.sort();
-    out.dedup();
-    out
-}
-
-fn discover_all_schemas() -> BTreeSet<String> {
-    let mut out = BTreeSet::new();
-    let dir = self_dir();
-    if let Ok(entries) = std::fs::read_dir(&dir) {
-        for e in entries.flatten() {
-            if !e.path().is_file() {
-                continue;
-            }
-            let n = e.file_name().to_string_lossy().to_string();
-            if let Some((sch, _)) = classify_mmap_basename(&n) {
-                out.insert(sch);
-            }
-        }
-    }
-    out.insert(DEFAULT_UNDOTTED_SCHEMA.to_string());
-    out
-}
-
-fn bytes_to_lazy_table(data: &[u8], logical_name: &str) -> Arc<LazyTableSource> {
-    match detect_table(data) {
-        Some(TableKind::Ring) => {
-            let view = match MemTableView::new(data) {
-                Ok(v) => v,
-                Err(_) => return Arc::new(LazyTableSource::default()),
-            };
-            Arc::new(LazyTableSource {
-                name: logical_name.to_string(),
-                schema: Some(view_to_arrow_schema(&view)),
-                data: view_to_recordbatch(&view),
-            })
-        }
-        Some(TableKind::Hash) => {
-            let view = match MemhView::new(data) {
-                Ok(v) => v,
-                Err(_) => return Arc::new(LazyTableSource::default()),
-            };
-            Arc::new(LazyTableSource {
-                name: logical_name.to_string(),
-                schema: Some(memh_kv_schema()),
-                data: memh_view_to_recordbatch(&view),
-            })
-        }
-        None => Arc::new(LazyTableSource::default()),
-    }
-}
-
-fn dtype_to_arrow(dt: DType) -> DataType {
-    match dt {
-        DType::U8 => DataType::UInt8,
-        DType::U32 => DataType::UInt32,
-        DType::I32 => DataType::Int32,
-        DType::I64 => DataType::Int64,
-        DType::F32 => DataType::Float32,
-        DType::F64 => DataType::Float64,
-        DType::U64 => DataType::UInt64,
-        DType::Str => DataType::Utf8,
-        DType::Bytes => DataType::Binary,
-    }
-}
-
-fn view_to_arrow_schema(view: &MemTableView) -> SchemaRef {
-    let s = view.schema();
-    let fields: Vec<Field> = s
-        .cols
-        .iter()
-        .map(|c| Field::new(&c.name, dtype_to_arrow(c.dtype), true))
-        .collect();
-    SchemaRef::new(Schema::new(fields))
-}
-
-enum ColBuilder {
-    U8(UInt8Builder),
-    U32(UInt32Builder),
-    I32(Int32Builder),
-    I64(Int64Builder),
-    F32(Float32Builder),
-    F64(Float64Builder),
-    U64(UInt64Builder),
-    Str(GenericStringBuilder<i32>),
-    Bytes(BinaryBuilder),
-}
-
-fn view_to_recordbatch(view: &MemTableView) -> Vec<RecordBatch> {
-    let schema = view.schema();
-    let arrow_schema = view_to_arrow_schema(view);
-
-    let mut builders: Vec<ColBuilder> = schema
-        .cols
-        .iter()
-        .map(|c| match c.dtype {
-            DType::U8 => ColBuilder::U8(UInt8Builder::new()),
-            DType::U32 => ColBuilder::U32(UInt32Builder::new()),
-            DType::I32 => ColBuilder::I32(Int32Builder::new()),
-            DType::I64 => ColBuilder::I64(Int64Builder::new()),
-            DType::F32 => ColBuilder::F32(Float32Builder::new()),
-            DType::F64 => ColBuilder::F64(Float64Builder::new()),
-            DType::U64 => ColBuilder::U64(UInt64Builder::new()),
-            DType::Str => ColBuilder::Str(GenericStringBuilder::new()),
-            DType::Bytes => ColBuilder::Bytes(BinaryBuilder::new()),
-        })
-        .collect();
-
-    for chunk in 0..view.num_chunks() {
-        for row in view.rows(chunk) {
-            let mut cursor = row.cursor();
-            for builder in builders.iter_mut() {
-                match builder {
-                    ColBuilder::U8(b) => b.append_value(cursor.next_u8()),
-                    ColBuilder::U32(b) => b.append_value(cursor.next_u32()),
-                    ColBuilder::I32(b) => b.append_value(cursor.next_i32()),
-                    ColBuilder::I64(b) => b.append_value(cursor.next_i64()),
-                    ColBuilder::F32(b) => b.append_value(cursor.next_f32()),
-                    ColBuilder::F64(b) => b.append_value(cursor.next_f64()),
-                    ColBuilder::U64(b) => b.append_value(cursor.next_u64()),
-                    ColBuilder::Str(b) => b.append_value(cursor.next_str()),
-                    ColBuilder::Bytes(b) => b.append_value(cursor.next_bytes()),
-                }
-            }
-        }
-    }
-
-    let arrays: Vec<ArrayRef> = builders
-        .into_iter()
-        .map(|b| -> ArrayRef {
-            match b {
-                ColBuilder::U8(mut b) => Arc::new(b.finish()),
-                ColBuilder::U32(mut b) => Arc::new(b.finish()),
-                ColBuilder::I32(mut b) => Arc::new(b.finish()),
-                ColBuilder::I64(mut b) => Arc::new(b.finish()),
-                ColBuilder::F32(mut b) => Arc::new(b.finish()),
-                ColBuilder::F64(mut b) => Arc::new(b.finish()),
-                ColBuilder::U64(mut b) => Arc::new(b.finish()),
-                ColBuilder::Str(mut b) => Arc::new(b.finish()),
-                ColBuilder::Bytes(mut b) => Arc::new(b.finish()),
-            }
-        })
-        .collect();
-
-    match RecordBatch::try_new(arrow_schema, arrays) {
-        Ok(batch) => vec![batch],
-        Err(e) => {
-            log::error!("memtable → RecordBatch failed: {e}");
-            vec![]
-        }
-    }
-}
-
-// ── MEMH: key-value table → two-column RecordBatch ────────────────────
-
-/// Fixed Arrow schema for MEMH tables: `key` (Utf8) + `value` (Utf8).
-///
-/// All MEMH values are serialised to strings so that heterogeneous value types
-/// (scalars, strings, bytes) can be represented in a single column and queried
-/// with SQL string predicates.
-fn memh_kv_schema() -> SchemaRef {
-    Arc::new(Schema::new(vec![
-        Field::new("key", DataType::Utf8, false),
-        Field::new("value", DataType::Utf8, true),
-    ]))
-}
-
-fn typed_value_to_str(v: &TypedValue<'_>) -> String {
-    match v {
-        TypedValue::U8(n) => n.to_string(),
-        TypedValue::I32(n) => n.to_string(),
-        TypedValue::I64(n) => n.to_string(),
-        TypedValue::F32(n) => n.to_string(),
-        TypedValue::F64(n) => n.to_string(),
-        TypedValue::U64(n) => n.to_string(),
-        TypedValue::U32(n) => n.to_string(),
-        TypedValue::Str(s) => s.to_string(),
-        TypedValue::Bytes(b) => {
-            // Hex-encode without adding a dep; e.g. "0xdeadbeef"
-            let mut out = String::with_capacity(2 + b.len() * 2);
-            out.push_str("0x");
-            for byte in *b {
-                use std::fmt::Write;
-                let _ = write!(out, "{byte:02x}");
-            }
-            out
-        }
-    }
-}
-
-fn memh_view_to_recordbatch(view: &MemhView<'_>) -> Vec<RecordBatch> {
-    let schema = memh_kv_schema();
-    let mut keys: GenericStringBuilder<i32> = GenericStringBuilder::new();
-    let mut values: GenericStringBuilder<i32> = GenericStringBuilder::new();
-
-    for (k, v) in view.iter() {
-        keys.append_value(k);
-        values.append_value(typed_value_to_str(&v));
-    }
-
-    match RecordBatch::try_new(
-        schema,
-        vec![Arc::new(keys.finish()), Arc::new(values.finish())],
-    ) {
-        Ok(batch) => vec![batch],
-        Err(e) => {
-            log::error!("memh → RecordBatch failed: {e}");
-            vec![]
-        }
-    }
-}
-
-// ── Dynamic schemas from mmap filenames ───────────────────────────────
-
-/// One DataFusion schema: tables are mmap files whose basename maps here via
-/// [`classify_mmap_basename`].
-#[derive(Debug)]
-pub struct MmapFileSchemaProvider {
-    schema: String,
-}
-
-impl MmapFileSchemaProvider {
-    pub fn new(schema: impl Into<String>) -> Self {
-        Self {
-            schema: schema.into(),
-        }
-    }
-}
-
-#[async_trait]
-impl SchemaProvider for MmapFileSchemaProvider {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn table_names(&self) -> Vec<String> {
-        tables_in_schema(&self.schema)
-    }
-
-    async fn table(&self, name: &str) -> DfResult<Option<Arc<dyn TableProvider>>> {
-        let names = self.table_names();
-        if !names.iter().any(|n| n == name) {
-            return Ok(None);
-        }
-        let path = self_dir().join(mmap_filename_for(&self.schema, name));
-        let data = match std::fs::read(&path) {
-            Ok(d) => d,
-            Err(_) => return Ok(None),
-        };
-        Ok(Some(bytes_to_lazy_table(&data, name)))
-    }
-
-    fn register_table(
-        &self,
-        _name: String,
-        _table: Arc<dyn TableProvider>,
-    ) -> DfResult<Option<Arc<dyn TableProvider>>> {
-        Err(DataFusionError::NotImplemented(
-            "unable to create tables".to_string(),
-        ))
-    }
-
-    fn deregister_table(&self, _name: &str) -> DfResult<Option<Arc<dyn TableProvider>>> {
-        Err(DataFusionError::NotImplemented(
-            "unable to drop tables".to_string(),
-        ))
-    }
-
-    fn table_exist(&self, name: &str) -> bool {
-        self.table_names().iter().any(|n| n == name)
-    }
-}
-
-/// Wraps `probe` catalog; delegates static schemas (python, cluster, …)
-/// to inner, discovers mmap-backed schemas (e.g. `pulsing.*`) at query time.
-#[derive(Debug)]
-struct DynamicMmapCatalog {
-    inner: Arc<dyn CatalogProvider>,
-}
-
-impl CatalogProvider for DynamicMmapCatalog {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn schema_names(&self) -> Vec<String> {
-        let mut names: BTreeSet<String> = self.inner.schema_names().into_iter().collect();
-        for sch in discover_all_schemas() {
-            names.insert(sch);
-        }
-        names.into_iter().collect()
-    }
-
-    fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>> {
-        if !tables_in_schema(name).is_empty() || name == DEFAULT_UNDOTTED_SCHEMA {
-            return Some(Arc::new(MmapFileSchemaProvider::new(name)));
-        }
-        self.inner.schema(name)
-    }
-
-    fn register_schema(
-        &self,
-        name: &str,
-        schema: Arc<dyn SchemaProvider>,
-    ) -> DfResult<Option<Arc<dyn SchemaProvider>>> {
-        self.inner.register_schema(name, schema)
-    }
-}
-
-/// Namespace plugin that wraps the `probe` catalog with [`DynamicMmapCatalog`]
-/// for dynamic schema discovery from mmap files at query time.
-#[derive(Debug, Default)]
-pub struct UnifiedMemtablePlugin;
-
-impl Plugin for UnifiedMemtablePlugin {
-    fn name(&self) -> String {
-        "mmap_memtables".into()
-    }
-    fn kind(&self) -> PluginType {
-        PluginType::Namespace
-    }
-    fn namespace(&self) -> String {
-        "memtable".into()
-    }
-
-    fn provide_catalog(&self, inner: Arc<dyn CatalogProvider>) -> Option<Arc<dyn CatalogProvider>> {
-        Some(Arc::new(DynamicMmapCatalog { inner }))
-    }
-}
-
-// ── EngineExtension ────────────────────────────────────────────────────
-
-#[derive(Debug, Default, EngineExtension)]
-pub struct MemTableExtension {}
-
-impl EngineCall for MemTableExtension {}
-
-impl EngineDatasource for MemTableExtension {
-    fn datasrc(
-        &self,
-        _namespace: &str,
-        _name: Option<&str>,
-    ) -> Option<Arc<dyn Plugin + Sync + Send>> {
-        Some(Arc::new(UnifiedMemtablePlugin::default()))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use datafusion::arrow::array::{AsArray, Float64Array, Int32Array, Int64Array, UInt8Array};
-    use probing_memtable::{MemTable, Schema as MtSchema, Value};
-    use std::sync::Mutex;
-
-    /// `PROBING_DATA_DIR` is process-global; serialize tests that mutate it.
-    static PROBING_DATA_DIR_LOCK: Mutex<()> = Mutex::new(());
-
-    #[test]
-    fn dtype_mapping_covers_all_variants() {
-        assert_eq!(dtype_to_arrow(DType::U8), DataType::UInt8);
-        assert_eq!(dtype_to_arrow(DType::U32), DataType::UInt32);
-        assert_eq!(dtype_to_arrow(DType::I32), DataType::Int32);
-        assert_eq!(dtype_to_arrow(DType::I64), DataType::Int64);
-        assert_eq!(dtype_to_arrow(DType::F32), DataType::Float32);
-        assert_eq!(dtype_to_arrow(DType::F64), DataType::Float64);
-        assert_eq!(dtype_to_arrow(DType::U64), DataType::UInt64);
-        assert_eq!(dtype_to_arrow(DType::Str), DataType::Utf8);
-        assert_eq!(dtype_to_arrow(DType::Bytes), DataType::Binary);
-    }
-
-    #[test]
-    fn recordbatch_from_mixed_types() {
-        let schema = MtSchema::new()
-            .col("id", DType::I32)
-            .col("value", DType::F64)
-            .col("tag", DType::Str);
-        let mut t = MemTable::new(&schema, 4096, 2);
-        t.push_row(&[Value::I32(1), Value::F64(3.14), Value::Str("hello")]);
-        t.push_row(&[Value::I32(2), Value::F64(2.72), Value::Str("world")]);
-
-        let view = t.view();
-        let batches = view_to_recordbatch(&view);
-        assert_eq!(batches.len(), 1);
-        let batch = &batches[0];
-        assert_eq!(batch.num_rows(), 2);
-        assert_eq!(batch.num_columns(), 3);
-
-        let ids = batch
-            .column(0)
-            .as_any()
-            .downcast_ref::<Int32Array>()
-            .unwrap();
-        assert_eq!(ids.value(0), 1);
-        assert_eq!(ids.value(1), 2);
-
-        let vals = batch
-            .column(1)
-            .as_any()
-            .downcast_ref::<Float64Array>()
-            .unwrap();
-        assert!((vals.value(0) - 3.14).abs() < 1e-10);
-        assert!((vals.value(1) - 2.72).abs() < 1e-10);
-
-        let tags: &datafusion::arrow::array::StringArray = batch.column(2).as_string();
-        assert_eq!(tags.value(0), "hello");
-        assert_eq!(tags.value(1), "world");
-    }
-
-    #[test]
-    fn recordbatch_multiple_chunks() {
-        let schema = MtSchema::new().col("v", DType::I64);
-        // Small chunk so rows spill across chunks
-        let mut t = MemTable::new(&schema, 128, 4);
-        for i in 0..20 {
-            t.push_row(&[Value::I64(i)]);
-        }
-
-        let view = t.view();
-        let batches = view_to_recordbatch(&view);
-        assert_eq!(batches.len(), 1);
-        let batch = &batches[0];
-        // Ring buffer may have overwritten old chunks, but total rows should be > 0
-        assert!(batch.num_rows() > 0);
-
-        let col = batch
-            .column(0)
-            .as_any()
-            .downcast_ref::<Int64Array>()
-            .unwrap();
-        // Verify values are sequential (from whatever chunks survived)
-        for i in 1..col.len() {
-            assert!(col.value(i) > col.value(i - 1));
-        }
-    }
-
-    #[test]
-    fn recordbatch_empty_table() {
-        let schema = MtSchema::new().col("x", DType::U8);
-        let t = MemTable::new(&schema, 1024, 1);
-        let view = t.view();
-        let batches = view_to_recordbatch(&view);
-        assert_eq!(batches.len(), 1);
-        assert_eq!(batches[0].num_rows(), 0);
-    }
-
-    #[test]
-    fn arrow_schema_matches_memtable_schema() {
-        let schema = MtSchema::new()
-            .col("ts", DType::I64)
-            .col("cpu", DType::F64)
-            .col("name", DType::Str);
-        let t = MemTable::new(&schema, 1024, 1);
-        let view = t.view();
-        let arrow = view_to_arrow_schema(&view);
-
-        assert_eq!(arrow.fields().len(), 3);
-        assert_eq!(arrow.field(0).name(), "ts");
-        assert_eq!(*arrow.field(0).data_type(), DataType::Int64);
-        assert_eq!(arrow.field(1).name(), "cpu");
-        assert_eq!(*arrow.field(1).data_type(), DataType::Float64);
-        assert_eq!(arrow.field(2).name(), "name");
-        assert_eq!(*arrow.field(2).data_type(), DataType::Utf8);
-    }
-
-    #[test]
-    fn recordbatch_u8_column() {
-        let schema = MtSchema::new().col("flag", DType::U8);
-        let mut t = MemTable::new(&schema, 1024, 1);
-        t.push_row(&[Value::U8(0)]);
-        t.push_row(&[Value::U8(255)]);
-
-        let view = t.view();
-        let batches = view_to_recordbatch(&view);
-        let col = batches[0]
-            .column(0)
-            .as_any()
-            .downcast_ref::<UInt8Array>()
-            .unwrap();
-        assert_eq!(col.value(0), 0);
-        assert_eq!(col.value(1), 255);
-    }
-
-    fn read_lazy_from_mmap(schema: &str, table: &str) -> Arc<LazyTableSource> {
-        let path = self_dir().join(mmap_filename_for(schema, table));
-        let data = std::fs::read(path).unwrap();
-        bytes_to_lazy_table(&data, table)
-    }
-
-    #[test]
-    fn classify_and_mmap_roundtrip() {
-        assert_eq!(
-            classify_mmap_basename("pulsing.actors"),
-            Some(("pulsing".into(), "actors".into()))
-        );
-        assert_eq!(
-            classify_mmap_basename("foo.bar.baz"),
-            Some(("foo".into(), "bar.baz".into()))
-        );
-        assert_eq!(
-            classify_mmap_basename("metrics"),
-            Some((DEFAULT_UNDOTTED_SCHEMA.into(), "metrics".into()))
-        );
-        assert_eq!(
-            mmap_filename_for(DEFAULT_UNDOTTED_SCHEMA, "metrics"),
-            "metrics"
-        );
-        assert_eq!(mmap_filename_for("pulsing", "actors"), "pulsing.actors");
-        assert_eq!(mmap_filename_for("foo", "bar.baz"), "foo.bar.baz");
-    }
-
-    #[test]
-    fn namespace_list_and_make_lazy_via_exposed_table() {
-        let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap();
-        use probing_memtable::discover::ExposedTable;
-
-        let tmp = tempfile::tempdir().unwrap();
-        // Override discovery dir via env var
-        let orig = std::env::var("PROBING_DATA_DIR").ok();
-        std::env::set_var("PROBING_DATA_DIR", tmp.path());
-
-        let schema = MtSchema::new().col("ts", DType::I64).col("msg", DType::Str);
-        let mut table = ExposedTable::create("test_metrics", &schema, 4096, 2).unwrap();
-        {
-            let mut w = table.writer();
-            w.push_row(&[Value::I64(100), Value::Str("alpha")]);
-            w.push_row(&[Value::I64(200), Value::Str("beta")]);
-        }
-
-        let names = tables_in_schema(DEFAULT_UNDOTTED_SCHEMA);
-        assert!(
-            names.contains(&"test_metrics".to_string()),
-            "got: {names:?}"
-        );
-
-        let lazy = read_lazy_from_mmap(DEFAULT_UNDOTTED_SCHEMA, "test_metrics");
-        assert_eq!(lazy.data.len(), 1);
-        let batch = &lazy.data[0];
-        assert_eq!(batch.num_rows(), 2);
-
-        let ts = batch
-            .column(0)
-            .as_any()
-            .downcast_ref::<Int64Array>()
-            .unwrap();
-        assert_eq!(ts.value(0), 100);
-        assert_eq!(ts.value(1), 200);
-
-        let msgs: &datafusion::arrow::array::StringArray = batch.column(1).as_string();
-        assert_eq!(msgs.value(0), "alpha");
-        assert_eq!(msgs.value(1), "beta");
-
-        // Cleanup
-        drop(table);
-        match orig {
-            Some(v) => std::env::set_var("PROBING_DATA_DIR", v),
-            None => std::env::remove_var("PROBING_DATA_DIR"),
-        }
-    }
-
-    #[test]
-    fn dotted_schema_isolated_from_memtable_list() {
-        let _lock = PROBING_DATA_DIR_LOCK.lock().unwrap();
-        use probing_memtable::discover::ExposedTable;
-
-        let tmp = tempfile::tempdir().unwrap();
-        let orig = std::env::var("PROBING_DATA_DIR").ok();
-        std::env::set_var("PROBING_DATA_DIR", tmp.path());
-
-        let schema = MtSchema::new().col("ts", DType::I64).col("msg", DType::Str);
-        let dotted = mmap_filename_for("acme", "metrics_demo");
-        let mut ring = ExposedTable::create(&dotted, &schema, 4096, 2).unwrap();
-        {
-            let mut w = ring.writer();
-            w.push_row(&[Value::I64(1), Value::Str("x")]);
-        }
-
-        let mem_names = tables_in_schema(DEFAULT_UNDOTTED_SCHEMA);
-        assert!(
-            !mem_names.contains(&"metrics_demo".to_string()),
-            "dotted file must not appear as memtable table: {mem_names:?}"
-        );
-
-        let acme_names = tables_in_schema("acme");
-        assert!(
-            acme_names.contains(&"metrics_demo".to_string()),
-            "got: {acme_names:?}"
-        );
-
-        let lazy = read_lazy_from_mmap("acme", "metrics_demo");
-        assert_eq!(lazy.data.len(), 1);
-        assert_eq!(lazy.data[0].num_rows(), 1);
+//! The implementation moved to `probing_core::core::memtable_sql` so that both
+//! the server and language extensions can expose mmap memtables to SQL through
+//! the same code path (logical chunk ordering, generation re-validation, and
+//! zero-copy mmap reads). This module re-exports it for backward compatibility.
 
-        drop(ring);
-        match orig {
-            Some(v) => std::env::set_var("PROBING_DATA_DIR", v),
-            None => std::env::remove_var("PROBING_DATA_DIR"),
-        }
-    }
-}
+pub use probing_core::core::memtable_sql::*;

From 1dd31ac8e4f40ce083d7fc25fc811f332c014fb8 Mon Sep 17 00:00:00 2001
From: Reiase <reiase@gmail.com>
Date: Sat, 13 Jun 2026 23:51:21 +0800
Subject: [PATCH 2/3] Update .gitignore, upgrade dioxus dependencies, and
 enhance documentation

- Added `.claude/` to `.gitignore` to exclude specific files from version control.
- Updated `dioxus` and related dependencies to version `0.7.9` in `Cargo.toml` for improved features and stability.
- Revised documentation in `data-layer.md` and its Chinese counterpart to reflect changes in the header version and the single-writer model, enhancing clarity and accuracy.
---
 .github/actions/setup-build-env/action.yml  |   2 +-
 .gitignore                                  |   1 +
 docs/src/design/data-layer.md               |  66 ++--
 docs/src/design/data-layer.zh.md            |  58 ++-
 probing/cli/src/cli/bench/args.rs           |  19 +-
 probing/cli/src/cli/bench/runners/mixed.rs  |  13 +-
 probing/cli/src/cli/bench/runners/mp.rs     |  14 +-
 probing/cli/src/cli/bench/runners/write.rs  |  19 +-
 probing/memtable/benches/memtable_report.rs |  48 ---
 probing/memtable/src/discover.rs            |   3 +-
 probing/memtable/src/layout.rs              | 232 ++---------
 probing/memtable/src/lib.rs                 |  20 +-
 probing/memtable/src/memtable.rs            | 402 +++-----------------
 probing/memtable/src/raw.rs                 |  13 +-
 probing/memtable/src/writer.rs              |  45 +--
 web/Cargo.toml                              |   9 +-
 16 files changed, 238 insertions(+), 726 deletions(-)

diff --git a/.github/actions/setup-build-env/action.yml b/.github/actions/setup-build-env/action.yml
index 6f26a114..4cadba77 100644
--- a/.github/actions/setup-build-env/action.yml
+++ b/.github/actions/setup-build-env/action.yml
@@ -66,7 +66,7 @@ runs:
         test -e ~/.cargo/bin/rnr || cargo install rnr
         test -e ~/.cargo/bin/cargo-nextest || cargo install --locked cargo-nextest
         test -e ~/.cargo/bin/cargo-binstall || cargo install cargo-binstall
-        test -e ~/.cargo/bin/dx || cargo binstall dioxus-cli@0.7.6 -y
+        test -e ~/.cargo/bin/dx || cargo binstall dioxus-cli@0.7.9 -y
         test -e ~/.cargo/bin/trunk || cargo install trunk --locked
 
     - name: Install Python Build Dependencies
diff --git a/.gitignore b/.gitignore
index b7860735..a1dd4b73 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ pkg/
 venv/
 python/probing/probing
 docs/site/
+.claude/
diff --git a/docs/src/design/data-layer.md b/docs/src/design/data-layer.md
index ddcf4156..2dd565dc 100644
--- a/docs/src/design/data-layer.md
+++ b/docs/src/design/data-layer.md
@@ -53,12 +53,12 @@ The hot tier is mapped read-only at query time; the cold tier is read via `Segme
 Every MEMT buffer (heap, shared memory, or mmap'd file) begins with a 64-byte header (one cache
 line), followed by per-column descriptors, then chunk data.
 
-**Header v3 (64 bytes):**
+**Header v4 (64 bytes):**
 
 | offset | size | field | notes |
 |---|---|---|---|
 | 0 | 4 | `magic` | `0x4D454D54` (`"MEMT"`) |
-| 4 | 2 | `version` | 3 |
+| 4 | 2 | `version` | 4 |
 | 6 | 2 | `header_size` | 64 (validation) |
 | 8 | 2 | `byte_order` | BOM `[0x01,0x02]` |
 | 10 | 2 | `ts_col` | timestamp column index + 1 (0 = none) |
@@ -68,16 +68,19 @@ line), followed by per-column descriptors, then chunk data.
 | 24 | 4 | `chunk_size` | bytes per chunk |
 | 28 | 4 | `data_offset` | 64-aligned |
 | 32 | 4 | `write_chunk` | `AtomicU32` — current ring slot |
-| 36 | 4 | `write_lock` | `AtomicU32` — 0 = free, else holder PID |
-| 40 | 4 | `refcount` | `AtomicU32` |
-| 44 | 4 | `creator_pid` | |
-| 48 | 8 | `creator_start_time` | for PID-recycling detection |
-| 56 | 8 | `lock_owner_start` | `AtomicU64` — lock holder's start time |
+| 36 | 4 | `refcount` | `AtomicU32` |
+| 40 | 4 | `creator_pid` | |
+| 44 | 4 | `_pad0` | alignment (was `write_lock` in v3) |
+| 48 | 8 | `creator_start_time` | for PID-recycling detection during discovery |
+| 56 | 8 | `_reserved` | reserved (was `lock_owner_start` in v3) |
 
 Bytes 0–31 are the **cold zone** (immutable after init); bytes 32–63 are the **hot zone**
 (atomically mutated), split to avoid false sharing. Each chunk starts with a 40-byte
 `ChunkHeader` carrying a `generation` counter and per-chunk `min_ts`/`max_ts` (`AtomicI64`).
 
+> **v4** dropped the `write_lock` and `lock_owner_start` fields: MEMT is single-writer, so there is
+> no in-buffer write lock. Their byte slots are now reserved.
+
 ### Backends
 
 The same API backs three storage kinds:
@@ -95,25 +98,42 @@ slot (wrapping), sealing the previous chunk. Each slot carries a monotonically i
 **logical (oldest → newest) order** and re-check the generation after reading — a chunk recycled
 mid-read is discarded rather than surfacing torn rows.
 
-### Robust Write Lock
+### Single-Writer Model (no lock)
+
+MEMT is **single-writer**: exactly one writer owns each buffer (the creator process; any in-process
+write is serialized by the caller). There is **no in-buffer write lock** — the writer appends rows
+without any CAS or fence on a lock word. Readers are lock-free and never coordinated with the writer
+except through the per-chunk `used` / `row_count` `Release` stores and `generation` re-validation.
+
+Why this is safe and sufficient:
+
+- Production uses one writer per table — the Python `ExternalTable` path writes one file per process
+  (named `<data_dir>/<pid>/…`); a process restart means a new PID and a fresh file.
+- Readers never wrote to the lock anyway; their correctness rides the `Release`/`Acquire` ordering on
+  `used`/`row_count` plus the `generation` check on each chunk.
+- Removing the lock also removes the fork-safety hazard the PID-stealing spinlock had to guard
+  against (a forked child inheriting a cached start time and being mistaken for a recycled PID).
+
+> The **cold tier (MEMC)** has a separate concurrency story — multiple compactor writers are
+> distinguished by `writer_id` and segment isolation — and is unaffected by the MEMT single-writer
+> model.
 
-`write_lock` holds **0 (free) or the holder's PID**. A waiter spins; if it spins past
-`LOCK_STEAL_TIMEOUT` (500 ms) it enters a steal decision:
+### Single-Writer Fast Path
 
-- if the holder process no longer exists (`kill(pid, 0)`), the lock is stolen;
-- if the holder exists but its kernel start time differs from `lock_owner_start`, the PID was
-  recycled by an unrelated process — stolen after a short re-check grace.
+Since data is generated **one row at a time**, the single-row commit path is tuned to be as cheap as
+possible:
 
-Stealing is data-safe: rows only become visible via the `Release` store of `row_count` at the end
-of a write, so a half-written row from a dead holder stays uncommitted and is simply overwritten.
+- **Zero per-row allocation.** The `RowWriter` streaming API encodes fields directly into the ring
+  chunk; no `Vec<Value>` is built per row. (The `push_row(&[Value])` convenience API still works but
+  asks the caller to materialize a value slice.)
+- **No lock, no per-row `catch_unwind`.** With a single writer there is nothing to lock and nothing
+  to release on panic, so neither a per-row CAS + `Release` fence nor a `catch_unwind`/`Drop` guard
+  is needed.
 
-!!! note "Fork safety"
-    The holder's start time is read via a per-PID cache, **not** a one-shot cache. A child that
-    inherited a parent's cached value would record the parent's start time and be mistaken for a
-    recycled PID by a waiter — exactly the hazard fork-heavy workloads (PyTorch DataLoader)
-    trigger. Re-reading whenever the live PID changes makes every post-fork caller observe its own
-    start time. (Start times come from `/proc` on Linux; on platforms without it the steal-on-recycle
-    path is inert.)
+Reader correctness is independent of the write path: row visibility always rides the `used` /
+`row_count` `Release` stores in `finish()`. Measured single-thread `metrics` throughput (M4,
+release): plain `push_row` + spinlock ≈ 18.8M rows/s → streaming, lock-free ≈ 29.9M rows/s
+(**+59%** end to end).
 
 ### Timestamp Metadata
 
@@ -273,7 +293,7 @@ re-validates).
 - No torn rows on reads (generation re-validation); cold torn-tail recovery.
 - Exactly-once across tiers (query dedup) and across restarts (`prime_from_cold`).
 - Bounded hot memory; bounded cold bytes/TTL.
-- Fork-safe locking.
+- Single-writer, lock-free hot path (MEMT); readers lock-free via generation re-validation.
 
 **Known trade-offs (P2 backlog):**
 
diff --git a/docs/src/design/data-layer.zh.md b/docs/src/design/data-layer.zh.md
index 718317fc..9303b605 100644
--- a/docs/src/design/data-layer.zh.md
+++ b/docs/src/design/data-layer.zh.md
@@ -47,12 +47,12 @@ graph LR
 每个 MEMT 缓冲区（堆、共享内存或 mmap 文件）都以 64 字节头部（一个 cache line）开始，随后是
 逐列描述符，再是 chunk 数据。
 
-**Header v3（64 字节）：**
+**Header v4（64 字节）：**
 
 | 偏移 | 大小 | 字段 | 说明 |
 |---|---|---|---|
 | 0 | 4 | `magic` | `0x4D454D54`（`"MEMT"`） |
-| 4 | 2 | `version` | 3 |
+| 4 | 2 | `version` | 4 |
 | 6 | 2 | `header_size` | 64（仅校验） |
 | 8 | 2 | `byte_order` | BOM `[0x01,0x02]` |
 | 10 | 2 | `ts_col` | 时间戳列索引 + 1（0 = 无） |
@@ -62,16 +62,19 @@ graph LR
 | 24 | 4 | `chunk_size` | 每个 chunk 字节数 |
 | 28 | 4 | `data_offset` | 64 对齐 |
 | 32 | 4 | `write_chunk` | `AtomicU32`——当前环形槽位 |
-| 36 | 4 | `write_lock` | `AtomicU32`——0 = 空闲，否则为持有者 PID |
-| 40 | 4 | `refcount` | `AtomicU32` |
-| 44 | 4 | `creator_pid` | |
-| 48 | 8 | `creator_start_time` | 用于 PID 回收检测 |
-| 56 | 8 | `lock_owner_start` | `AtomicU64`——锁持有者的进程启动时间 |
+| 36 | 4 | `refcount` | `AtomicU32` |
+| 40 | 4 | `creator_pid` | |
+| 44 | 4 | `_pad0` | 对齐填充（v3 中为 `write_lock`） |
+| 48 | 8 | `creator_start_time` | 用于发现期的 PID 回收检测 |
+| 56 | 8 | `_reserved` | 预留（v3 中为 `lock_owner_start`） |
 
 字节 0–31 是**冷区**（初始化后不可变），字节 32–63 是**热区**（运行时原子修改），二者分离以避免
 伪共享。每个 chunk 以 40 字节的 `ChunkHeader` 开头，携带 `generation` 计数器及逐 chunk 的
 `min_ts`/`max_ts`（`AtomicI64`）。
 
+> **v4** 移除了 `write_lock` 与 `lock_owner_start` 字段：MEMT 是单写者，缓冲区内不再有写锁。其字节
+> 槽位现已预留。
+
 ### 三种后端
 
 同一套 API 支撑三种存储形态：
@@ -86,23 +89,36 @@ graph LR
 每个槽位携带单调递增的 `generation`（每次环形绕回到该槽位即自增）。读取者按**逻辑顺序（旧 → 新）**
 物化 chunk，并在读取后复核 generation——若某 chunk 在读取过程中被回收，则丢弃而非暴露半行数据。
 
-### Robust 写锁
+### 单写者模型（无锁）
+
+MEMT 是**单写者**：每个缓冲区恰好一个写者拥有（创建者进程；进程内的写由调用方自行串行化）。缓冲区
+内**没有写锁**——写者直接追加行，不在任何锁字上做 CAS 或屏障。读者免锁，与写者之间仅通过逐 chunk
+的 `used` / `row_count` 的 `Release` 存储以及 `generation` 复核来协调。
+
+为何安全且足够：
+
+- 生产中每表单写者——Python `ExternalTable` 路径为每个进程写一个文件（命名为 `<data_dir>/<pid>/…`）；
+  进程重启即换新 PID、换新文件；
+- 读者本就不写锁字，其正确性依赖 `used`/`row_count` 的 `Release`/`Acquire` 次序以及逐 chunk 的
+  `generation` 复核；
+- 去掉锁还顺带消除了 PID 抢占自旋锁必须防范的 fork 隐患（fork 出的子进程继承了缓存的启动时间，被误
+  判为 PID 回收）。
+
+> **冷层（MEMC）** 是另一套并发模型——多个压实写者由 `writer_id` 与段隔离区分——不受 MEMT 单写者
+> 模型影响。
 
-`write_lock` 存放 **0（空闲）或持有者的 PID**。等待者自旋；若自旋超过 `LOCK_STEAL_TIMEOUT`
-（500 ms），进入抢占判定：
+### 单写者快路径
 
-- 若持有者进程已不存在（`kill(pid, 0)`），抢占该锁；
-- 若持有者存在但其内核启动时间与 `lock_owner_start` 不符，说明 PID 已被无关进程回收——经短暂复核
-  宽限后抢占。
+由于数据是**单条生成**的，单行提交路径被尽量做轻：
 
-抢占是数据安全的：行只有在写入结束时通过 `row_count` 的 `Release` 存储才可见，因此已死持有者写到
-一半的行不会提交，会被直接覆盖。
+- **每行零分配。** `RowWriter` 流式 API 直接把各字段编码进 ring chunk，不再为每行构造
+  `Vec<Value>`。（`push_row(&[Value])` 便捷接口仍可用，但要求调用方先物化一个 value 切片。）
+- **无锁，也无每行 `catch_unwind`。** 单写者下既无需加锁，也无需在 panic 时释放锁，因此既不需要逐行
+  的 CAS + `Release` 屏障，也不需要 `catch_unwind`/`Drop` 守卫。
 
-!!! note "fork 安全"
-    持有者启动时间通过**按 PID 缓存**读取，而非一次性缓存。若子进程继承了父进程的缓存值，就会记录
-    父进程的启动时间，从而被等待者误判为 PID 回收——这正是大量 fork 的负载（PyTorch DataLoader）
-    会触发的隐患。每当存活 PID 变化即重新读取，可让每个 fork 后的调用者观察到自己的启动时间。
-    （启动时间在 Linux 上来自 `/proc`；在不具备该接口的平台上，回收抢占路径自动失效。）
+读者正确性与写路径无关：行可见性始终依赖 `finish()` 中 `used` / `row_count` 的 `Release` 存储。单线程
+`metrics` 实测吞吐（M4，release）：朴素 `push_row` + 自旋锁 ≈ 18.8M 行/s → 流式、免锁 ≈ 29.9M 行/s
+（端到端 **+59%**）。
 
 ### 时间戳元数据
 
@@ -244,7 +260,7 @@ chunk。每行恰好计数一次，且去重对环形回收免疫（generation 
 - 读取无半行数据（generation 复核）；冷层尾部撕裂可恢复；
 - 跨层精确一次（查询去重）与跨重启精确一次（`prime_from_cold`）；
 - 热层内存有界；冷层字节/TTL 有界；
-- fork 安全的锁。
+- 单写者、无锁热路径（MEMT）；读者通过 generation 复核免锁读取。
 
 **已知取舍（P2 待办）：**
 
diff --git a/probing/cli/src/cli/bench/args.rs b/probing/cli/src/cli/bench/args.rs
index ad9b7cff..a46b4ce3 100644
--- a/probing/cli/src/cli/bench/args.rs
+++ b/probing/cli/src/cli/bench/args.rs
@@ -22,9 +22,9 @@ pub enum Backend {
 /// Streaming row writer vs. value-vector `push_row`.
 #[derive(ValueEnum, Debug, Clone, Copy, PartialEq, Eq)]
 pub enum WriterMode {
-    /// `push_row` — concurrency-safe auto-advance; allocates a value row.
+    /// `push_row` — auto-advance on chunk full; allocates a value row.
     Push,
-    /// `RowWriter` streaming fast path (single-threaded only).
+    /// `RowWriter` streaming fast path (zero per-row allocation).
     Streaming,
 }
 
@@ -81,13 +81,14 @@ pub struct WriteArgs {
     #[arg(long, default_value_t = 1_000_000)]
     pub rows: u64,
 
-    /// Concurrent writer threads. >1 requires a shared backend
-    /// (shm/file/shared) to exercise the cross-handle write lock.
+    /// Concurrent writer threads. Only valid with `--backend heap`, where each
+    /// thread gets its own independent table. Shared backends are single-writer.
     #[arg(long, default_value_t = 1)]
     pub threads: usize,
 
-    /// Writer API to exercise.
-    #[arg(long, value_enum, default_value = "push")]
+    /// Writer API to exercise. `streaming` is the zero-allocation single-row
+    /// fast path (no per-row value vector); `push` allocates a value row.
+    #[arg(long, value_enum, default_value = "streaming")]
     pub writer: WriterMode,
 
     /// File path for `--backend file` (defaults to a temp file).
@@ -178,8 +179,8 @@ pub struct MixedArgs {
     #[arg(long, value_enum, default_value = "shared")]
     pub backend: Backend,
 
-    /// Concurrent writer threads.
-    #[arg(long, default_value_t = 2)]
+    /// Writer threads. MEMT is single-writer; must be 1.
+    #[arg(long, default_value_t = 1)]
     pub writers: usize,
 
     /// Concurrent reader (scan) threads.
@@ -218,7 +219,7 @@ pub struct MpArgs {
     #[arg(long, value_enum, default_value = "shared")]
     pub backend: Backend,
 
-    /// Number of writer processes.
+    /// Writer processes. MEMT is single-writer; must be 1.
     #[arg(long, default_value_t = 1)]
     pub writers: usize,
 
diff --git a/probing/cli/src/cli/bench/runners/mixed.rs b/probing/cli/src/cli/bench/runners/mixed.rs
index 4b92d666..ea0972d9 100644
--- a/probing/cli/src/cli/bench/runners/mixed.rs
+++ b/probing/cli/src/cli/bench/runners/mixed.rs
@@ -1,7 +1,7 @@
-//! `mixed` — end-to-end pipeline / soak: concurrent writers, optional
-//! background compactor, and concurrent readers over one shared table for a
-//! fixed duration. Reports per-role throughput plus the resulting cold-tier
-//! footprint.
+//! `mixed` — end-to-end pipeline / soak: a single writer, optional background
+//! compactor, and concurrent readers over one shared table for a fixed
+//! duration. Reports per-role throughput plus the resulting cold-tier
+//! footprint. MEMT is single-writer, so the writer count is fixed at 1.
 
 use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::sync::Arc;
@@ -19,7 +19,10 @@ use crate::cli::bench::workload::RowGen;
 pub fn run(args: &MixedArgs, json: bool, seed: u64) -> Result<()> {
     let spec = args.schema.spec();
     let row_bytes = spec.approx_row_bytes() as u64;
-    let writers = args.writers.max(1);
+    if args.writers > 1 {
+        bail!("mixed is single-writer (MEMT); --writers must be 1");
+    }
+    let writers = 1usize;
     let readers = args.readers;
 
     // Create the shared backing; keep the creator alive for the whole run.
diff --git a/probing/cli/src/cli/bench/runners/mp.rs b/probing/cli/src/cli/bench/runners/mp.rs
index b5ba246e..a25e9e3e 100644
--- a/probing/cli/src/cli/bench/runners/mp.rs
+++ b/probing/cli/src/cli/bench/runners/mp.rs
@@ -6,9 +6,9 @@
 //! wall-clock window (synchronised by a shared start instant) and prints a
 //! one-line JSON result; the orchestrator aggregates them.
 //!
-//! This is the scenario the data layer is built for: independent OS processes
-//! contending on the in-buffer robust write lock (writers) while others read
-//! lock-free (readers) — the cross-process path threads cannot exercise.
+//! This exercises the cross-process read path: a single writer process feeds
+//! the shared mapping while several reader processes read lock-free. MEMT is
+//! single-writer, so there is exactly one writer process.
 //!
 //! Worker vs. orchestrator is selected by the `PROBING_BENCH_MP_ROLE`
 //! environment variable, so the public surface stays a single `mp` command.
@@ -41,11 +41,11 @@ pub fn run(args: &MpArgs, json: bool, seed: u64) -> Result<()> {
 fn orchestrate(args: &MpArgs, json: bool, seed: u64) -> Result<()> {
     let spec = args.schema.spec();
     let row_bytes = spec.approx_row_bytes() as u64;
-    let writers = args.writers.max(1);
-    let readers = args.readers;
-    if writers + readers == 0 {
-        bail!("need at least one worker (--writers/--readers)");
+    if args.writers > 1 {
+        bail!("mp is single-writer (MEMT); --writers must be 1");
     }
+    let writers = 1usize;
+    let readers = args.readers;
 
     // Create the shared backing and keep it alive for the whole run.
     let (attach, _creator) = match args.backend {
diff --git a/probing/cli/src/cli/bench/runners/write.rs b/probing/cli/src/cli/bench/runners/write.rs
index 16fd4aa4..356b6fb8 100644
--- a/probing/cli/src/cli/bench/runners/write.rs
+++ b/probing/cli/src/cli/bench/runners/write.rs
@@ -1,10 +1,8 @@
 //! `write` — write throughput across backends, writer counts and APIs.
 //!
-//! With `--threads > 1` on a shared backend (`shm`/`file`/`shared`) every
-//! thread opens its own handle to the same mapping, so the run genuinely
-//! contends on the in-buffer robust write lock. The `heap` backend cannot be
-//! shared, so multi-threaded heap runs use independent per-thread tables
-//! (parallel throughput, no lock contention).
+//! MEMT is single-writer, so shared backends (`shm`/`file`/`shared`) run with
+//! one writer. `--threads > 1` is only valid on the `heap` backend, where each
+//! thread gets its own independent table (parallel throughput, one writer each).
 
 use std::sync::Barrier;
 use std::time::Instant;
@@ -37,10 +35,19 @@ pub fn run(args: &WriteArgs, json: bool, seed: u64) -> Result<()> {
     if args.writer == WriterMode::Streaming && threads > 1 {
         bail!("--writer streaming requires --threads 1 (advance-on-overflow is not concurrency-safe)");
     }
+    // MEMT is single-writer. Multiple threads writing the SAME mapping is
+    // unsupported, so shared backends are capped to one writer. The heap
+    // backend instead gives each thread its own independent table.
+    if threads > 1 && args.backend != Backend::Heap {
+        bail!(
+            "--threads > 1 requires --backend heap (independent per-thread tables); \
+             shared backends (shm/file/shared) are single-writer"
+        );
+    }
     if threads > 1 && args.backend == Backend::Heap {
         eprintln!(
             "note: heap backend cannot be shared; --threads {threads} uses independent \
-             per-thread tables (no lock contention)"
+             per-thread tables (parallel, single-writer each)"
         );
     }
 
diff --git a/probing/memtable/benches/memtable_report.rs b/probing/memtable/benches/memtable_report.rs
index 6de6ac0d..d8625c9b 100644
--- a/probing/memtable/benches/memtable_report.rs
+++ b/probing/memtable/benches/memtable_report.rs
@@ -280,40 +280,6 @@ fn bench_push_row_unchecked_fixed(rows: &[FixedInput]) -> u64 {
     rows.len() as u64
 }
 
-fn bench_solo_push_row_fixed(rows: &[FixedInput]) -> u64 {
-    let schema = fixed_schema();
-    let bytes_per_row = 4 + 8 + 8;
-    let chunk_size = 64 * 1024;
-    let total_bytes = rows.len() * bytes_per_row;
-    let num_chunks = ((total_bytes / chunk_size) + 2).max(2);
-    let size = MemTable::required_size(&schema, chunk_size, num_chunks);
-    let mut buf = vec![0u8; size];
-    let mut sw =
-        MemTableWriter::init(&mut buf, &schema, chunk_size as u32, num_chunks as u32).solo();
-    for &(ts, value) in rows {
-        sw.push_row_unchecked(&[Value::I64(ts), Value::I64(value)]);
-    }
-    black_box(sw.num_chunks());
-    rows.len() as u64
-}
-
-fn bench_solo_row_writer_fixed(rows: &[FixedInput]) -> u64 {
-    let schema = fixed_schema();
-    let bytes_per_row = 4 + 8 + 8;
-    let chunk_size = 64 * 1024;
-    let total_bytes = rows.len() * bytes_per_row;
-    let num_chunks = ((total_bytes / chunk_size) + 2).max(2);
-    let size = MemTable::required_size(&schema, chunk_size, num_chunks);
-    let mut buf = vec![0u8; size];
-    let mut sw =
-        MemTableWriter::init(&mut buf, &schema, chunk_size as u32, num_chunks as u32).solo();
-    for &(ts, value) in rows {
-        sw.row_writer().put_i64(ts).put_i64(value).finish();
-    }
-    black_box(sw.num_chunks());
-    rows.len() as u64
-}
-
 fn bench_dedup_push_row_strings(rows: &[StringInput]) -> u64 {
     let schema = string_schema();
     let approx_bytes_per_row = 4 + 8 + (4 + 5) + (4 + 20);
@@ -509,9 +475,7 @@ fn print_report(results: &[BenchResult]) {
             ("baseline_memcpy_fixed", None),
             ("baseline_raw_append", Some("baseline_memcpy_fixed")),
             ("baseline_flat_encode", Some("baseline_raw_append")),
-            ("solo_row_writer_fixed", Some("baseline_flat_encode")),
             ("row_writer_fixed", Some("baseline_flat_encode")),
-            ("solo_push_row_fixed", Some("solo_row_writer_fixed")),
             ("push_row_unchecked_fixed", Some("row_writer_fixed")),
             ("push_row_fixed", Some("row_writer_fixed")),
         ],
@@ -594,18 +558,6 @@ fn main() {
             FIXED_ROWS as u64 * fixed_bytes,
             || bench_push_row_unchecked_fixed(&fixed_write_inputs),
         ),
-        run_case(
-            "solo_push_row_fixed",
-            FIXED_ROWS as u64,
-            FIXED_ROWS as u64 * fixed_bytes,
-            || bench_solo_push_row_fixed(&fixed_write_inputs),
-        ),
-        run_case(
-            "solo_row_writer_fixed",
-            FIXED_ROWS as u64,
-            FIXED_ROWS as u64 * fixed_bytes,
-            || bench_solo_row_writer_fixed(&fixed_write_inputs),
-        ),
         run_case(
             "row_writer_fixed",
             FIXED_ROWS as u64,
diff --git a/probing/memtable/src/discover.rs b/probing/memtable/src/discover.rs
index eec04e56..4197eca0 100644
--- a/probing/memtable/src/discover.rs
+++ b/probing/memtable/src/discover.rs
@@ -161,8 +161,7 @@ impl ExposedTable {
     ///
     /// This is the fast path for high-frequency writes — it skips the
     /// O(rows × chunks) `validate_buf` that `writer()` performs on every call.
-    /// The spinlock is released even if the write panics, preventing a
-    /// deadlocked mmap file (see [`MemTable::push_row`]).
+    /// MEMT is single-writer, so no lock is taken (see [`MemTable::push_row`]).
     pub fn push_row(&mut self, values: &[Value]) {
         self.inner.push_row(values)
     }
diff --git a/probing/memtable/src/layout.rs b/probing/memtable/src/layout.rs
index f737468d..6729d7cb 100644
--- a/probing/memtable/src/layout.rs
+++ b/probing/memtable/src/layout.rs
@@ -1,12 +1,12 @@
 //! Low-level layout: header, column descriptors, chunk headers, byte helpers.
 //!
-//! ## Header v3 binary layout (64 bytes, 1 cache line)
+//! ## Header v4 binary layout (64 bytes, 1 cache line)
 //!
 //! ```text
 //! offset  size  field               notes
 //! ──────────────────────────────────────────────────────────
 //!  0       4    magic               0x4D454D54 ("MEMT" in LE)
-//!  4       2    version             3
+//!  4       2    version             4
 //!  6       2    header_size         64 (validation only)
 //!  8       2    byte_order          BOM: written as [0x01, 0x02]
 //! 10       2    ts_col              timestamp column index + 1 (0 = none)
@@ -17,20 +17,24 @@
 //! 28       4    data_offset         (64-aligned)
 //! ─── 32 byte boundary (cold/hot split) ─────────────────
 //! 32       4    write_chunk         AtomicU32
-//! 36       4    write_lock          AtomicU32: 0 = unlocked, else holder PID
-//! 40       4    refcount            AtomicU32
-//! 44       4    creator_pid         PID of creating process
+//! 36       4    refcount            AtomicU32
+//! 40       4    creator_pid         PID of creating process
+//! 44       4    _pad0               (alignment)
 //! 48       8    creator_start_time  process start time (platform-specific)
-//! 56       8    lock_owner_start    AtomicU64: lock holder's start time
+//! 56       8    _reserved           reserved for future use
 //! ──────────────────────────────────────────────────────────
 //! ```
 //!
 //! All multi-byte fields are little-endian.  The `byte_order` BOM
 //! allows readers to detect endianness mismatch without guessing.
+//!
+//! MEMT is **single-writer**: exactly one writer owns each buffer (the
+//! creator process; in-process writes are serialised by the caller). There
+//! is no in-buffer write lock. Readers stay lock-free via per-chunk
+//! `generation` re-validation.
 
 use std::mem;
-use std::sync::atomic::{AtomicI64, AtomicU32, AtomicU64, Ordering};
-use std::time::{Duration, Instant};
+use std::sync::atomic::{AtomicI64, AtomicU32, AtomicU64};
 
 // ── C-style layout structs ──────────────────────────────────────────
 
@@ -40,10 +44,11 @@ pub(crate) const MAGIC: u32 = MAGIC_MEMT;
 
 /// Header format version for MEMT.
 ///
-/// v3: `_pad0` became `ts_col`, `_reserved` became `lock_owner_start`,
-/// `write_lock` stores the holder PID (was 0/1), and `ChunkHeader` grew
-/// `min_ts`/`max_ts` (24 → 40 bytes).
-pub(crate) const VERSION: u16 = 3;
+/// v4: dropped the `write_lock` and `lock_owner_start` fields — MEMT is
+/// single-writer, so there is no in-buffer write lock. Their bytes are now
+/// `_pad0`/`_reserved`. v3 added per-chunk `min_ts`/`max_ts` and the PID
+/// write lock (both since superseded).
+pub(crate) const VERSION: u16 = 4;
 
 /// Byte-order mark: written as raw bytes `[0x01, 0x02]`.
 /// On a LE host, `u16::from_ne_bytes([0x01, 0x02])` == `0x0201`.
@@ -68,8 +73,8 @@ pub(crate) const FLAGS_KNOWN: u32 = FLAG_DEDUP;
 /// schema dimensions, layout offsets.
 ///
 /// **Hot zone** (bytes 32–63): atomically mutated at runtime —
-/// `write_chunk`, `write_lock`, `refcount`.  Separated from the cold
-/// zone to avoid false-sharing on different cache lines.
+/// `write_chunk`, `refcount`.  Separated from the cold zone to avoid
+/// false-sharing on different cache lines.
 #[repr(C)]
 pub(crate) struct Header {
     // ── cold zone (read-only after init) ─────────────────
@@ -102,24 +107,19 @@ pub(crate) struct Header {
     // ── hot zone (atomically mutated) ────────────────────
     /// Ring buffer: index of the chunk currently being written.
     pub write_chunk: AtomicU32,
-    /// Robust writer spinlock: 0 = unlocked, otherwise the **PID** of the
-    /// holding process. A waiter that has spun past
-    /// [`LOCK_STEAL_TIMEOUT`] checks the holder's liveness and steals the
-    /// lock from a dead process (see [`acquire_write_lock`]).
-    pub write_lock: AtomicU32,
     /// Reference count for shared lifetime management.
     pub refcount: AtomicU32,
     /// PID of the process that created this table (for cross-process discovery).
     pub creator_pid: u32,
-    /// Process start time — for PID-recycling detection.
+    /// Padding to 8-align `creator_start_time` (was `write_lock` in v3).
+    pub _pad0: u32,
+    /// Process start time — for PID-recycling detection during discovery.
     /// Linux: clock ticks since boot (`/proc/<pid>/stat` field 22).
     /// macOS: microseconds since epoch (via `sysctl`).
     /// Other: 0 (falls back to PID-only liveness check).
     pub creator_start_time: u64,
-    /// Start time of the current lock holder (0 = unknown / not written
-    /// yet). Written by the holder right after acquiring; lets waiters
-    /// detect PID recycling before stealing. Advisory only.
-    pub lock_owner_start: AtomicU64,
+    /// Reserved for future use (was `lock_owner_start` in v3).
+    pub _reserved: u64,
 }
 
 /// Per-column descriptor, immediately following the Header.
@@ -224,154 +224,6 @@ pub(crate) fn chunk_header(buf: &[u8], cs: usize) -> &ChunkHeader {
     unsafe { &*(buf[cs..].as_ptr() as *const ChunkHeader) }
 }
 
-/// How long a waiter spins before checking whether the lock holder is
-/// still alive (and stealing the lock from a dead process).
-///
-/// Writers hold the lock for nanoseconds–microseconds; even a descheduled
-/// holder resumes within milliseconds. Reaching this timeout in practice
-/// means the holder crashed while holding the lock.
-pub(crate) const LOCK_STEAL_TIMEOUT: Duration = Duration::from_millis(500);
-
-/// `true` when a process with `pid` exists (it may belong to another user).
-fn process_alive(pid: u32) -> bool {
-    if pid == std::process::id() {
-        return true;
-    }
-    if unsafe { libc::kill(pid as libc::pid_t, 0) } == 0 {
-        return true;
-    }
-    // EPERM: the process exists but we may not signal it.
-    std::io::Error::last_os_error().raw_os_error() == Some(libc::EPERM)
-}
-
-/// This process's kernel start time, cached per PID (reads `/proc` on Linux).
-///
-/// **Fork safety:** the cache is keyed on the live PID, not a one-shot
-/// `OnceLock`. A child inheriting a parent's cached value would otherwise
-/// record the *parent's* start time in `lock_owner_start`, and a waiter
-/// comparing against the child's real start time would mistake the live child
-/// for a recycled PID and steal its lock — exactly the hazard fork-heavy
-/// workloads (e.g. PyTorch DataLoader) trigger. Re-reading whenever the PID
-/// changes makes every post-fork caller observe its own start time.
-fn my_start_time() -> u64 {
-    static MY_PID: AtomicU32 = AtomicU32::new(0);
-    static MY_START: AtomicU64 = AtomicU64::new(0);
-
-    let pid = std::process::id();
-    if MY_PID.load(Ordering::Acquire) == pid {
-        let cached = MY_START.load(Ordering::Acquire);
-        if cached != 0 {
-            return cached;
-        }
-    }
-    let start = crate::raw::process_start_time(pid);
-    // Publish start before PID: a reader that observes the matching PID is then
-    // guaranteed to also observe the start written for it.
-    MY_START.store(start, Ordering::Release);
-    MY_PID.store(pid, Ordering::Release);
-    start
-}
-
-/// Decide whether the lock can be stolen from `holder`, and try to.
-///
-/// Steal conditions (either):
-/// - `holder` no longer exists (crashed / killed while holding the lock);
-/// - `holder` exists but its kernel start time does not match the one the
-///   real holder recorded in `lock_owner_start` — the PID was recycled by
-///   an unrelated process. Re-checked after a grace period to rule out
-///   the transient window where a fresh holder has not yet recorded its
-///   start time.
-///
-/// Stealing is safe with respect to data: rows only become visible via the
-/// `used`/`row_count` Release stores at the end of a write, so a row half
-/// written by the dead holder stays uncommitted and is simply overwritten.
-#[cold]
-#[inline(never)]
-fn try_steal_lock(h: &Header, holder: u32, me: u32) -> bool {
-    if process_alive(holder) {
-        let owner_start = h.lock_owner_start.load(Ordering::Relaxed);
-        let actual_start = crate::raw::process_start_time(holder);
-        if owner_start == 0 || actual_start == 0 || actual_start == owner_start {
-            return false; // genuinely alive (or cannot tell) — keep waiting
-        }
-        std::thread::sleep(Duration::from_millis(10));
-        if h.write_lock.load(Ordering::Relaxed) != holder
-            || h.lock_owner_start.load(Ordering::Relaxed) != owner_start
-        {
-            return false; // lock changed hands meanwhile — not stale
-        }
-    }
-    if h.write_lock
-        .compare_exchange(holder, me, Ordering::Acquire, Ordering::Relaxed)
-        .is_ok()
-    {
-        h.lock_owner_start.store(my_start_time(), Ordering::Relaxed);
-        return true;
-    }
-    false
-}
-
-/// Acquire the **robust** writer spinlock with exponential back-off.
-///
-/// The lock word holds the owner's PID (0 = unlocked). First few failures
-/// use `spin_loop()` (pause instruction), then escalate to `yield_now()`.
-/// A waiter stuck past [`LOCK_STEAL_TIMEOUT`] verifies the holder's
-/// liveness and steals the lock from a dead process (see
-/// [`try_steal_lock`]), so a writer crashing inside the critical section
-/// cannot deadlock other writer processes forever.
-///
-/// SAFETY NOTE: the buffer parameter is `&mut [u8]` (not `&[u8]`) so that
-/// LLVM does **not** mark the pointer `readonly`. With `&[u8]` LLVM may
-/// legally eliminate the atomic store inside `release_write_lock`, turning
-/// the spin loop into an infinite loop in optimised (release) builds.
-pub(crate) fn acquire_write_lock(buf: &mut [u8]) {
-    let ptr = buf.as_mut_ptr() as *const Header;
-    let h = unsafe { &*ptr };
-    let me = std::process::id();
-    let mut spins = 0u32;
-    let mut waiting_since: Option<Instant> = None;
-    loop {
-        match h
-            .write_lock
-            .compare_exchange_weak(0, me, Ordering::Acquire, Ordering::Relaxed)
-        {
-            Ok(_) => {
-                h.lock_owner_start.store(my_start_time(), Ordering::Relaxed);
-                return;
-            }
-            Err(holder) if holder != 0 => {
-                let since = *waiting_since.get_or_insert_with(Instant::now);
-                if spins >= 16 && since.elapsed() >= LOCK_STEAL_TIMEOUT {
-                    if try_steal_lock(h, holder, me) {
-                        return;
-                    }
-                    waiting_since = Some(Instant::now());
-                }
-            }
-            Err(_) => {} // spurious failure with lock free — retry CAS
-        }
-        if spins < 16 {
-            for _ in 0..1 << spins.min(4) {
-                std::hint::spin_loop();
-            }
-        } else {
-            std::thread::yield_now();
-        }
-        spins += 1;
-    }
-}
-
-/// Release the writer spinlock. See [`acquire_write_lock`] for why `&mut`.
-///
-/// Clears `lock_owner_start` *before* the lock word so that waiters never
-/// pair the next holder's PID with this holder's start time.
-pub(crate) fn release_write_lock(buf: &mut [u8]) {
-    let ptr = buf.as_mut_ptr() as *const Header;
-    unsafe {
-        (*ptr).lock_owner_start.store(0, Ordering::Relaxed);
-        (*ptr).write_lock.store(0, Ordering::Release);
-    }
-}
 pub(crate) fn r32(buf: &[u8], off: usize) -> u32 {
     u32::from_le_bytes(buf[off..off + 4].try_into().unwrap())
 }
@@ -413,40 +265,4 @@ mod tests {
         let expected_le = u16::from_le_bytes(BYTE_ORDER_MARK);
         assert_eq!(bom, expected_le);
     }
-
-    /// Fork safety: after `fork()`, `my_start_time()` must return the *child's*
-    /// own kernel start time, not a value cached for the parent before the
-    /// fork. With the old `OnceLock` cache the child returned the parent's
-    /// start time; a waiter then compared it against the child's real start
-    /// time and stole the lock from a live holder. The test process has run
-    /// long enough that its start tick differs from a freshly-forked child's,
-    /// so the stale value would be observably wrong.
-    ///
-    /// Linux-only: kernel start times come from `/proc`. On platforms without
-    /// it `process_start_time` returns 0, the PID-recycle steal path is inert,
-    /// and there is no fork hazard to guard against.
-    #[cfg(target_os = "linux")]
-    #[test]
-    fn my_start_time_refreshes_after_fork() {
-        // Warm the per-PID cache for the parent (mimics the leaked OnceLock).
-        let parent = my_start_time();
-        assert_ne!(parent, 0, "parent start time should be readable");
-
-        unsafe {
-            let pid = libc::fork();
-            assert!(pid >= 0, "fork failed");
-            if pid == 0 {
-                // Child: the cached value must equal a fresh read for THIS pid.
-                let cached = my_start_time();
-                let fresh = crate::raw::process_start_time(std::process::id());
-                libc::_exit(if cached == fresh && cached != 0 { 0 } else { 1 });
-            }
-            let mut status = 0;
-            libc::waitpid(pid, &mut status, 0);
-            assert!(
-                libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0,
-                "child my_start_time() must reflect its own process, not the parent's cache",
-            );
-        }
-    }
 }
diff --git a/probing/memtable/src/lib.rs b/probing/memtable/src/lib.rs
index 2cbd5c04..87f02631 100644
--- a/probing/memtable/src/lib.rs
+++ b/probing/memtable/src/lib.rs
@@ -5,10 +5,14 @@
 //!
 //! ## Concurrency
 //!
-//! - **Writers** are serialized by a spinlock (`write_lock` in Header).
+//! MEMT is **single-writer**: exactly one writer owns each buffer, so there
+//! is no in-buffer write lock.
+//!
+//! - **Writer**: a single owner appends rows; the `&mut` borrow (or the
+//!   caller's own serialization) guarantees exclusivity. No lock is taken.
 //! - **Readers** are lock-free: per-chunk `used` is updated with `Release` ordering
-//!   by writers and loaded with `Acquire` by readers, ensuring row data visibility.
-//! - `RowWriter` holds the lock for its lifetime; released by `finish()` or `Drop`.
+//!   by the writer and loaded with `Acquire` by readers, ensuring row data visibility.
+//!   Readers re-validate the chunk `generation` to discard rows from a recycled chunk.
 //!
 //! # Memory Layout
 //!
@@ -16,13 +20,13 @@
 //!
 //! ```text
 //! ┌──────────────────────────────────┐ 0
-//! │ Header v2 (64 bytes, repr(C))    │
+//! │ Header v4 (64 bytes, repr(C))    │
 //! │  ── cold zone (read-only) ──     │
 //! │   magic: u32     (0x4D454D54)    │
-//! │   version: u16   (2)             │
+//! │   version: u16   (4)             │
 //! │   header_size: u16 (64)          │
 //! │   byte_order: u16 (BOM 0x0102)   │
-//! │   _pad0: u16                     │
+//! │   ts_col: u16                    │
 //! │   flags: u32     (feature bits)  │
 //! │   num_cols: u32                  │
 //! │   num_chunks: u32                │
@@ -30,11 +34,11 @@
 //! │   data_offset: u32               │
 //! │  ── hot zone (atomic) ────       │
 //! │   write_chunk: AtomicU32         │
-//! │   write_lock: AtomicU32          │
 //! │   refcount: AtomicU32            │
 //! │   creator_pid: u32                │
+//! │   _pad0: u32                     │
 //! │   creator_start_time: u64         │
-//! │   _reserved: [u32; 2]            │
+//! │   _reserved: u64                 │
 //! ├──────────────────────────────────┤ 64
 //! │ ColumnDesc × N (64 bytes each)   │
 //! │   name: [u8; 56]  (LP u16)      │
diff --git a/probing/memtable/src/memtable.rs b/probing/memtable/src/memtable.rs
index 6545df5d..66797556 100644
--- a/probing/memtable/src/memtable.rs
+++ b/probing/memtable/src/memtable.rs
@@ -1,10 +1,10 @@
 use crate::dedup::DedupState;
 use crate::layout::{
-    acquire_write_lock, chunk_header, chunk_start_off, col_desc, compute_data_offset, header,
-    header_mut, release_write_lock, w32, CHUNK_HEADER_SIZE, FLAG_DEDUP,
+    chunk_header, chunk_start_off, col_desc, compute_data_offset, header, header_mut, w32,
+    CHUNK_HEADER_SIZE, FLAG_DEDUP,
 };
 use crate::raw::{
-    advance_chunk_unlocked, init_buf, note_row_ts, row_ts, validate_buf, validate_row_schema,
+    advance_chunk_raw, init_buf, note_row_ts, row_ts, validate_buf, validate_row_schema,
     write_row_bytes,
 };
 use crate::refcount::refcount;
@@ -166,11 +166,7 @@ macro_rules! impl_table_reader {
 
 // ── Write helpers ────────────────────────────────────────────────────
 
-fn make_row_writer<'a>(
-    buf: &'a mut [u8],
-    dedup: Option<&'a mut DedupState>,
-    locked: bool,
-) -> RowWriter<'a> {
+fn make_row_writer<'a>(buf: &'a mut [u8], dedup: Option<&'a mut DedupState>) -> RowWriter<'a> {
     let h = header(buf);
     let wc = h.write_chunk.load(Ordering::Relaxed) as usize;
     let csz = h.chunk_size as usize;
@@ -188,17 +184,11 @@ fn make_row_writer<'a>(
         overflow: false,
         done: false,
         col_idx: 0,
-        locked,
         ts_col,
         pending_ts: None,
     }
 }
 
-fn begin_row_writer<'a>(buf: &'a mut [u8], dedup: Option<&'a mut DedupState>) -> RowWriter<'a> {
-    acquire_write_lock(buf);
-    make_row_writer(buf, dedup, true)
-}
-
 fn row_data_size(values: &[Value]) -> usize {
     values.iter().map(|v| v.encoded_size()).sum()
 }
@@ -206,7 +196,7 @@ fn row_data_size(values: &[Value]) -> usize {
 pub(crate) fn push_plain_row(buf: &mut [u8], values: &[Value]) {
     let row_data = row_data_size(values);
     if !write_row_bytes(buf, values, row_data) {
-        advance_chunk_unlocked(buf);
+        advance_chunk_raw(buf);
         assert!(
             write_row_bytes(buf, values, row_data),
             "row exceeds chunk capacity"
@@ -214,18 +204,6 @@ pub(crate) fn push_plain_row(buf: &mut [u8], values: &[Value]) {
     }
 }
 
-fn locked_append(buf: &mut [u8], values: &[Value]) -> bool {
-    acquire_write_lock(buf);
-    let ok = write_row_bytes(buf, values, row_data_size(values));
-    release_write_lock(buf);
-    ok
-}
-
-fn locked_advance(buf: &mut [u8]) {
-    acquire_write_lock(buf);
-    advance_chunk_unlocked(buf);
-    release_write_lock(buf);
-}
 
 const MAX_DEDUP_COLS: usize = 64;
 
@@ -413,16 +391,14 @@ impl MemTable {
         let mut buf = vec![0u8; size];
         init_buf(&mut buf, schema, chunk_size, num_chunks);
         Self {
-            backing: Backing::Heap(buf),
-        }
+            backing: Backing::Heap(buf),        }
     }
 
     /// Adopt an existing heap buffer (validates the MEMT layout).
     pub fn from_buf(buf: Vec<u8>) -> Result<Self, &'static str> {
         validate_buf(&buf)?;
         Ok(Self {
-            backing: Backing::Heap(buf),
-        })
+            backing: Backing::Heap(buf),        })
     }
 
     // ── POSIX shared memory (memory-only) ────────────────────────────
@@ -451,8 +427,7 @@ impl MemTable {
                 mmap,
                 name: cname.into_string().expect("validated utf-8"),
                 unlink_on_drop: true,
-            },
-        })
+            },        })
     }
 
     /// Attach to an existing POSIX shared-memory table created by
@@ -471,8 +446,7 @@ impl MemTable {
                 mmap,
                 name: cname.into_string().expect("validated utf-8"),
                 unlink_on_drop: false,
-            },
-        })
+            },        })
     }
 
     // ── mmap'd file (disk-backed, persistent) ────────────────────────
@@ -511,8 +485,7 @@ impl MemTable {
                 path,
                 dir: None,
                 unlink_on_drop: false,
-            },
-        })
+            },        })
     }
 
     /// Reopen an existing mmap'd-file table read-write (validates the
@@ -530,8 +503,7 @@ impl MemTable {
                 path,
                 dir: None,
                 unlink_on_drop: false,
-            },
-        })
+            },        })
     }
 
     // ── discoverable file (data-dir convention) ──────────────────────
@@ -591,8 +563,7 @@ impl MemTable {
                 path,
                 dir: Some(dir),
                 unlink_on_drop: true,
-            },
-        })
+            },        })
     }
 
     // ── backing introspection ─────────────────────────────────────────
@@ -645,26 +616,23 @@ impl MemTable {
     impl_table_reader!();
 
     pub fn row_writer(&mut self) -> RowWriter<'_> {
-        begin_row_writer(self.backing.bytes_mut(), None)
+        make_row_writer(self.backing.bytes_mut(), None)
     }
     pub fn append_row(&mut self, values: &[Value]) -> bool {
         assert!(
             validate_row_schema(self.backing.bytes(), values),
             "value types do not match schema"
         );
-        locked_append(self.backing.bytes_mut(), values)
+        write_row_bytes(self.backing.bytes_mut(), values, row_data_size(values))
     }
     pub fn advance_chunk(&mut self) {
-        locked_advance(self.backing.bytes_mut())
+        advance_chunk_raw(self.backing.bytes_mut())
     }
 
     /// Append a row, auto-advancing to the next chunk when full.
     ///
-    /// # Panic safety
-    ///
-    /// The spinlock is released even if the write panics (e.g. row exceeds
-    /// chunk capacity) — for shared tables this prevents a deadlocked mmap
-    /// file that other processes may still be reading.
+    /// MEMT is single-writer: the `&mut self` borrow guarantees exclusive
+    /// access, so no lock is taken.
     pub fn push_row(&mut self, values: &[Value]) {
         assert!(
             validate_row_schema(self.backing.bytes(), values),
@@ -673,15 +641,7 @@ impl MemTable {
         self.push_row_unchecked(values);
     }
     pub fn push_row_unchecked(&mut self, values: &[Value]) {
-        let buf = self.backing.bytes_mut();
-        acquire_write_lock(buf);
-        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
-            push_plain_row(buf, values);
-        }));
-        release_write_lock(buf);
-        if let Err(payload) = result {
-            std::panic::resume_unwind(payload);
-        }
+        push_plain_row(self.backing.bytes_mut(), values);
     }
 }
 
@@ -767,46 +727,30 @@ impl fmt::Display for MemTableView<'_> {
 
 /// Unified writer for external buffers (`&mut [u8]`).
 ///
-/// Supports four modes via builder methods:
+/// MEMT is single-writer: the `&mut [u8]` borrow guarantees exclusive
+/// access, so no lock is taken. Two modes via builder methods:
 ///
 /// | Mode | Construction |
 /// |------|-------------|
-/// | Locked, plain | `MemTableWriter::new(buf)?` |
-/// | Locked, dedup | `MemTableWriter::new(buf)?.dedup()` |
-/// | Solo, plain | `MemTableWriter::new(buf)?.solo()` |
-/// | Solo, dedup | `MemTableWriter::new(buf)?.solo().dedup()` |
-///
-/// **Locked** (default): writers are serialized via a spinlock — safe for
-/// multiple writer threads sharing the same buffer through raw pointers.
-///
-/// **Solo**: no spinlock — the `&mut [u8]` borrow guarantees exclusive
-/// access at compile time.  Saves ~5 ns/row of CAS overhead.
+/// | Plain | `MemTableWriter::new(buf)?` |
+/// | Dedup | `MemTableWriter::new(buf)?.dedup()` |
 ///
 /// **Dedup**: per-chunk, hash-based string/bytes dedup.  Repeated values
 /// are stored as 4-byte back-references within the same chunk.
 pub struct MemTableWriter<'a> {
     buf: &'a mut [u8],
     dedup: Option<DedupState>,
-    locked: bool,
 }
 
 impl<'a> MemTableWriter<'a> {
     pub fn new(buf: &'a mut [u8]) -> Result<Self, &'static str> {
         validate_buf(buf)?;
-        Ok(Self {
-            buf,
-            dedup: None,
-            locked: true,
-        })
+        Ok(Self { buf, dedup: None })
     }
 
     pub fn init(buf: &'a mut [u8], schema: &Schema, chunk_size: u32, num_chunks: u32) -> Self {
         init_buf(buf, schema, chunk_size, num_chunks);
-        Self {
-            buf,
-            dedup: None,
-            locked: true,
-        }
+        Self { buf, dedup: None }
     }
 
     /// Enable per-chunk string/bytes dedup.  Sets `FLAG_DEDUP` in header.
@@ -816,12 +760,6 @@ impl<'a> MemTableWriter<'a> {
         self
     }
 
-    /// Disable the spinlock (single-producer mode).
-    pub fn solo(mut self) -> Self {
-        self.locked = false;
-        self
-    }
-
     pub fn set_min_dedup_len(&mut self, len: usize) {
         if let Some(ref mut s) = self.dedup {
             s.set_min_dedup_len(len);
@@ -838,11 +776,7 @@ impl<'a> MemTableWriter<'a> {
     impl_table_reader!();
 
     pub fn row_writer(&mut self) -> RowWriter<'_> {
-        if self.locked {
-            begin_row_writer(self.buf, self.dedup.as_mut())
-        } else {
-            make_row_writer(self.buf, self.dedup.as_mut(), false)
-        }
+        make_row_writer(self.buf, self.dedup.as_mut())
     }
 
     pub fn push_row(&mut self, values: &[Value]) {
@@ -858,16 +792,10 @@ impl<'a> MemTableWriter<'a> {
     }
 
     pub fn advance_chunk(&mut self) {
-        if self.locked {
-            acquire_write_lock(self.buf);
-        }
-        advance_chunk_unlocked(self.buf);
+        advance_chunk_raw(self.buf);
         if let Some(ref mut s) = self.dedup {
             s.clear();
         }
-        if self.locked {
-            release_write_lock(self.buf);
-        }
     }
 
     pub fn append_row(&mut self, values: &[Value]) -> bool {
@@ -875,27 +803,17 @@ impl<'a> MemTableWriter<'a> {
             validate_row_schema(self.buf, values),
             "value types do not match schema"
         );
-        if self.locked {
-            acquire_write_lock(self.buf);
-        }
-        let ok = if let Some(ref mut state) = self.dedup {
+        if let Some(ref mut state) = self.dedup {
             append_row_dedup_bytes(self.buf, state, values)
         } else {
             write_row_bytes(self.buf, values, row_data_size(values))
-        };
-        if self.locked {
-            release_write_lock(self.buf);
         }
-        ok
     }
 
     fn push_inner(&mut self, values: &[Value]) {
-        if self.locked {
-            acquire_write_lock(self.buf);
-        }
         if let Some(ref mut state) = self.dedup {
             if !append_row_dedup_bytes(self.buf, state, values) {
-                advance_chunk_unlocked(self.buf);
+                advance_chunk_raw(self.buf);
                 state.clear();
                 assert!(
                     append_row_dedup_bytes(self.buf, state, values),
@@ -905,19 +823,15 @@ impl<'a> MemTableWriter<'a> {
         } else {
             push_plain_row(self.buf, values);
         }
-        if self.locked {
-            release_write_lock(self.buf);
-        }
     }
 }
 
 impl fmt::Display for MemTableWriter<'_> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let mode = match (self.locked, self.dedup.is_some()) {
-            (true, false) => "locked",
-            (true, true) => "locked+dedup",
-            (false, false) => "solo",
-            (false, true) => "solo+dedup",
+        let mode = if self.dedup.is_some() {
+            "dedup"
+        } else {
+            "plain"
         };
         write!(
             f,
@@ -1349,7 +1263,6 @@ mod tests {
         assert_eq!(h.num_chunks, 4);
         assert_eq!(h.chunk_size, 1024);
         assert_eq!(h.write_chunk.load(Ordering::Relaxed), 0);
-        assert_eq!(h.write_lock.load(Ordering::Relaxed), 0);
         assert_eq!(h.refcount.load(Ordering::Relaxed), 1);
     }
 
@@ -1488,67 +1401,15 @@ mod tests {
     }
 
     #[test]
-    fn concurrent_multiple_writers() {
-        use std::alloc;
-        use std::thread;
-
-        let schema = Schema::new().col("tid", DType::I64).col("seq", DType::I64);
-        let chunk_size = 8192u32;
-        let num_chunks = 8u32;
-        let size = MemTable::required_size(&schema, chunk_size as usize, num_chunks as usize);
-        let layout = alloc::Layout::from_size_align(size, 64).unwrap();
-        let ptr = unsafe { alloc::alloc_zeroed(layout) };
-        assert!(!ptr.is_null());
-
-        unsafe {
-            let buf = std::slice::from_raw_parts_mut(ptr, size);
-            init_buf(buf, &schema, chunk_size, num_chunks);
-        }
-
-        let num_writers = 8;
-        let rows_per_writer = 50;
-        let addr = ptr as usize;
-
-        // 单写线程：多线程各自 `&mut` 同一块缓冲在语言层面是 UB，release 下易死锁/损坏元数据。
-        let writer = thread::spawn(move || {
-            let buf = unsafe { std::slice::from_raw_parts_mut(addr as *mut u8, size) };
-            let mut mt = MemTableWriter::new(buf).unwrap();
-            for tid in 0..num_writers {
-                for seq in 0..rows_per_writer as i64 {
-                    mt.push_row(&[Value::I64(tid as i64), Value::I64(seq)]);
-                }
-            }
-        });
-        writer.join().unwrap();
-
-        unsafe {
-            let buf = std::slice::from_raw_parts(ptr, size);
-            let view = MemTableView::new(buf).unwrap();
-            let total: usize = (0..view.num_chunks()).map(|c| view.num_rows(c)).sum();
-            assert_eq!(total, num_writers * rows_per_writer);
-
-            // every row should be a valid (tid, seq) pair
-            for chunk in 0..view.num_chunks() {
-                for row in view.rows(chunk) {
-                    let mut c = row.cursor();
-                    let tid = c.next_i64();
-                    let seq = c.next_i64();
-                    assert!((0..num_writers as i64).contains(&tid));
-                    assert!((0..rows_per_writer as i64).contains(&seq));
-                }
-            }
-
-            alloc::dealloc(ptr, layout);
-        }
-    }
-
-    #[test]
-    fn concurrent_writers_and_readers() {
+    fn single_writer_concurrent_readers() {
         use std::alloc;
         use std::sync::atomic::{AtomicBool, AtomicUsize};
         use std::sync::{Arc, Barrier};
         use std::thread;
 
+        // The production model: one writer feeds the ring while N lock-free
+        // readers continuously scan it. Readers must never observe a torn or
+        // corrupt row.
         let schema = Schema::new().col("val", DType::I64);
         let chunk_size = 4096u32;
         let num_chunks = 4u32;
@@ -1562,16 +1423,14 @@ mod tests {
             init_buf(buf, &schema, chunk_size, num_chunks);
         }
 
-        let num_writers = 4;
-        let rows_per_writer = 100;
+        let total_rows = 400i64;
         let num_readers = 4;
         let addr = ptr as usize;
         let done = Arc::new(AtomicBool::new(false));
         let total_reads = Arc::new(AtomicUsize::new(0));
-        // 1 个写线程 + num_readers 个读线程（不能多写线程同缓冲 &mut，见 concurrent_multiple_writers）
         let barrier = Arc::new(Barrier::new(1 + num_readers));
 
-        // spawn readers — continuously scan all chunks while writers are active
+        // Readers continuously scan all chunks while the writer is active.
         let reader_handles: Vec<_> = (0..num_readers)
             .map(|_| {
                 let done = done.clone();
@@ -1604,10 +1463,8 @@ mod tests {
                 barrier.wait();
                 let buf = unsafe { std::slice::from_raw_parts_mut(addr as *mut u8, size) };
                 let mut mt = MemTableWriter::new(buf).unwrap();
-                for tid in 0..num_writers {
-                    for seq in 0..rows_per_writer as i64 {
-                        mt.push_row(&[Value::I64(tid as i64 * 1000 + seq)]);
-                    }
+                for seq in 0..total_rows {
+                    mt.push_row(&[Value::I64(seq)]);
                 }
             })
         };
@@ -1619,72 +1476,17 @@ mod tests {
             h.join().unwrap();
         }
 
-        // readers actually read some rows
         assert!(
             total_reads.load(Ordering::Relaxed) > 0,
             "readers should have observed rows"
         );
 
-        // final consistency: total rows == writers × rows_per_writer
+        // Final consistency: every written row is present.
         unsafe {
             let buf = std::slice::from_raw_parts(ptr, size);
             let view = MemTableView::new(buf).unwrap();
             let total: usize = (0..view.num_chunks()).map(|c| view.num_rows(c)).sum();
-            assert_eq!(total, num_writers * rows_per_writer);
-            alloc::dealloc(ptr, layout);
-        }
-    }
-
-    #[test]
-    fn concurrent_row_writer_contention() {
-        use std::alloc;
-        use std::thread;
-
-        let schema = Schema::new().col("tid", DType::I32).col("msg", DType::Str);
-        let chunk_size = 16384u32;
-        let num_chunks = 4u32;
-        let size = MemTable::required_size(&schema, chunk_size as usize, num_chunks as usize);
-        let layout = alloc::Layout::from_size_align(size, 64).unwrap();
-        let ptr = unsafe { alloc::alloc_zeroed(layout) };
-        assert!(!ptr.is_null());
-
-        unsafe {
-            let buf = std::slice::from_raw_parts_mut(ptr, size);
-            init_buf(buf, &schema, chunk_size, num_chunks);
-        }
-
-        let num_writers = 8;
-        let rows_per_writer = 60;
-        let addr = ptr as usize;
-
-        let writer = thread::spawn(move || {
-            let buf = unsafe { std::slice::from_raw_parts_mut(addr as *mut u8, size) };
-            let mut mt = MemTableWriter::new(buf).unwrap();
-            for tid in 0..num_writers {
-                let tag = format!("t{tid}");
-                for _ in 0..rows_per_writer {
-                    mt.row_writer().put_i32(tid as i32).put_str(&tag).finish();
-                }
-            }
-        });
-        writer.join().unwrap();
-
-        unsafe {
-            let buf = std::slice::from_raw_parts(ptr, size);
-            let view = MemTableView::new(buf).unwrap();
-            let total: usize = (0..view.num_chunks()).map(|c| view.num_rows(c)).sum();
-            assert_eq!(total, num_writers * rows_per_writer);
-
-            for chunk in 0..view.num_chunks() {
-                for row in view.rows(chunk) {
-                    let mut c = row.cursor();
-                    let tid = c.next_i32();
-                    let msg = c.next_str();
-                    assert!((0..num_writers as i32).contains(&tid));
-                    assert_eq!(msg, format!("t{tid}"));
-                }
-            }
-
+            assert_eq!(total, total_rows as usize);
             alloc::dealloc(ptr, layout);
         }
     }
@@ -1868,14 +1670,14 @@ mod tests {
         t.push_row(&[Value::Str("oops")]); // Str instead of U32
     }
 
-    // ── MemTableWriter solo mode tests ──────────────────────────
+    // ── MemTableWriter tests ──────────────────────────
 
     #[test]
-    fn solo_writer_basic() {
+    fn mem_table_writer_basic() {
         let schema = Schema::new().col("ts", DType::I64).col("val", DType::F64);
         let size = MemTable::required_size(&schema, 4096, 2);
         let mut buf = vec![0u8; size];
-        let mut sw = MemTableWriter::init(&mut buf, &schema, 4096, 2).solo();
+        let mut sw = MemTableWriter::init(&mut buf, &schema, 4096, 2);
 
         sw.push_row(&[Value::I64(100), Value::F64(3.14)]);
         sw.push_row(&[Value::I64(200), Value::F64(2.72)]);
@@ -1888,11 +1690,11 @@ mod tests {
     }
 
     #[test]
-    fn solo_writer_row_writer() {
+    fn mem_table_writer_row_writer() {
         let schema = Schema::new().col("id", DType::I32).col("msg", DType::Str);
         let size = MemTable::required_size(&schema, 4096, 1);
         let mut buf = vec![0u8; size];
-        let mut sw = MemTableWriter::init(&mut buf, &schema, 4096, 1).solo();
+        let mut sw = MemTableWriter::init(&mut buf, &schema, 4096, 1);
 
         sw.row_writer().put_i32(1).put_str("hello").finish();
         sw.row_writer().put_i32(2).put_str("world").finish();
@@ -1905,28 +1707,11 @@ mod tests {
     }
 
     #[test]
-    fn solo_writer_no_lock_touched() {
-        let schema = Schema::new().col("x", DType::I32);
-        let size = MemTable::required_size(&schema, 1024, 1);
-        let mut buf = vec![0u8; size];
-        let mut sw = MemTableWriter::init(&mut buf, &schema, 1024, 1).solo();
-        sw.push_row(&[Value::I32(42)]);
-        sw.row_writer().put_i32(99).finish();
-        assert_eq!(
-            header(sw.as_bytes()).write_lock.load(Ordering::Relaxed),
-            0,
-            "solo mode must never touch the write_lock"
-        );
-    }
-
-    #[test]
-    fn solo_writer_dedup() {
+    fn mem_table_writer_dedup() {
         let schema = Schema::new().col("tag", DType::Str).col("seq", DType::I32);
         let size = MemTable::required_size(&schema, 8192, 1);
         let mut buf = vec![0u8; size];
-        let mut sw = MemTableWriter::init(&mut buf, &schema, 8192, 1)
-            .solo()
-            .dedup();
+        let mut sw = MemTableWriter::init(&mut buf, &schema, 8192, 1).dedup();
 
         for i in 0..20 {
             sw.push_row(&[Value::Str("repeat"), Value::I32(i)]);
@@ -1934,9 +1719,9 @@ mod tests {
 
         let used_dedup = sw.chunk_used(0);
 
-        // Compare with plain solo writer
+        // Compare with a plain writer
         let mut buf2 = vec![0u8; size];
-        let mut sw2 = MemTableWriter::init(&mut buf2, &schema, 8192, 1).solo();
+        let mut sw2 = MemTableWriter::init(&mut buf2, &schema, 8192, 1);
         for i in 0..20 {
             sw2.push_row(&[Value::Str("repeat"), Value::I32(i)]);
         }
@@ -1955,11 +1740,11 @@ mod tests {
     }
 
     #[test]
-    fn solo_writer_auto_advance() {
+    fn mem_table_writer_auto_advance() {
         let schema = Schema::new().col("v", DType::I64);
         let size = MemTable::required_size(&schema, 64, 4);
         let mut buf = vec![0u8; size];
-        let mut sw = MemTableWriter::init(&mut buf, &schema, 64, 4).solo();
+        let mut sw = MemTableWriter::init(&mut buf, &schema, 64, 4);
 
         for i in 0..50i64 {
             sw.push_row_unchecked(&[Value::I64(i)]);
@@ -2059,85 +1844,4 @@ mod tests {
         header_mut(t.as_bytes_mut()).ts_col = 1; // col 0 is I64 → ok
         assert!(MemTableView::new(t.as_bytes()).is_ok());
     }
-
-    // ── robust write lock ──────────────────────────────────────────────
-
-    /// PID of a process that no longer exists: spawn a short-lived child
-    /// and wait for it to exit.
-    fn dead_pid() -> u32 {
-        let mut child = std::process::Command::new("true")
-            .spawn()
-            .expect("spawn true");
-        let pid = child.id();
-        child.wait().expect("wait true");
-        pid
-    }
-
-    #[test]
-    fn lock_word_holds_pid_while_held() {
-        let schema = Schema::new().col("x", DType::I32);
-        let mut t = MemTable::new(&schema, 1024, 1);
-        let lock_ptr = header(t.as_bytes()).write_lock.as_ptr() as usize;
-        let lock = unsafe { &*(lock_ptr as *const std::sync::atomic::AtomicU32) };
-        {
-            let _w = t.row_writer(); // holds the lock
-            assert_eq!(
-                lock.load(Ordering::Relaxed),
-                std::process::id(),
-                "lock word must hold the owner PID"
-            );
-        }
-        assert_eq!(lock.load(Ordering::Relaxed), 0);
-    }
-
-    #[test]
-    fn stale_lock_from_dead_process_is_stolen() {
-        let schema = Schema::new().col("x", DType::I32);
-        let mut t = MemTable::new(&schema, 1024, 1);
-
-        // Simulate a writer that crashed inside the critical section.
-        header(t.as_bytes())
-            .write_lock
-            .store(dead_pid(), Ordering::SeqCst);
-
-        let start = std::time::Instant::now();
-        t.push_row(&[Value::I32(42)]); // must not deadlock
-        let took = start.elapsed();
-
-        assert_eq!(t.rows(0).next().unwrap().col_i32(0), 42);
-        assert_eq!(header(t.as_bytes()).write_lock.load(Ordering::Relaxed), 0);
-        assert!(
-            took >= crate::layout::LOCK_STEAL_TIMEOUT,
-            "steal must wait out the timeout first (took {took:?})"
-        );
-    }
-
-    #[test]
-    fn live_holder_is_not_preempted() {
-        let schema = Schema::new().col("x", DType::I32);
-        let mut t = MemTable::new(&schema, 1024, 1);
-
-        // Another thread of this (alive) process holds the lock and
-        // releases it well past the steal timeout.
-        let me = std::process::id();
-        header(t.as_bytes()).write_lock.store(me, Ordering::SeqCst);
-        let lock_ptr = header(t.as_bytes()).write_lock.as_ptr() as usize;
-        let hold = crate::layout::LOCK_STEAL_TIMEOUT + std::time::Duration::from_millis(200);
-        let releaser = std::thread::spawn(move || {
-            std::thread::sleep(hold);
-            let lock = unsafe { &*(lock_ptr as *const std::sync::atomic::AtomicU32) };
-            lock.store(0, Ordering::Release);
-        });
-
-        let start = std::time::Instant::now();
-        t.push_row(&[Value::I32(7)]);
-        let took = start.elapsed();
-        releaser.join().unwrap();
-
-        assert!(
-            took >= hold - std::time::Duration::from_millis(50),
-            "live holder must be waited on, not preempted (took {took:?})"
-        );
-        assert_eq!(t.rows(0).next().unwrap().col_i32(0), 7);
-    }
 }
diff --git a/probing/memtable/src/raw.rs b/probing/memtable/src/raw.rs
index 9eb61272..94948f71 100644
--- a/probing/memtable/src/raw.rs
+++ b/probing/memtable/src/raw.rs
@@ -113,11 +113,12 @@ pub(crate) fn write_row_bytes(buf: &mut [u8], values: &[Value], row_data: usize)
     true
 }
 
-/// Advance the ring buffer to the next chunk (caller must hold the write lock).
+/// Advance the ring buffer to the next chunk.
 ///
-/// Takes `&mut [u8]` so that LLVM does not mark the pointer `readonly`;
-/// see [`acquire_write_lock`](crate::layout::acquire_write_lock) for details.
-pub(crate) fn advance_chunk_unlocked(buf: &mut [u8]) {
+/// MEMT is single-writer, so no lock is taken. Takes `&mut [u8]` so that
+/// LLVM does not mark the pointer `readonly` (which would let it elide the
+/// atomic stores below in optimised builds).
+pub(crate) fn advance_chunk_raw(buf: &mut [u8]) {
     let ptr = buf.as_mut_ptr();
     unsafe {
         let h = &*(ptr as *const Header);
@@ -359,11 +360,11 @@ pub(crate) fn init_buf(buf: &mut [u8], schema: &Schema, chunk_size: u32, num_chu
     h.chunk_size = chunk_size;
     h.data_offset = data_off as u32;
     h.write_chunk.store(0, Ordering::Relaxed);
-    h.write_lock.store(0, Ordering::Relaxed);
     h.refcount.store(1, Ordering::Relaxed);
     h.creator_pid = std::process::id();
+    h._pad0 = 0;
     h.creator_start_time = process_start_time(std::process::id());
-    h.lock_owner_start.store(0, Ordering::Relaxed);
+    h._reserved = 0;
 
     for (i, col) in schema.cols.iter().enumerate() {
         let cd = col_desc_mut(buf, i);
diff --git a/probing/memtable/src/writer.rs b/probing/memtable/src/writer.rs
index 1a4cb428..468979d0 100644
--- a/probing/memtable/src/writer.rs
+++ b/probing/memtable/src/writer.rs
@@ -1,13 +1,12 @@
 use crate::dedup::DedupState;
-use crate::layout::{chunk_header, release_write_lock, w32, CHUNK_HEADER_SIZE};
+use crate::layout::{chunk_header, w32, CHUNK_HEADER_SIZE};
 use crate::raw::note_row_ts;
 use std::sync::atomic::Ordering;
 
 /// Streaming row writer — **low-overhead, weak-contract** hot-path API.
 ///
-/// When `locked` is true (default), holds the write lock from creation
-/// until [`finish()`](Self::finish) (or `Drop`).
-/// When `locked` is false (solo mode), no lock is touched.
+/// MEMT is single-writer, so no lock is taken; the `&mut` borrow guarantees
+/// exclusive access for the writer's lifetime.
 ///
 /// Callers must supply columns in schema order via the typed `put_*`
 /// methods; **no per-call schema validation is performed**.
@@ -25,7 +24,6 @@ pub struct RowWriter<'a> {
     pub(crate) overflow: bool,
     pub(crate) done: bool,
     pub(crate) col_idx: usize,
-    pub(crate) locked: bool,
     /// `Header::ts_col` (timestamp column index + 1; 0 = none).
     pub(crate) ts_col: u16,
     /// Timestamp captured by `put_i64` on the designated column,
@@ -132,7 +130,8 @@ impl<'a> RowWriter<'a> {
         self
     }
 
-    /// Commit the row and release the write lock (if held).
+    /// Commit the row. Returns `false` if the row overflowed the chunk (and
+    /// nothing was committed) or `finish` was already called.
     pub fn finish(&mut self) -> bool {
         if self.done {
             return false;
@@ -155,27 +154,14 @@ impl<'a> RowWriter<'a> {
                 .fetch_add(1, Ordering::Release);
             true
         };
-        if self.locked {
-            release_write_lock(self.buf);
-        }
         ok
     }
 }
 
-impl Drop for RowWriter<'_> {
-    fn drop(&mut self) {
-        if !self.done && self.locked {
-            release_write_lock(self.buf);
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
-    use crate::layout::header;
     use crate::memtable::MemTable;
     use crate::schema::{DType, Schema, Value};
-    use std::sync::atomic::Ordering;
 
     #[test]
     fn row_writer_basic() {
@@ -217,15 +203,17 @@ mod tests {
     }
 
     #[test]
-    fn row_writer_drop_releases_lock() {
+    fn row_writer_drop_without_finish_commits_nothing() {
         let schema = Schema::new().col("x", DType::I32);
         let mut t = MemTable::new(&schema, 1024, 1);
         {
-            let _w = t.row_writer(); // acquires lock
-                                     // dropped without finish() → lock released by Drop
+            let mut w = t.row_writer();
+            w.put_i32(99); // dropped without finish() → row not committed
         }
-        // lock should be free; this must not deadlock
+        assert_eq!(t.num_rows(0), 0, "uncommitted row must not be visible");
+        // A subsequent write still works and is the first visible row.
         t.push_row(&[Value::I32(42)]);
+        assert_eq!(t.num_rows(0), 1);
         assert_eq!(t.rows(0).next().unwrap().col_i32(0), 42);
     }
 
@@ -245,15 +233,14 @@ mod tests {
     }
 
     #[test]
-    fn write_lock_field_is_zero_after_operations() {
+    fn mixed_push_and_row_writer() {
         let schema = Schema::new().col("x", DType::I32);
         let mut t = MemTable::new(&schema, 1024, 1);
         t.push_row(&[Value::I32(1)]);
         t.row_writer().put_i32(2).finish();
-        assert_eq!(
-            header(t.as_bytes()).write_lock.load(Ordering::Relaxed),
-            0,
-            "write_lock must be 0 after all operations complete"
-        );
+        assert_eq!(t.num_rows(0), 2);
+        let rows: Vec<_> = t.rows(0).collect();
+        assert_eq!(rows[0].col_i32(0), 1);
+        assert_eq!(rows[1].col_i32(0), 2);
     }
 }
diff --git a/web/Cargo.toml b/web/Cargo.toml
index a17c9b45..99447114 100644
--- a/web/Cargo.toml
+++ b/web/Cargo.toml
@@ -7,9 +7,9 @@ edition = "2021"
 
 [dependencies]
 # Dioxus dependencies
-dioxus = { version = "0.7", features = ["web"] }
-dioxus-router = "0.7"
-dioxus-web = "0.7"
+dioxus = { version = "0.7.9", features = ["web"] }
+dioxus-router = "0.7.9"
+dioxus-web = "0.7.9"
 
 # Serialization
 serde = { version = "1.0", features = ["derive"] }
@@ -41,7 +41,8 @@ probing-proto = { path = "../probing/proto", default-features = false, features
 [profile.release]
 opt-level = "z"
 debug = false
-strip = true
+# Do not set strip here — dx strips after asset extraction via rust-objcopy,
+# which breaks on some nightly macOS toolchains (missing libLLVM.dylib).
 
 [profile.wasm-dev]
 inherits = "dev"

From dcced68b117a06b6b5b1f025ea791bc6d65f6f4d Mon Sep 17 00:00:00 2001
From: Reiase <reiase@gmail.com>
Date: Sun, 14 Jun 2026 00:11:08 +0800
Subject: [PATCH 3/3] Update version to 0.2.5 across multiple files

- Bumped the version of the `probing` package in `Cargo.toml`, `pyproject.toml`, and various `Cargo.lock` entries from `0.2.4` to `0.2.5`.
- Updated documentation in `installation.md` and its Chinese counterpart to reflect the new version.
- Adjusted the version reporting in the CLI to dynamically fetch the version from the environment variable.
---
 Cargo.lock                       | 20 ++++++++++----------
 Cargo.toml                       |  2 +-
 docs/src/design/data-layer.md    | 12 ++++++------
 docs/src/design/data-layer.zh.md | 12 ++++++------
 docs/src/installation.md         |  2 +-
 docs/src/installation.zh.md      |  2 +-
 probing/cli/src/cli/mod.rs       |  2 +-
 probing/memtable/src/layout.rs   | 18 ++++++++----------
 probing/memtable/src/lib.rs      |  4 ++--
 pyproject.toml                   |  2 +-
 python/probing/__init__.py       |  2 +-
 11 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cf4703ab..cca12cf7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3329,7 +3329,7 @@ dependencies = [
 
 [[package]]
 name = "probing"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "anyhow",
  "arrow",
@@ -3349,7 +3349,7 @@ dependencies = [
 
 [[package]]
 name = "probing-cc"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -3365,7 +3365,7 @@ dependencies = [
 
 [[package]]
 name = "probing-cli"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "anyhow",
  "clap 4.5.38",
@@ -3394,7 +3394,7 @@ dependencies = [
 
 [[package]]
 name = "probing-core"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "anyhow",
  "arrow",
@@ -3421,7 +3421,7 @@ dependencies = [
 
 [[package]]
 name = "probing-macros"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "probing-core",
  "quote",
@@ -3430,7 +3430,7 @@ dependencies = [
 
 [[package]]
 name = "probing-memtable"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "libc",
  "memmap2",
@@ -3440,7 +3440,7 @@ dependencies = [
 
 [[package]]
 name = "probing-proto"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "anyhow",
  "arrow",
@@ -3456,7 +3456,7 @@ dependencies = [
 
 [[package]]
 name = "probing-python"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -3488,7 +3488,7 @@ dependencies = [
 
 [[package]]
 name = "probing-server"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -3519,7 +3519,7 @@ dependencies = [
 
 [[package]]
 name = "probing-store"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "thiserror 2.0.12",
  "tokio",
diff --git a/Cargo.toml b/Cargo.toml
index 56c8210b..faec3266 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,7 +14,7 @@ members = [
 ]
 
 [workspace.package]
-version = "0.2.4"
+version = "0.2.5"
 authors = ["reiase <reiase@gmail.com>"]
 edition = "2021"
 license = "Apache-2.0"
diff --git a/docs/src/design/data-layer.md b/docs/src/design/data-layer.md
index 2dd565dc..c51e67da 100644
--- a/docs/src/design/data-layer.md
+++ b/docs/src/design/data-layer.md
@@ -53,12 +53,12 @@ The hot tier is mapped read-only at query time; the cold tier is read via `Segme
 Every MEMT buffer (heap, shared memory, or mmap'd file) begins with a 64-byte header (one cache
 line), followed by per-column descriptors, then chunk data.
 
-**Header v4 (64 bytes):**
+**Header v3 (64 bytes):**
 
 | offset | size | field | notes |
 |---|---|---|---|
 | 0 | 4 | `magic` | `0x4D454D54` (`"MEMT"`) |
-| 4 | 2 | `version` | 4 |
+| 4 | 2 | `version` | 3 |
 | 6 | 2 | `header_size` | 64 (validation) |
 | 8 | 2 | `byte_order` | BOM `[0x01,0x02]` |
 | 10 | 2 | `ts_col` | timestamp column index + 1 (0 = none) |
@@ -70,16 +70,16 @@ line), followed by per-column descriptors, then chunk data.
 | 32 | 4 | `write_chunk` | `AtomicU32` — current ring slot |
 | 36 | 4 | `refcount` | `AtomicU32` |
 | 40 | 4 | `creator_pid` | |
-| 44 | 4 | `_pad0` | alignment (was `write_lock` in v3) |
+| 44 | 4 | `_pad0` | alignment (was `write_lock` in v2) |
 | 48 | 8 | `creator_start_time` | for PID-recycling detection during discovery |
-| 56 | 8 | `_reserved` | reserved (was `lock_owner_start` in v3) |
+| 56 | 8 | `_reserved` | reserved |
 
 Bytes 0–31 are the **cold zone** (immutable after init); bytes 32–63 are the **hot zone**
 (atomically mutated), split to avoid false sharing. Each chunk starts with a 40-byte
 `ChunkHeader` carrying a `generation` counter and per-chunk `min_ts`/`max_ts` (`AtomicI64`).
 
-> **v4** dropped the `write_lock` and `lock_owner_start` fields: MEMT is single-writer, so there is
-> no in-buffer write lock. Their byte slots are now reserved.
+> **v3** vs v2: `_pad0` became `ts_col`; dropped `write_lock` (single-writer model);
+> `ChunkHeader` gained `min_ts`/`max_ts` (24 → 40 bytes).
 
 ### Backends
 
diff --git a/docs/src/design/data-layer.zh.md b/docs/src/design/data-layer.zh.md
index 9303b605..d1e7a3a0 100644
--- a/docs/src/design/data-layer.zh.md
+++ b/docs/src/design/data-layer.zh.md
@@ -47,12 +47,12 @@ graph LR
 每个 MEMT 缓冲区（堆、共享内存或 mmap 文件）都以 64 字节头部（一个 cache line）开始，随后是
 逐列描述符，再是 chunk 数据。
 
-**Header v4（64 字节）：**
+**Header v3（64 字节）：**
 
 | 偏移 | 大小 | 字段 | 说明 |
 |---|---|---|---|
 | 0 | 4 | `magic` | `0x4D454D54`（`"MEMT"`） |
-| 4 | 2 | `version` | 4 |
+| 4 | 2 | `version` | 3 |
 | 6 | 2 | `header_size` | 64（仅校验） |
 | 8 | 2 | `byte_order` | BOM `[0x01,0x02]` |
 | 10 | 2 | `ts_col` | 时间戳列索引 + 1（0 = 无） |
@@ -64,16 +64,16 @@ graph LR
 | 32 | 4 | `write_chunk` | `AtomicU32`——当前环形槽位 |
 | 36 | 4 | `refcount` | `AtomicU32` |
 | 40 | 4 | `creator_pid` | |
-| 44 | 4 | `_pad0` | 对齐填充（v3 中为 `write_lock`） |
+| 44 | 4 | `_pad0` | 对齐填充（v2 中为 `write_lock`） |
 | 48 | 8 | `creator_start_time` | 用于发现期的 PID 回收检测 |
-| 56 | 8 | `_reserved` | 预留（v3 中为 `lock_owner_start`） |
+| 56 | 8 | `_reserved` | 预留 |
 
 字节 0–31 是**冷区**（初始化后不可变），字节 32–63 是**热区**（运行时原子修改），二者分离以避免
 伪共享。每个 chunk 以 40 字节的 `ChunkHeader` 开头，携带 `generation` 计数器及逐 chunk 的
 `min_ts`/`max_ts`（`AtomicI64`）。
 
-> **v4** 移除了 `write_lock` 与 `lock_owner_start` 字段：MEMT 是单写者，缓冲区内不再有写锁。其字节
-> 槽位现已预留。
+> **v3** 相对 v2：`_pad0` 改为 `ts_col`；移除 `write_lock`（单写者模型）；`ChunkHeader` 新增
+> `min_ts`/`max_ts`（24 → 40 字节）。
 
 ### 三种后端
 
diff --git a/docs/src/installation.md b/docs/src/installation.md
index 80035d42..05dff6c4 100644
--- a/docs/src/installation.md
+++ b/docs/src/installation.md
@@ -53,7 +53,7 @@ probing --version
 This should print the installed version of Probing, for example:
 
 ```
-probing 0.2.4
+probing 0.2.5
 ```
 
 You can also check if the `probing` command is available:
diff --git a/docs/src/installation.zh.md b/docs/src/installation.zh.md
index 9ac8c4a3..52672ad2 100644
--- a/docs/src/installation.zh.md
+++ b/docs/src/installation.zh.md
@@ -51,7 +51,7 @@ probing --version
 应该会输出已安装的 Probing 版本，例如：
 
 ```
-probing 0.2.4
+probing 0.2.5
 ```
 
 您也可以检查 `probing` 命令是否可用：
diff --git a/probing/cli/src/cli/mod.rs b/probing/cli/src/cli/mod.rs
index b900e672..519cd7fa 100644
--- a/probing/cli/src/cli/mod.rs
+++ b/probing/cli/src/cli/mod.rs
@@ -25,7 +25,7 @@ use commands::Commands;
 use once_cell::sync::Lazy;
 
 fn get_build_info() -> String {
-    let mut info = "0.2.1".to_string();
+    let mut info = env!("CARGO_PKG_VERSION").to_string();
 
     if let Some(timestamp) = option_env!("VERGEN_BUILD_TIMESTAMP") {
         info.push_str(&format!("\nBuild Timestamp: {timestamp}"));
diff --git a/probing/memtable/src/layout.rs b/probing/memtable/src/layout.rs
index 6729d7cb..b7bc52ae 100644
--- a/probing/memtable/src/layout.rs
+++ b/probing/memtable/src/layout.rs
@@ -1,12 +1,12 @@
 //! Low-level layout: header, column descriptors, chunk headers, byte helpers.
 //!
-//! ## Header v4 binary layout (64 bytes, 1 cache line)
+//! ## Header v3 binary layout (64 bytes, 1 cache line)
 //!
 //! ```text
 //! offset  size  field               notes
 //! ──────────────────────────────────────────────────────────
 //!  0       4    magic               0x4D454D54 ("MEMT" in LE)
-//!  4       2    version             4
+//!  4       2    version             3
 //!  6       2    header_size         64 (validation only)
 //!  8       2    byte_order          BOM: written as [0x01, 0x02]
 //! 10       2    ts_col              timestamp column index + 1 (0 = none)
@@ -44,11 +44,9 @@ pub(crate) const MAGIC: u32 = MAGIC_MEMT;
 
 /// Header format version for MEMT.
 ///
-/// v4: dropped the `write_lock` and `lock_owner_start` fields — MEMT is
-/// single-writer, so there is no in-buffer write lock. Their bytes are now
-/// `_pad0`/`_reserved`. v3 added per-chunk `min_ts`/`max_ts` and the PID
-/// write lock (both since superseded).
-pub(crate) const VERSION: u16 = 4;
+/// v3: `_pad0` became `ts_col`; dropped `write_lock` (single-writer model);
+/// `ChunkHeader` grew `min_ts`/`max_ts` (24 → 40 bytes).
+pub(crate) const VERSION: u16 = 3;
 
 /// Byte-order mark: written as raw bytes `[0x01, 0x02]`.
 /// On a LE host, `u16::from_ne_bytes([0x01, 0x02])` == `0x0201`.
@@ -80,7 +78,7 @@ pub(crate) struct Header {
     // ── cold zone (read-only after init) ─────────────────
     pub magic: u32,
     pub version: u16,
-    /// Size of this header in bytes (always 64 in v2).
+    /// Size of this header in bytes (always 64).
     ///
     /// Used for validation only — column descriptors always start at
     /// offset `size_of::<Header>()` (compile-time constant).  If a
@@ -111,14 +109,14 @@ pub(crate) struct Header {
     pub refcount: AtomicU32,
     /// PID of the process that created this table (for cross-process discovery).
     pub creator_pid: u32,
-    /// Padding to 8-align `creator_start_time` (was `write_lock` in v3).
+    /// Padding to 8-align `creator_start_time` (was `write_lock` in v2).
     pub _pad0: u32,
     /// Process start time — for PID-recycling detection during discovery.
     /// Linux: clock ticks since boot (`/proc/<pid>/stat` field 22).
     /// macOS: microseconds since epoch (via `sysctl`).
     /// Other: 0 (falls back to PID-only liveness check).
     pub creator_start_time: u64,
-    /// Reserved for future use (was `lock_owner_start` in v3).
+    /// Reserved for future use (was part of `_reserved` in v2).
     pub _reserved: u64,
 }
 
diff --git a/probing/memtable/src/lib.rs b/probing/memtable/src/lib.rs
index 87f02631..0fa43526 100644
--- a/probing/memtable/src/lib.rs
+++ b/probing/memtable/src/lib.rs
@@ -20,10 +20,10 @@
 //!
 //! ```text
 //! ┌──────────────────────────────────┐ 0
-//! │ Header v4 (64 bytes, repr(C))    │
+//! │ Header v3 (64 bytes, repr(C))    │
 //! │  ── cold zone (read-only) ──     │
 //! │   magic: u32     (0x4D454D54)    │
-//! │   version: u16   (4)             │
+//! │   version: u16   (3)             │
 //! │   header_size: u16 (64)          │
 //! │   byte_order: u16 (BOM 0x0102)   │
 //! │   ts_col: u16                    │
diff --git a/pyproject.toml b/pyproject.toml
index 4edb2329..b352432c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "probing"
-version = "0.2.4"
+version = "0.2.5"
 description = "Dynamic Performance Profiler for Distributed AI"
 readme = "README.md"
 authors = [
diff --git a/python/probing/__init__.py b/python/probing/__init__.py
index 85499c26..ceca30e3 100644
--- a/python/probing/__init__.py
+++ b/python/probing/__init__.py
@@ -22,7 +22,7 @@
 import probing.config as config
 from probing import _core
 
-VERSION = "0.2.4"
+VERSION = "0.2.5"
 
 # Core Primitives
 ExternalTable = _core.ExternalTable