From 10f276ab4425682426550478c9158c18d1584fd3 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Mon, 27 Apr 2026 18:22:26 +0200
Subject: [PATCH 01/22] Add SIMD and no-hint benchmark variants for
 PrefixHashMap

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 Cargo.toml                              |   1 +
 crates/hashmap-bench/Cargo.toml         |  24 +
 crates/hashmap-bench/OPTIMIZATIONS.md   | 482 +++++++++++++++++
 crates/hashmap-bench/hashmap_insert.rs  | 175 ++++++
 crates/hashmap-bench/lib.rs             |  45 ++
 crates/hashmap-bench/prefix_map.rs      | 502 +++++++++++++++++
 crates/hashmap-bench/prefix_map_simd.rs | 688 ++++++++++++++++++++++++
 7 files changed, 1917 insertions(+)
 create mode 100644 crates/hashmap-bench/Cargo.toml
 create mode 100644 crates/hashmap-bench/OPTIMIZATIONS.md
 create mode 100644 crates/hashmap-bench/hashmap_insert.rs
 create mode 100644 crates/hashmap-bench/lib.rs
 create mode 100644 crates/hashmap-bench/prefix_map.rs
 create mode 100644 crates/hashmap-bench/prefix_map_simd.rs

diff --git a/Cargo.toml b/Cargo.toml
index 312f46d..524e62a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
     "crates/*",
     "crates/bpe/benchmarks",
     "crates/bpe/tests",
+    "crates/hashmap-bench",
 ]
 resolver = "2"
 
diff --git a/crates/hashmap-bench/Cargo.toml b/crates/hashmap-bench/Cargo.toml
new file mode 100644
index 0000000..cae08a7
--- /dev/null
+++ b/crates/hashmap-bench/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "hashmap-bench"
+edition = "2021"
+
+[lib]
+path = "lib.rs"
+test = false
+
+[[bench]]
+name = "hashmap_insert"
+path = "hashmap_insert.rs"
+harness = false
+test = false
+
+[dependencies]
+criterion = "0.7"
+rand = "0.9"
+rustc-hash = "2"
+ahash = "0.8"
+hashbrown = "0.15"
+foldhash = "0.1"
+gxhash = "3"
+smallvec = "1"
+fnv = "1"
diff --git a/crates/hashmap-bench/OPTIMIZATIONS.md b/crates/hashmap-bench/OPTIMIZATIONS.md
new file mode 100644
index 0000000..d113e60
--- /dev/null
+++ b/crates/hashmap-bench/OPTIMIZATIONS.md
@@ -0,0 +1,482 @@
+# Missing Optimizations in PrefixHashMap vs. Rust Swiss Table (hashbrown)
+
+## Executive Summary
+
+The `PrefixHashMap` in this repository is a minimal, insertion-only hash map specialized for pre-hashed `u32` keys. While it borrows the core Swiss table concept of control-byte-based group scanning, it omits a large number of optimizations present in the production [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) Swiss table implementation. The most impactful missing optimizations are: **SIMD-accelerated group scanning** (SSE2/NEON), **open-addressing with triangular probing** (instead of overflow chaining), **SoA memory layout** separating control bytes from data for cache efficiency, **in-place rehashing** to reclaim tombstones, **DELETED tombstone support** for element removal, and **over-allocation utilization**. This report catalogs every significant optimization gap across architecture, probing, memory layout, SIMD, resize strategy, and API completeness.
+
+---
+
+## Architecture Overview
+
+```
+┌───────────────────────────────────────────────────────────────────┐
+│                    hashbrown Swiss Table                          │
+│                                                                   │
+│   Single contiguous allocation:                                   │
+│   [Padding] [T_n ... T_1  T_0] [CT_0 CT_1 ... CT_n] [CT_extra]  │
+│                 data (SoA)          control bytes     (mirrored)  │
+│                                                                   │
+│   • Open addressing, triangular probing                           │
+│   • 16-byte groups (SSE2) or 8-byte groups (NEON/generic)         │
+│   • SIMD parallel group scan                                      │
+│   • EMPTY / DELETED / FULL tag states                             │
+└───────────────────────────────────────────────────────────────────┘
+
+┌───────────────────────────────────────────────────────────────────┐
+│                     PrefixHashMap                                 │
+│                                                                   │
+│   Vec<Group> where each Group:                                    │
+│   { ctrl: [u8; 8], keys: [u32; 8], values: [MaybeUninit<V>; 8],  │
+│     overflow: u32 }                                               │
+│                                                                   │
+│   • Overflow chaining (linked Group structs)                      │
+│   • Fixed 8-byte groups, scalar bit-manipulation                  │
+│   • EMPTY / FULL tag states only (no DELETED)                     │
+└───────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 1. SIMD-Accelerated Group Scanning
+
+**Status: Missing from PrefixHashMap**
+
+This is arguably the most impactful optimization gap.
+
+### hashbrown
+
+hashbrown provides three SIMD backends selected at compile time[^1]:
+
+| Platform | Backend | Group Width | Instructions Used |
+|----------|---------|-------------|-------------------|
+| x86/x86_64 with SSE2 | `sse2.rs` | 16 bytes | `_mm_cmpeq_epi8`, `_mm_movemask_epi8` |
+| AArch64 with NEON | `neon.rs` | 8 bytes | `vceq_u8`, `vcltz_s8`, `vreinterpret_u64_u8` |
+| Fallback | `generic.rs` | 8 bytes (u64) | Scalar bit tricks |
+
+On x86_64, the SSE2 `match_tag` compiles to just 2 instructions: a `pcmpeqb` and a `pmovmskb`, producing a 16-bit mask where each bit directly indicates a matching slot[^2]. This means **16 slots are scanned in a single operation**.
+
+### PrefixHashMap
+
+PrefixHashMap uses only the scalar approach, operating on 8 control bytes packed into a `u64`[^3]:
+
+```rust
+fn match_byte(ctrl: &[u8; GROUP_SIZE], byte: u8) -> u64 {
+    let word = u64::from_ne_bytes(*ctrl);
+    let broadcast = 0x0101010101010101u64 * (byte as u64);
+    let xor = word ^ broadcast;
+    (xor.wrapping_sub(0x0101010101010101)) & !xor & 0x8080808080808080
+}
+```
+
+This is essentially the same algorithm as hashbrown's `generic.rs` fallback[^4], but:
+
+- **Fixed at 8-byte groups** — never benefits from the SSE2 16-byte group scan available on most modern x86 machines.
+- **No platform-specific fast paths** — no NEON, no SSE2, no LoongArch LSX.
+
+**Impact**: On x86_64, hashbrown scans 2× more slots per group operation using native SIMD instructions that are lower latency than the scalar bit-manipulation chain.
+
+---
+
+## 2. Probing Strategy: Triangular Probing vs. Overflow Chaining
+
+**Status: Missing from PrefixHashMap**
+
+### hashbrown
+
+hashbrown uses **triangular probing**, a variant of open addressing where each successive probe jumps by one more group width[^5]:
+
+```rust
+struct ProbeSeq { pos: usize, stride: usize }
+impl ProbeSeq {
+    fn move_next(&mut self, bucket_mask: usize) {
+        self.stride += Group::WIDTH;
+        self.pos += self.stride;
+        self.pos &= bucket_mask;
+    }
+}
+```
+
+This is mathematically guaranteed to visit every group exactly once in a power-of-two-sized table[^6]. All probing occurs within a single contiguous allocation, enabling excellent spatial locality.
+
+### PrefixHashMap
+
+PrefixHashMap uses **overflow chaining**: when a primary group is full, an overflow group is allocated at the end of the `Vec<Group>` and linked via an index pointer[^7]:
+
+```rust
+overflow: u32, // index into groups vec, or NO_OVERFLOW
+```
+
+**Missing benefits of triangular probing**:
+
+- **Spatial locality**: Triangular probing accesses nearby memory regions (the next group is typically in the same or adjacent cache line). Overflow groups are appended at the end of the vector, potentially far from the primary group.
+- **No pointer chasing**: Triangular probing computes the next position arithmetically; overflow chaining follows an indirection.
+- **Probe termination guarantee**: Triangular probing terminates when it encounters an EMPTY slot. Overflow chaining must check the `overflow` field and follow links.
+
+---
+
+## 3. Memory Layout: SoA vs. AoS
+
+**Status: Missing from PrefixHashMap**
+
+### hashbrown
+
+hashbrown uses a **Structure-of-Arrays (SoA)** layout within a single allocation[^8]:
+
+```
+[Padding] [T_n, ..., T_1, T_0] [CT_0, CT_1, ..., CT_n, CT_extra...]
+           ^^^ data part ^^^     ^^^ control bytes (contiguous) ^^^
+```
+
+All control bytes are stored contiguously at the end of the allocation. When probing, the initial scan only touches control bytes — the data is only accessed after a tag match. This means:
+
+- **Control byte scans stay in L1 cache**: For a table with 1024 entries, all 1024 control bytes fit in ~1KB, likely fitting entirely in L1 cache.
+- **Data is only accessed on hits**: Cache pollution from data access is minimized.
+
+### PrefixHashMap
+
+PrefixHashMap uses an **Array-of-Structures (AoS)** layout[^9]:
+
+```rust
+struct Group<V> {
+    ctrl: [u8; 8],      // 8 bytes
+    keys: [u32; 8],     // 32 bytes
+    values: [MaybeUninit<V>; 8], // 8 * size_of::<V>() bytes
+    overflow: u32,       // 4 bytes
+}
+```
+
+For a `V` of 8 bytes (e.g., `usize`), each Group is 8 + 32 + 64 + 4 = 108 bytes (plus alignment padding). Scanning the control bytes of sequential groups requires jumping over all the key/value data, degrading cache utilization when doing multi-group probing.
+
+---
+
+## 4. Control Byte Mirroring for Wrap-Around
+
+**Status: Missing from PrefixHashMap (but less needed due to overflow chaining)**
+
+### hashbrown
+
+hashbrown allocates `num_buckets + Group::WIDTH` control bytes. The first `Group::WIDTH` control bytes are replicated at the end[^10]:
+
+```rust
+fn set_ctrl(&mut self, index: usize, ctrl: Tag) {
+    let index2 = ((index.wrapping_sub(Group::WIDTH)) & self.bucket_mask) + Group::WIDTH;
+    *self.ctrl(index) = ctrl;
+    *self.ctrl(index2) = ctrl;  // mirror
+}
+```
+
+This ensures that a group load starting near the end of the table can safely wrap around without a branch or special case.
+
+### PrefixHashMap
+
+Not implemented — not needed because PrefixHashMap doesn't use open addressing. Each group is self-contained with its own control byte array.
+
+---
+
+## 5. Tombstone / DELETED Support and In-Place Rehashing
+
+**Status: Missing from PrefixHashMap**
+
+### hashbrown
+
+hashbrown has three control byte states[^11]:
+
+| State | Encoding | Meaning |
+|-------|----------|---------|
+| `EMPTY` | `0xFF` (1111_1111) | Slot never occupied or fully reclaimed |
+| `DELETED` | `0x80` (1000_0000) | Tombstone — element removed, probing must continue past |
+| `FULL` | `0x00..0x7F` | Occupied — top 7 bits of hash |
+
+When elements are removed, the control byte is set to `DELETED` rather than `EMPTY`. This preserves the probe chain for other elements. When the ratio of deleted entries gets too high, hashbrown performs an **in-place rehash**[^12]:
+
+1. Convert all FULL → DELETED, DELETED → EMPTY via `convert_special_to_empty_and_full_to_deleted()`
+2. Walk through each DELETED (originally FULL) entry and swap it into its ideal position
+3. If both old and new positions are in the same probe group, just update the control byte in place
+
+This avoids a full reallocation when many deletes have fragmented the table.
+
+### PrefixHashMap
+
+PrefixHashMap only has two states[^13]:
+
+| State | Encoding |
+|-------|----------|
+| `EMPTY` | `0x00` |
+| `FULL` | `key_byte \| 0x80` |
+
+There is **no deletion support at all** — the map is described as "insertion-only"[^14]. This means:
+- No `remove()` method
+- No tombstones
+- No in-place rehash optimization
+- If an entry needs to be removed, the entire map must be rebuilt
+
+---
+
+## 6. Tag / Hash Encoding
+
+**Status: Different approach in PrefixHashMap (not necessarily worse, but different trade-offs)**
+
+### hashbrown
+
+Uses the **top 7 bits** of the 64-bit hash as the tag, stored with the high bit clear (range `0x00..0x7F`)[^15]:
+
+```rust
+pub(crate) const fn full(hash: u64) -> Tag {
+    let top7 = hash >> (MIN_HASH_LEN * 8 - 7);
+    Tag((top7 & 0x7f) as u8)
+}
+```
+
+The high bit is reserved for EMPTY/DELETED sentinel detection. This means `EMPTY`/`DELETED` can be distinguished from `FULL` with a single bit test.
+
+### PrefixHashMap
+
+Forces bit 7 high and uses the low 7 bits of the key[^16]:
+
+```rust
+fn tag(key: u32) -> u8 {
+    (key as u8) | 0x80
+}
+```
+
+EMPTY is `0x00`. This inverts the hashbrown convention — FULL entries have bit 7 set, EMPTY has bit 7 clear. The `match_empty` function checks for zero bytes[^17]:
+
+```rust
+fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> u64 {
+    let word = u64::from_ne_bytes(*ctrl);
+    !word & 0x8080808080808080
+}
+```
+
+**Key difference**: PrefixHashMap cannot distinguish DELETED from FULL because all non-zero control bytes have bit 7 set. This is a deliberate simplification for the insertion-only use case.
+
+---
+
+## 7. Load Factor and Growth Strategy
+
+**Status: Different and less sophisticated in PrefixHashMap**
+
+### hashbrown
+
+Uses an **87.5% maximum load factor** (7/8) with a `growth_left` counter[^18]:
+
+```rust
+fn bucket_mask_to_capacity(bucket_mask: usize) -> usize {
+    if bucket_mask < 8 { bucket_mask }
+    else { ((bucket_mask + 1) / 8) * 7 }
+}
+```
+
+Growth is triggered when `growth_left` reaches 0, which tracks insertions minus the capacity. The `growth_left` field is decremented only when inserting into an EMPTY slot (not a DELETED one)[^19].
+
+### PrefixHashMap
+
+Uses overflow group exhaustion as the growth trigger[^20]:
+
+```rust
+let max_overflow = self.num_primary / 8 + 1;
+let num_overflow = self.groups.len() as u32 - self.num_primary;
+if num_overflow >= max_overflow {
+    self.grow();
+    return self.insert(key, value);
+}
+```
+
+This reserves 12.5% extra groups for overflow. Growth happens when the overflow area is full. This is a coarser signal than hashbrown's per-slot tracking and can lead to:
+- **Premature growth** if unlucky hash distribution fills overflow disproportionately
+- **Delayed growth** if hash distribution is uniform (overflow area may never fill even at high load)
+
+---
+
+## 8. Resize Strategy
+
+**Status: Significantly less optimized in PrefixHashMap**
+
+### hashbrown
+
+hashbrown's resize has multiple optimizations:
+
+1. **Over-allocation utilization**[^21]: When the allocator returns more memory than requested, hashbrown uses the extra space for additional buckets:
+   ```rust
+   if block.len() != layout.size() {
+       let x = maximum_buckets_in(block.len(), table_layout, Group::WIDTH);
+       // Use larger capacity...
+   }
+   ```
+
+2. **In-place rehashing** when fragmentation from deletions is high (described in §5).
+
+3. **Efficient element copying** using `ptr::copy_nonoverlapping` with layout-aware size calculations[^22].
+
+4. **Panic-safe resize** using `ScopeGuard` to ensure the old table is freed even if the hasher panics[^23].
+
+### PrefixHashMap
+
+PrefixHashMap's grow is simpler and less efficient[^24]:
+
+```rust
+fn grow(&mut self) {
+    let old_groups = std::mem::take(&mut self.groups);
+    self.n_bits += 1;
+    // ... allocate new groups ...
+    for group in old_groups {
+        for i in 0..GROUP_SIZE {
+            if group.ctrl[i] != CTRL_EMPTY {
+                self.insert(key, value);  // full re-insertion
+            }
+        }
+        std::mem::forget(group);
+    }
+}
+```
+
+Missing optimizations:
+- **Always doubles** — no option for in-place rehash
+- **Re-inserts via the public API** — each element goes through the full insert path including overflow chain traversal, whereas hashbrown uses a fast `prepare_insert_index` that skips duplicate checking
+- **No over-allocation utilization**
+- **Limited panic safety** — uses `mem::forget` on old groups but doesn't guard against panics during re-insertion
+
+---
+
+## 9. Branch Prediction Hints
+
+**Status: Missing from PrefixHashMap**
+
+### hashbrown
+
+hashbrown extensively uses `likely()` and `unlikely()` hints to guide the CPU's branch predictor[^25]:
+
+```rust
+if unlikely(self.table.growth_left == 0 && old_ctrl.special_is_empty()) {
+    self.reserve(1, hasher);
+}
+```
+
+```rust
+if likely(eq(index)) {
+    return Some(index);
+}
+```
+
+### PrefixHashMap
+
+No branch hints are used anywhere in the implementation. On modern CPUs, this can affect branch prediction accuracy for cold paths like growth and overflow traversal.
+
+---
+
+## 10. Slot Hint / Preferred Slot
+
+**Status: Present in PrefixHashMap but NOT in hashbrown (PrefixHashMap advantage)**
+
+PrefixHashMap has a unique optimization not present in hashbrown: a **preferred slot hint** derived from additional hash bits[^26]:
+
+```rust
+fn slot_hint(key: u32) -> usize {
+    ((key >> 7) & 0x7) as usize
+}
+```
+
+Before scanning the group, PrefixHashMap first checks the preferred slot directly[^27]:
+
+```rust
+let c = group.ctrl[hint];
+if c == CTRL_EMPTY {
+    // Direct insert without scanning
+}
+if c == tag && group.keys[hint] == key {
+    // Direct hit without scanning
+}
+```
+
+This is a fast path that avoids the scalar group scan entirely when the preferred slot is available. hashbrown does not have this optimization — it always does a full group scan via SIMD/scalar.
+
+---
+
+## 11. Additional Missing Features and Optimizations
+
+| Feature | hashbrown | PrefixHashMap |
+|---------|-----------|---------------|
+| Custom allocator support | Yes (`Allocator` trait)[^28] | No (uses `Vec` with global allocator) |
+| ZST (Zero-Sized Type) handling | Optimized special case[^29] | Not supported |
+| `#[cold]` / `#[inline(never)]` on slow paths | Yes (e.g., `reserve_rehash`)[^30] | Not used |
+| `Entry` API | Full entry API | Not provided |
+| Iterator support | `RawIter`, `RawDrain`, `RawIntoIter` | Not provided |
+| `shrink_to` / `shrink_to_fit` | Yes | Not provided |
+| Generic over key type | Yes (any `K: Hash + Eq`) | Fixed `u32` keys only |
+| `remove` / `erase` | Yes, with tombstones | Not supported |
+| Monomorphization reduction | Uses `dyn Fn` for inner functions[^31] | Not applicable (simpler API) |
+| Small table optimization | Min capacity thresholds based on layout/group width[^32] | Minimum 2 primary groups |
+
+---
+
+## 12. Summary of Impact
+
+The missing optimizations can be categorized by their likely performance impact:
+
+### High Impact
+1. **SIMD group scanning** — 2× more slots per scan on SSE2; lower-latency instructions
+2. **SoA memory layout** — dramatically better cache behavior for control byte scanning
+3. **Open addressing with triangular probing** — eliminates pointer chasing in overflow chains
+4. **Resize without re-insertion** — hashbrown copies elements directly without re-probing
+
+### Medium Impact
+5. **In-place rehashing** — avoids allocation when table is fragmented by deletions (N/A for insert-only)
+6. **Over-allocation utilization** — free extra capacity from allocator rounding
+7. **Branch hints** — guides CPU branch predictor for common vs. rare paths
+8. **Load factor tracking** — precise growth triggering vs. overflow-area exhaustion
+
+### Lower Impact (or N/A for the use case)
+9. **Control byte mirroring** — needed for open addressing wrap-around (not needed with chaining)
+10. **Tombstone/DELETED support** — only matters if deletion is needed
+11. **Custom allocators** — not needed for most use cases
+12. **ZST handling** — irrelevant for `u32` keys
+
+### PrefixHashMap Advantages (Not in hashbrown)
+- **Slot hint fast path** — direct preferred-slot check before group scan
+- **No hashing overhead** — keys are pre-hashed `u32` values
+- **Simpler implementation** — ~250 lines vs. ~5000+ lines, easier to reason about
+
+---
+
+## Confidence Assessment
+
+- **High confidence**: All claims about both implementations are verified directly from source code. The hashbrown analysis is based on the current `main` branch (commit `420e83ba`), and the PrefixHashMap analysis is from the local `crates/hashmap-bench/prefix_map.rs`.
+- **Moderate confidence**: Performance impact assessments are based on algorithmic analysis and known CPU architecture properties (cache line sizes, SIMD throughput) rather than measured benchmarks. Actual impact depends on workload, key distribution, and hardware.
+- **Assumption**: The PrefixHashMap is intentionally minimal — many "missing" features are deliberate design choices for simplicity in a benchmarking context, not oversights.
+
+---
+
+## Footnotes
+
+[^1]: `src/control/group/mod.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — compile-time cfg selection of SSE2, NEON, LSX, or generic backend
+[^2]: `src/control/group/sse2.rs:80-93` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `match_tag` using `_mm_cmpeq_epi8` + `_mm_movemask_epi8`
+[^3]: `crates/hashmap-bench/prefix_map.rs:50-56` — scalar `match_byte` function
+[^4]: `src/control/group/generic.rs:96-104` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — generic `match_tag` using same bit-trick
+[^5]: `src/raw.rs:80-97` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `ProbeSeq` struct and `move_next`
+[^6]: Blog post cited in hashbrown source: https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/
+[^7]: `crates/hashmap-bench/prefix_map.rs:12` — `overflow: u32` field in Group struct
+[^8]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `TableLayout::calculate_layout_for` computes `ctrl_offset = size * buckets` (data then control bytes)
+[^9]: `crates/hashmap-bench/prefix_map.rs:8-13` — Group struct with interleaved ctrl/keys/values
+[^10]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `set_ctrl` method mirrors control bytes
+[^11]: `src/control/tag.rs:5-9` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — Tag EMPTY=0xFF, DELETED=0x80
+[^12]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `rehash_in_place` method
+[^13]: `crates/hashmap-bench/prefix_map.rs:4` — `CTRL_EMPTY: u8 = 0x00`
+[^14]: `crates/hashmap-bench/prefix_map.rs:26` — doc comment: "Insertion-only hash map"
+[^15]: `src/control/tag.rs:36-47` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `Tag::full` method
+[^16]: `crates/hashmap-bench/prefix_map.rs:40-42` — `tag` function
+[^17]: `crates/hashmap-bench/prefix_map.rs:59-62` — `match_empty` function
+[^18]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `bucket_mask_to_capacity` function
+[^19]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `record_item_insert_at` decrements `growth_left` only for EMPTY
+[^20]: `crates/hashmap-bench/prefix_map.rs:148-154` — overflow exhaustion check triggering `grow()`
+[^21]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `new_uninitialized` over-allocation handling
+[^22]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `resize_inner` uses `ptr::copy_nonoverlapping`
+[^23]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `prepare_resize` returns `ScopeGuard`
+[^24]: `crates/hashmap-bench/prefix_map.rs:216-241` — `grow` method
+[^25]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — uses `likely()` / `unlikely()` from `crate::util`
+[^26]: `crates/hashmap-bench/prefix_map.rs:45-47` — `slot_hint` function
+[^27]: `crates/hashmap-bench/prefix_map.rs:98-114` — fast path check in `insert`
+[^28]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `RawTable<T, A: Allocator>`
+[^29]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `IS_ZERO_SIZED` special cases throughout
+[^30]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `#[cold] #[inline(never)]` on `reserve_rehash`
+[^31]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `find_inner` uses `&mut dyn FnMut(usize) -> bool`
+[^32]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `capacity_to_buckets` with `min_cap` thresholds
diff --git a/crates/hashmap-bench/hashmap_insert.rs b/crates/hashmap-bench/hashmap_insert.rs
new file mode 100644
index 0000000..c05bc52
--- /dev/null
+++ b/crates/hashmap-bench/hashmap_insert.rs
@@ -0,0 +1,175 @@
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use hashmap_bench::random_trigram_hashes;
+
+fn bench_hashmap_insert(c: &mut Criterion) {
+    let trigrams = random_trigram_hashes(1000);
+
+    let mut group = c.benchmark_group("hashmap_insert_1000_trigrams");
+
+    group.bench_function("std::HashMap", |b| {
+        b.iter_batched(
+            || std::collections::HashMap::with_capacity(trigrams.len()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("hashbrown::HashMap", |b| {
+        b.iter_batched(
+            || hashbrown::HashMap::with_capacity(trigrams.len()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("FxHashMap", |b| {
+        b.iter_batched(
+            || rustc_hash::FxHashMap::with_capacity_and_hasher(trigrams.len(), Default::default()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("AHashMap", |b| {
+        b.iter_batched(
+            || ahash::AHashMap::with_capacity(trigrams.len()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("FoldHashMap", |b| {
+        b.iter_batched(
+            || hashbrown::HashMap::<u32, usize, foldhash::fast::FixedState>::with_capacity_and_hasher(
+                trigrams.len(),
+                foldhash::fast::FixedState::default(),
+            ),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("PrefixHashMap", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity(trigrams.len()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("SimdPrefixHashMap", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity(trigrams.len()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("NoHintScalar", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map::NoHintScalarPrefixHashMap::with_capacity(trigrams.len()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("NoHintSimd", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map_simd::NoHintPrefixHashMap::with_capacity(trigrams.len()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("GxHashMap", |b| {
+        b.iter_batched(
+            || gxhash::HashMap::with_capacity_and_hasher(trigrams.len(), Default::default()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("std::HashMap+FNV", |b| {
+        b.iter_batched(
+            || std::collections::HashMap::with_capacity_and_hasher(trigrams.len(), fnv::FnvBuildHasher::default()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("hashbrown+Identity", |b| {
+        b.iter_batched(
+            || hashbrown::HashMap::<u32, usize, hashmap_bench::IdentityBuildHasher>::with_capacity_and_hasher(
+                trigrams.len(),
+                Default::default(),
+            ),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_hashmap_insert);
+criterion_main!(benches);
diff --git a/crates/hashmap-bench/lib.rs b/crates/hashmap-bench/lib.rs
new file mode 100644
index 0000000..0714ffd
--- /dev/null
+++ b/crates/hashmap-bench/lib.rs
@@ -0,0 +1,45 @@
+pub mod prefix_map;
+pub mod prefix_map_simd;
+
+use rand::Rng;
+use std::hash::{BuildHasherDefault, Hasher};
+
+/// A hasher that returns the input unchanged. Only valid for u32 keys
+/// that are already well-distributed hashes.
+#[derive(Default)]
+pub struct IdentityHasher(u64);
+
+impl Hasher for IdentityHasher {
+    fn write(&mut self, _bytes: &[u8]) {
+        unimplemented!("IdentityHasher only supports write_u32");
+    }
+    fn write_u32(&mut self, i: u32) {
+        self.0 = i as u64;
+    }
+    fn finish(&self) -> u64 {
+        self.0
+    }
+}
+
+pub type IdentityBuildHasher = BuildHasherDefault<IdentityHasher>;
+
+/// Generate `n` random trigrams as well-distributed u32 hashes.
+/// Each trigram is packed into a u32, then scrambled with a murmur3 finalizer.
+pub fn random_trigram_hashes(n: usize) -> Vec<u32> {
+    let mut rng = rand::rng();
+    (0..n)
+        .map(|_| {
+            let a = rng.random_range(b'a'..=b'z') as u32;
+            let b = rng.random_range(b'a'..=b'z') as u32;
+            let c = rng.random_range(b'a'..=b'z') as u32;
+            let packed = a | (b << 8) | (c << 16);
+            let mut h = packed;
+            h ^= h >> 16;
+            h = h.wrapping_mul(0x85ebca6b);
+            h ^= h >> 13;
+            h = h.wrapping_mul(0xc2b2ae35);
+            h ^= h >> 16;
+            h
+        })
+        .collect()
+}
diff --git a/crates/hashmap-bench/prefix_map.rs b/crates/hashmap-bench/prefix_map.rs
new file mode 100644
index 0000000..40c669e
--- /dev/null
+++ b/crates/hashmap-bench/prefix_map.rs
@@ -0,0 +1,502 @@
+use core::mem::MaybeUninit;
+
+const GROUP_SIZE: usize = 8;
+const CTRL_EMPTY: u8 = 0x00;
+const NO_OVERFLOW: u32 = u32::MAX;
+
+/// A single group: 8 slots with control bytes, keys, values, and an overflow pointer.
+struct Group<V> {
+    ctrl: [u8; GROUP_SIZE],
+    keys: [u32; GROUP_SIZE],
+    values: [MaybeUninit<V>; GROUP_SIZE],
+    overflow: u32, // index into groups vec, or NO_OVERFLOW
+}
+
+impl<V> Group<V> {
+    fn new() -> Self {
+        Self {
+            ctrl: [CTRL_EMPTY; GROUP_SIZE],
+            keys: [0; GROUP_SIZE],
+            values: [const { MaybeUninit::uninit() }; GROUP_SIZE],
+            overflow: NO_OVERFLOW,
+        }
+    }
+}
+
+/// Insertion-only hash map where the key IS a hash (`u32`).
+///
+/// Groups are stored in a single `Vec<Group>`. The first `2^n_bits` groups
+/// are primary buckets (addressed by key prefix). When a primary group is
+/// full, an overflow group is allocated from the end of the vec and linked
+/// via `overflow`.
+pub struct PrefixHashMap<V> {
+    groups: Vec<Group<V>>,
+    n_bits: u32,
+    num_primary: u32,
+    len: usize,
+}
+
+#[inline]
+fn tag(key: u32) -> u8 {
+    (key as u8) | 0x80
+}
+
+#[inline]
+fn slot_hint(key: u32) -> usize {
+    ((key >> 7) & 0x7) as usize
+}
+
+#[inline]
+fn match_byte(ctrl: &[u8; GROUP_SIZE], byte: u8) -> u64 {
+    let word = u64::from_ne_bytes(*ctrl);
+    let broadcast = 0x0101010101010101u64 * (byte as u64);
+    let xor = word ^ broadcast;
+    // Zero bytes in xor → matches. Use: (v - 0x01..01) & !v & 0x80..80
+    (xor.wrapping_sub(0x0101010101010101)) & !xor & 0x8080808080808080
+}
+
+#[inline]
+fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> u64 {
+    let word = u64::from_ne_bytes(*ctrl);
+    !word & 0x8080808080808080
+}
+
+impl<V> PrefixHashMap<V> {
+    #[inline]
+    fn group_index(&self, key: u32) -> usize {
+        (key >> (32 - self.n_bits)) as usize
+    }
+
+    pub fn new() -> Self {
+        Self::with_capacity(0)
+    }
+
+    pub fn with_capacity(capacity: usize) -> Self {
+        let min_groups = (capacity / GROUP_SIZE).max(1).next_power_of_two();
+        let n_bits = min_groups.trailing_zeros().max(1);
+        let num_primary = 1usize << n_bits;
+        // Reserve ~12.5% extra groups for overflow.
+        let total = num_primary + num_primary / 8 + 1;
+        let mut groups = Vec::with_capacity(total);
+        groups.resize_with(num_primary, Group::new);
+        Self {
+            groups,
+            n_bits,
+            num_primary: num_primary as u32,
+            len: 0,
+        }
+    }
+
+    pub fn insert(&mut self, key: u32, value: V) -> Option<V> {
+        let tag = tag(key);
+        let hint = slot_hint(key);
+        let mut gi = self.group_index(key);
+
+        loop {
+            let group = &self.groups[gi];
+
+            // Fast path: check preferred slot.
+            let c = group.ctrl[hint];
+            if c == CTRL_EMPTY {
+                let group = &mut self.groups[gi];
+                group.ctrl[hint] = tag;
+                group.keys[hint] = key;
+                group.values[hint] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+            if c == tag && group.keys[hint] == key {
+                let old = std::mem::replace(
+                    unsafe { self.groups[gi].values[hint].assume_init_mut() },
+                    value,
+                );
+                return Some(old);
+            }
+
+            // Slow path: scan group for tag match.
+            let mut tag_mask = match_byte(&group.ctrl, tag);
+            tag_mask &= !(0x80u64 << (hint * 8)); // clear hint slot
+            while tag_mask != 0 {
+                let i = (tag_mask.trailing_zeros() >> 3) as usize;
+                tag_mask &= tag_mask - 1;
+                if group.keys[i] == key {
+                    let old = std::mem::replace(
+                        unsafe { self.groups[gi].values[i].assume_init_mut() },
+                        value,
+                    );
+                    return Some(old);
+                }
+            }
+
+            // Check for empty slot in this group.
+            let empty_mask = match_empty(&group.ctrl);
+            if empty_mask != 0 {
+                let i = (empty_mask.trailing_zeros() >> 3) as usize;
+                let group = &mut self.groups[gi];
+                group.ctrl[i] = tag;
+                group.keys[i] = key;
+                group.values[i] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+
+            // Group full — follow or create overflow chain.
+            let overflow = self.groups[gi].overflow;
+            if overflow != NO_OVERFLOW {
+                gi = overflow as usize;
+            } else {
+                let max_overflow = self.num_primary / 8 + 1;
+                let num_overflow = self.groups.len() as u32 - self.num_primary;
+                if num_overflow >= max_overflow {
+                    // Overflow exhausted — grow and retry.
+                    self.grow();
+                    return self.insert(key, value);
+                }
+                // Allocate a new overflow group.
+                let new_gi = self.groups.len();
+                self.groups.push(Group::new());
+                self.groups[gi].overflow = new_gi as u32;
+                // Insert into the new group's preferred slot.
+                let group = &mut self.groups[new_gi];
+                group.ctrl[hint] = tag;
+                group.keys[hint] = key;
+                group.values[hint] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+        }
+    }
+
+    pub fn get(&self, key: u32) -> Option<&V> {
+        let tag = tag(key);
+        let hint = slot_hint(key);
+        let mut gi = self.group_index(key);
+
+        loop {
+            let group = &self.groups[gi];
+
+            // Fast path: preferred slot.
+            let c = group.ctrl[hint];
+            if c == tag && group.keys[hint] == key {
+                return Some(unsafe { group.values[hint].assume_init_ref() });
+            }
+            if c == CTRL_EMPTY {
+                // Preferred slot empty and no overflow means not found
+                // (only if no overflow — check below)
+            }
+
+            // Slow path: scan group.
+            let mut tag_mask = match_byte(&group.ctrl, tag);
+            tag_mask &= !(0x80u64 << (hint * 8)); // clear hint slot
+            while tag_mask != 0 {
+                let i = (tag_mask.trailing_zeros() >> 3) as usize;
+                tag_mask &= tag_mask - 1;
+                if group.keys[i] == key {
+                    return Some(unsafe { group.values[i].assume_init_ref() });
+                }
+            }
+
+            // If group has empty slots, key is not present.
+            if match_empty(&group.ctrl) != 0 {
+                return None;
+            }
+
+            // Follow overflow chain.
+            if group.overflow == NO_OVERFLOW {
+                return None;
+            }
+            gi = group.overflow as usize;
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    fn grow(&mut self) {
+        let old_groups = std::mem::take(&mut self.groups);
+        let old_len = self.len;
+
+        self.n_bits += 1;
+        let num_primary = 1usize << self.n_bits;
+        let total = num_primary + num_primary / 8 + 1;
+        self.num_primary = num_primary as u32;
+        self.groups = Vec::with_capacity(total);
+        self.groups.resize_with(num_primary, Group::new);
+        self.len = 0;
+
+        for group in old_groups {
+            for i in 0..GROUP_SIZE {
+                if group.ctrl[i] != CTRL_EMPTY {
+                    let key = group.keys[i];
+                    let value = unsafe { group.values[i].assume_init_read() };
+                    self.insert(key, value);
+                }
+            }
+            // Don't drop values — we moved them out with assume_init_read.
+            std::mem::forget(group);
+        }
+
+        debug_assert_eq!(self.len, old_len);
+    }
+}
+
+impl<V> Drop for PrefixHashMap<V> {
+    fn drop(&mut self) {
+        for group in &mut self.groups {
+            for i in 0..GROUP_SIZE {
+                if group.ctrl[i] != CTRL_EMPTY {
+                    unsafe { group.values[i].assume_init_drop() };
+                }
+            }
+        }
+    }
+}
+
+// ── NoHintScalarPrefixHashMap ───────────────────────────────────────────────
+// Same scalar match functions, but no slot_hint fast path — always does a
+// full group scan. Used to isolate the impact of slot_hint vs SIMD.
+
+pub struct NoHintScalarPrefixHashMap<V> {
+    groups: Vec<Group<V>>,
+    n_bits: u32,
+    num_primary: u32,
+    len: usize,
+}
+
+impl<V> NoHintScalarPrefixHashMap<V> {
+    #[inline]
+    fn group_index(&self, key: u32) -> usize {
+        (key >> (32 - self.n_bits)) as usize
+    }
+
+    pub fn new() -> Self {
+        Self::with_capacity(0)
+    }
+
+    pub fn with_capacity(capacity: usize) -> Self {
+        let min_groups = (capacity / GROUP_SIZE).max(1).next_power_of_two();
+        let n_bits = min_groups.trailing_zeros().max(1);
+        let num_primary = 1usize << n_bits;
+        let total = num_primary + num_primary / 8 + 1;
+        let mut groups = Vec::with_capacity(total);
+        groups.resize_with(num_primary, Group::new);
+        Self {
+            groups,
+            n_bits,
+            num_primary: num_primary as u32,
+            len: 0,
+        }
+    }
+
+    pub fn insert(&mut self, key: u32, value: V) -> Option<V> {
+        let tag = tag(key);
+        let mut gi = self.group_index(key);
+
+        loop {
+            let group = &self.groups[gi];
+
+            // Scan group for tag match (no slot_hint fast path).
+            let mut tag_mask = match_byte(&group.ctrl, tag);
+            while tag_mask != 0 {
+                let i = (tag_mask.trailing_zeros() >> 3) as usize;
+                tag_mask &= tag_mask - 1;
+                if group.keys[i] == key {
+                    let old = std::mem::replace(
+                        unsafe { self.groups[gi].values[i].assume_init_mut() },
+                        value,
+                    );
+                    return Some(old);
+                }
+            }
+
+            // Check for empty slot in this group.
+            let empty_mask = match_empty(&group.ctrl);
+            if empty_mask != 0 {
+                let i = (empty_mask.trailing_zeros() >> 3) as usize;
+                let group = &mut self.groups[gi];
+                group.ctrl[i] = tag;
+                group.keys[i] = key;
+                group.values[i] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+
+            // Group full — follow or create overflow chain.
+            let overflow = self.groups[gi].overflow;
+            if overflow != NO_OVERFLOW {
+                gi = overflow as usize;
+            } else {
+                let max_overflow = self.num_primary / 8 + 1;
+                let num_overflow = self.groups.len() as u32 - self.num_primary;
+                if num_overflow >= max_overflow {
+                    self.grow();
+                    return self.insert(key, value);
+                }
+                let new_gi = self.groups.len();
+                self.groups.push(Group::new());
+                self.groups[gi].overflow = new_gi as u32;
+                let group = &mut self.groups[new_gi];
+                group.ctrl[0] = tag;
+                group.keys[0] = key;
+                group.values[0] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+        }
+    }
+
+    pub fn get(&self, key: u32) -> Option<&V> {
+        let tag = tag(key);
+        let mut gi = self.group_index(key);
+
+        loop {
+            let group = &self.groups[gi];
+
+            // Scan group for tag match (no slot_hint fast path).
+            let mut tag_mask = match_byte(&group.ctrl, tag);
+            while tag_mask != 0 {
+                let i = (tag_mask.trailing_zeros() >> 3) as usize;
+                tag_mask &= tag_mask - 1;
+                if group.keys[i] == key {
+                    return Some(unsafe { group.values[i].assume_init_ref() });
+                }
+            }
+
+            // If group has empty slots, key is not present.
+            if match_empty(&group.ctrl) != 0 {
+                return None;
+            }
+
+            // Follow overflow chain.
+            if group.overflow == NO_OVERFLOW {
+                return None;
+            }
+            gi = group.overflow as usize;
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    fn grow(&mut self) {
+        let old_groups = std::mem::take(&mut self.groups);
+        let old_len = self.len;
+
+        self.n_bits += 1;
+        let num_primary = 1usize << self.n_bits;
+        let total = num_primary + num_primary / 8 + 1;
+        self.num_primary = num_primary as u32;
+        self.groups = Vec::with_capacity(total);
+        self.groups.resize_with(num_primary, Group::new);
+        self.len = 0;
+
+        for group in old_groups {
+            for i in 0..GROUP_SIZE {
+                if group.ctrl[i] != CTRL_EMPTY {
+                    let key = group.keys[i];
+                    let value = unsafe { group.values[i].assume_init_read() };
+                    self.insert(key, value);
+                }
+            }
+            std::mem::forget(group);
+        }
+
+        debug_assert_eq!(self.len, old_len);
+    }
+}
+
+impl<V> Drop for NoHintScalarPrefixHashMap<V> {
+    fn drop(&mut self) {
+        for group in &mut self.groups {
+            for i in 0..GROUP_SIZE {
+                if group.ctrl[i] != CTRL_EMPTY {
+                    unsafe { group.values[i].assume_init_drop() };
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn insert_and_get() {
+        let mut map = PrefixHashMap::new();
+        map.insert(100, "hello");
+        map.insert(200, "world");
+        assert_eq!(map.get(100), Some(&"hello"));
+        assert_eq!(map.get(200), Some(&"world"));
+        assert_eq!(map.get(999), None);
+        assert_eq!(map.len(), 2);
+    }
+
+    #[test]
+    fn insert_overwrite() {
+        let mut map = PrefixHashMap::new();
+        map.insert(42, "a");
+        assert_eq!(map.insert(42, "b"), Some("a"));
+        assert_eq!(map.get(42), Some(&"b"));
+        assert_eq!(map.len(), 1);
+    }
+
+    #[test]
+    fn grow_preserves_entries() {
+        let mut map = PrefixHashMap::new();
+        for i in 0..200u32 {
+            map.insert(i, i * 10);
+        }
+        assert_eq!(map.len(), 200);
+        for i in 0..200u32 {
+            assert_eq!(map.get(i), Some(&(i * 10)), "missing key {i}");
+        }
+    }
+
+    #[test]
+    fn many_entries() {
+        let mut map = PrefixHashMap::with_capacity(2000);
+        for i in 0..2000u32 {
+            map.insert(i.wrapping_mul(2654435761), i);
+        }
+        assert_eq!(map.len(), 2000);
+        for i in 0..2000u32 {
+            assert_eq!(map.get(i.wrapping_mul(2654435761)), Some(&i));
+        }
+    }
+
+    #[test]
+    fn overflow_chain() {
+        // Force overflow by inserting many keys with same prefix.
+        let mut map = PrefixHashMap::with_capacity(8);
+        for i in 0..20u32 {
+            // All keys have same top bits → same group → forces overflow.
+            let key = i | 0xAB000000;
+            map.insert(key, i);
+        }
+        assert_eq!(map.len(), 20);
+        for i in 0..20u32 {
+            let key = i | 0xAB000000;
+            assert_eq!(map.get(key), Some(&i), "missing key {key:#x}");
+        }
+    }
+
+    #[test]
+    fn grow_on_overflow_exhaustion() {
+        // Start tiny (2 primary groups), force enough collisions to exhaust overflow.
+        let mut map = PrefixHashMap::with_capacity(1);
+        let old_n_bits = map.n_bits;
+        for i in 0..100u32 {
+            let key = i | 0xFF000000; // all same prefix → single group chain
+            map.insert(key, i);
+        }
+        assert!(map.n_bits > old_n_bits, "should have grown");
+        assert_eq!(map.len(), 100);
+        for i in 0..100u32 {
+            let key = i | 0xFF000000;
+            assert_eq!(map.get(key), Some(&i), "missing key {key:#x} after grow");
+        }
+    }
+}
diff --git a/crates/hashmap-bench/prefix_map_simd.rs b/crates/hashmap-bench/prefix_map_simd.rs
new file mode 100644
index 0000000..1d6787e
--- /dev/null
+++ b/crates/hashmap-bench/prefix_map_simd.rs
@@ -0,0 +1,688 @@
+use core::mem::MaybeUninit;
+
+// Platform-dependent group size: 16 on x86_64 (SSE2), 8 everywhere else.
+#[cfg(target_arch = "x86_64")]
+const GROUP_SIZE: usize = 16;
+#[cfg(not(target_arch = "x86_64"))]
+const GROUP_SIZE: usize = 8;
+
+const CTRL_EMPTY: u8 = 0x00;
+const NO_OVERFLOW: u32 = u32::MAX;
+
+// ── Match‑mask abstraction ──────────────────────────────────────────────────
+// Each platform returns a different mask type from group scans. We unify the
+// interface via a Mask type alias and free functions.
+
+#[cfg(target_arch = "x86_64")]
+type Mask = u32; // movemask: one bit per slot, bottom 16 used
+#[cfg(not(target_arch = "x86_64"))]
+type Mask = u64; // one byte per slot, high bit indicates match
+
+#[cfg(target_arch = "x86_64")]
+mod group_ops {
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86 as x86;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64 as x86;
+
+    use super::{Mask, GROUP_SIZE};
+
+    #[inline(always)]
+    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
+        unsafe {
+            let group = x86::_mm_loadu_si128(ctrl.as_ptr() as *const x86::__m128i);
+            let cmp = x86::_mm_cmpeq_epi8(group, x86::_mm_set1_epi8(tag as i8));
+            x86::_mm_movemask_epi8(cmp) as u32
+        }
+    }
+
+    #[inline(always)]
+    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        match_tag(ctrl, super::CTRL_EMPTY)
+    }
+
+    /// Index of the lowest matching slot.
+    #[inline(always)]
+    pub fn lowest(mask: Mask) -> usize {
+        mask.trailing_zeros() as usize
+    }
+
+    /// Clear a single slot from the mask.
+    #[inline(always)]
+    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
+        mask & !(1u32 << slot)
+    }
+
+    /// Advance to next match, returning slot index.
+    #[inline(always)]
+    pub fn next_match(mask: &mut Mask) -> Option<usize> {
+        if *mask == 0 {
+            return None;
+        }
+        let i = lowest(*mask);
+        *mask &= *mask - 1;
+        Some(i)
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+mod group_ops {
+    use core::arch::aarch64 as neon;
+
+    use super::{Mask, GROUP_SIZE};
+
+    #[inline(always)]
+    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
+        unsafe {
+            let group = neon::vld1_u8(ctrl.as_ptr());
+            let cmp = neon::vceq_u8(group, neon::vdup_n_u8(tag));
+            neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0) & 0x8080808080808080
+        }
+    }
+
+    #[inline(always)]
+    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        unsafe {
+            let group = neon::vld1_u8(ctrl.as_ptr());
+            let cmp = neon::vceq_u8(group, neon::vdup_n_u8(0));
+            neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0) & 0x8080808080808080
+        }
+    }
+
+    #[inline(always)]
+    pub fn lowest(mask: Mask) -> usize {
+        (mask.trailing_zeros() >> 3) as usize
+    }
+
+    #[inline(always)]
+    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
+        mask & !(0x80u64 << (slot * 8))
+    }
+
+    #[inline(always)]
+    pub fn next_match(mask: &mut Mask) -> Option<usize> {
+        if *mask == 0 {
+            return None;
+        }
+        let i = lowest(*mask);
+        *mask &= *mask - 1;
+        Some(i)
+    }
+}
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+mod group_ops {
+    use super::{Mask, GROUP_SIZE};
+
+    #[inline(always)]
+    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
+        let word = u64::from_ne_bytes(*ctrl);
+        let broadcast = 0x0101010101010101u64 * (tag as u64);
+        let xor = word ^ broadcast;
+        (xor.wrapping_sub(0x0101010101010101)) & !xor & 0x8080808080808080
+    }
+
+    #[inline(always)]
+    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        let word = u64::from_ne_bytes(*ctrl);
+        !word & 0x8080808080808080
+    }
+
+    #[inline(always)]
+    pub fn lowest(mask: Mask) -> usize {
+        (mask.trailing_zeros() >> 3) as usize
+    }
+
+    #[inline(always)]
+    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
+        mask & !(0x80u64 << (slot * 8))
+    }
+
+    #[inline(always)]
+    pub fn next_match(mask: &mut Mask) -> Option<usize> {
+        if *mask == 0 {
+            return None;
+        }
+        let i = lowest(*mask);
+        *mask &= *mask - 1;
+        Some(i)
+    }
+}
+
+// ── Group struct ────────────────────────────────────────────────────────────
+
+struct Group<V> {
+    ctrl: [u8; GROUP_SIZE],
+    keys: [u32; GROUP_SIZE],
+    values: [MaybeUninit<V>; GROUP_SIZE],
+    overflow: u32,
+}
+
+impl<V> Group<V> {
+    fn new() -> Self {
+        Self {
+            ctrl: [CTRL_EMPTY; GROUP_SIZE],
+            keys: [0; GROUP_SIZE],
+            values: [const { MaybeUninit::uninit() }; GROUP_SIZE],
+            overflow: NO_OVERFLOW,
+        }
+    }
+}
+
+// ── Helper functions ────────────────────────────────────────────────────────
+
+#[inline]
+fn tag(key: u32) -> u8 {
+    (key as u8) | 0x80
+}
+
+#[inline]
+fn slot_hint(key: u32) -> usize {
+    ((key >> 7) & (GROUP_SIZE as u32 - 1)) as usize
+}
+
+// ── SimdPrefixHashMap ───────────────────────────────────────────────────────
+
+/// Insertion-only hash map where the key IS a hash (`u32`).
+///
+/// Same algorithm as `PrefixHashMap` but with platform-specific SIMD
+/// group scanning (SSE2 on x86_64, NEON on aarch64, scalar fallback elsewhere).
+/// On x86_64 the group size is widened to 16 slots to exploit SSE2.
+pub struct SimdPrefixHashMap<V> {
+    groups: Vec<Group<V>>,
+    n_bits: u32,
+    num_primary: u32,
+    len: usize,
+}
+
+impl<V> SimdPrefixHashMap<V> {
+    #[inline]
+    fn group_index(&self, key: u32) -> usize {
+        (key >> (32 - self.n_bits)) as usize
+    }
+
+    pub fn new() -> Self {
+        Self::with_capacity(0)
+    }
+
+    pub fn with_capacity(capacity: usize) -> Self {
+        let min_groups = (capacity / GROUP_SIZE).max(1).next_power_of_two();
+        let n_bits = min_groups.trailing_zeros().max(1);
+        let num_primary = 1usize << n_bits;
+        let total = num_primary + num_primary / 8 + 1;
+        let mut groups = Vec::with_capacity(total);
+        groups.resize_with(num_primary, Group::new);
+        Self {
+            groups,
+            n_bits,
+            num_primary: num_primary as u32,
+            len: 0,
+        }
+    }
+
+    pub fn insert(&mut self, key: u32, value: V) -> Option<V> {
+        let tag = tag(key);
+        let hint = slot_hint(key);
+        let mut gi = self.group_index(key);
+
+        loop {
+            let group = &self.groups[gi];
+
+            // Fast path: check preferred slot.
+            let c = group.ctrl[hint];
+            if c == CTRL_EMPTY {
+                let group = &mut self.groups[gi];
+                group.ctrl[hint] = tag;
+                group.keys[hint] = key;
+                group.values[hint] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+            if c == tag && group.keys[hint] == key {
+                let old = std::mem::replace(
+                    unsafe { self.groups[gi].values[hint].assume_init_mut() },
+                    value,
+                );
+                return Some(old);
+            }
+
+            // Slow path: SIMD scan group for tag match.
+            let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
+            tag_mask = group_ops::clear_slot(tag_mask, hint);
+            while let Some(i) = group_ops::next_match(&mut tag_mask) {
+                if group.keys[i] == key {
+                    let old = std::mem::replace(
+                        unsafe { self.groups[gi].values[i].assume_init_mut() },
+                        value,
+                    );
+                    return Some(old);
+                }
+            }
+
+            // Check for empty slot in this group.
+            let empty_mask = group_ops::match_empty(&group.ctrl);
+            if empty_mask != 0 {
+                let i = group_ops::lowest(empty_mask);
+                let group = &mut self.groups[gi];
+                group.ctrl[i] = tag;
+                group.keys[i] = key;
+                group.values[i] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+
+            // Group full — follow or create overflow chain.
+            let overflow = self.groups[gi].overflow;
+            if overflow != NO_OVERFLOW {
+                gi = overflow as usize;
+            } else {
+                let max_overflow = self.num_primary / 8 + 1;
+                let num_overflow = self.groups.len() as u32 - self.num_primary;
+                if num_overflow >= max_overflow {
+                    self.grow();
+                    return self.insert(key, value);
+                }
+                let new_gi = self.groups.len();
+                self.groups.push(Group::new());
+                self.groups[gi].overflow = new_gi as u32;
+                let group = &mut self.groups[new_gi];
+                group.ctrl[hint] = tag;
+                group.keys[hint] = key;
+                group.values[hint] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+        }
+    }
+
+    pub fn get(&self, key: u32) -> Option<&V> {
+        let tag = tag(key);
+        let hint = slot_hint(key);
+        let mut gi = self.group_index(key);
+
+        loop {
+            let group = &self.groups[gi];
+
+            // Fast path: preferred slot.
+            let c = group.ctrl[hint];
+            if c == tag && group.keys[hint] == key {
+                return Some(unsafe { group.values[hint].assume_init_ref() });
+            }
+
+            // Slow path: SIMD scan group.
+            let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
+            tag_mask = group_ops::clear_slot(tag_mask, hint);
+            while let Some(i) = group_ops::next_match(&mut tag_mask) {
+                if group.keys[i] == key {
+                    return Some(unsafe { group.values[i].assume_init_ref() });
+                }
+            }
+
+            // If group has empty slots, key is not present.
+            if group_ops::match_empty(&group.ctrl) != 0 {
+                return None;
+            }
+
+            // Follow overflow chain.
+            if group.overflow == NO_OVERFLOW {
+                return None;
+            }
+            gi = group.overflow as usize;
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    fn grow(&mut self) {
+        let old_groups = std::mem::take(&mut self.groups);
+        let old_len = self.len;
+
+        self.n_bits += 1;
+        let num_primary = 1usize << self.n_bits;
+        let total = num_primary + num_primary / 8 + 1;
+        self.num_primary = num_primary as u32;
+        self.groups = Vec::with_capacity(total);
+        self.groups.resize_with(num_primary, Group::new);
+        self.len = 0;
+
+        for group in old_groups {
+            for i in 0..GROUP_SIZE {
+                if group.ctrl[i] != CTRL_EMPTY {
+                    let key = group.keys[i];
+                    let value = unsafe { group.values[i].assume_init_read() };
+                    self.insert(key, value);
+                }
+            }
+            std::mem::forget(group);
+        }
+
+        debug_assert_eq!(self.len, old_len);
+    }
+}
+
+impl<V> Drop for SimdPrefixHashMap<V> {
+    fn drop(&mut self) {
+        for group in &mut self.groups {
+            for i in 0..GROUP_SIZE {
+                if group.ctrl[i] != CTRL_EMPTY {
+                    unsafe { group.values[i].assume_init_drop() };
+                }
+            }
+        }
+    }
+}
+
+// ── NoHintPrefixHashMap (SIMD, no slot_hint) ────────────────────────────────
+// Same as SimdPrefixHashMap but always does a full group scan — no preferred
+// slot fast path. This isolates the pure SIMD scan cost.
+
+pub struct NoHintPrefixHashMap<V> {
+    groups: Vec<Group<V>>,
+    n_bits: u32,
+    num_primary: u32,
+    len: usize,
+}
+
+impl<V> NoHintPrefixHashMap<V> {
+    #[inline]
+    fn group_index(&self, key: u32) -> usize {
+        (key >> (32 - self.n_bits)) as usize
+    }
+
+    pub fn new() -> Self {
+        Self::with_capacity(0)
+    }
+
+    pub fn with_capacity(capacity: usize) -> Self {
+        let min_groups = (capacity / GROUP_SIZE).max(1).next_power_of_two();
+        let n_bits = min_groups.trailing_zeros().max(1);
+        let num_primary = 1usize << n_bits;
+        let total = num_primary + num_primary / 8 + 1;
+        let mut groups = Vec::with_capacity(total);
+        groups.resize_with(num_primary, Group::new);
+        Self {
+            groups,
+            n_bits,
+            num_primary: num_primary as u32,
+            len: 0,
+        }
+    }
+
+    pub fn insert(&mut self, key: u32, value: V) -> Option<V> {
+        let tag = tag(key);
+        let mut gi = self.group_index(key);
+
+        loop {
+            let group = &self.groups[gi];
+
+            // Scan group for tag match.
+            let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
+            while let Some(i) = group_ops::next_match(&mut tag_mask) {
+                if group.keys[i] == key {
+                    let old = std::mem::replace(
+                        unsafe { self.groups[gi].values[i].assume_init_mut() },
+                        value,
+                    );
+                    return Some(old);
+                }
+            }
+
+            // Check for empty slot in this group.
+            let empty_mask = group_ops::match_empty(&group.ctrl);
+            if empty_mask != 0 {
+                let i = group_ops::lowest(empty_mask);
+                let group = &mut self.groups[gi];
+                group.ctrl[i] = tag;
+                group.keys[i] = key;
+                group.values[i] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+
+            // Group full — follow or create overflow chain.
+            let overflow = self.groups[gi].overflow;
+            if overflow != NO_OVERFLOW {
+                gi = overflow as usize;
+            } else {
+                let max_overflow = self.num_primary / 8 + 1;
+                let num_overflow = self.groups.len() as u32 - self.num_primary;
+                if num_overflow >= max_overflow {
+                    self.grow();
+                    return self.insert(key, value);
+                }
+                let new_gi = self.groups.len();
+                self.groups.push(Group::new());
+                self.groups[gi].overflow = new_gi as u32;
+                let group = &mut self.groups[new_gi];
+                group.ctrl[0] = tag;
+                group.keys[0] = key;
+                group.values[0] = MaybeUninit::new(value);
+                self.len += 1;
+                return None;
+            }
+        }
+    }
+
+    pub fn get(&self, key: u32) -> Option<&V> {
+        let tag = tag(key);
+        let mut gi = self.group_index(key);
+
+        loop {
+            let group = &self.groups[gi];
+
+            // Scan group for tag match.
+            let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
+            while let Some(i) = group_ops::next_match(&mut tag_mask) {
+                if group.keys[i] == key {
+                    return Some(unsafe { group.values[i].assume_init_ref() });
+                }
+            }
+
+            // If group has empty slots, key is not present.
+            if group_ops::match_empty(&group.ctrl) != 0 {
+                return None;
+            }
+
+            // Follow overflow chain.
+            if group.overflow == NO_OVERFLOW {
+                return None;
+            }
+            gi = group.overflow as usize;
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    fn grow(&mut self) {
+        let old_groups = std::mem::take(&mut self.groups);
+        let old_len = self.len;
+
+        self.n_bits += 1;
+        let num_primary = 1usize << self.n_bits;
+        let total = num_primary + num_primary / 8 + 1;
+        self.num_primary = num_primary as u32;
+        self.groups = Vec::with_capacity(total);
+        self.groups.resize_with(num_primary, Group::new);
+        self.len = 0;
+
+        for group in old_groups {
+            for i in 0..GROUP_SIZE {
+                if group.ctrl[i] != CTRL_EMPTY {
+                    let key = group.keys[i];
+                    let value = unsafe { group.values[i].assume_init_read() };
+                    self.insert(key, value);
+                }
+            }
+            std::mem::forget(group);
+        }
+
+        debug_assert_eq!(self.len, old_len);
+    }
+}
+
+impl<V> Drop for NoHintPrefixHashMap<V> {
+    fn drop(&mut self) {
+        for group in &mut self.groups {
+            for i in 0..GROUP_SIZE {
+                if group.ctrl[i] != CTRL_EMPTY {
+                    unsafe { group.values[i].assume_init_drop() };
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn insert_and_get() {
+        let mut map = SimdPrefixHashMap::new();
+        map.insert(100, "hello");
+        map.insert(200, "world");
+        assert_eq!(map.get(100), Some(&"hello"));
+        assert_eq!(map.get(200), Some(&"world"));
+        assert_eq!(map.get(999), None);
+        assert_eq!(map.len(), 2);
+    }
+
+    #[test]
+    fn insert_overwrite() {
+        let mut map = SimdPrefixHashMap::new();
+        map.insert(42, "a");
+        assert_eq!(map.insert(42, "b"), Some("a"));
+        assert_eq!(map.get(42), Some(&"b"));
+        assert_eq!(map.len(), 1);
+    }
+
+    #[test]
+    fn grow_preserves_entries() {
+        let mut map = SimdPrefixHashMap::new();
+        for i in 0..200u32 {
+            map.insert(i, i * 10);
+        }
+        assert_eq!(map.len(), 200);
+        for i in 0..200u32 {
+            assert_eq!(map.get(i), Some(&(i * 10)), "missing key {i}");
+        }
+    }
+
+    #[test]
+    fn many_entries() {
+        let mut map = SimdPrefixHashMap::with_capacity(2000);
+        for i in 0..2000u32 {
+            map.insert(i.wrapping_mul(2654435761), i);
+        }
+        assert_eq!(map.len(), 2000);
+        for i in 0..2000u32 {
+            assert_eq!(map.get(i.wrapping_mul(2654435761)), Some(&i));
+        }
+    }
+
+    #[test]
+    fn overflow_chain() {
+        let mut map = SimdPrefixHashMap::with_capacity(8);
+        for i in 0..20u32 {
+            let key = i | 0xAB000000;
+            map.insert(key, i);
+        }
+        assert_eq!(map.len(), 20);
+        for i in 0..20u32 {
+            let key = i | 0xAB000000;
+            assert_eq!(map.get(key), Some(&i), "missing key {key:#x}");
+        }
+    }
+
+    #[test]
+    fn grow_on_overflow_exhaustion() {
+        let mut map = SimdPrefixHashMap::with_capacity(1);
+        let old_n_bits = map.n_bits;
+        for i in 0..100u32 {
+            let key = i | 0xFF000000;
+            map.insert(key, i);
+        }
+        assert!(map.n_bits > old_n_bits, "should have grown");
+        assert_eq!(map.len(), 100);
+        for i in 0..100u32 {
+            let key = i | 0xFF000000;
+            assert_eq!(map.get(key), Some(&i), "missing key {key:#x} after grow");
+        }
+    }
+
+    /// Verify SIMD match functions produce identical results to the scalar versions.
+    #[test]
+    fn simd_matches_scalar() {
+        // Scalar reference implementations
+        fn scalar_match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Vec<usize> {
+            ctrl.iter()
+                .enumerate()
+                .filter(|(_, &c)| c == tag)
+                .map(|(i, _)| i)
+                .collect()
+        }
+        fn scalar_match_empty(ctrl: &[u8; GROUP_SIZE]) -> Vec<usize> {
+            ctrl.iter()
+                .enumerate()
+                .filter(|(_, &c)| c == CTRL_EMPTY)
+                .map(|(i, _)| i)
+                .collect()
+        }
+
+        // Decode a SIMD mask into sorted slot indices
+        fn decode_mask(mut mask: Mask) -> Vec<usize> {
+            let mut out = vec![];
+            while let Some(i) = group_ops::next_match(&mut mask) {
+                out.push(i);
+            }
+            out
+        }
+
+        // Test with various control byte patterns
+        let patterns: Vec<[u8; GROUP_SIZE]> = vec![
+            [CTRL_EMPTY; GROUP_SIZE],
+            [0x80; GROUP_SIZE],
+            {
+                let mut p = [CTRL_EMPTY; GROUP_SIZE];
+                p[0] = 0xAB;
+                p[GROUP_SIZE - 1] = 0xAB;
+                p
+            },
+            {
+                let mut p = [CTRL_EMPTY; GROUP_SIZE];
+                for (i, b) in p.iter_mut().enumerate() {
+                    *b = if i % 2 == 0 { 0x80 | (i as u8) } else { CTRL_EMPTY };
+                }
+                p
+            },
+            {
+                let mut p = [0u8; GROUP_SIZE];
+                for (i, b) in p.iter_mut().enumerate() {
+                    *b = 0x80 | (i as u8);
+                }
+                p
+            },
+        ];
+
+        for ctrl in &patterns {
+            // Test match_empty
+            let simd_empty = decode_mask(group_ops::match_empty(ctrl));
+            let scalar_empty = scalar_match_empty(ctrl);
+            assert_eq!(simd_empty, scalar_empty, "match_empty mismatch for {ctrl:?}");
+
+            // Test match_tag with various tags
+            for &tag in &[0x80, 0x81, 0xAB, 0xFF] {
+                let simd_tag = decode_mask(group_ops::match_tag(ctrl, tag));
+                let scalar_tag = scalar_match_tag(ctrl, tag);
+                assert_eq!(
+                    simd_tag, scalar_tag,
+                    "match_tag mismatch for tag={tag:#x}, ctrl={ctrl:?}"
+                );
+            }
+        }
+    }
+}

From 77742a2925c416de1fe0b2f6dd92274e087f1e68 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Tue, 28 Apr 2026 11:09:20 +0200
Subject: [PATCH 02/22] Fast version

---
 crates/hashmap-bench/README.md          |  93 +++++++++++
 crates/hashmap-bench/hashmap_insert.rs  | 208 ++++++++++++++++++++++--
 crates/hashmap-bench/lib.rs             |  27 +--
 crates/hashmap-bench/prefix_map.rs      | 173 ++++++++++++++------
 crates/hashmap-bench/prefix_map_simd.rs |  80 +++++++--
 5 files changed, 495 insertions(+), 86 deletions(-)
 create mode 100644 crates/hashmap-bench/README.md

diff --git a/crates/hashmap-bench/README.md b/crates/hashmap-bench/README.md
new file mode 100644
index 0000000..5b93b23
--- /dev/null
+++ b/crates/hashmap-bench/README.md
@@ -0,0 +1,93 @@
+# hashmap-bench
+
+Benchmarks comparing the custom `PrefixHashMap` (an insertion-only hash map for
+pre-hashed `u32` keys) against Rust's standard library and several third-party
+hash map implementations.
+
+## Design
+
+`PrefixHashMap` is a Swiss-table-inspired hash map optimized for the case where
+keys are already well-distributed `u32` hashes (e.g. trigram fingerprints). It
+skips the hash function entirely and uses the key bits directly for bucket
+selection and tag matching.
+
+Key design choices:
+
+- **Overflow chaining** instead of open addressing — groups that fill up link
+  to overflow groups rather than probing into neighbours.
+- **Slot hint** — a preferred slot index derived from the key, checked before
+  scanning the group. Gives a direct hit on most inserts at low load.
+- **AoS group layout** — each group stores its control bytes, keys, and values
+  together, keeping a single insert's data within 1–2 cache lines.
+- **Optimized growth** — during resize, elements are re-inserted without
+  duplicate checking and copied via raw pointers.
+
+`SimdPrefixHashMap` adds platform-specific SIMD for the control byte scan
+(NEON on aarch64, SSE2 on x86\_64, scalar fallback elsewhere).
+
+## Benchmark results
+
+All benchmarks insert 1000 random trigram hashes (scrambled with
+`folded_multiply`) into maps with various configurations. Measured on Apple
+M-series (aarch64).
+
+### Insert 1000 trigrams — pre-sized, no growth
+
+| Rank | Map | Time (µs) | vs best |
+|------|-----|-----------|---------|
+| 🥇 | FoldHashMap | 2.31 | — |
+| 🥈 | **SimdPrefixHashMap** | **2.51** | +9% |
+| 🥉 | FxHashMap | 2.65 | +15% |
+| 4 | hashbrown::HashMap | 2.67 | +16% |
+| 5 | hashbrown+Identity | 2.72 | +18% |
+| 6 | NoHintSimd | 2.76 | +19% |
+| 7 | **PrefixHashMap** | **3.00** | +30% |
+| 8 | std::HashMap+FNV | 3.10 | +34% |
+| 9 | AHashMap | 3.33 | +44% |
+| 10 | GxHashMap | 3.74 | +62% |
+| 11 | std::HashMap | 8.52 | +269% |
+
+### Re-insert same keys (all overwrites)
+
+| Map | Time (µs) |
+|-----|-----------|
+| **SimdPrefixHashMap** | **2.15** ✅ |
+| hashbrown+Identity | 2.33 |
+| PrefixHashMap | 3.24 |
+
+### Growth from small (`with_capacity(128)`, 3 resize rounds)
+
+| Map | Time (µs) | Growth cost |
+|-----|-----------|-------------|
+| **SimdPrefixHashMap** | **7.21** | +4.70 |
+| **PrefixHashMap** | **7.68** | +4.68 |
+| hashbrown+Identity | 10.05 | +7.33 |
+
+### Overflow reserve sizing (from small, 3 resize rounds)
+
+| Reserve | Time (µs) |
+|---------|-----------|
+| 0 (grow immediately) | 6.96 |
+| m/8 (12.5%, default) | 8.04 |
+| m/4 (25%) | 8.33 |
+| m/2 (50%) | 8.93 |
+| m/1 (100%) | 10.31 |
+| hashbrown+Identity | 9.86 |
+
+### Key takeaways
+
+- **SimdPrefixHashMap beats every hashbrown variant** except FoldHashMap on
+  first-time inserts, and is **the fastest** for overwrites.
+- **Growth is ~40% cheaper** than hashbrown thanks to the optimized
+  `insert_for_grow` path that skips duplicate checking and uses raw copies.
+- **Smaller overflow reserves are faster** — growing early is cheaper than
+  traversing overflow chains.
+- The remaining ~9% gap to FoldHashMap comes from hashbrown's highly optimized
+  code generation (branch hints, `#[cold]` paths, monomorphization reduction)
+  and its SoA memory layout advantage for SIMD group scans.
+
+## Running
+
+```sh
+cargo bench --bench hashmap_insert
+```
diff --git a/crates/hashmap-bench/hashmap_insert.rs b/crates/hashmap-bench/hashmap_insert.rs
index c05bc52..00508d7 100644
--- a/crates/hashmap-bench/hashmap_insert.rs
+++ b/crates/hashmap-bench/hashmap_insert.rs
@@ -4,6 +4,7 @@ use hashmap_bench::random_trigram_hashes;
 fn bench_hashmap_insert(c: &mut Criterion) {
     let trigrams = random_trigram_hashes(1000);
 
+    // ── Main comparison: insert 1000 trigrams ───────────────────────────
     let mut group = c.benchmark_group("hashmap_insert_1000_trigrams");
 
     group.bench_function("std::HashMap", |b| {
@@ -74,6 +75,48 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
+    group.bench_function("GxHashMap", |b| {
+        b.iter_batched(
+            || gxhash::HashMap::with_capacity_and_hasher(trigrams.len(), Default::default()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("std::HashMap+FNV", |b| {
+        b.iter_batched(
+            || std::collections::HashMap::with_capacity_and_hasher(trigrams.len(), fnv::FnvBuildHasher::default()),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("hashbrown+Identity", |b| {
+        b.iter_batched(
+            || hashbrown::HashMap::<u32, usize, hashmap_bench::IdentityBuildHasher>::with_capacity_and_hasher(
+                trigrams.len(),
+                Default::default(),
+            ),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
     group.bench_function("PrefixHashMap", |b| {
         b.iter_batched(
             || hashmap_bench::prefix_map::PrefixHashMap::with_capacity(trigrams.len()),
@@ -100,9 +143,9 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group.bench_function("NoHintScalar", |b| {
+    group.bench_function("NoHintSimd", |b| {
         b.iter_batched(
-            || hashmap_bench::prefix_map::NoHintScalarPrefixHashMap::with_capacity(trigrams.len()),
+            || hashmap_bench::prefix_map_simd::NoHintPrefixHashMap::with_capacity(trigrams.len()),
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -113,22 +156,82 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group.bench_function("NoHintSimd", |b| {
+    group.finish();
+
+    // ── Re-insert: insert same keys twice (second pass = all overwrites) ─
+    let mut group2 = c.benchmark_group("reinsert_1000_trigrams");
+
+    group2.bench_function("hashbrown+Identity", |b| {
         b.iter_batched(
-            || hashmap_bench::prefix_map_simd::NoHintPrefixHashMap::with_capacity(trigrams.len()),
+            || {
+                let mut map = hashbrown::HashMap::<u32, usize, hashmap_bench::IdentityBuildHasher>::with_capacity_and_hasher(
+                    trigrams.len(),
+                    Default::default(),
+                );
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
             |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i + 1000);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group2.bench_function("PrefixHashMap", |b| {
+        b.iter_batched(
+            || {
+                let mut map = hashmap_bench::prefix_map::PrefixHashMap::with_capacity(trigrams.len());
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
                 }
                 map
             },
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i + 1000);
+                }
+                map
+            },
             BatchSize::SmallInput,
         );
     });
 
-    group.bench_function("GxHashMap", |b| {
+    group2.bench_function("SimdPrefixHashMap", |b| {
         b.iter_batched(
-            || gxhash::HashMap::with_capacity_and_hasher(trigrams.len(), Default::default()),
+            || {
+                let mut map = hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity(trigrams.len());
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i + 1000);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group2.finish();
+
+    // ── Growth penalty: start small (128), force 3 growths ──────────────
+    let mut group3 = c.benchmark_group("grow_from_128_insert_1000_trigrams");
+
+    group3.bench_function("hashbrown+Identity", |b| {
+        b.iter_batched(
+            || hashbrown::HashMap::<u32, usize, hashmap_bench::IdentityBuildHasher>::with_capacity_and_hasher(
+                128,
+                Default::default(),
+            ),
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -139,9 +242,9 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group.bench_function("std::HashMap+FNV", |b| {
+    group3.bench_function("PrefixHashMap", |b| {
         b.iter_batched(
-            || std::collections::HashMap::with_capacity_and_hasher(trigrams.len(), fnv::FnvBuildHasher::default()),
+            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity(128),
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -152,10 +255,28 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group.bench_function("hashbrown+Identity", |b| {
+    group3.bench_function("SimdPrefixHashMap", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity(128),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group3.finish();
+
+    // ── Overflow reserve: start small so overflow is exercised ───────────
+    let mut group4 = c.benchmark_group("overflow_reserve_insert_1000_trigrams");
+
+    group4.bench_function("hashbrown+Identity", |b| {
         b.iter_batched(
             || hashbrown::HashMap::<u32, usize, hashmap_bench::IdentityBuildHasher>::with_capacity_and_hasher(
-                trigrams.len(),
+                128,
                 Default::default(),
             ),
             |mut map| {
@@ -168,7 +289,72 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group.finish();
+    group4.bench_function("PrefixMap ovfl=m/8", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_overflow(128, 8),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group4.bench_function("PrefixMap ovfl=m/4", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_overflow(128, 4),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group4.bench_function("PrefixMap ovfl=m/2", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_overflow(128, 2),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group4.bench_function("PrefixMap ovfl=m/1", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_overflow(128, 1),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group4.bench_function("PrefixMap ovfl=0", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_overflow(128, usize::MAX),
+            |mut map| {
+                for (i, &key) in trigrams.iter().enumerate() {
+                    map.insert(key, i);
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group4.finish();
 }
 
 criterion_group!(benches, bench_hashmap_insert);
diff --git a/crates/hashmap-bench/lib.rs b/crates/hashmap-bench/lib.rs
index 0714ffd..21dabfb 100644
--- a/crates/hashmap-bench/lib.rs
+++ b/crates/hashmap-bench/lib.rs
@@ -4,8 +4,19 @@ pub mod prefix_map_simd;
 use rand::Rng;
 use std::hash::{BuildHasherDefault, Hasher};
 
-/// A hasher that returns the input unchanged. Only valid for u32 keys
-/// that are already well-distributed hashes.
+/// Folded multiply: full u64×u64→u128, then XOR the two halves.
+/// Produces a u64 with good bit independence between high and low halves.
+#[inline(always)]
+pub fn folded_multiply(x: u64, y: u64) -> u64 {
+    let full = (x as u128).wrapping_mul(y as u128);
+    (full as u64) ^ ((full >> 64) as u64)
+}
+
+const ARBITRARY0: u64 = 0x243f6a8885a308d3;
+
+/// A hasher that expands a u32 key into a well-distributed u64 using
+/// folded_multiply so that both hashbrown's bucket index (low bits) and
+/// tag (top 7 bits) have independent entropy.
 #[derive(Default)]
 pub struct IdentityHasher(u64);
 
@@ -14,7 +25,7 @@ impl Hasher for IdentityHasher {
         unimplemented!("IdentityHasher only supports write_u32");
     }
     fn write_u32(&mut self, i: u32) {
-        self.0 = i as u64;
+        self.0 = (i as u64) | ((i as u64) << 32);
     }
     fn finish(&self) -> u64 {
         self.0
@@ -24,7 +35,7 @@ impl Hasher for IdentityHasher {
 pub type IdentityBuildHasher = BuildHasherDefault<IdentityHasher>;
 
 /// Generate `n` random trigrams as well-distributed u32 hashes.
-/// Each trigram is packed into a u32, then scrambled with a murmur3 finalizer.
+/// Each trigram is packed into a u32, then scrambled with folded_multiply.
 pub fn random_trigram_hashes(n: usize) -> Vec<u32> {
     let mut rng = rand::rng();
     (0..n)
@@ -33,13 +44,7 @@ pub fn random_trigram_hashes(n: usize) -> Vec<u32> {
             let b = rng.random_range(b'a'..=b'z') as u32;
             let c = rng.random_range(b'a'..=b'z') as u32;
             let packed = a | (b << 8) | (c << 16);
-            let mut h = packed;
-            h ^= h >> 16;
-            h = h.wrapping_mul(0x85ebca6b);
-            h ^= h >> 13;
-            h = h.wrapping_mul(0xc2b2ae35);
-            h ^= h >> 16;
-            h
+            folded_multiply(packed as u64, ARBITRARY0) as u32
         })
         .collect()
 }
diff --git a/crates/hashmap-bench/prefix_map.rs b/crates/hashmap-bench/prefix_map.rs
index 40c669e..d18bf8d 100644
--- a/crates/hashmap-bench/prefix_map.rs
+++ b/crates/hashmap-bench/prefix_map.rs
@@ -4,6 +4,22 @@ const GROUP_SIZE: usize = 8;
 const CTRL_EMPTY: u8 = 0x00;
 const NO_OVERFLOW: u32 = u32::MAX;
 
+#[inline(always)]
+fn likely(b: bool) -> bool {
+    if !b { cold_path() }
+    b
+}
+
+#[inline(always)]
+fn unlikely(b: bool) -> bool {
+    if b { cold_path() }
+    b
+}
+
+#[cold]
+#[inline(never)]
+fn cold_path() {}
+
 /// A single group: 8 slots with control bytes, keys, values, and an overflow pointer.
 struct Group<V> {
     ctrl: [u8; GROUP_SIZE],
@@ -72,11 +88,18 @@ impl<V> PrefixHashMap<V> {
     }
 
     pub fn with_capacity(capacity: usize) -> Self {
-        let min_groups = (capacity / GROUP_SIZE).max(1).next_power_of_two();
+        Self::with_capacity_and_overflow(capacity, 8)
+    }
+
+    /// `overflow_denom`: reserve `num_primary / overflow_denom + 1` overflow groups.
+    /// Default is 8 (12.5%). Use 4 for 25%, 2 for 50%, etc.
+    pub fn with_capacity_and_overflow(capacity: usize, overflow_denom: usize) -> Self {
+        // Target ≤87.5% load (7/8), matching hashbrown's load factor.
+        let adjusted = capacity.checked_mul(8).unwrap_or(usize::MAX) / 7;
+        let min_groups = (adjusted / GROUP_SIZE).max(1).next_power_of_two();
         let n_bits = min_groups.trailing_zeros().max(1);
         let num_primary = 1usize << n_bits;
-        // Reserve ~12.5% extra groups for overflow.
-        let total = num_primary + num_primary / 8 + 1;
+        let total = num_primary + num_primary / overflow_denom + 1;
         let mut groups = Vec::with_capacity(total);
         groups.resize_with(num_primary, Group::new);
         Self {
@@ -97,7 +120,7 @@ impl<V> PrefixHashMap<V> {
 
             // Fast path: check preferred slot.
             let c = group.ctrl[hint];
-            if c == CTRL_EMPTY {
+            if likely(c == CTRL_EMPTY) {
                 let group = &mut self.groups[gi];
                 group.ctrl[hint] = tag;
                 group.keys[hint] = key;
@@ -119,7 +142,7 @@ impl<V> PrefixHashMap<V> {
             while tag_mask != 0 {
                 let i = (tag_mask.trailing_zeros() >> 3) as usize;
                 tag_mask &= tag_mask - 1;
-                if group.keys[i] == key {
+                if unlikely(group.keys[i] == key) {
                     let old = std::mem::replace(
                         unsafe { self.groups[gi].values[i].assume_init_mut() },
                         value,
@@ -130,7 +153,7 @@ impl<V> PrefixHashMap<V> {
 
             // Check for empty slot in this group.
             let empty_mask = match_empty(&group.ctrl);
-            if empty_mask != 0 {
+            if likely(empty_mask != 0) {
                 let i = (empty_mask.trailing_zeros() >> 3) as usize;
                 let group = &mut self.groups[gi];
                 group.ctrl[i] = tag;
@@ -142,29 +165,29 @@ impl<V> PrefixHashMap<V> {
 
             // Group full — follow or create overflow chain.
             let overflow = self.groups[gi].overflow;
-            if overflow != NO_OVERFLOW {
-                gi = overflow as usize;
-            } else {
-                let max_overflow = self.num_primary / 8 + 1;
-                let num_overflow = self.groups.len() as u32 - self.num_primary;
-                if num_overflow >= max_overflow {
-                    // Overflow exhausted — grow and retry.
-                    self.grow();
-                    return self.insert(key, value);
-                }
-                // Allocate a new overflow group.
-                let new_gi = self.groups.len();
-                self.groups.push(Group::new());
-                self.groups[gi].overflow = new_gi as u32;
-                // Insert into the new group's preferred slot.
-                let group = &mut self.groups[new_gi];
-                group.ctrl[hint] = tag;
-                group.keys[hint] = key;
-                group.values[hint] = MaybeUninit::new(value);
-                self.len += 1;
-                return None;
+            if unlikely(overflow == NO_OVERFLOW) {
+                return self.insert_overflow(gi, hint, tag, key, value);
             }
+            gi = overflow as usize;
+        }
+    }
+
+    #[cold]
+    #[inline(never)]
+    fn insert_overflow(&mut self, gi: usize, hint: usize, tag: u8, key: u32, value: V) -> Option<V> {
+        if self.groups.len() == self.groups.capacity() {
+            self.grow();
+            return self.insert(key, value);
         }
+        let new_gi = self.groups.len();
+        self.groups.push(Group::new());
+        self.groups[gi].overflow = new_gi as u32;
+        let group = &mut self.groups[new_gi];
+        group.ctrl[hint] = tag;
+        group.keys[hint] = key;
+        group.values[hint] = MaybeUninit::new(value);
+        self.len += 1;
+        None
     }
 
     pub fn get(&self, key: u32) -> Option<&V> {
@@ -177,13 +200,9 @@ impl<V> PrefixHashMap<V> {
 
             // Fast path: preferred slot.
             let c = group.ctrl[hint];
-            if c == tag && group.keys[hint] == key {
+            if likely(c == tag) && group.keys[hint] == key {
                 return Some(unsafe { group.values[hint].assume_init_ref() });
             }
-            if c == CTRL_EMPTY {
-                // Preferred slot empty and no overflow means not found
-                // (only if no overflow — check below)
-            }
 
             // Slow path: scan group.
             let mut tag_mask = match_byte(&group.ctrl, tag);
@@ -191,18 +210,18 @@ impl<V> PrefixHashMap<V> {
             while tag_mask != 0 {
                 let i = (tag_mask.trailing_zeros() >> 3) as usize;
                 tag_mask &= tag_mask - 1;
-                if group.keys[i] == key {
+                if likely(group.keys[i] == key) {
                     return Some(unsafe { group.values[i].assume_init_ref() });
                 }
             }
 
             // If group has empty slots, key is not present.
-            if match_empty(&group.ctrl) != 0 {
+            if likely(match_empty(&group.ctrl) != 0) {
                 return None;
             }
 
             // Follow overflow chain.
-            if group.overflow == NO_OVERFLOW {
+            if unlikely(group.overflow == NO_OVERFLOW) {
                 return None;
             }
             gi = group.overflow as usize;
@@ -225,20 +244,79 @@ impl<V> PrefixHashMap<V> {
         self.groups.resize_with(num_primary, Group::new);
         self.len = 0;
 
-        for group in old_groups {
-            for i in 0..GROUP_SIZE {
-                if group.ctrl[i] != CTRL_EMPTY {
-                    let key = group.keys[i];
-                    let value = unsafe { group.values[i].assume_init_read() };
-                    self.insert(key, value);
-                }
+        for group in &old_groups {
+            // Skip groups with no entries using a quick check on the ctrl word.
+            let ctrl_word = u64::from_ne_bytes(group.ctrl);
+            if ctrl_word == 0 {
+                continue;
+            }
+
+            // Iterate only occupied slots using the high-bit mask.
+            let mut full_mask = ctrl_word & 0x8080808080808080;
+            while full_mask != 0 {
+                let i = (full_mask.trailing_zeros() >> 3) as usize;
+                full_mask &= full_mask - 1;
+
+                let key = group.keys[i];
+                // No duplicate check — we know all keys are unique during grow.
+                self.insert_for_grow(key, group.values[i].as_ptr());
             }
-            // Don't drop values — we moved them out with assume_init_read.
-            std::mem::forget(group);
         }
+        // Prevent double-drop — values were copied out via raw pointers.
+        std::mem::forget(old_groups);
 
         debug_assert_eq!(self.len, old_len);
     }
+
+    /// Fast insert used only during `grow`. Skips duplicate checking and
+    /// copies the value via raw pointer instead of moving it.
+    fn insert_for_grow(&mut self, key: u32, value_src: *const V) {
+        let tag = tag(key);
+        let hint = slot_hint(key);
+        let mut gi = self.group_index(key);
+
+        loop {
+            let group = &self.groups[gi];
+
+            // Try preferred slot first.
+            if group.ctrl[hint] == CTRL_EMPTY {
+                let group = &mut self.groups[gi];
+                group.ctrl[hint] = tag;
+                group.keys[hint] = key;
+                unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
+                self.len += 1;
+                return;
+            }
+
+            // Find any empty slot in this group.
+            let empty_mask = match_empty(&group.ctrl);
+            if empty_mask != 0 {
+                let i = (empty_mask.trailing_zeros() >> 3) as usize;
+                let group = &mut self.groups[gi];
+                group.ctrl[i] = tag;
+                group.keys[i] = key;
+                unsafe { group.values[i].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
+                self.len += 1;
+                return;
+            }
+
+            // Group full — follow or create overflow chain.
+            let overflow = self.groups[gi].overflow;
+            if overflow != NO_OVERFLOW {
+                gi = overflow as usize;
+            } else {
+                let new_gi = self.groups.len();
+                self.groups.push(Group::new());
+                self.groups[gi].overflow = new_gi as u32;
+                let group = &mut self.groups[new_gi];
+                group.ctrl[hint] = tag;
+                group.keys[hint] = key;
+                unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
+                self.len += 1;
+                return;
+            }
+        }
+    }
 }
 
 impl<V> Drop for PrefixHashMap<V> {
@@ -275,7 +353,8 @@ impl<V> NoHintScalarPrefixHashMap<V> {
     }
 
     pub fn with_capacity(capacity: usize) -> Self {
-        let min_groups = (capacity / GROUP_SIZE).max(1).next_power_of_two();
+        let adjusted = capacity.checked_mul(8).unwrap_or(usize::MAX) / 7;
+        let min_groups = (adjusted / GROUP_SIZE).max(1).next_power_of_two();
         let n_bits = min_groups.trailing_zeros().max(1);
         let num_primary = 1usize << n_bits;
         let total = num_primary + num_primary / 8 + 1;
@@ -327,9 +406,7 @@ impl<V> NoHintScalarPrefixHashMap<V> {
             if overflow != NO_OVERFLOW {
                 gi = overflow as usize;
             } else {
-                let max_overflow = self.num_primary / 8 + 1;
-                let num_overflow = self.groups.len() as u32 - self.num_primary;
-                if num_overflow >= max_overflow {
+                if self.groups.len() == self.groups.capacity() {
                     self.grow();
                     return self.insert(key, value);
                 }
diff --git a/crates/hashmap-bench/prefix_map_simd.rs b/crates/hashmap-bench/prefix_map_simd.rs
index 1d6787e..b0a138d 100644
--- a/crates/hashmap-bench/prefix_map_simd.rs
+++ b/crates/hashmap-bench/prefix_map_simd.rs
@@ -206,7 +206,8 @@ impl<V> SimdPrefixHashMap<V> {
     }
 
     pub fn with_capacity(capacity: usize) -> Self {
-        let min_groups = (capacity / GROUP_SIZE).max(1).next_power_of_two();
+        let adjusted = capacity.checked_mul(8).unwrap_or(usize::MAX) / 7;
+        let min_groups = (adjusted / GROUP_SIZE).max(1).next_power_of_two();
         let n_bits = min_groups.trailing_zeros().max(1);
         let num_primary = 1usize << n_bits;
         let total = num_primary + num_primary / 8 + 1;
@@ -276,9 +277,7 @@ impl<V> SimdPrefixHashMap<V> {
             if overflow != NO_OVERFLOW {
                 gi = overflow as usize;
             } else {
-                let max_overflow = self.num_primary / 8 + 1;
-                let num_overflow = self.groups.len() as u32 - self.num_primary;
-                if num_overflow >= max_overflow {
+                if self.groups.len() == self.groups.capacity() {
                     self.grow();
                     return self.insert(key, value);
                 }
@@ -347,19 +346,69 @@ impl<V> SimdPrefixHashMap<V> {
         self.groups.resize_with(num_primary, Group::new);
         self.len = 0;
 
-        for group in old_groups {
-            for i in 0..GROUP_SIZE {
-                if group.ctrl[i] != CTRL_EMPTY {
-                    let key = group.keys[i];
-                    let value = unsafe { group.values[i].assume_init_read() };
-                    self.insert(key, value);
-                }
+        for group in &old_groups {
+            let ctrl_word = u64::from_ne_bytes(group.ctrl);
+            if ctrl_word == 0 {
+                continue;
+            }
+            let mut full_mask = ctrl_word & 0x8080808080808080;
+            while full_mask != 0 {
+                let i = (full_mask.trailing_zeros() >> 3) as usize;
+                full_mask &= full_mask - 1;
+                let key = group.keys[i];
+                self.insert_for_grow(key, group.values[i].as_ptr());
             }
-            std::mem::forget(group);
         }
+        std::mem::forget(old_groups);
 
         debug_assert_eq!(self.len, old_len);
     }
+
+    /// Fast insert for grow: no duplicate check, raw pointer copy.
+    fn insert_for_grow(&mut self, key: u32, value_src: *const V) {
+        let tag = tag(key);
+        let hint = slot_hint(key);
+        let mut gi = self.group_index(key);
+
+        loop {
+            let group = &self.groups[gi];
+
+            if group.ctrl[hint] == CTRL_EMPTY {
+                let group = &mut self.groups[gi];
+                group.ctrl[hint] = tag;
+                group.keys[hint] = key;
+                unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
+                self.len += 1;
+                return;
+            }
+
+            let empty_mask = group_ops::match_empty(&group.ctrl);
+            if empty_mask != 0 {
+                let i = group_ops::lowest(empty_mask);
+                let group = &mut self.groups[gi];
+                group.ctrl[i] = tag;
+                group.keys[i] = key;
+                unsafe { group.values[i].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
+                self.len += 1;
+                return;
+            }
+
+            let overflow = self.groups[gi].overflow;
+            if overflow != NO_OVERFLOW {
+                gi = overflow as usize;
+            } else {
+                let new_gi = self.groups.len();
+                self.groups.push(Group::new());
+                self.groups[gi].overflow = new_gi as u32;
+                let group = &mut self.groups[new_gi];
+                group.ctrl[hint] = tag;
+                group.keys[hint] = key;
+                unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
+                self.len += 1;
+                return;
+            }
+        }
+    }
 }
 
 impl<V> Drop for SimdPrefixHashMap<V> {
@@ -396,7 +445,8 @@ impl<V> NoHintPrefixHashMap<V> {
     }
 
     pub fn with_capacity(capacity: usize) -> Self {
-        let min_groups = (capacity / GROUP_SIZE).max(1).next_power_of_two();
+        let adjusted = capacity.checked_mul(8).unwrap_or(usize::MAX) / 7;
+        let min_groups = (adjusted / GROUP_SIZE).max(1).next_power_of_two();
         let n_bits = min_groups.trailing_zeros().max(1);
         let num_primary = 1usize << n_bits;
         let total = num_primary + num_primary / 8 + 1;
@@ -446,9 +496,7 @@ impl<V> NoHintPrefixHashMap<V> {
             if overflow != NO_OVERFLOW {
                 gi = overflow as usize;
             } else {
-                let max_overflow = self.num_primary / 8 + 1;
-                let num_overflow = self.groups.len() as u32 - self.num_primary;
-                if num_overflow >= max_overflow {
+                if self.groups.len() == self.groups.capacity() {
                     self.grow();
                     return self.insert(key, value);
                 }

From 08e46dc28e4cf9cad4ff02ee30744329200239fd Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Tue, 28 Apr 2026 12:24:55 +0200
Subject: [PATCH 03/22] make it generic

---
 crates/hashmap-bench/hashmap_insert.rs  | 127 ++-----
 crates/hashmap-bench/prefix_map.rs      | 414 ++++++++--------------
 crates/hashmap-bench/prefix_map_simd.rs | 436 +++++++-----------------
 3 files changed, 288 insertions(+), 689 deletions(-)

diff --git a/crates/hashmap-bench/hashmap_insert.rs b/crates/hashmap-bench/hashmap_insert.rs
index 00508d7..a5a9717 100644
--- a/crates/hashmap-bench/hashmap_insert.rs
+++ b/crates/hashmap-bench/hashmap_insert.rs
@@ -119,7 +119,10 @@ fn bench_hashmap_insert(c: &mut Criterion) {
 
     group.bench_function("PrefixHashMap", |b| {
         b.iter_batched(
-            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity(trigrams.len()),
+            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_hasher(
+                trigrams.len(),
+                hashmap_bench::IdentityBuildHasher::default(),
+            ),
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -132,20 +135,10 @@ fn bench_hashmap_insert(c: &mut Criterion) {
 
     group.bench_function("SimdPrefixHashMap", |b| {
         b.iter_batched(
-            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity(trigrams.len()),
-            |mut map| {
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i);
-                }
-                map
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group.bench_function("NoHintSimd", |b| {
-        b.iter_batched(
-            || hashmap_bench::prefix_map_simd::NoHintPrefixHashMap::with_capacity(trigrams.len()),
+            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity_and_hasher(
+                trigrams.len(),
+                hashmap_bench::IdentityBuildHasher::default(),
+            ),
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -186,7 +179,10 @@ fn bench_hashmap_insert(c: &mut Criterion) {
     group2.bench_function("PrefixHashMap", |b| {
         b.iter_batched(
             || {
-                let mut map = hashmap_bench::prefix_map::PrefixHashMap::with_capacity(trigrams.len());
+                let mut map = hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_hasher(
+                    trigrams.len(),
+                    hashmap_bench::IdentityBuildHasher::default(),
+                );
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
                 }
@@ -205,7 +201,10 @@ fn bench_hashmap_insert(c: &mut Criterion) {
     group2.bench_function("SimdPrefixHashMap", |b| {
         b.iter_batched(
             || {
-                let mut map = hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity(trigrams.len());
+                let mut map = hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity_and_hasher(
+                    trigrams.len(),
+                    hashmap_bench::IdentityBuildHasher::default(),
+                );
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
                 }
@@ -244,7 +243,10 @@ fn bench_hashmap_insert(c: &mut Criterion) {
 
     group3.bench_function("PrefixHashMap", |b| {
         b.iter_batched(
-            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity(128),
+            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_hasher(
+                128,
+                hashmap_bench::IdentityBuildHasher::default(),
+            ),
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -257,27 +259,9 @@ fn bench_hashmap_insert(c: &mut Criterion) {
 
     group3.bench_function("SimdPrefixHashMap", |b| {
         b.iter_batched(
-            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity(128),
-            |mut map| {
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i);
-                }
-                map
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group3.finish();
-
-    // ── Overflow reserve: start small so overflow is exercised ───────────
-    let mut group4 = c.benchmark_group("overflow_reserve_insert_1000_trigrams");
-
-    group4.bench_function("hashbrown+Identity", |b| {
-        b.iter_batched(
-            || hashbrown::HashMap::<u32, usize, hashmap_bench::IdentityBuildHasher>::with_capacity_and_hasher(
+            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity_and_hasher(
                 128,
-                Default::default(),
+                hashmap_bench::IdentityBuildHasher::default(),
             ),
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
@@ -289,72 +273,7 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group4.bench_function("PrefixMap ovfl=m/8", |b| {
-        b.iter_batched(
-            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_overflow(128, 8),
-            |mut map| {
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i);
-                }
-                map
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group4.bench_function("PrefixMap ovfl=m/4", |b| {
-        b.iter_batched(
-            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_overflow(128, 4),
-            |mut map| {
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i);
-                }
-                map
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group4.bench_function("PrefixMap ovfl=m/2", |b| {
-        b.iter_batched(
-            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_overflow(128, 2),
-            |mut map| {
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i);
-                }
-                map
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group4.bench_function("PrefixMap ovfl=m/1", |b| {
-        b.iter_batched(
-            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_overflow(128, 1),
-            |mut map| {
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i);
-                }
-                map
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group4.bench_function("PrefixMap ovfl=0", |b| {
-        b.iter_batched(
-            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_overflow(128, usize::MAX),
-            |mut map| {
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i);
-                }
-                map
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group4.finish();
+    group3.finish();
 }
 
 criterion_group!(benches, bench_hashmap_insert);
diff --git a/crates/hashmap-bench/prefix_map.rs b/crates/hashmap-bench/prefix_map.rs
index d18bf8d..2619aa5 100644
--- a/crates/hashmap-bench/prefix_map.rs
+++ b/crates/hashmap-bench/prefix_map.rs
@@ -1,4 +1,7 @@
 use core::mem::MaybeUninit;
+use std::borrow::Borrow;
+use std::collections::hash_map::RandomState;
+use std::hash::{BuildHasher, Hash};
 
 const GROUP_SIZE: usize = 8;
 const CTRL_EMPTY: u8 = 0x00;
@@ -20,46 +23,14 @@ fn unlikely(b: bool) -> bool {
 #[inline(never)]
 fn cold_path() {}
 
-/// A single group: 8 slots with control bytes, keys, values, and an overflow pointer.
-struct Group<V> {
-    ctrl: [u8; GROUP_SIZE],
-    keys: [u32; GROUP_SIZE],
-    values: [MaybeUninit<V>; GROUP_SIZE],
-    overflow: u32, // index into groups vec, or NO_OVERFLOW
-}
-
-impl<V> Group<V> {
-    fn new() -> Self {
-        Self {
-            ctrl: [CTRL_EMPTY; GROUP_SIZE],
-            keys: [0; GROUP_SIZE],
-            values: [const { MaybeUninit::uninit() }; GROUP_SIZE],
-            overflow: NO_OVERFLOW,
-        }
-    }
-}
-
-/// Insertion-only hash map where the key IS a hash (`u32`).
-///
-/// Groups are stored in a single `Vec<Group>`. The first `2^n_bits` groups
-/// are primary buckets (addressed by key prefix). When a primary group is
-/// full, an overflow group is allocated from the end of the vec and linked
-/// via `overflow`.
-pub struct PrefixHashMap<V> {
-    groups: Vec<Group<V>>,
-    n_bits: u32,
-    num_primary: u32,
-    len: usize,
-}
-
 #[inline]
-fn tag(key: u32) -> u8 {
-    (key as u8) | 0x80
+fn tag(hash: u64) -> u8 {
+    (hash as u8) | 0x80
 }
 
 #[inline]
-fn slot_hint(key: u32) -> usize {
-    ((key >> 7) & 0x7) as usize
+fn slot_hint(hash: u64) -> usize {
+    ((hash >> 7) & 0x7) as usize
 }
 
 #[inline]
@@ -67,7 +38,6 @@ fn match_byte(ctrl: &[u8; GROUP_SIZE], byte: u8) -> u64 {
     let word = u64::from_ne_bytes(*ctrl);
     let broadcast = 0x0101010101010101u64 * (byte as u64);
     let xor = word ^ broadcast;
-    // Zero bytes in xor → matches. Use: (v - 0x01..01) & !v & 0x80..80
     (xor.wrapping_sub(0x0101010101010101)) & !xor & 0x8080808080808080
 }
 
@@ -77,43 +47,95 @@ fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> u64 {
     !word & 0x8080808080808080
 }
 
-impl<V> PrefixHashMap<V> {
-    #[inline]
-    fn group_index(&self, key: u32) -> usize {
-        (key >> (32 - self.n_bits)) as usize
+struct Group<K, V> {
+    ctrl: [u8; GROUP_SIZE],
+    keys: [MaybeUninit<K>; GROUP_SIZE],
+    values: [MaybeUninit<V>; GROUP_SIZE],
+    overflow: u32,
+}
+
+impl<K, V> Group<K, V> {
+    fn new() -> Self {
+        Self {
+            ctrl: [CTRL_EMPTY; GROUP_SIZE],
+            keys: [const { MaybeUninit::uninit() }; GROUP_SIZE],
+            values: [const { MaybeUninit::uninit() }; GROUP_SIZE],
+            overflow: NO_OVERFLOW,
+        }
     }
+}
+
+/// Insertion-only hash map with overflow chaining and slot-hint fast path.
+///
+/// Generic over key type `K`, value type `V`, and hash builder `S`.
+pub struct PrefixHashMap<K, V, S = RandomState> {
+    groups: Vec<Group<K, V>>,
+    n_bits: u32,
+    len: usize,
+    hash_builder: S,
+}
 
+impl<K: Hash + Eq, V> PrefixHashMap<K, V> {
     pub fn new() -> Self {
-        Self::with_capacity(0)
+        Self::with_capacity_and_hasher(0, RandomState::new())
     }
 
     pub fn with_capacity(capacity: usize) -> Self {
-        Self::with_capacity_and_overflow(capacity, 8)
+        Self::with_capacity_and_hasher(capacity, RandomState::new())
+    }
+}
+
+impl<K, V, S> PrefixHashMap<K, V, S> {
+    pub fn with_hasher(hash_builder: S) -> Self {
+        Self::with_capacity_and_hasher(0, hash_builder)
     }
 
-    /// `overflow_denom`: reserve `num_primary / overflow_denom + 1` overflow groups.
-    /// Default is 8 (12.5%). Use 4 for 25%, 2 for 50%, etc.
-    pub fn with_capacity_and_overflow(capacity: usize, overflow_denom: usize) -> Self {
+    pub fn with_capacity_and_hasher(capacity: usize, hash_builder: S) -> Self {
         // Target ≤87.5% load (7/8), matching hashbrown's load factor.
         let adjusted = capacity.checked_mul(8).unwrap_or(usize::MAX) / 7;
         let min_groups = (adjusted / GROUP_SIZE).max(1).next_power_of_two();
         let n_bits = min_groups.trailing_zeros().max(1);
         let num_primary = 1usize << n_bits;
-        let total = num_primary + num_primary / overflow_denom + 1;
+        let total = num_primary + num_primary / 8 + 1;
         let mut groups = Vec::with_capacity(total);
         groups.resize_with(num_primary, Group::new);
         Self {
             groups,
             n_bits,
-            num_primary: num_primary as u32,
             len: 0,
+            hash_builder,
         }
     }
 
-    pub fn insert(&mut self, key: u32, value: V) -> Option<V> {
-        let tag = tag(key);
-        let hint = slot_hint(key);
-        let mut gi = self.group_index(key);
+    #[inline]
+    fn group_index(&self, hash: u64) -> usize {
+        (hash >> (64 - self.n_bits)) as usize
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+}
+
+impl<K: Hash + Eq, V, S: BuildHasher> PrefixHashMap<K, V, S> {
+    pub fn insert(&mut self, key: K, value: V) -> Option<V> {
+        let hash = self.hash_builder.hash_one(&key);
+        self.insert_hashed(hash, key, value)
+    }
+
+    pub fn get<Q>(&self, key: &Q) -> Option<&V>
+    where
+        K: Borrow<Q>,
+        Q: Hash + Eq + ?Sized,
+    {
+        let hash = self.hash_builder.hash_one(key);
+        self.get_hashed(hash, key)
+    }
+
+    fn insert_hashed(&mut self, hash: u64, key: K, value: V) -> Option<V> {
+        let tag = tag(hash);
+        let hint = slot_hint(hash);
+        let mut gi = self.group_index(hash);
 
         loop {
             let group = &self.groups[gi];
@@ -123,30 +145,33 @@ impl<V> PrefixHashMap<V> {
             if likely(c == CTRL_EMPTY) {
                 let group = &mut self.groups[gi];
                 group.ctrl[hint] = tag;
-                group.keys[hint] = key;
+                group.keys[hint] = MaybeUninit::new(key);
                 group.values[hint] = MaybeUninit::new(value);
                 self.len += 1;
                 return None;
             }
-            if c == tag && group.keys[hint] == key {
+            if c == tag && unsafe { group.keys[hint].assume_init_ref() } == &key {
                 let old = std::mem::replace(
                     unsafe { self.groups[gi].values[hint].assume_init_mut() },
                     value,
                 );
+                // Drop the incoming key since we're keeping the stored one.
+                drop(key);
                 return Some(old);
             }
 
             // Slow path: scan group for tag match.
             let mut tag_mask = match_byte(&group.ctrl, tag);
-            tag_mask &= !(0x80u64 << (hint * 8)); // clear hint slot
+            tag_mask &= !(0x80u64 << (hint * 8));
             while tag_mask != 0 {
                 let i = (tag_mask.trailing_zeros() >> 3) as usize;
                 tag_mask &= tag_mask - 1;
-                if unlikely(group.keys[i] == key) {
+                if unlikely(unsafe { group.keys[i].assume_init_ref() } == &key) {
                     let old = std::mem::replace(
                         unsafe { self.groups[gi].values[i].assume_init_mut() },
                         value,
                     );
+                    drop(key);
                     return Some(old);
                 }
             }
@@ -157,7 +182,7 @@ impl<V> PrefixHashMap<V> {
                 let i = (empty_mask.trailing_zeros() >> 3) as usize;
                 let group = &mut self.groups[gi];
                 group.ctrl[i] = tag;
-                group.keys[i] = key;
+                group.keys[i] = MaybeUninit::new(key);
                 group.values[i] = MaybeUninit::new(value);
                 self.len += 1;
                 return None;
@@ -166,7 +191,7 @@ impl<V> PrefixHashMap<V> {
             // Group full — follow or create overflow chain.
             let overflow = self.groups[gi].overflow;
             if unlikely(overflow == NO_OVERFLOW) {
-                return self.insert_overflow(gi, hint, tag, key, value);
+                return self.insert_overflow(gi, hash, key, value);
             }
             gi = overflow as usize;
         }
@@ -174,43 +199,53 @@ impl<V> PrefixHashMap<V> {
 
     #[cold]
     #[inline(never)]
-    fn insert_overflow(&mut self, gi: usize, hint: usize, tag: u8, key: u32, value: V) -> Option<V> {
+    fn insert_overflow(&mut self, gi: usize, hash: u64, key: K, value: V) -> Option<V> {
         if self.groups.len() == self.groups.capacity() {
             self.grow();
-            return self.insert(key, value);
+            return self.insert_hashed(hash, key, value);
         }
+        let hint = slot_hint(hash);
+        let tag = tag(hash);
         let new_gi = self.groups.len();
         self.groups.push(Group::new());
         self.groups[gi].overflow = new_gi as u32;
         let group = &mut self.groups[new_gi];
         group.ctrl[hint] = tag;
-        group.keys[hint] = key;
+        group.keys[hint] = MaybeUninit::new(key);
         group.values[hint] = MaybeUninit::new(value);
         self.len += 1;
         None
     }
 
-    pub fn get(&self, key: u32) -> Option<&V> {
-        let tag = tag(key);
-        let hint = slot_hint(key);
-        let mut gi = self.group_index(key);
+    fn get_hashed<Q>(&self, hash: u64, key: &Q) -> Option<&V>
+    where
+        K: Borrow<Q>,
+        Q: Eq + ?Sized,
+    {
+        let tag = tag(hash);
+        let hint = slot_hint(hash);
+        let mut gi = self.group_index(hash);
 
         loop {
             let group = &self.groups[gi];
 
             // Fast path: preferred slot.
             let c = group.ctrl[hint];
-            if likely(c == tag) && group.keys[hint] == key {
+            if likely(c == tag)
+                && unsafe { group.keys[hint].assume_init_ref() }.borrow() == key
+            {
                 return Some(unsafe { group.values[hint].assume_init_ref() });
             }
 
             // Slow path: scan group.
             let mut tag_mask = match_byte(&group.ctrl, tag);
-            tag_mask &= !(0x80u64 << (hint * 8)); // clear hint slot
+            tag_mask &= !(0x80u64 << (hint * 8));
             while tag_mask != 0 {
                 let i = (tag_mask.trailing_zeros() >> 3) as usize;
                 tag_mask &= tag_mask - 1;
-                if likely(group.keys[i] == key) {
+                if likely(
+                    unsafe { group.keys[i].assume_init_ref() }.borrow() == key,
+                ) {
                     return Some(unsafe { group.values[i].assume_init_ref() });
                 }
             }
@@ -228,10 +263,6 @@ impl<V> PrefixHashMap<V> {
         }
     }
 
-    pub fn len(&self) -> usize {
-        self.len
-    }
-
     fn grow(&mut self) {
         let old_groups = std::mem::take(&mut self.groups);
         let old_len = self.len;
@@ -239,68 +270,60 @@ impl<V> PrefixHashMap<V> {
         self.n_bits += 1;
         let num_primary = 1usize << self.n_bits;
         let total = num_primary + num_primary / 8 + 1;
-        self.num_primary = num_primary as u32;
         self.groups = Vec::with_capacity(total);
         self.groups.resize_with(num_primary, Group::new);
         self.len = 0;
 
         for group in &old_groups {
-            // Skip groups with no entries using a quick check on the ctrl word.
             let ctrl_word = u64::from_ne_bytes(group.ctrl);
             if ctrl_word == 0 {
                 continue;
             }
-
-            // Iterate only occupied slots using the high-bit mask.
             let mut full_mask = ctrl_word & 0x8080808080808080;
             while full_mask != 0 {
                 let i = (full_mask.trailing_zeros() >> 3) as usize;
                 full_mask &= full_mask - 1;
-
-                let key = group.keys[i];
-                // No duplicate check — we know all keys are unique during grow.
-                self.insert_for_grow(key, group.values[i].as_ptr());
+                let hash = self.hash_builder.hash_one(unsafe {
+                    group.keys[i].assume_init_ref()
+                });
+                self.insert_for_grow(hash, group.keys[i].as_ptr(), group.values[i].as_ptr());
             }
         }
-        // Prevent double-drop — values were copied out via raw pointers.
+        // Prevent double-drop — keys/values were copied out via raw pointers.
         std::mem::forget(old_groups);
 
         debug_assert_eq!(self.len, old_len);
     }
 
-    /// Fast insert used only during `grow`. Skips duplicate checking and
-    /// copies the value via raw pointer instead of moving it.
-    fn insert_for_grow(&mut self, key: u32, value_src: *const V) {
-        let tag = tag(key);
-        let hint = slot_hint(key);
-        let mut gi = self.group_index(key);
+    /// Fast insert for grow: no duplicate check, raw pointer copy.
+    fn insert_for_grow(&mut self, hash: u64, key_src: *const K, value_src: *const V) {
+        let tag = tag(hash);
+        let hint = slot_hint(hash);
+        let mut gi = self.group_index(hash);
 
         loop {
             let group = &self.groups[gi];
 
-            // Try preferred slot first.
             if group.ctrl[hint] == CTRL_EMPTY {
                 let group = &mut self.groups[gi];
                 group.ctrl[hint] = tag;
-                group.keys[hint] = key;
+                unsafe { group.keys[hint].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
                 unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
                 self.len += 1;
                 return;
             }
 
-            // Find any empty slot in this group.
             let empty_mask = match_empty(&group.ctrl);
             if empty_mask != 0 {
                 let i = (empty_mask.trailing_zeros() >> 3) as usize;
                 let group = &mut self.groups[gi];
                 group.ctrl[i] = tag;
-                group.keys[i] = key;
+                unsafe { group.keys[i].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
                 unsafe { group.values[i].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
                 self.len += 1;
                 return;
             }
 
-            // Group full — follow or create overflow chain.
             let overflow = self.groups[gi].overflow;
             if overflow != NO_OVERFLOW {
                 gi = overflow as usize;
@@ -310,7 +333,7 @@ impl<V> PrefixHashMap<V> {
                 self.groups[gi].overflow = new_gi as u32;
                 let group = &mut self.groups[new_gi];
                 group.ctrl[hint] = tag;
-                group.keys[hint] = key;
+                unsafe { group.keys[hint].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
                 unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
                 self.len += 1;
                 return;
@@ -319,176 +342,12 @@ impl<V> PrefixHashMap<V> {
     }
 }
 
-impl<V> Drop for PrefixHashMap<V> {
-    fn drop(&mut self) {
-        for group in &mut self.groups {
-            for i in 0..GROUP_SIZE {
-                if group.ctrl[i] != CTRL_EMPTY {
-                    unsafe { group.values[i].assume_init_drop() };
-                }
-            }
-        }
-    }
-}
-
-// ── NoHintScalarPrefixHashMap ───────────────────────────────────────────────
-// Same scalar match functions, but no slot_hint fast path — always does a
-// full group scan. Used to isolate the impact of slot_hint vs SIMD.
-
-pub struct NoHintScalarPrefixHashMap<V> {
-    groups: Vec<Group<V>>,
-    n_bits: u32,
-    num_primary: u32,
-    len: usize,
-}
-
-impl<V> NoHintScalarPrefixHashMap<V> {
-    #[inline]
-    fn group_index(&self, key: u32) -> usize {
-        (key >> (32 - self.n_bits)) as usize
-    }
-
-    pub fn new() -> Self {
-        Self::with_capacity(0)
-    }
-
-    pub fn with_capacity(capacity: usize) -> Self {
-        let adjusted = capacity.checked_mul(8).unwrap_or(usize::MAX) / 7;
-        let min_groups = (adjusted / GROUP_SIZE).max(1).next_power_of_two();
-        let n_bits = min_groups.trailing_zeros().max(1);
-        let num_primary = 1usize << n_bits;
-        let total = num_primary + num_primary / 8 + 1;
-        let mut groups = Vec::with_capacity(total);
-        groups.resize_with(num_primary, Group::new);
-        Self {
-            groups,
-            n_bits,
-            num_primary: num_primary as u32,
-            len: 0,
-        }
-    }
-
-    pub fn insert(&mut self, key: u32, value: V) -> Option<V> {
-        let tag = tag(key);
-        let mut gi = self.group_index(key);
-
-        loop {
-            let group = &self.groups[gi];
-
-            // Scan group for tag match (no slot_hint fast path).
-            let mut tag_mask = match_byte(&group.ctrl, tag);
-            while tag_mask != 0 {
-                let i = (tag_mask.trailing_zeros() >> 3) as usize;
-                tag_mask &= tag_mask - 1;
-                if group.keys[i] == key {
-                    let old = std::mem::replace(
-                        unsafe { self.groups[gi].values[i].assume_init_mut() },
-                        value,
-                    );
-                    return Some(old);
-                }
-            }
-
-            // Check for empty slot in this group.
-            let empty_mask = match_empty(&group.ctrl);
-            if empty_mask != 0 {
-                let i = (empty_mask.trailing_zeros() >> 3) as usize;
-                let group = &mut self.groups[gi];
-                group.ctrl[i] = tag;
-                group.keys[i] = key;
-                group.values[i] = MaybeUninit::new(value);
-                self.len += 1;
-                return None;
-            }
-
-            // Group full — follow or create overflow chain.
-            let overflow = self.groups[gi].overflow;
-            if overflow != NO_OVERFLOW {
-                gi = overflow as usize;
-            } else {
-                if self.groups.len() == self.groups.capacity() {
-                    self.grow();
-                    return self.insert(key, value);
-                }
-                let new_gi = self.groups.len();
-                self.groups.push(Group::new());
-                self.groups[gi].overflow = new_gi as u32;
-                let group = &mut self.groups[new_gi];
-                group.ctrl[0] = tag;
-                group.keys[0] = key;
-                group.values[0] = MaybeUninit::new(value);
-                self.len += 1;
-                return None;
-            }
-        }
-    }
-
-    pub fn get(&self, key: u32) -> Option<&V> {
-        let tag = tag(key);
-        let mut gi = self.group_index(key);
-
-        loop {
-            let group = &self.groups[gi];
-
-            // Scan group for tag match (no slot_hint fast path).
-            let mut tag_mask = match_byte(&group.ctrl, tag);
-            while tag_mask != 0 {
-                let i = (tag_mask.trailing_zeros() >> 3) as usize;
-                tag_mask &= tag_mask - 1;
-                if group.keys[i] == key {
-                    return Some(unsafe { group.values[i].assume_init_ref() });
-                }
-            }
-
-            // If group has empty slots, key is not present.
-            if match_empty(&group.ctrl) != 0 {
-                return None;
-            }
-
-            // Follow overflow chain.
-            if group.overflow == NO_OVERFLOW {
-                return None;
-            }
-            gi = group.overflow as usize;
-        }
-    }
-
-    pub fn len(&self) -> usize {
-        self.len
-    }
-
-    fn grow(&mut self) {
-        let old_groups = std::mem::take(&mut self.groups);
-        let old_len = self.len;
-
-        self.n_bits += 1;
-        let num_primary = 1usize << self.n_bits;
-        let total = num_primary + num_primary / 8 + 1;
-        self.num_primary = num_primary as u32;
-        self.groups = Vec::with_capacity(total);
-        self.groups.resize_with(num_primary, Group::new);
-        self.len = 0;
-
-        for group in old_groups {
-            for i in 0..GROUP_SIZE {
-                if group.ctrl[i] != CTRL_EMPTY {
-                    let key = group.keys[i];
-                    let value = unsafe { group.values[i].assume_init_read() };
-                    self.insert(key, value);
-                }
-            }
-            std::mem::forget(group);
-        }
-
-        debug_assert_eq!(self.len, old_len);
-    }
-}
-
-impl<V> Drop for NoHintScalarPrefixHashMap<V> {
+impl<K, V, S> Drop for PrefixHashMap<K, V, S> {
     fn drop(&mut self) {
         for group in &mut self.groups {
             for i in 0..GROUP_SIZE {
                 if group.ctrl[i] != CTRL_EMPTY {
+                    unsafe { group.keys[i].assume_init_drop() };
                     unsafe { group.values[i].assume_init_drop() };
                 }
             }
@@ -505,9 +364,9 @@ mod tests {
         let mut map = PrefixHashMap::new();
         map.insert(100, "hello");
         map.insert(200, "world");
-        assert_eq!(map.get(100), Some(&"hello"));
-        assert_eq!(map.get(200), Some(&"world"));
-        assert_eq!(map.get(999), None);
+        assert_eq!(map.get(&100), Some(&"hello"));
+        assert_eq!(map.get(&200), Some(&"world"));
+        assert_eq!(map.get(&999), None);
         assert_eq!(map.len(), 2);
     }
 
@@ -516,7 +375,7 @@ mod tests {
         let mut map = PrefixHashMap::new();
         map.insert(42, "a");
         assert_eq!(map.insert(42, "b"), Some("a"));
-        assert_eq!(map.get(42), Some(&"b"));
+        assert_eq!(map.get(&42), Some(&"b"));
         assert_eq!(map.len(), 1);
     }
 
@@ -528,7 +387,7 @@ mod tests {
         }
         assert_eq!(map.len(), 200);
         for i in 0..200u32 {
-            assert_eq!(map.get(i), Some(&(i * 10)), "missing key {i}");
+            assert_eq!(map.get(&i), Some(&(i * 10)), "missing key {i}");
         }
     }
 
@@ -540,40 +399,53 @@ mod tests {
         }
         assert_eq!(map.len(), 2000);
         for i in 0..2000u32 {
-            assert_eq!(map.get(i.wrapping_mul(2654435761)), Some(&i));
+            assert_eq!(map.get(&i.wrapping_mul(2654435761)), Some(&i));
         }
     }
 
     #[test]
     fn overflow_chain() {
-        // Force overflow by inserting many keys with same prefix.
         let mut map = PrefixHashMap::with_capacity(8);
         for i in 0..20u32 {
-            // All keys have same top bits → same group → forces overflow.
             let key = i | 0xAB000000;
             map.insert(key, i);
         }
         assert_eq!(map.len(), 20);
         for i in 0..20u32 {
             let key = i | 0xAB000000;
-            assert_eq!(map.get(key), Some(&i), "missing key {key:#x}");
+            assert_eq!(map.get(&key), Some(&i), "missing key {key:#x}");
         }
     }
 
     #[test]
     fn grow_on_overflow_exhaustion() {
-        // Start tiny (2 primary groups), force enough collisions to exhaust overflow.
         let mut map = PrefixHashMap::with_capacity(1);
         let old_n_bits = map.n_bits;
         for i in 0..100u32 {
-            let key = i | 0xFF000000; // all same prefix → single group chain
+            let key = i | 0xFF000000;
             map.insert(key, i);
         }
         assert!(map.n_bits > old_n_bits, "should have grown");
         assert_eq!(map.len(), 100);
         for i in 0..100u32 {
             let key = i | 0xFF000000;
-            assert_eq!(map.get(key), Some(&i), "missing key {key:#x} after grow");
+            assert_eq!(map.get(&key), Some(&i), "missing key {key:#x} after grow");
         }
     }
+
+    #[test]
+    fn string_keys() {
+        let mut map = PrefixHashMap::new();
+        map.insert("hello".to_string(), 1);
+        map.insert("world".to_string(), 2);
+        assert_eq!(map.get("hello"), Some(&1));
+        assert_eq!(map.get("world"), Some(&2));
+        assert_eq!(map.get("missing"), None);
+        assert_eq!(map.len(), 2);
+
+        // Overwrite
+        assert_eq!(map.insert("hello".to_string(), 3), Some(1));
+        assert_eq!(map.get("hello"), Some(&3));
+        assert_eq!(map.len(), 2);
+    }
 }
diff --git a/crates/hashmap-bench/prefix_map_simd.rs b/crates/hashmap-bench/prefix_map_simd.rs
index b0a138d..243819f 100644
--- a/crates/hashmap-bench/prefix_map_simd.rs
+++ b/crates/hashmap-bench/prefix_map_simd.rs
@@ -1,4 +1,7 @@
 use core::mem::MaybeUninit;
+use std::borrow::Borrow;
+use std::collections::hash_map::RandomState;
+use std::hash::{BuildHasher, Hash};
 
 // Platform-dependent group size: 16 on x86_64 (SSE2), 8 everywhere else.
 #[cfg(target_arch = "x86_64")]
@@ -9,14 +12,12 @@ const GROUP_SIZE: usize = 8;
 const CTRL_EMPTY: u8 = 0x00;
 const NO_OVERFLOW: u32 = u32::MAX;
 
-// ── Match‑mask abstraction ──────────────────────────────────────────────────
-// Each platform returns a different mask type from group scans. We unify the
-// interface via a Mask type alias and free functions.
-
 #[cfg(target_arch = "x86_64")]
-type Mask = u32; // movemask: one bit per slot, bottom 16 used
+type Mask = u32;
 #[cfg(not(target_arch = "x86_64"))]
-type Mask = u64; // one byte per slot, high bit indicates match
+type Mask = u64;
+
+// ── SIMD group operations ───────────────────────────────────────────────────
 
 #[cfg(target_arch = "x86_64")]
 mod group_ops {
@@ -41,19 +42,16 @@ mod group_ops {
         match_tag(ctrl, super::CTRL_EMPTY)
     }
 
-    /// Index of the lowest matching slot.
     #[inline(always)]
     pub fn lowest(mask: Mask) -> usize {
         mask.trailing_zeros() as usize
     }
 
-    /// Clear a single slot from the mask.
     #[inline(always)]
     pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
         mask & !(1u32 << slot)
     }
 
-    /// Advance to next match, returning slot index.
     #[inline(always)]
     pub fn next_match(mask: &mut Mask) -> Option<usize> {
         if *mask == 0 {
@@ -149,63 +147,63 @@ mod group_ops {
     }
 }
 
-// ── Group struct ────────────────────────────────────────────────────────────
+// ── Helpers ─────────────────────────────────────────────────────────────────
+
+#[inline]
+fn tag(hash: u64) -> u8 {
+    (hash as u8) | 0x80
+}
+
+#[inline]
+fn slot_hint(hash: u64) -> usize {
+    ((hash >> 7) & (GROUP_SIZE as u64 - 1)) as usize
+}
 
-struct Group<V> {
+struct Group<K, V> {
     ctrl: [u8; GROUP_SIZE],
-    keys: [u32; GROUP_SIZE],
+    keys: [MaybeUninit<K>; GROUP_SIZE],
     values: [MaybeUninit<V>; GROUP_SIZE],
     overflow: u32,
 }
 
-impl<V> Group<V> {
+impl<K, V> Group<K, V> {
     fn new() -> Self {
         Self {
             ctrl: [CTRL_EMPTY; GROUP_SIZE],
-            keys: [0; GROUP_SIZE],
+            keys: [const { MaybeUninit::uninit() }; GROUP_SIZE],
             values: [const { MaybeUninit::uninit() }; GROUP_SIZE],
             overflow: NO_OVERFLOW,
         }
     }
 }
 
-// ── Helper functions ────────────────────────────────────────────────────────
-
-#[inline]
-fn tag(key: u32) -> u8 {
-    (key as u8) | 0x80
-}
-
-#[inline]
-fn slot_hint(key: u32) -> usize {
-    ((key >> 7) & (GROUP_SIZE as u32 - 1)) as usize
-}
-
-// ── SimdPrefixHashMap ───────────────────────────────────────────────────────
-
-/// Insertion-only hash map where the key IS a hash (`u32`).
+/// Insertion-only hash map with SIMD group scanning.
 ///
-/// Same algorithm as `PrefixHashMap` but with platform-specific SIMD
-/// group scanning (SSE2 on x86_64, NEON on aarch64, scalar fallback elsewhere).
-/// On x86_64 the group size is widened to 16 slots to exploit SSE2.
-pub struct SimdPrefixHashMap<V> {
-    groups: Vec<Group<V>>,
+/// Uses NEON on aarch64, SSE2 on x86_64, scalar fallback elsewhere.
+/// Generic over key type `K`, value type `V`, and hash builder `S`.
+pub struct SimdPrefixHashMap<K, V, S = RandomState> {
+    groups: Vec<Group<K, V>>,
     n_bits: u32,
-    num_primary: u32,
     len: usize,
+    hash_builder: S,
 }
 
-impl<V> SimdPrefixHashMap<V> {
-    #[inline]
-    fn group_index(&self, key: u32) -> usize {
-        (key >> (32 - self.n_bits)) as usize
-    }
-
+impl<K: Hash + Eq, V> SimdPrefixHashMap<K, V> {
     pub fn new() -> Self {
-        Self::with_capacity(0)
+        Self::with_capacity_and_hasher(0, RandomState::new())
     }
 
     pub fn with_capacity(capacity: usize) -> Self {
+        Self::with_capacity_and_hasher(capacity, RandomState::new())
+    }
+}
+
+impl<K, V, S> SimdPrefixHashMap<K, V, S> {
+    pub fn with_hasher(hash_builder: S) -> Self {
+        Self::with_capacity_and_hasher(0, hash_builder)
+    }
+
+    pub fn with_capacity_and_hasher(capacity: usize, hash_builder: S) -> Self {
         let adjusted = capacity.checked_mul(8).unwrap_or(usize::MAX) / 7;
         let min_groups = (adjusted / GROUP_SIZE).max(1).next_power_of_two();
         let n_bits = min_groups.trailing_zeros().max(1);
@@ -216,15 +214,40 @@ impl<V> SimdPrefixHashMap<V> {
         Self {
             groups,
             n_bits,
-            num_primary: num_primary as u32,
             len: 0,
+            hash_builder,
         }
     }
 
-    pub fn insert(&mut self, key: u32, value: V) -> Option<V> {
-        let tag = tag(key);
-        let hint = slot_hint(key);
-        let mut gi = self.group_index(key);
+    #[inline]
+    fn group_index(&self, hash: u64) -> usize {
+        (hash >> (64 - self.n_bits)) as usize
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+}
+
+impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
+    pub fn insert(&mut self, key: K, value: V) -> Option<V> {
+        let hash = self.hash_builder.hash_one(&key);
+        self.insert_hashed(hash, key, value)
+    }
+
+    pub fn get<Q>(&self, key: &Q) -> Option<&V>
+    where
+        K: Borrow<Q>,
+        Q: Hash + Eq + ?Sized,
+    {
+        let hash = self.hash_builder.hash_one(key);
+        self.get_hashed(hash, key)
+    }
+
+    fn insert_hashed(&mut self, hash: u64, key: K, value: V) -> Option<V> {
+        let tag = tag(hash);
+        let hint = slot_hint(hash);
+        let mut gi = self.group_index(hash);
 
         loop {
             let group = &self.groups[gi];
@@ -234,16 +257,17 @@ impl<V> SimdPrefixHashMap<V> {
             if c == CTRL_EMPTY {
                 let group = &mut self.groups[gi];
                 group.ctrl[hint] = tag;
-                group.keys[hint] = key;
+                group.keys[hint] = MaybeUninit::new(key);
                 group.values[hint] = MaybeUninit::new(value);
                 self.len += 1;
                 return None;
             }
-            if c == tag && group.keys[hint] == key {
+            if c == tag && unsafe { group.keys[hint].assume_init_ref() } == &key {
                 let old = std::mem::replace(
                     unsafe { self.groups[gi].values[hint].assume_init_mut() },
                     value,
                 );
+                drop(key);
                 return Some(old);
             }
 
@@ -251,11 +275,12 @@ impl<V> SimdPrefixHashMap<V> {
             let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
             tag_mask = group_ops::clear_slot(tag_mask, hint);
             while let Some(i) = group_ops::next_match(&mut tag_mask) {
-                if group.keys[i] == key {
+                if unsafe { group.keys[i].assume_init_ref() } == &key {
                     let old = std::mem::replace(
                         unsafe { self.groups[gi].values[i].assume_init_mut() },
                         value,
                     );
+                    drop(key);
                     return Some(old);
                 }
             }
@@ -266,7 +291,7 @@ impl<V> SimdPrefixHashMap<V> {
                 let i = group_ops::lowest(empty_mask);
                 let group = &mut self.groups[gi];
                 group.ctrl[i] = tag;
-                group.keys[i] = key;
+                group.keys[i] = MaybeUninit::new(key);
                 group.values[i] = MaybeUninit::new(value);
                 self.len += 1;
                 return None;
@@ -279,14 +304,14 @@ impl<V> SimdPrefixHashMap<V> {
             } else {
                 if self.groups.len() == self.groups.capacity() {
                     self.grow();
-                    return self.insert(key, value);
+                    return self.insert_hashed(hash, key, value);
                 }
                 let new_gi = self.groups.len();
                 self.groups.push(Group::new());
                 self.groups[gi].overflow = new_gi as u32;
                 let group = &mut self.groups[new_gi];
                 group.ctrl[hint] = tag;
-                group.keys[hint] = key;
+                group.keys[hint] = MaybeUninit::new(key);
                 group.values[hint] = MaybeUninit::new(value);
                 self.len += 1;
                 return None;
@@ -294,17 +319,23 @@ impl<V> SimdPrefixHashMap<V> {
         }
     }
 
-    pub fn get(&self, key: u32) -> Option<&V> {
-        let tag = tag(key);
-        let hint = slot_hint(key);
-        let mut gi = self.group_index(key);
+    fn get_hashed<Q>(&self, hash: u64, key: &Q) -> Option<&V>
+    where
+        K: Borrow<Q>,
+        Q: Eq + ?Sized,
+    {
+        let tag = tag(hash);
+        let hint = slot_hint(hash);
+        let mut gi = self.group_index(hash);
 
         loop {
             let group = &self.groups[gi];
 
             // Fast path: preferred slot.
             let c = group.ctrl[hint];
-            if c == tag && group.keys[hint] == key {
+            if c == tag
+                && unsafe { group.keys[hint].assume_init_ref() }.borrow() == key
+            {
                 return Some(unsafe { group.values[hint].assume_init_ref() });
             }
 
@@ -312,17 +343,15 @@ impl<V> SimdPrefixHashMap<V> {
             let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
             tag_mask = group_ops::clear_slot(tag_mask, hint);
             while let Some(i) = group_ops::next_match(&mut tag_mask) {
-                if group.keys[i] == key {
+                if unsafe { group.keys[i].assume_init_ref() }.borrow() == key {
                     return Some(unsafe { group.values[i].assume_init_ref() });
                 }
             }
 
-            // If group has empty slots, key is not present.
             if group_ops::match_empty(&group.ctrl) != 0 {
                 return None;
             }
 
-            // Follow overflow chain.
             if group.overflow == NO_OVERFLOW {
                 return None;
             }
@@ -330,10 +359,6 @@ impl<V> SimdPrefixHashMap<V> {
         }
     }
 
-    pub fn len(&self) -> usize {
-        self.len
-    }
-
     fn grow(&mut self) {
         let old_groups = std::mem::take(&mut self.groups);
         let old_len = self.len;
@@ -341,7 +366,6 @@ impl<V> SimdPrefixHashMap<V> {
         self.n_bits += 1;
         let num_primary = 1usize << self.n_bits;
         let total = num_primary + num_primary / 8 + 1;
-        self.num_primary = num_primary as u32;
         self.groups = Vec::with_capacity(total);
         self.groups.resize_with(num_primary, Group::new);
         self.len = 0;
@@ -355,8 +379,10 @@ impl<V> SimdPrefixHashMap<V> {
             while full_mask != 0 {
                 let i = (full_mask.trailing_zeros() >> 3) as usize;
                 full_mask &= full_mask - 1;
-                let key = group.keys[i];
-                self.insert_for_grow(key, group.values[i].as_ptr());
+                let hash = self.hash_builder.hash_one(unsafe {
+                    group.keys[i].assume_init_ref()
+                });
+                self.insert_for_grow(hash, group.keys[i].as_ptr(), group.values[i].as_ptr());
             }
         }
         std::mem::forget(old_groups);
@@ -364,11 +390,10 @@ impl<V> SimdPrefixHashMap<V> {
         debug_assert_eq!(self.len, old_len);
     }
 
-    /// Fast insert for grow: no duplicate check, raw pointer copy.
-    fn insert_for_grow(&mut self, key: u32, value_src: *const V) {
-        let tag = tag(key);
-        let hint = slot_hint(key);
-        let mut gi = self.group_index(key);
+    fn insert_for_grow(&mut self, hash: u64, key_src: *const K, value_src: *const V) {
+        let tag = tag(hash);
+        let hint = slot_hint(hash);
+        let mut gi = self.group_index(hash);
 
         loop {
             let group = &self.groups[gi];
@@ -376,7 +401,7 @@ impl<V> SimdPrefixHashMap<V> {
             if group.ctrl[hint] == CTRL_EMPTY {
                 let group = &mut self.groups[gi];
                 group.ctrl[hint] = tag;
-                group.keys[hint] = key;
+                unsafe { group.keys[hint].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
                 unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
                 self.len += 1;
                 return;
@@ -387,7 +412,7 @@ impl<V> SimdPrefixHashMap<V> {
                 let i = group_ops::lowest(empty_mask);
                 let group = &mut self.groups[gi];
                 group.ctrl[i] = tag;
-                group.keys[i] = key;
+                unsafe { group.keys[i].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
                 unsafe { group.values[i].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
                 self.len += 1;
                 return;
@@ -402,7 +427,7 @@ impl<V> SimdPrefixHashMap<V> {
                 self.groups[gi].overflow = new_gi as u32;
                 let group = &mut self.groups[new_gi];
                 group.ctrl[hint] = tag;
-                group.keys[hint] = key;
+                unsafe { group.keys[hint].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
                 unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
                 self.len += 1;
                 return;
@@ -411,172 +436,12 @@ impl<V> SimdPrefixHashMap<V> {
     }
 }
 
-impl<V> Drop for SimdPrefixHashMap<V> {
-    fn drop(&mut self) {
-        for group in &mut self.groups {
-            for i in 0..GROUP_SIZE {
-                if group.ctrl[i] != CTRL_EMPTY {
-                    unsafe { group.values[i].assume_init_drop() };
-                }
-            }
-        }
-    }
-}
-
-// ── NoHintPrefixHashMap (SIMD, no slot_hint) ────────────────────────────────
-// Same as SimdPrefixHashMap but always does a full group scan — no preferred
-// slot fast path. This isolates the pure SIMD scan cost.
-
-pub struct NoHintPrefixHashMap<V> {
-    groups: Vec<Group<V>>,
-    n_bits: u32,
-    num_primary: u32,
-    len: usize,
-}
-
-impl<V> NoHintPrefixHashMap<V> {
-    #[inline]
-    fn group_index(&self, key: u32) -> usize {
-        (key >> (32 - self.n_bits)) as usize
-    }
-
-    pub fn new() -> Self {
-        Self::with_capacity(0)
-    }
-
-    pub fn with_capacity(capacity: usize) -> Self {
-        let adjusted = capacity.checked_mul(8).unwrap_or(usize::MAX) / 7;
-        let min_groups = (adjusted / GROUP_SIZE).max(1).next_power_of_two();
-        let n_bits = min_groups.trailing_zeros().max(1);
-        let num_primary = 1usize << n_bits;
-        let total = num_primary + num_primary / 8 + 1;
-        let mut groups = Vec::with_capacity(total);
-        groups.resize_with(num_primary, Group::new);
-        Self {
-            groups,
-            n_bits,
-            num_primary: num_primary as u32,
-            len: 0,
-        }
-    }
-
-    pub fn insert(&mut self, key: u32, value: V) -> Option<V> {
-        let tag = tag(key);
-        let mut gi = self.group_index(key);
-
-        loop {
-            let group = &self.groups[gi];
-
-            // Scan group for tag match.
-            let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
-            while let Some(i) = group_ops::next_match(&mut tag_mask) {
-                if group.keys[i] == key {
-                    let old = std::mem::replace(
-                        unsafe { self.groups[gi].values[i].assume_init_mut() },
-                        value,
-                    );
-                    return Some(old);
-                }
-            }
-
-            // Check for empty slot in this group.
-            let empty_mask = group_ops::match_empty(&group.ctrl);
-            if empty_mask != 0 {
-                let i = group_ops::lowest(empty_mask);
-                let group = &mut self.groups[gi];
-                group.ctrl[i] = tag;
-                group.keys[i] = key;
-                group.values[i] = MaybeUninit::new(value);
-                self.len += 1;
-                return None;
-            }
-
-            // Group full — follow or create overflow chain.
-            let overflow = self.groups[gi].overflow;
-            if overflow != NO_OVERFLOW {
-                gi = overflow as usize;
-            } else {
-                if self.groups.len() == self.groups.capacity() {
-                    self.grow();
-                    return self.insert(key, value);
-                }
-                let new_gi = self.groups.len();
-                self.groups.push(Group::new());
-                self.groups[gi].overflow = new_gi as u32;
-                let group = &mut self.groups[new_gi];
-                group.ctrl[0] = tag;
-                group.keys[0] = key;
-                group.values[0] = MaybeUninit::new(value);
-                self.len += 1;
-                return None;
-            }
-        }
-    }
-
-    pub fn get(&self, key: u32) -> Option<&V> {
-        let tag = tag(key);
-        let mut gi = self.group_index(key);
-
-        loop {
-            let group = &self.groups[gi];
-
-            // Scan group for tag match.
-            let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
-            while let Some(i) = group_ops::next_match(&mut tag_mask) {
-                if group.keys[i] == key {
-                    return Some(unsafe { group.values[i].assume_init_ref() });
-                }
-            }
-
-            // If group has empty slots, key is not present.
-            if group_ops::match_empty(&group.ctrl) != 0 {
-                return None;
-            }
-
-            // Follow overflow chain.
-            if group.overflow == NO_OVERFLOW {
-                return None;
-            }
-            gi = group.overflow as usize;
-        }
-    }
-
-    pub fn len(&self) -> usize {
-        self.len
-    }
-
-    fn grow(&mut self) {
-        let old_groups = std::mem::take(&mut self.groups);
-        let old_len = self.len;
-
-        self.n_bits += 1;
-        let num_primary = 1usize << self.n_bits;
-        let total = num_primary + num_primary / 8 + 1;
-        self.num_primary = num_primary as u32;
-        self.groups = Vec::with_capacity(total);
-        self.groups.resize_with(num_primary, Group::new);
-        self.len = 0;
-
-        for group in old_groups {
-            for i in 0..GROUP_SIZE {
-                if group.ctrl[i] != CTRL_EMPTY {
-                    let key = group.keys[i];
-                    let value = unsafe { group.values[i].assume_init_read() };
-                    self.insert(key, value);
-                }
-            }
-            std::mem::forget(group);
-        }
-
-        debug_assert_eq!(self.len, old_len);
-    }
-}
-
-impl<V> Drop for NoHintPrefixHashMap<V> {
+impl<K, V, S> Drop for SimdPrefixHashMap<K, V, S> {
     fn drop(&mut self) {
         for group in &mut self.groups {
             for i in 0..GROUP_SIZE {
                 if group.ctrl[i] != CTRL_EMPTY {
+                    unsafe { group.keys[i].assume_init_drop() };
                     unsafe { group.values[i].assume_init_drop() };
                 }
             }
@@ -593,9 +458,9 @@ mod tests {
         let mut map = SimdPrefixHashMap::new();
         map.insert(100, "hello");
         map.insert(200, "world");
-        assert_eq!(map.get(100), Some(&"hello"));
-        assert_eq!(map.get(200), Some(&"world"));
-        assert_eq!(map.get(999), None);
+        assert_eq!(map.get(&100), Some(&"hello"));
+        assert_eq!(map.get(&200), Some(&"world"));
+        assert_eq!(map.get(&999), None);
         assert_eq!(map.len(), 2);
     }
 
@@ -604,7 +469,7 @@ mod tests {
         let mut map = SimdPrefixHashMap::new();
         map.insert(42, "a");
         assert_eq!(map.insert(42, "b"), Some("a"));
-        assert_eq!(map.get(42), Some(&"b"));
+        assert_eq!(map.get(&42), Some(&"b"));
         assert_eq!(map.len(), 1);
     }
 
@@ -616,7 +481,7 @@ mod tests {
         }
         assert_eq!(map.len(), 200);
         for i in 0..200u32 {
-            assert_eq!(map.get(i), Some(&(i * 10)), "missing key {i}");
+            assert_eq!(map.get(&i), Some(&(i * 10)), "missing key {i}");
         }
     }
 
@@ -628,7 +493,7 @@ mod tests {
         }
         assert_eq!(map.len(), 2000);
         for i in 0..2000u32 {
-            assert_eq!(map.get(i.wrapping_mul(2654435761)), Some(&i));
+            assert_eq!(map.get(&i.wrapping_mul(2654435761)), Some(&i));
         }
     }
 
@@ -642,7 +507,7 @@ mod tests {
         assert_eq!(map.len(), 20);
         for i in 0..20u32 {
             let key = i | 0xAB000000;
-            assert_eq!(map.get(key), Some(&i), "missing key {key:#x}");
+            assert_eq!(map.get(&key), Some(&i), "missing key {key:#x}");
         }
     }
 
@@ -658,79 +523,22 @@ mod tests {
         assert_eq!(map.len(), 100);
         for i in 0..100u32 {
             let key = i | 0xFF000000;
-            assert_eq!(map.get(key), Some(&i), "missing key {key:#x} after grow");
+            assert_eq!(map.get(&key), Some(&i), "missing key {key:#x} after grow");
         }
     }
 
-    /// Verify SIMD match functions produce identical results to the scalar versions.
     #[test]
-    fn simd_matches_scalar() {
-        // Scalar reference implementations
-        fn scalar_match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Vec<usize> {
-            ctrl.iter()
-                .enumerate()
-                .filter(|(_, &c)| c == tag)
-                .map(|(i, _)| i)
-                .collect()
-        }
-        fn scalar_match_empty(ctrl: &[u8; GROUP_SIZE]) -> Vec<usize> {
-            ctrl.iter()
-                .enumerate()
-                .filter(|(_, &c)| c == CTRL_EMPTY)
-                .map(|(i, _)| i)
-                .collect()
-        }
-
-        // Decode a SIMD mask into sorted slot indices
-        fn decode_mask(mut mask: Mask) -> Vec<usize> {
-            let mut out = vec![];
-            while let Some(i) = group_ops::next_match(&mut mask) {
-                out.push(i);
-            }
-            out
-        }
+    fn string_keys() {
+        let mut map = SimdPrefixHashMap::new();
+        map.insert("hello".to_string(), 1);
+        map.insert("world".to_string(), 2);
+        assert_eq!(map.get("hello"), Some(&1));
+        assert_eq!(map.get("world"), Some(&2));
+        assert_eq!(map.get("missing"), None);
+        assert_eq!(map.len(), 2);
 
-        // Test with various control byte patterns
-        let patterns: Vec<[u8; GROUP_SIZE]> = vec![
-            [CTRL_EMPTY; GROUP_SIZE],
-            [0x80; GROUP_SIZE],
-            {
-                let mut p = [CTRL_EMPTY; GROUP_SIZE];
-                p[0] = 0xAB;
-                p[GROUP_SIZE - 1] = 0xAB;
-                p
-            },
-            {
-                let mut p = [CTRL_EMPTY; GROUP_SIZE];
-                for (i, b) in p.iter_mut().enumerate() {
-                    *b = if i % 2 == 0 { 0x80 | (i as u8) } else { CTRL_EMPTY };
-                }
-                p
-            },
-            {
-                let mut p = [0u8; GROUP_SIZE];
-                for (i, b) in p.iter_mut().enumerate() {
-                    *b = 0x80 | (i as u8);
-                }
-                p
-            },
-        ];
-
-        for ctrl in &patterns {
-            // Test match_empty
-            let simd_empty = decode_mask(group_ops::match_empty(ctrl));
-            let scalar_empty = scalar_match_empty(ctrl);
-            assert_eq!(simd_empty, scalar_empty, "match_empty mismatch for {ctrl:?}");
-
-            // Test match_tag with various tags
-            for &tag in &[0x80, 0x81, 0xAB, 0xFF] {
-                let simd_tag = decode_mask(group_ops::match_tag(ctrl, tag));
-                let scalar_tag = scalar_match_tag(ctrl, tag);
-                assert_eq!(
-                    simd_tag, scalar_tag,
-                    "match_tag mismatch for tag={tag:#x}, ctrl={ctrl:?}"
-                );
-            }
-        }
+        assert_eq!(map.insert("hello".to_string(), 3), Some(1));
+        assert_eq!(map.get("hello"), Some(&3));
+        assert_eq!(map.len(), 2);
     }
 }

From 84644217f247504e0ea814179acc80b9825572c9 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Tue, 28 Apr 2026 10:37:36 +0000
Subject: [PATCH 04/22] fix sse version

---
 crates/hashmap-bench/prefix_map.rs      |  5 ++--
 crates/hashmap-bench/prefix_map_simd.rs | 40 +++++++++++++++++++------
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/crates/hashmap-bench/prefix_map.rs b/crates/hashmap-bench/prefix_map.rs
index 2619aa5..ed3cc36 100644
--- a/crates/hashmap-bench/prefix_map.rs
+++ b/crates/hashmap-bench/prefix_map.rs
@@ -289,8 +289,9 @@ impl<K: Hash + Eq, V, S: BuildHasher> PrefixHashMap<K, V, S> {
                 self.insert_for_grow(hash, group.keys[i].as_ptr(), group.values[i].as_ptr());
             }
         }
-        // Prevent double-drop — keys/values were copied out via raw pointers.
-        std::mem::forget(old_groups);
+        // Group<K, V> has no Drop (keys/values are MaybeUninit), so dropping
+        // old_groups runs no destructors but does free the backing buffer.
+        drop(old_groups);
 
         debug_assert_eq!(self.len, old_len);
     }
diff --git a/crates/hashmap-bench/prefix_map_simd.rs b/crates/hashmap-bench/prefix_map_simd.rs
index 243819f..b9aa108 100644
--- a/crates/hashmap-bench/prefix_map_simd.rs
+++ b/crates/hashmap-bench/prefix_map_simd.rs
@@ -42,6 +42,16 @@ mod group_ops {
         match_tag(ctrl, super::CTRL_EMPTY)
     }
 
+    /// Mask of slots whose ctrl byte has the high bit set (occupied).
+    /// Uses SSE2 `_mm_movemask_epi8` which extracts the top bit of each byte.
+    #[inline(always)]
+    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        unsafe {
+            let group = x86::_mm_loadu_si128(ctrl.as_ptr() as *const x86::__m128i);
+            x86::_mm_movemask_epi8(group) as u32
+        }
+    }
+
     #[inline(always)]
     pub fn lowest(mask: Mask) -> usize {
         mask.trailing_zeros() as usize
@@ -87,6 +97,15 @@ mod group_ops {
         }
     }
 
+    /// Mask of slots whose ctrl byte has the high bit set (occupied).
+    #[inline(always)]
+    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        unsafe {
+            let group = neon::vld1_u8(ctrl.as_ptr());
+            neon::vget_lane_u64(neon::vreinterpret_u64_u8(group), 0) & 0x8080808080808080
+        }
+    }
+
     #[inline(always)]
     pub fn lowest(mask: Mask) -> usize {
         (mask.trailing_zeros() >> 3) as usize
@@ -126,6 +145,13 @@ mod group_ops {
         !word & 0x8080808080808080
     }
 
+    /// Mask of slots whose ctrl byte has the high bit set (occupied).
+    #[inline(always)]
+    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        let word = u64::from_ne_bytes(*ctrl);
+        word & 0x8080808080808080
+    }
+
     #[inline(always)]
     pub fn lowest(mask: Mask) -> usize {
         (mask.trailing_zeros() >> 3) as usize
@@ -371,21 +397,17 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
         self.len = 0;
 
         for group in &old_groups {
-            let ctrl_word = u64::from_ne_bytes(group.ctrl);
-            if ctrl_word == 0 {
-                continue;
-            }
-            let mut full_mask = ctrl_word & 0x8080808080808080;
-            while full_mask != 0 {
-                let i = (full_mask.trailing_zeros() >> 3) as usize;
-                full_mask &= full_mask - 1;
+            let mut full_mask = group_ops::match_full(&group.ctrl);
+            while let Some(i) = group_ops::next_match(&mut full_mask) {
                 let hash = self.hash_builder.hash_one(unsafe {
                     group.keys[i].assume_init_ref()
                 });
                 self.insert_for_grow(hash, group.keys[i].as_ptr(), group.values[i].as_ptr());
             }
         }
-        std::mem::forget(old_groups);
+        // Group<K, V> has no Drop (keys/values are MaybeUninit), so dropping
+        // old_groups runs no destructors but does free the backing buffer.
+        drop(old_groups);
 
         debug_assert_eq!(self.len, old_len);
     }

From 0244f8f8b566d802c792b5a72edd36371e2c5730 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Tue, 28 Apr 2026 13:35:52 +0200
Subject: [PATCH 05/22] cleanup

---
 crates/hashmap-bench/hashmap_insert.rs |  54 ---
 crates/hashmap-bench/lib.rs            |   1 -
 crates/hashmap-bench/prefix_map.rs     | 452 -------------------------
 3 files changed, 507 deletions(-)
 delete mode 100644 crates/hashmap-bench/prefix_map.rs

diff --git a/crates/hashmap-bench/hashmap_insert.rs b/crates/hashmap-bench/hashmap_insert.rs
index a5a9717..fe7eafa 100644
--- a/crates/hashmap-bench/hashmap_insert.rs
+++ b/crates/hashmap-bench/hashmap_insert.rs
@@ -118,22 +118,6 @@ fn bench_hashmap_insert(c: &mut Criterion) {
     });
 
     group.bench_function("PrefixHashMap", |b| {
-        b.iter_batched(
-            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_hasher(
-                trigrams.len(),
-                hashmap_bench::IdentityBuildHasher::default(),
-            ),
-            |mut map| {
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i);
-                }
-                map
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group.bench_function("SimdPrefixHashMap", |b| {
         b.iter_batched(
             || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity_and_hasher(
                 trigrams.len(),
@@ -177,28 +161,6 @@ fn bench_hashmap_insert(c: &mut Criterion) {
     });
 
     group2.bench_function("PrefixHashMap", |b| {
-        b.iter_batched(
-            || {
-                let mut map = hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_hasher(
-                    trigrams.len(),
-                    hashmap_bench::IdentityBuildHasher::default(),
-                );
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i);
-                }
-                map
-            },
-            |mut map| {
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i + 1000);
-                }
-                map
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group2.bench_function("SimdPrefixHashMap", |b| {
         b.iter_batched(
             || {
                 let mut map = hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity_and_hasher(
@@ -242,22 +204,6 @@ fn bench_hashmap_insert(c: &mut Criterion) {
     });
 
     group3.bench_function("PrefixHashMap", |b| {
-        b.iter_batched(
-            || hashmap_bench::prefix_map::PrefixHashMap::with_capacity_and_hasher(
-                128,
-                hashmap_bench::IdentityBuildHasher::default(),
-            ),
-            |mut map| {
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i);
-                }
-                map
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group3.bench_function("SimdPrefixHashMap", |b| {
         b.iter_batched(
             || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity_and_hasher(
                 128,
diff --git a/crates/hashmap-bench/lib.rs b/crates/hashmap-bench/lib.rs
index 21dabfb..d06be51 100644
--- a/crates/hashmap-bench/lib.rs
+++ b/crates/hashmap-bench/lib.rs
@@ -1,4 +1,3 @@
-pub mod prefix_map;
 pub mod prefix_map_simd;
 
 use rand::Rng;
diff --git a/crates/hashmap-bench/prefix_map.rs b/crates/hashmap-bench/prefix_map.rs
deleted file mode 100644
index ed3cc36..0000000
--- a/crates/hashmap-bench/prefix_map.rs
+++ /dev/null
@@ -1,452 +0,0 @@
-use core::mem::MaybeUninit;
-use std::borrow::Borrow;
-use std::collections::hash_map::RandomState;
-use std::hash::{BuildHasher, Hash};
-
-const GROUP_SIZE: usize = 8;
-const CTRL_EMPTY: u8 = 0x00;
-const NO_OVERFLOW: u32 = u32::MAX;
-
-#[inline(always)]
-fn likely(b: bool) -> bool {
-    if !b { cold_path() }
-    b
-}
-
-#[inline(always)]
-fn unlikely(b: bool) -> bool {
-    if b { cold_path() }
-    b
-}
-
-#[cold]
-#[inline(never)]
-fn cold_path() {}
-
-#[inline]
-fn tag(hash: u64) -> u8 {
-    (hash as u8) | 0x80
-}
-
-#[inline]
-fn slot_hint(hash: u64) -> usize {
-    ((hash >> 7) & 0x7) as usize
-}
-
-#[inline]
-fn match_byte(ctrl: &[u8; GROUP_SIZE], byte: u8) -> u64 {
-    let word = u64::from_ne_bytes(*ctrl);
-    let broadcast = 0x0101010101010101u64 * (byte as u64);
-    let xor = word ^ broadcast;
-    (xor.wrapping_sub(0x0101010101010101)) & !xor & 0x8080808080808080
-}
-
-#[inline]
-fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> u64 {
-    let word = u64::from_ne_bytes(*ctrl);
-    !word & 0x8080808080808080
-}
-
-struct Group<K, V> {
-    ctrl: [u8; GROUP_SIZE],
-    keys: [MaybeUninit<K>; GROUP_SIZE],
-    values: [MaybeUninit<V>; GROUP_SIZE],
-    overflow: u32,
-}
-
-impl<K, V> Group<K, V> {
-    fn new() -> Self {
-        Self {
-            ctrl: [CTRL_EMPTY; GROUP_SIZE],
-            keys: [const { MaybeUninit::uninit() }; GROUP_SIZE],
-            values: [const { MaybeUninit::uninit() }; GROUP_SIZE],
-            overflow: NO_OVERFLOW,
-        }
-    }
-}
-
-/// Insertion-only hash map with overflow chaining and slot-hint fast path.
-///
-/// Generic over key type `K`, value type `V`, and hash builder `S`.
-pub struct PrefixHashMap<K, V, S = RandomState> {
-    groups: Vec<Group<K, V>>,
-    n_bits: u32,
-    len: usize,
-    hash_builder: S,
-}
-
-impl<K: Hash + Eq, V> PrefixHashMap<K, V> {
-    pub fn new() -> Self {
-        Self::with_capacity_and_hasher(0, RandomState::new())
-    }
-
-    pub fn with_capacity(capacity: usize) -> Self {
-        Self::with_capacity_and_hasher(capacity, RandomState::new())
-    }
-}
-
-impl<K, V, S> PrefixHashMap<K, V, S> {
-    pub fn with_hasher(hash_builder: S) -> Self {
-        Self::with_capacity_and_hasher(0, hash_builder)
-    }
-
-    pub fn with_capacity_and_hasher(capacity: usize, hash_builder: S) -> Self {
-        // Target ≤87.5% load (7/8), matching hashbrown's load factor.
-        let adjusted = capacity.checked_mul(8).unwrap_or(usize::MAX) / 7;
-        let min_groups = (adjusted / GROUP_SIZE).max(1).next_power_of_two();
-        let n_bits = min_groups.trailing_zeros().max(1);
-        let num_primary = 1usize << n_bits;
-        let total = num_primary + num_primary / 8 + 1;
-        let mut groups = Vec::with_capacity(total);
-        groups.resize_with(num_primary, Group::new);
-        Self {
-            groups,
-            n_bits,
-            len: 0,
-            hash_builder,
-        }
-    }
-
-    #[inline]
-    fn group_index(&self, hash: u64) -> usize {
-        (hash >> (64 - self.n_bits)) as usize
-    }
-
-    pub fn len(&self) -> usize {
-        self.len
-    }
-}
-
-impl<K: Hash + Eq, V, S: BuildHasher> PrefixHashMap<K, V, S> {
-    pub fn insert(&mut self, key: K, value: V) -> Option<V> {
-        let hash = self.hash_builder.hash_one(&key);
-        self.insert_hashed(hash, key, value)
-    }
-
-    pub fn get<Q>(&self, key: &Q) -> Option<&V>
-    where
-        K: Borrow<Q>,
-        Q: Hash + Eq + ?Sized,
-    {
-        let hash = self.hash_builder.hash_one(key);
-        self.get_hashed(hash, key)
-    }
-
-    fn insert_hashed(&mut self, hash: u64, key: K, value: V) -> Option<V> {
-        let tag = tag(hash);
-        let hint = slot_hint(hash);
-        let mut gi = self.group_index(hash);
-
-        loop {
-            let group = &self.groups[gi];
-
-            // Fast path: check preferred slot.
-            let c = group.ctrl[hint];
-            if likely(c == CTRL_EMPTY) {
-                let group = &mut self.groups[gi];
-                group.ctrl[hint] = tag;
-                group.keys[hint] = MaybeUninit::new(key);
-                group.values[hint] = MaybeUninit::new(value);
-                self.len += 1;
-                return None;
-            }
-            if c == tag && unsafe { group.keys[hint].assume_init_ref() } == &key {
-                let old = std::mem::replace(
-                    unsafe { self.groups[gi].values[hint].assume_init_mut() },
-                    value,
-                );
-                // Drop the incoming key since we're keeping the stored one.
-                drop(key);
-                return Some(old);
-            }
-
-            // Slow path: scan group for tag match.
-            let mut tag_mask = match_byte(&group.ctrl, tag);
-            tag_mask &= !(0x80u64 << (hint * 8));
-            while tag_mask != 0 {
-                let i = (tag_mask.trailing_zeros() >> 3) as usize;
-                tag_mask &= tag_mask - 1;
-                if unlikely(unsafe { group.keys[i].assume_init_ref() } == &key) {
-                    let old = std::mem::replace(
-                        unsafe { self.groups[gi].values[i].assume_init_mut() },
-                        value,
-                    );
-                    drop(key);
-                    return Some(old);
-                }
-            }
-
-            // Check for empty slot in this group.
-            let empty_mask = match_empty(&group.ctrl);
-            if likely(empty_mask != 0) {
-                let i = (empty_mask.trailing_zeros() >> 3) as usize;
-                let group = &mut self.groups[gi];
-                group.ctrl[i] = tag;
-                group.keys[i] = MaybeUninit::new(key);
-                group.values[i] = MaybeUninit::new(value);
-                self.len += 1;
-                return None;
-            }
-
-            // Group full — follow or create overflow chain.
-            let overflow = self.groups[gi].overflow;
-            if unlikely(overflow == NO_OVERFLOW) {
-                return self.insert_overflow(gi, hash, key, value);
-            }
-            gi = overflow as usize;
-        }
-    }
-
-    #[cold]
-    #[inline(never)]
-    fn insert_overflow(&mut self, gi: usize, hash: u64, key: K, value: V) -> Option<V> {
-        if self.groups.len() == self.groups.capacity() {
-            self.grow();
-            return self.insert_hashed(hash, key, value);
-        }
-        let hint = slot_hint(hash);
-        let tag = tag(hash);
-        let new_gi = self.groups.len();
-        self.groups.push(Group::new());
-        self.groups[gi].overflow = new_gi as u32;
-        let group = &mut self.groups[new_gi];
-        group.ctrl[hint] = tag;
-        group.keys[hint] = MaybeUninit::new(key);
-        group.values[hint] = MaybeUninit::new(value);
-        self.len += 1;
-        None
-    }
-
-    fn get_hashed<Q>(&self, hash: u64, key: &Q) -> Option<&V>
-    where
-        K: Borrow<Q>,
-        Q: Eq + ?Sized,
-    {
-        let tag = tag(hash);
-        let hint = slot_hint(hash);
-        let mut gi = self.group_index(hash);
-
-        loop {
-            let group = &self.groups[gi];
-
-            // Fast path: preferred slot.
-            let c = group.ctrl[hint];
-            if likely(c == tag)
-                && unsafe { group.keys[hint].assume_init_ref() }.borrow() == key
-            {
-                return Some(unsafe { group.values[hint].assume_init_ref() });
-            }
-
-            // Slow path: scan group.
-            let mut tag_mask = match_byte(&group.ctrl, tag);
-            tag_mask &= !(0x80u64 << (hint * 8));
-            while tag_mask != 0 {
-                let i = (tag_mask.trailing_zeros() >> 3) as usize;
-                tag_mask &= tag_mask - 1;
-                if likely(
-                    unsafe { group.keys[i].assume_init_ref() }.borrow() == key,
-                ) {
-                    return Some(unsafe { group.values[i].assume_init_ref() });
-                }
-            }
-
-            // If group has empty slots, key is not present.
-            if likely(match_empty(&group.ctrl) != 0) {
-                return None;
-            }
-
-            // Follow overflow chain.
-            if unlikely(group.overflow == NO_OVERFLOW) {
-                return None;
-            }
-            gi = group.overflow as usize;
-        }
-    }
-
-    fn grow(&mut self) {
-        let old_groups = std::mem::take(&mut self.groups);
-        let old_len = self.len;
-
-        self.n_bits += 1;
-        let num_primary = 1usize << self.n_bits;
-        let total = num_primary + num_primary / 8 + 1;
-        self.groups = Vec::with_capacity(total);
-        self.groups.resize_with(num_primary, Group::new);
-        self.len = 0;
-
-        for group in &old_groups {
-            let ctrl_word = u64::from_ne_bytes(group.ctrl);
-            if ctrl_word == 0 {
-                continue;
-            }
-            let mut full_mask = ctrl_word & 0x8080808080808080;
-            while full_mask != 0 {
-                let i = (full_mask.trailing_zeros() >> 3) as usize;
-                full_mask &= full_mask - 1;
-                let hash = self.hash_builder.hash_one(unsafe {
-                    group.keys[i].assume_init_ref()
-                });
-                self.insert_for_grow(hash, group.keys[i].as_ptr(), group.values[i].as_ptr());
-            }
-        }
-        // Group<K, V> has no Drop (keys/values are MaybeUninit), so dropping
-        // old_groups runs no destructors but does free the backing buffer.
-        drop(old_groups);
-
-        debug_assert_eq!(self.len, old_len);
-    }
-
-    /// Fast insert for grow: no duplicate check, raw pointer copy.
-    fn insert_for_grow(&mut self, hash: u64, key_src: *const K, value_src: *const V) {
-        let tag = tag(hash);
-        let hint = slot_hint(hash);
-        let mut gi = self.group_index(hash);
-
-        loop {
-            let group = &self.groups[gi];
-
-            if group.ctrl[hint] == CTRL_EMPTY {
-                let group = &mut self.groups[gi];
-                group.ctrl[hint] = tag;
-                unsafe { group.keys[hint].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
-                unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
-                self.len += 1;
-                return;
-            }
-
-            let empty_mask = match_empty(&group.ctrl);
-            if empty_mask != 0 {
-                let i = (empty_mask.trailing_zeros() >> 3) as usize;
-                let group = &mut self.groups[gi];
-                group.ctrl[i] = tag;
-                unsafe { group.keys[i].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
-                unsafe { group.values[i].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
-                self.len += 1;
-                return;
-            }
-
-            let overflow = self.groups[gi].overflow;
-            if overflow != NO_OVERFLOW {
-                gi = overflow as usize;
-            } else {
-                let new_gi = self.groups.len();
-                self.groups.push(Group::new());
-                self.groups[gi].overflow = new_gi as u32;
-                let group = &mut self.groups[new_gi];
-                group.ctrl[hint] = tag;
-                unsafe { group.keys[hint].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
-                unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
-                self.len += 1;
-                return;
-            }
-        }
-    }
-}
-
-impl<K, V, S> Drop for PrefixHashMap<K, V, S> {
-    fn drop(&mut self) {
-        for group in &mut self.groups {
-            for i in 0..GROUP_SIZE {
-                if group.ctrl[i] != CTRL_EMPTY {
-                    unsafe { group.keys[i].assume_init_drop() };
-                    unsafe { group.values[i].assume_init_drop() };
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn insert_and_get() {
-        let mut map = PrefixHashMap::new();
-        map.insert(100, "hello");
-        map.insert(200, "world");
-        assert_eq!(map.get(&100), Some(&"hello"));
-        assert_eq!(map.get(&200), Some(&"world"));
-        assert_eq!(map.get(&999), None);
-        assert_eq!(map.len(), 2);
-    }
-
-    #[test]
-    fn insert_overwrite() {
-        let mut map = PrefixHashMap::new();
-        map.insert(42, "a");
-        assert_eq!(map.insert(42, "b"), Some("a"));
-        assert_eq!(map.get(&42), Some(&"b"));
-        assert_eq!(map.len(), 1);
-    }
-
-    #[test]
-    fn grow_preserves_entries() {
-        let mut map = PrefixHashMap::new();
-        for i in 0..200u32 {
-            map.insert(i, i * 10);
-        }
-        assert_eq!(map.len(), 200);
-        for i in 0..200u32 {
-            assert_eq!(map.get(&i), Some(&(i * 10)), "missing key {i}");
-        }
-    }
-
-    #[test]
-    fn many_entries() {
-        let mut map = PrefixHashMap::with_capacity(2000);
-        for i in 0..2000u32 {
-            map.insert(i.wrapping_mul(2654435761), i);
-        }
-        assert_eq!(map.len(), 2000);
-        for i in 0..2000u32 {
-            assert_eq!(map.get(&i.wrapping_mul(2654435761)), Some(&i));
-        }
-    }
-
-    #[test]
-    fn overflow_chain() {
-        let mut map = PrefixHashMap::with_capacity(8);
-        for i in 0..20u32 {
-            let key = i | 0xAB000000;
-            map.insert(key, i);
-        }
-        assert_eq!(map.len(), 20);
-        for i in 0..20u32 {
-            let key = i | 0xAB000000;
-            assert_eq!(map.get(&key), Some(&i), "missing key {key:#x}");
-        }
-    }
-
-    #[test]
-    fn grow_on_overflow_exhaustion() {
-        let mut map = PrefixHashMap::with_capacity(1);
-        let old_n_bits = map.n_bits;
-        for i in 0..100u32 {
-            let key = i | 0xFF000000;
-            map.insert(key, i);
-        }
-        assert!(map.n_bits > old_n_bits, "should have grown");
-        assert_eq!(map.len(), 100);
-        for i in 0..100u32 {
-            let key = i | 0xFF000000;
-            assert_eq!(map.get(&key), Some(&i), "missing key {key:#x} after grow");
-        }
-    }
-
-    #[test]
-    fn string_keys() {
-        let mut map = PrefixHashMap::new();
-        map.insert("hello".to_string(), 1);
-        map.insert("world".to_string(), 2);
-        assert_eq!(map.get("hello"), Some(&1));
-        assert_eq!(map.get("world"), Some(&2));
-        assert_eq!(map.get("missing"), None);
-        assert_eq!(map.len(), 2);
-
-        // Overwrite
-        assert_eq!(map.insert("hello".to_string(), 3), Some(1));
-        assert_eq!(map.get("hello"), Some(&3));
-        assert_eq!(map.len(), 2);
-    }
-}

From 7d09f3f460e22d4c25b764a7ef84b55c097b9fa5 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Tue, 28 Apr 2026 13:16:36 +0000
Subject: [PATCH 06/22] replace vec with box

---
 crates/hashmap-bench/prefix_map_simd.rs | 98 ++++++++++++-------------
 1 file changed, 48 insertions(+), 50 deletions(-)

diff --git a/crates/hashmap-bench/prefix_map_simd.rs b/crates/hashmap-bench/prefix_map_simd.rs
index b9aa108..ccc0160 100644
--- a/crates/hashmap-bench/prefix_map_simd.rs
+++ b/crates/hashmap-bench/prefix_map_simd.rs
@@ -208,7 +208,8 @@ impl<K, V> Group<K, V> {
 /// Uses NEON on aarch64, SSE2 on x86_64, scalar fallback elsewhere.
 /// Generic over key type `K`, value type `V`, and hash builder `S`.
 pub struct SimdPrefixHashMap<K, V, S = RandomState> {
-    groups: Vec<Group<K, V>>,
+    groups: Box<[Group<K, V>]>,
+    num_groups: u32,
     n_bits: u32,
     len: usize,
     hash_builder: S,
@@ -233,18 +234,27 @@ impl<K, V, S> SimdPrefixHashMap<K, V, S> {
         let adjusted = capacity.checked_mul(8).unwrap_or(usize::MAX) / 7;
         let min_groups = (adjusted / GROUP_SIZE).max(1).next_power_of_two();
         let n_bits = min_groups.trailing_zeros().max(1);
-        let num_primary = 1usize << n_bits;
-        let total = num_primary + num_primary / 8 + 1;
-        let mut groups = Vec::with_capacity(total);
-        groups.resize_with(num_primary, Group::new);
+        let (groups, num_primary) = Self::alloc_groups(n_bits);
         Self {
             groups,
+            num_groups: num_primary,
             n_bits,
             len: 0,
             hash_builder,
         }
     }
 
+    /// Allocate a fully default-initialized boxed slice sized for `n_bits` primary groups
+    /// plus the standard 12.5% overflow reserve. Returns the slice and the number of
+    /// primary groups (which is also the initial in-use count).
+    fn alloc_groups(n_bits: u32) -> (Box<[Group<K, V>]>, u32) {
+        let num_primary = 1usize << n_bits;
+        let total = num_primary + num_primary / 8 + 1;
+        let mut groups: Vec<Group<K, V>> = Vec::with_capacity(total);
+        groups.resize_with(total, Group::new);
+        (groups.into_boxed_slice(), num_primary as u32)
+    }
+
     #[inline]
     fn group_index(&self, hash: u64) -> usize {
         (hash >> (64 - self.n_bits)) as usize
@@ -276,12 +286,11 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
         let mut gi = self.group_index(hash);
 
         loop {
-            let group = &self.groups[gi];
+            let group = &mut self.groups[gi];
 
             // Fast path: check preferred slot.
             let c = group.ctrl[hint];
             if c == CTRL_EMPTY {
-                let group = &mut self.groups[gi];
                 group.ctrl[hint] = tag;
                 group.keys[hint] = MaybeUninit::new(key);
                 group.values[hint] = MaybeUninit::new(value);
@@ -290,7 +299,7 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
             }
             if c == tag && unsafe { group.keys[hint].assume_init_ref() } == &key {
                 let old = std::mem::replace(
-                    unsafe { self.groups[gi].values[hint].assume_init_mut() },
+                    unsafe { group.values[hint].assume_init_mut() },
                     value,
                 );
                 drop(key);
@@ -303,7 +312,7 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
             while let Some(i) = group_ops::next_match(&mut tag_mask) {
                 if unsafe { group.keys[i].assume_init_ref() } == &key {
                     let old = std::mem::replace(
-                        unsafe { self.groups[gi].values[i].assume_init_mut() },
+                        unsafe { group.values[i].assume_init_mut() },
                         value,
                     );
                     drop(key);
@@ -315,7 +324,6 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
             let empty_mask = group_ops::match_empty(&group.ctrl);
             if empty_mask != 0 {
                 let i = group_ops::lowest(empty_mask);
-                let group = &mut self.groups[gi];
                 group.ctrl[i] = tag;
                 group.keys[i] = MaybeUninit::new(key);
                 group.values[i] = MaybeUninit::new(value);
@@ -324,16 +332,16 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
             }
 
             // Group full — follow or create overflow chain.
-            let overflow = self.groups[gi].overflow;
+            let overflow = group.overflow;
             if overflow != NO_OVERFLOW {
                 gi = overflow as usize;
             } else {
-                if self.groups.len() == self.groups.capacity() {
+                if self.num_groups as usize == self.groups.len() {
                     self.grow();
                     return self.insert_hashed(hash, key, value);
                 }
-                let new_gi = self.groups.len();
-                self.groups.push(Group::new());
+                let new_gi = self.num_groups as usize;
+                self.num_groups += 1;
                 self.groups[gi].overflow = new_gi as u32;
                 let group = &mut self.groups[new_gi];
                 group.ctrl[hint] = tag;
@@ -386,17 +394,20 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
     }
 
     fn grow(&mut self) {
-        let old_groups = std::mem::take(&mut self.groups);
+        let old_groups = std::mem::replace(
+            &mut self.groups,
+            Vec::<Group<K, V>>::new().into_boxed_slice(),
+        );
+        let old_num_groups = self.num_groups as usize;
         let old_len = self.len;
 
         self.n_bits += 1;
-        let num_primary = 1usize << self.n_bits;
-        let total = num_primary + num_primary / 8 + 1;
-        self.groups = Vec::with_capacity(total);
-        self.groups.resize_with(num_primary, Group::new);
+        let (new_groups, num_primary) = Self::alloc_groups(self.n_bits);
+        self.groups = new_groups;
+        self.num_groups = num_primary;
         self.len = 0;
 
-        for group in &old_groups {
+        for group in &old_groups[..old_num_groups] {
             let mut full_mask = group_ops::match_full(&group.ctrl);
             while let Some(i) = group_ops::next_match(&mut full_mask) {
                 let hash = self.hash_builder.hash_one(unsafe {
@@ -414,53 +425,40 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
 
     fn insert_for_grow(&mut self, hash: u64, key_src: *const K, value_src: *const V) {
         let tag = tag(hash);
-        let hint = slot_hint(hash);
-        let mut gi = self.group_index(hash);
+        let mut hint = slot_hint(hash);
+        let gi = self.group_index(hash);
+        let mut group = &mut self.groups[gi];
 
         loop {
-            let group = &self.groups[gi];
-
             if group.ctrl[hint] == CTRL_EMPTY {
-                let group = &mut self.groups[gi];
-                group.ctrl[hint] = tag;
-                unsafe { group.keys[hint].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
-                unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
-                self.len += 1;
-                return;
+                break;
             }
-
             let empty_mask = group_ops::match_empty(&group.ctrl);
             if empty_mask != 0 {
-                let i = group_ops::lowest(empty_mask);
-                let group = &mut self.groups[gi];
-                group.ctrl[i] = tag;
-                unsafe { group.keys[i].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
-                unsafe { group.values[i].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
-                self.len += 1;
-                return;
+                hint = group_ops::lowest(empty_mask);
+                break;
             }
-
-            let overflow = self.groups[gi].overflow;
+            let overflow = group.overflow;
             if overflow != NO_OVERFLOW {
-                gi = overflow as usize;
+                group = &mut self.groups[overflow as usize];
             } else {
-                let new_gi = self.groups.len();
-                self.groups.push(Group::new());
+                let new_gi = self.num_groups as usize;
                 self.groups[gi].overflow = new_gi as u32;
-                let group = &mut self.groups[new_gi];
-                group.ctrl[hint] = tag;
-                unsafe { group.keys[hint].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
-                unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
-                self.len += 1;
-                return;
+                self.num_groups += 1;
+                group = &mut self.groups[new_gi];
+                break;
             }
         }
+        group.ctrl[hint] = tag;
+        unsafe { group.keys[hint].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
+        unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
+        self.len += 1;
     }
 }
 
 impl<K, V, S> Drop for SimdPrefixHashMap<K, V, S> {
     fn drop(&mut self) {
-        for group in &mut self.groups {
+        for group in &mut self.groups[..self.num_groups as usize] {
             for i in 0..GROUP_SIZE {
                 if group.ctrl[i] != CTRL_EMPTY {
                     unsafe { group.keys[i].assume_init_drop() };

From fba4bb268ad8d88008230a1da20fe6971985980b Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Tue, 28 Apr 2026 14:35:26 +0000
Subject: [PATCH 07/22] add entry function

---
 crates/hashmap-bench/hashmap_insert.rs  |  60 ++++
 crates/hashmap-bench/prefix_map_simd.rs | 382 +++++++++++++++++++++++-
 2 files changed, 437 insertions(+), 5 deletions(-)

diff --git a/crates/hashmap-bench/hashmap_insert.rs b/crates/hashmap-bench/hashmap_insert.rs
index fe7eafa..e13e3e6 100644
--- a/crates/hashmap-bench/hashmap_insert.rs
+++ b/crates/hashmap-bench/hashmap_insert.rs
@@ -220,6 +220,66 @@ fn bench_hashmap_insert(c: &mut Criterion) {
     });
 
     group3.finish();
+
+    // ── get_or_default: count trigram occurrences ──────────────────────
+    // Counting workload: most lookups hit existing keys, so this stresses
+    // the find-existing path of get_or_default / entry().or_insert().
+    let mut counted_trigrams = Vec::with_capacity(trigrams.len() * 4);
+    for _ in 0..4 {
+        counted_trigrams.extend_from_slice(&trigrams);
+    }
+
+    let mut group4 = c.benchmark_group("count_4000_trigrams_get_or_default");
+
+    group4.bench_function("hashbrown+Identity entry()", |b| {
+        b.iter_batched(
+            || hashbrown::HashMap::<u32, u32, hashmap_bench::IdentityBuildHasher>::with_capacity_and_hasher(
+                trigrams.len(),
+                Default::default(),
+            ),
+            |mut map| {
+                for &key in &counted_trigrams {
+                    *map.entry(key).or_insert(0) += 1;
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group4.bench_function("PrefixHashMap get_or_default", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::<u32, u32, _>::with_capacity_and_hasher(
+                trigrams.len(),
+                hashmap_bench::IdentityBuildHasher::default(),
+            ),
+            |mut map| {
+                for &key in &counted_trigrams {
+                    *map.get_or_default(key) += 1;
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group4.bench_function("PrefixHashMap entry().or_default()", |b| {
+        b.iter_batched(
+            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::<u32, u32, _>::with_capacity_and_hasher(
+                trigrams.len(),
+                hashmap_bench::IdentityBuildHasher::default(),
+            ),
+            |mut map| {
+                for &key in &counted_trigrams {
+                    *map.entry(key).or_default() += 1;
+                }
+                map
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group4.finish();
 }
 
 criterion_group!(benches, bench_hashmap_insert);
diff --git a/crates/hashmap-bench/prefix_map_simd.rs b/crates/hashmap-bench/prefix_map_simd.rs
index ccc0160..628e411 100644
--- a/crates/hashmap-bench/prefix_map_simd.rs
+++ b/crates/hashmap-bench/prefix_map_simd.rs
@@ -280,6 +280,46 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
         self.get_hashed(hash, key)
     }
 
+    /// Returns a mutable reference to the value for `key`, inserting `f()` if absent.
+    #[inline]
+    pub fn get_or_insert_with<F: FnOnce() -> V>(&mut self, key: K, f: F) -> &mut V {
+        self.entry(key).or_insert_with(f)
+    }
+
+    /// Returns a mutable reference to the value for `key`, inserting `V::default()` if absent.
+    pub fn get_or_default(&mut self, key: K) -> &mut V
+    where
+        V: Default,
+    {
+        self.get_or_insert_with(key, V::default)
+    }
+
+    /// Returns an [`Entry`] for `key`, providing in-place access to its value
+    /// (insertion, mutation, or read). The lookup chain is walked exactly once;
+    /// the resulting `VacantEntry` already knows where to write.
+    #[inline]
+    pub fn entry(&mut self, key: K) -> Entry<'_, K, V, S> {
+        let hash = self.hash_builder.hash_one(&key);
+        match self.find_or_insertion_slot(hash, &key) {
+            FindResult::Found(ptr) => Entry::Occupied(OccupiedEntry {
+                // SAFETY: pointer is valid for `'_` (bounded by `&mut self`).
+                value: unsafe { &mut *ptr },
+            }),
+            FindResult::Empty { group, slot } => Entry::Vacant(VacantEntry {
+                map: self,
+                hash,
+                key,
+                insertion: Insertion::Empty { group, slot },
+            }),
+            FindResult::NeedsOverflow { tail } => Entry::Vacant(VacantEntry {
+                map: self,
+                hash,
+                key,
+                insertion: Insertion::NeedsOverflow { tail },
+            }),
+        }
+    }
+
     fn insert_hashed(&mut self, hash: u64, key: K, value: V) -> Option<V> {
         let tag = tag(hash);
         let hint = slot_hint(hash);
@@ -302,7 +342,6 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
                     unsafe { group.values[hint].assume_init_mut() },
                     value,
                 );
-                drop(key);
                 return Some(old);
             }
 
@@ -315,7 +354,6 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
                         unsafe { group.values[i].assume_init_mut() },
                         value,
                     );
-                    drop(key);
                     return Some(old);
                 }
             }
@@ -338,7 +376,9 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
             } else {
                 if self.num_groups as usize == self.groups.len() {
                     self.grow();
-                    return self.insert_hashed(hash, key, value);
+                    // n_bits changed; recompute the primary group and retry.
+                    gi = self.group_index(hash);
+                    continue;
                 }
                 let new_gi = self.num_groups as usize;
                 self.num_groups += 1;
@@ -354,6 +394,17 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
     }
 
     fn get_hashed<Q>(&self, hash: u64, key: &Q) -> Option<&V>
+    where
+        K: Borrow<Q>,
+        Q: Eq + ?Sized,
+    {
+        let (gi, slot) = self.find_slot(hash, key)?;
+        Some(unsafe { self.groups[gi].values[slot].assume_init_ref() })
+    }
+
+    /// Look up `key` and return its `(group_index, slot)` if present.
+    /// Pure read-only lookup — does not allocate or modify the table.
+    fn find_slot<Q>(&self, hash: u64, key: &Q) -> Option<(usize, usize)>
     where
         K: Borrow<Q>,
         Q: Eq + ?Sized,
@@ -370,7 +421,7 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
             if c == tag
                 && unsafe { group.keys[hint].assume_init_ref() }.borrow() == key
             {
-                return Some(unsafe { group.values[hint].assume_init_ref() });
+                return Some((gi, hint));
             }
 
             // Slow path: SIMD scan group.
@@ -378,7 +429,7 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
             tag_mask = group_ops::clear_slot(tag_mask, hint);
             while let Some(i) = group_ops::next_match(&mut tag_mask) {
                 if unsafe { group.keys[i].assume_init_ref() }.borrow() == key {
-                    return Some(unsafe { group.values[i].assume_init_ref() });
+                    return Some((gi, i));
                 }
             }
 
@@ -393,6 +444,53 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
         }
     }
 
+    /// Single-walk variant that returns either the found slot or precise
+    /// information about where to insert. Used by [`entry`].
+    ///
+    /// Returns raw pointers (instead of indices) so the caller can write
+    /// directly without re-indexing. Pointers remain valid for the lifetime
+    /// of `&mut self` until any reallocation (`grow`).
+    fn find_or_insertion_slot(&mut self, hash: u64, key: &K) -> FindResult<K, V> {
+        let tag = tag(hash);
+        let hint = slot_hint(hash);
+        let mut gi = self.group_index(hash);
+
+        loop {
+            let group = &mut self.groups[gi];
+
+            // Fast path: preferred slot.
+            let c = group.ctrl[hint];
+            if c == CTRL_EMPTY {
+                return FindResult::Empty { group: group as *mut _, slot: hint };
+            }
+            if c == tag && unsafe { group.keys[hint].assume_init_ref() } == key {
+                return FindResult::Found(group.values[hint].as_mut_ptr());
+            }
+
+            // Slow path: SIMD scan group for tag match.
+            let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
+            tag_mask = group_ops::clear_slot(tag_mask, hint);
+            while let Some(i) = group_ops::next_match(&mut tag_mask) {
+                if unsafe { group.keys[i].assume_init_ref() } == key {
+                    return FindResult::Found(group.values[i].as_mut_ptr());
+                }
+            }
+
+            // Check for empty slot in this group.
+            let empty_mask = group_ops::match_empty(&group.ctrl);
+            if empty_mask != 0 {
+                let i = group_ops::lowest(empty_mask);
+                return FindResult::Empty { group: group as *mut _, slot: i };
+            }
+
+            // Group full — follow or report end of chain.
+            if group.overflow == NO_OVERFLOW {
+                return FindResult::NeedsOverflow { tail: group as *mut _ };
+            }
+            gi = group.overflow as usize;
+        }
+    }
+
     fn grow(&mut self) {
         let old_groups = std::mem::replace(
             &mut self.groups,
@@ -456,6 +554,196 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
     }
 }
 
+// ────────────────────────────────────────────────────────────────────────
+// Entry API
+// ────────────────────────────────────────────────────────────────────────
+
+/// Result of a single chain walk during `entry()`: either the existing slot
+/// for the key, an empty slot ready for insertion, or end-of-chain when no
+/// empty slot exists (and a new overflow group must be allocated).
+enum FindResult<K, V> {
+    /// Pointer to the existing value.
+    Found(*mut V),
+    /// Pointer to the group with an empty slot at index `slot`.
+    Empty { group: *mut Group<K, V>, slot: usize },
+    /// End of chain — the caller must allocate an overflow group and link it
+    /// via `tail`'s overflow field.
+    NeedsOverflow { tail: *mut Group<K, V> },
+}
+
+/// Pre-computed insertion location stashed inside [`VacantEntry`] so that
+/// `insert()` doesn't need to re-walk the chain. Pointers remain valid as
+/// long as no reallocation occurs (the grow path re-walks via the slow path).
+enum Insertion<K, V> {
+    /// An empty slot is waiting at `(group, slot)`.
+    Empty { group: *mut Group<K, V>, slot: usize },
+    /// The chain is full; allocate a new overflow group and link via `tail`.
+    NeedsOverflow { tail: *mut Group<K, V> },
+}
+
+/// View into a single entry in a [`SimdPrefixHashMap`], either occupied or vacant.
+pub enum Entry<'a, K, V, S> {
+    Occupied(OccupiedEntry<'a, V>),
+    Vacant(VacantEntry<'a, K, V, S>),
+}
+
+/// View into an occupied entry.
+pub struct OccupiedEntry<'a, V> {
+    value: &'a mut V,
+}
+
+/// View into a vacant entry. Holds the borrow of the map plus the hash, key,
+/// and pre-computed insertion slot.
+pub struct VacantEntry<'a, K, V, S> {
+    map: &'a mut SimdPrefixHashMap<K, V, S>,
+    hash: u64,
+    key: K,
+    insertion: Insertion<K, V>,
+}
+
+impl<'a, K: Hash + Eq, V, S: BuildHasher> Entry<'a, K, V, S> {
+    /// Insert `default` if vacant; return a mutable reference to the value either way.
+    #[inline]
+    pub fn or_insert(self, default: V) -> &'a mut V {
+        match self {
+            Entry::Occupied(o) => o.into_mut(),
+            Entry::Vacant(v) => v.insert(default),
+        }
+    }
+
+    /// Insert `f()` if vacant; `f` runs only on the vacant branch.
+    #[inline]
+    pub fn or_insert_with<F: FnOnce() -> V>(self, f: F) -> &'a mut V {
+        match self {
+            Entry::Occupied(o) => o.into_mut(),
+            Entry::Vacant(v) => v.insert(f()),
+        }
+    }
+
+    /// Insert `V::default()` if vacant.
+    #[inline]
+    pub fn or_default(self) -> &'a mut V
+    where
+        V: Default,
+    {
+        self.or_insert_with(V::default)
+    }
+
+    /// Apply `f` to the value if occupied; pass through unchanged otherwise.
+    #[inline]
+    pub fn and_modify<F: FnOnce(&mut V)>(self, f: F) -> Self {
+        match self {
+            Entry::Occupied(mut o) => {
+                f(o.get_mut());
+                Entry::Occupied(o)
+            }
+            v @ Entry::Vacant(_) => v,
+        }
+    }
+}
+
+impl<'a, V> OccupiedEntry<'a, V> {
+    /// Get a shared reference to the value.
+    #[inline]
+    pub fn get(&self) -> &V {
+        &*self.value
+    }
+
+    /// Get a mutable reference to the value.
+    #[inline]
+    pub fn get_mut(&mut self) -> &mut V {
+        self.value
+    }
+
+    /// Consume the entry, returning the mutable reference with the entry's lifetime.
+    #[inline]
+    pub fn into_mut(self) -> &'a mut V {
+        self.value
+    }
+}
+
+impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> {
+    /// Insert `value` and return a mutable reference to it.
+    /// Writes directly to the slot pre-computed during `entry()`; only re-walks
+    /// the chain on the rare grow path (where the pre-computed pointers become
+    /// stale because grow re-allocates the groups buffer).
+    #[inline]
+    pub fn insert(self, value: V) -> &'a mut V {
+        let map = self.map;
+        let hash = self.hash;
+        let key = self.key;
+
+        let (group_ptr, slot) = match self.insertion {
+            Insertion::Empty { group, slot } => (group, slot),
+            Insertion::NeedsOverflow { tail } => {
+                if map.num_groups as usize == map.groups.len() {
+                    return insert_after_grow(map, hash, key, value);
+                }
+                let new_gi = map.num_groups as usize;
+                map.num_groups += 1;
+                // SAFETY: `tail` was obtained from `&mut self.groups[..]` and
+                // remains valid because no reallocation occurred between
+                // `entry()` and now (we hold the only `&mut self`).
+                unsafe {
+                    (*tail).overflow = new_gi as u32;
+                }
+                let new_group: *mut Group<K, V> = &mut map.groups[new_gi];
+                (new_group, slot_hint(hash))
+            }
+        };
+
+        let tag = tag(hash);
+        // SAFETY: `group_ptr` points into `map.groups` and is valid for `'a`.
+        unsafe {
+            let group = &mut *group_ptr;
+            group.ctrl[slot] = tag;
+            group.keys[slot] = MaybeUninit::new(key);
+            group.values[slot] = MaybeUninit::new(value);
+            map.len += 1;
+            group.values[slot].assume_init_mut()
+        }
+    }
+}
+
+/// Cold path: the chain was full, the table is at capacity, and we need to
+/// grow before inserting. Re-walks via the slow path after grow.
+///
+/// After `grow()` doubles `num_primary` (`n_bits += 1`), our key's new
+/// primary group can have at most ~half the old chain's keys, so hitting
+/// `NeedsOverflow` again would require `GROUP_SIZE` keys to all collide on
+/// one extra bit of hash — essentially impossible for any reasonable hash.
+/// (`insert_for_grow` relies on the same assumption to skip its own
+/// capacity check.)
+#[cold]
+#[inline(never)]
+fn insert_after_grow<'a, K: Hash + Eq, V, S: BuildHasher>(
+    map: &'a mut SimdPrefixHashMap<K, V, S>,
+    hash: u64,
+    key: K,
+    value: V,
+) -> &'a mut V {
+    map.grow();
+    match map.find_or_insertion_slot(hash, &key) {
+        FindResult::Empty { group, slot } => {
+            let tag = tag(hash);
+            // SAFETY: `group` points into `map.groups` and is valid for `'a`.
+            unsafe {
+                let g = &mut *group;
+                g.ctrl[slot] = tag;
+                g.keys[slot] = MaybeUninit::new(key);
+                g.values[slot] = MaybeUninit::new(value);
+                map.len += 1;
+                g.values[slot].assume_init_mut()
+            }
+        }
+        // After grow, the new primary group for `key` cannot be full (see
+        // function docs), and the key wasn't in the table before grow.
+        FindResult::NeedsOverflow { .. } | FindResult::Found(_) => {
+            unreachable!("post-grow walk must hit an empty slot")
+        }
+    }
+}
+
 impl<K, V, S> Drop for SimdPrefixHashMap<K, V, S> {
     fn drop(&mut self) {
         for group in &mut self.groups[..self.num_groups as usize] {
@@ -561,4 +849,88 @@ mod tests {
         assert_eq!(map.get("hello"), Some(&3));
         assert_eq!(map.len(), 2);
     }
+
+    #[test]
+    fn get_or_default_basics() {
+        let mut map: SimdPrefixHashMap<&str, i32> = SimdPrefixHashMap::new();
+        // Inserts default (0), then mutates.
+        *map.get_or_default("a") += 5;
+        *map.get_or_default("b") += 7;
+        // Subsequent calls return the existing value.
+        *map.get_or_default("a") += 3;
+        assert_eq!(map.get(&"a"), Some(&8));
+        assert_eq!(map.get(&"b"), Some(&7));
+        assert_eq!(map.len(), 2);
+    }
+
+    #[test]
+    fn get_or_insert_with_lazy() {
+        let mut map: SimdPrefixHashMap<u32, String> = SimdPrefixHashMap::new();
+        let mut call_count = 0;
+        let mut make = |s: &str| {
+            call_count += 1;
+            s.to_string()
+        };
+        // First call: f runs, inserts "first".
+        assert_eq!(map.get_or_insert_with(1, || make("first")), &mut "first".to_string());
+        // Second call with same key: f does NOT run; returns existing.
+        assert_eq!(map.get_or_insert_with(1, || make("second")), &mut "first".to_string());
+        // New key: f runs.
+        assert_eq!(map.get_or_insert_with(2, || make("third")), &mut "third".to_string());
+        assert_eq!(call_count, 2);
+        assert_eq!(map.len(), 2);
+    }
+
+    #[test]
+    fn get_or_default_survives_grow() {
+        let mut map: SimdPrefixHashMap<u32, u32> = SimdPrefixHashMap::with_capacity(1);
+        for i in 0..500u32 {
+            *map.get_or_default(i) = i * 2;
+        }
+        assert_eq!(map.len(), 500);
+        for i in 0..500u32 {
+            assert_eq!(map.get(&i), Some(&(i * 2)), "missing key {i}");
+        }
+    }
+
+    #[test]
+    fn entry_or_default_counting() {
+        // Classic counting workload via Entry API.
+        let mut map: SimdPrefixHashMap<&str, u32> = SimdPrefixHashMap::new();
+        for word in ["a", "b", "a", "c", "b", "a"] {
+            *map.entry(word).or_default() += 1;
+        }
+        assert_eq!(map.get(&"a"), Some(&3));
+        assert_eq!(map.get(&"b"), Some(&2));
+        assert_eq!(map.get(&"c"), Some(&1));
+        assert_eq!(map.len(), 3);
+    }
+
+    #[test]
+    fn entry_or_insert_lazy() {
+        let mut map: SimdPrefixHashMap<u32, String> = SimdPrefixHashMap::new();
+        let mut call_count = 0;
+        let mut make = |s: &str| {
+            call_count += 1;
+            s.to_string()
+        };
+        // First call: f runs, inserts.
+        let v = map.entry(1).or_insert_with(|| make("first"));
+        assert_eq!(v, "first");
+        // Second call with same key: f does NOT run.
+        let v = map.entry(1).or_insert_with(|| make("second"));
+        assert_eq!(v, "first");
+        assert_eq!(call_count, 1);
+    }
+
+    #[test]
+    fn entry_and_modify() {
+        let mut map: SimdPrefixHashMap<u32, u32> = SimdPrefixHashMap::new();
+        // Vacant: and_modify is a no-op, then or_insert(0) runs.
+        *map.entry(7).and_modify(|v| *v *= 10).or_insert(1) += 100;
+        assert_eq!(map.get(&7), Some(&101));
+        // Occupied: and_modify runs, or_insert is skipped.
+        *map.entry(7).and_modify(|v| *v *= 2).or_insert(99) += 1;
+        assert_eq!(map.get(&7), Some(&203));
+    }
 }

From 127798c13cc12a9a05022ef3d6490858d3a68cc9 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Tue, 28 Apr 2026 14:45:34 +0000
Subject: [PATCH 08/22] Revert unnecessary change.

---
 crates/hashmap-bench/prefix_map_simd.rs | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/crates/hashmap-bench/prefix_map_simd.rs b/crates/hashmap-bench/prefix_map_simd.rs
index 628e411..410d2e0 100644
--- a/crates/hashmap-bench/prefix_map_simd.rs
+++ b/crates/hashmap-bench/prefix_map_simd.rs
@@ -394,17 +394,6 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
     }
 
     fn get_hashed<Q>(&self, hash: u64, key: &Q) -> Option<&V>
-    where
-        K: Borrow<Q>,
-        Q: Eq + ?Sized,
-    {
-        let (gi, slot) = self.find_slot(hash, key)?;
-        Some(unsafe { self.groups[gi].values[slot].assume_init_ref() })
-    }
-
-    /// Look up `key` and return its `(group_index, slot)` if present.
-    /// Pure read-only lookup — does not allocate or modify the table.
-    fn find_slot<Q>(&self, hash: u64, key: &Q) -> Option<(usize, usize)>
     where
         K: Borrow<Q>,
         Q: Eq + ?Sized,
@@ -421,7 +410,7 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
             if c == tag
                 && unsafe { group.keys[hint].assume_init_ref() }.borrow() == key
             {
-                return Some((gi, hint));
+                return Some(unsafe { group.values[hint].assume_init_ref() });
             }
 
             // Slow path: SIMD scan group.
@@ -429,7 +418,7 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
             tag_mask = group_ops::clear_slot(tag_mask, hint);
             while let Some(i) = group_ops::next_match(&mut tag_mask) {
                 if unsafe { group.keys[i].assume_init_ref() }.borrow() == key {
-                    return Some((gi, i));
+                    return Some(unsafe { group.values[i].assume_init_ref() });
                 }
             }
 

From 7eaf609b02c78cec73d7a93d9895e738f7982839 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Tue, 28 Apr 2026 15:24:11 +0000
Subject: [PATCH 09/22] Simplify enums

---
 crates/hashmap-bench/prefix_map_simd.rs | 43 +++++++++++--------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/crates/hashmap-bench/prefix_map_simd.rs b/crates/hashmap-bench/prefix_map_simd.rs
index 410d2e0..b709031 100644
--- a/crates/hashmap-bench/prefix_map_simd.rs
+++ b/crates/hashmap-bench/prefix_map_simd.rs
@@ -305,17 +305,11 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
                 // SAFETY: pointer is valid for `'_` (bounded by `&mut self`).
                 value: unsafe { &mut *ptr },
             }),
-            FindResult::Empty { group, slot } => Entry::Vacant(VacantEntry {
+            FindResult::Vacant(insertion) => Entry::Vacant(VacantEntry {
                 map: self,
                 hash,
                 key,
-                insertion: Insertion::Empty { group, slot },
-            }),
-            FindResult::NeedsOverflow { tail } => Entry::Vacant(VacantEntry {
-                map: self,
-                hash,
-                key,
-                insertion: Insertion::NeedsOverflow { tail },
+                insertion,
             }),
         }
     }
@@ -324,10 +318,8 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
         let tag = tag(hash);
         let hint = slot_hint(hash);
         let mut gi = self.group_index(hash);
-
         loop {
             let group = &mut self.groups[gi];
-
             // Fast path: check preferred slot.
             let c = group.ctrl[hint];
             if c == CTRL_EMPTY {
@@ -344,7 +336,6 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
                 );
                 return Some(old);
             }
-
             // Slow path: SIMD scan group for tag match.
             let mut tag_mask = group_ops::match_tag(&group.ctrl, tag);
             tag_mask = group_ops::clear_slot(tag_mask, hint);
@@ -357,7 +348,6 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
                     return Some(old);
                 }
             }
-
             // Check for empty slot in this group.
             let empty_mask = group_ops::match_empty(&group.ctrl);
             if empty_mask != 0 {
@@ -368,7 +358,6 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
                 self.len += 1;
                 return None;
             }
-
             // Group full — follow or create overflow chain.
             let overflow = group.overflow;
             if overflow != NO_OVERFLOW {
@@ -450,7 +439,10 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
             // Fast path: preferred slot.
             let c = group.ctrl[hint];
             if c == CTRL_EMPTY {
-                return FindResult::Empty { group: group as *mut _, slot: hint };
+                return FindResult::Vacant(Insertion::Empty {
+                    group: group as *mut _,
+                    slot: hint,
+                });
             }
             if c == tag && unsafe { group.keys[hint].assume_init_ref() } == key {
                 return FindResult::Found(group.values[hint].as_mut_ptr());
@@ -469,12 +461,17 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
             let empty_mask = group_ops::match_empty(&group.ctrl);
             if empty_mask != 0 {
                 let i = group_ops::lowest(empty_mask);
-                return FindResult::Empty { group: group as *mut _, slot: i };
+                return FindResult::Vacant(Insertion::Empty {
+                    group: group as *mut _,
+                    slot: i,
+                });
             }
 
             // Group full — follow or report end of chain.
             if group.overflow == NO_OVERFLOW {
-                return FindResult::NeedsOverflow { tail: group as *mut _ };
+                return FindResult::Vacant(Insertion::NeedsOverflow {
+                    tail: group as *mut _,
+                });
             }
             gi = group.overflow as usize;
         }
@@ -548,16 +545,12 @@ impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
 // ────────────────────────────────────────────────────────────────────────
 
 /// Result of a single chain walk during `entry()`: either the existing slot
-/// for the key, an empty slot ready for insertion, or end-of-chain when no
-/// empty slot exists (and a new overflow group must be allocated).
+/// for the key or a pre-computed insertion location for a vacant entry.
 enum FindResult<K, V> {
     /// Pointer to the existing value.
     Found(*mut V),
-    /// Pointer to the group with an empty slot at index `slot`.
-    Empty { group: *mut Group<K, V>, slot: usize },
-    /// End of chain — the caller must allocate an overflow group and link it
-    /// via `tail`'s overflow field.
-    NeedsOverflow { tail: *mut Group<K, V> },
+    /// Where to insert if the caller decides to add a new entry.
+    Vacant(Insertion<K, V>),
 }
 
 /// Pre-computed insertion location stashed inside [`VacantEntry`] so that
@@ -713,7 +706,7 @@ fn insert_after_grow<'a, K: Hash + Eq, V, S: BuildHasher>(
 ) -> &'a mut V {
     map.grow();
     match map.find_or_insertion_slot(hash, &key) {
-        FindResult::Empty { group, slot } => {
+        FindResult::Vacant(Insertion::Empty { group, slot }) => {
             let tag = tag(hash);
             // SAFETY: `group` points into `map.groups` and is valid for `'a`.
             unsafe {
@@ -727,7 +720,7 @@ fn insert_after_grow<'a, K: Hash + Eq, V, S: BuildHasher>(
         }
         // After grow, the new primary group for `key` cannot be full (see
         // function docs), and the key wasn't in the table before grow.
-        FindResult::NeedsOverflow { .. } | FindResult::Found(_) => {
+        FindResult::Vacant(Insertion::NeedsOverflow { .. }) | FindResult::Found(_) => {
             unreachable!("post-grow walk must hit an empty slot")
         }
     }

From 0ecf083d054baa9636e40ead46bc599bac634737 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Wed, 29 Apr 2026 14:35:06 +0200
Subject: [PATCH 10/22] some documentation

---
 crates/hashmap-bench/OPTIMIZATIONS.md | 556 ++++++--------------------
 crates/hashmap-bench/README.md        |  80 ++--
 2 files changed, 154 insertions(+), 482 deletions(-)

diff --git a/crates/hashmap-bench/OPTIMIZATIONS.md b/crates/hashmap-bench/OPTIMIZATIONS.md
index d113e60..86dc4ce 100644
--- a/crates/hashmap-bench/OPTIMIZATIONS.md
+++ b/crates/hashmap-bench/OPTIMIZATIONS.md
@@ -1,482 +1,176 @@
-# Missing Optimizations in PrefixHashMap vs. Rust Swiss Table (hashbrown)
+# PrefixHashMap vs. Rust Swiss Table (hashbrown): Optimization Analysis
 
 ## Executive Summary
 
-The `PrefixHashMap` in this repository is a minimal, insertion-only hash map specialized for pre-hashed `u32` keys. While it borrows the core Swiss table concept of control-byte-based group scanning, it omits a large number of optimizations present in the production [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) Swiss table implementation. The most impactful missing optimizations are: **SIMD-accelerated group scanning** (SSE2/NEON), **open-addressing with triangular probing** (instead of overflow chaining), **SoA memory layout** separating control bytes from data for cache efficiency, **in-place rehashing** to reclaim tombstones, **DELETED tombstone support** for element removal, and **over-allocation utilization**. This report catalogs every significant optimization gap across architecture, probing, memory layout, SIMD, resize strategy, and API completeness.
+`PrefixHashMap` is a Swiss-table-inspired hash map that uses **overflow
+chaining** (instead of open addressing), **SIMD group scanning** (NEON/SSE2),
+a **slot-hint fast path**, and an **optimized growth strategy**. It is generic
+over key type, value type, and hash builder.
 
----
-
-## Architecture Overview
-
-```
-┌───────────────────────────────────────────────────────────────────┐
-│                    hashbrown Swiss Table                          │
-│                                                                   │
-│   Single contiguous allocation:                                   │
-│   [Padding] [T_n ... T_1  T_0] [CT_0 CT_1 ... CT_n] [CT_extra]  │
-│                 data (SoA)          control bytes     (mirrored)  │
-│                                                                   │
-│   • Open addressing, triangular probing                           │
-│   • 16-byte groups (SSE2) or 8-byte groups (NEON/generic)         │
-│   • SIMD parallel group scan                                      │
-│   • EMPTY / DELETED / FULL tag states                             │
-└───────────────────────────────────────────────────────────────────┘
-
-┌───────────────────────────────────────────────────────────────────┐
-│                     PrefixHashMap                                 │
-│                                                                   │
-│   Vec<Group> where each Group:                                    │
-│   { ctrl: [u8; 8], keys: [u32; 8], values: [MaybeUninit<V>; 8],  │
-│     overflow: u32 }                                               │
-│                                                                   │
-│   • Overflow chaining (linked Group structs)                      │
-│   • Fixed 8-byte groups, scalar bit-manipulation                  │
-│   • EMPTY / FULL tag states only (no DELETED)                     │
-└───────────────────────────────────────────────────────────────────┘
-```
-
----
-
-## 1. SIMD-Accelerated Group Scanning
-
-**Status: Missing from PrefixHashMap**
-
-This is arguably the most impactful optimization gap.
-
-### hashbrown
-
-hashbrown provides three SIMD backends selected at compile time[^1]:
-
-| Platform | Backend | Group Width | Instructions Used |
-|----------|---------|-------------|-------------------|
-| x86/x86_64 with SSE2 | `sse2.rs` | 16 bytes | `_mm_cmpeq_epi8`, `_mm_movemask_epi8` |
-| AArch64 with NEON | `neon.rs` | 8 bytes | `vceq_u8`, `vcltz_s8`, `vreinterpret_u64_u8` |
-| Fallback | `generic.rs` | 8 bytes (u64) | Scalar bit tricks |
-
-On x86_64, the SSE2 `match_tag` compiles to just 2 instructions: a `pcmpeqb` and a `pmovmskb`, producing a 16-bit mask where each bit directly indicates a matching slot[^2]. This means **16 slots are scanned in a single operation**.
-
-### PrefixHashMap
-
-PrefixHashMap uses only the scalar approach, operating on 8 control bytes packed into a `u64`[^3]:
-
-```rust
-fn match_byte(ctrl: &[u8; GROUP_SIZE], byte: u8) -> u64 {
-    let word = u64::from_ne_bytes(*ctrl);
-    let broadcast = 0x0101010101010101u64 * (byte as u64);
-    let xor = word ^ broadcast;
-    (xor.wrapping_sub(0x0101010101010101)) & !xor & 0x8080808080808080
-}
-```
-
-This is essentially the same algorithm as hashbrown's `generic.rs` fallback[^4], but:
-
-- **Fixed at 8-byte groups** — never benefits from the SSE2 16-byte group scan available on most modern x86 machines.
-- **No platform-specific fast paths** — no NEON, no SSE2, no LoongArch LSX.
-
-**Impact**: On x86_64, hashbrown scans 2× more slots per group operation using native SIMD instructions that are lower latency than the scalar bit-manipulation chain.
-
----
-
-## 2. Probing Strategy: Triangular Probing vs. Overflow Chaining
-
-**Status: Missing from PrefixHashMap**
-
-### hashbrown
-
-hashbrown uses **triangular probing**, a variant of open addressing where each successive probe jumps by one more group width[^5]:
-
-```rust
-struct ProbeSeq { pos: usize, stride: usize }
-impl ProbeSeq {
-    fn move_next(&mut self, bucket_mask: usize) {
-        self.stride += Group::WIDTH;
-        self.pos += self.stride;
-        self.pos &= bucket_mask;
-    }
-}
-```
-
-This is mathematically guaranteed to visit every group exactly once in a power-of-two-sized table[^6]. All probing occurs within a single contiguous allocation, enabling excellent spatial locality.
-
-### PrefixHashMap
-
-PrefixHashMap uses **overflow chaining**: when a primary group is full, an overflow group is allocated at the end of the `Vec<Group>` and linked via an index pointer[^7]:
-
-```rust
-overflow: u32, // index into groups vec, or NO_OVERFLOW
-```
-
-**Missing benefits of triangular probing**:
-
-- **Spatial locality**: Triangular probing accesses nearby memory regions (the next group is typically in the same or adjacent cache line). Overflow groups are appended at the end of the vector, potentially far from the primary group.
-- **No pointer chasing**: Triangular probing computes the next position arithmetically; overflow chaining follows an indirection.
-- **Probe termination guarantee**: Triangular probing terminates when it encounters an EMPTY slot. Overflow chaining must check the `overflow` field and follow links.
-
----
-
-## 3. Memory Layout: SoA vs. AoS
-
-**Status: Missing from PrefixHashMap**
-
-### hashbrown
-
-hashbrown uses a **Structure-of-Arrays (SoA)** layout within a single allocation[^8]:
-
-```
-[Padding] [T_n, ..., T_1, T_0] [CT_0, CT_1, ..., CT_n, CT_extra...]
-           ^^^ data part ^^^     ^^^ control bytes (contiguous) ^^^
-```
-
-All control bytes are stored contiguously at the end of the allocation. When probing, the initial scan only touches control bytes — the data is only accessed after a tag match. This means:
-
-- **Control byte scans stay in L1 cache**: For a table with 1024 entries, all 1024 control bytes fit in ~1KB, likely fitting entirely in L1 cache.
-- **Data is only accessed on hits**: Cache pollution from data access is minimized.
-
-### PrefixHashMap
-
-PrefixHashMap uses an **Array-of-Structures (AoS)** layout[^9]:
-
-```rust
-struct Group<V> {
-    ctrl: [u8; 8],      // 8 bytes
-    keys: [u32; 8],     // 32 bytes
-    values: [MaybeUninit<V>; 8], // 8 * size_of::<V>() bytes
-    overflow: u32,       // 4 bytes
-}
-```
-
-For a `V` of 8 bytes (e.g., `usize`), each Group is 8 + 32 + 64 + 4 = 108 bytes (plus alignment padding). Scanning the control bytes of sequential groups requires jumping over all the key/value data, degrading cache utilization when doing multi-group probing.
-
----
-
-## 4. Control Byte Mirroring for Wrap-Around
-
-**Status: Missing from PrefixHashMap (but less needed due to overflow chaining)**
-
-### hashbrown
-
-hashbrown allocates `num_buckets + Group::WIDTH` control bytes. The first `Group::WIDTH` control bytes are replicated at the end[^10]:
-
-```rust
-fn set_ctrl(&mut self, index: usize, ctrl: Tag) {
-    let index2 = ((index.wrapping_sub(Group::WIDTH)) & self.bucket_mask) + Group::WIDTH;
-    *self.ctrl(index) = ctrl;
-    *self.ctrl(index2) = ctrl;  // mirror
-}
-```
-
-This ensures that a group load starting near the end of the table can safely wrap around without a branch or special case.
-
-### PrefixHashMap
-
-Not implemented — not needed because PrefixHashMap doesn't use open addressing. Each group is self-contained with its own control byte array.
+This document analyzes the design trade-offs versus
+[hashbrown](https://github.com/rust-lang/hashbrown) and records the
+experimental results that guided the current design.
 
 ---
 
-## 5. Tombstone / DELETED Support and In-Place Rehashing
-
-**Status: Missing from PrefixHashMap**
-
-### hashbrown
-
-hashbrown has three control byte states[^11]:
-
-| State | Encoding | Meaning |
-|-------|----------|---------|
-| `EMPTY` | `0xFF` (1111_1111) | Slot never occupied or fully reclaimed |
-| `DELETED` | `0x80` (1000_0000) | Tombstone — element removed, probing must continue past |
-| `FULL` | `0x00..0x7F` | Occupied — top 7 bits of hash |
-
-When elements are removed, the control byte is set to `DELETED` rather than `EMPTY`. This preserves the probe chain for other elements. When the ratio of deleted entries gets too high, hashbrown performs an **in-place rehash**[^12]:
-
-1. Convert all FULL → DELETED, DELETED → EMPTY via `convert_special_to_empty_and_full_to_deleted()`
-2. Walk through each DELETED (originally FULL) entry and swap it into its ideal position
-3. If both old and new positions are in the same probe group, just update the control byte in place
-
-This avoids a full reallocation when many deletes have fragmented the table.
-
-### PrefixHashMap
-
-PrefixHashMap only has two states[^13]:
-
-| State | Encoding |
-|-------|----------|
-| `EMPTY` | `0x00` |
-| `FULL` | `key_byte \| 0x80` |
-
-There is **no deletion support at all** — the map is described as "insertion-only"[^14]. This means:
-- No `remove()` method
-- No tombstones
-- No in-place rehash optimization
-- If an entry needs to be removed, the entire map must be rebuilt
-
----
-
-## 6. Tag / Hash Encoding
-
-**Status: Different approach in PrefixHashMap (not necessarily worse, but different trade-offs)**
-
-### hashbrown
-
-Uses the **top 7 bits** of the 64-bit hash as the tag, stored with the high bit clear (range `0x00..0x7F`)[^15]:
-
-```rust
-pub(crate) const fn full(hash: u64) -> Tag {
-    let top7 = hash >> (MIN_HASH_LEN * 8 - 7);
-    Tag((top7 & 0x7f) as u8)
-}
-```
-
-The high bit is reserved for EMPTY/DELETED sentinel detection. This means `EMPTY`/`DELETED` can be distinguished from `FULL` with a single bit test.
-
-### PrefixHashMap
-
-Forces bit 7 high and uses the low 7 bits of the key[^16]:
-
-```rust
-fn tag(key: u32) -> u8 {
-    (key as u8) | 0x80
-}
-```
-
-EMPTY is `0x00`. This inverts the hashbrown convention — FULL entries have bit 7 set, EMPTY has bit 7 clear. The `match_empty` function checks for zero bytes[^17]:
-
-```rust
-fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> u64 {
-    let word = u64::from_ne_bytes(*ctrl);
-    !word & 0x8080808080808080
-}
+## Architecture Comparison
+
+```
+┌──────────────────────────────────────────────────────────────────┐
+│                   hashbrown Swiss Table                          │
+│                                                                  │
+│  Single contiguous allocation (SoA):                             │
+│  [Padding] [T_n ... T_1  T_0] [CT_0 CT_1 ... CT_n] [CT_extra]  │
+│                data               control bytes      (mirrored) │
+│                                                                  │
+│  • Open addressing, triangular probing                           │
+│  • 16-byte groups (SSE2) or 8-byte groups (NEON/generic)         │
+│  • EMPTY / DELETED / FULL tag states                             │
+└──────────────────────────────────────────────────────────────────┘
+
+┌──────────────────────────────────────────────────────────────────┐
+│                      PrefixHashMap                               │
+│                                                                  │
+│  Vec<Group<K,V>> where each Group (AoS):                         │
+│  { ctrl: [u8; 8], keys: [MaybeUninit<K>; 8],                    │
+│    values: [MaybeUninit<V>; 8], overflow: u32 }                  │
+│                                                                  │
+│  • Overflow chaining (linked groups)                             │
+│  • 8-byte groups with NEON/SSE2/scalar SIMD scan                 │
+│  • EMPTY / FULL tag states only (insertion-only, no deletion)    │
+│  • Slot-hint fast path                                           │
+└──────────────────────────────────────────────────────────────────┘
 ```
 
-**Key difference**: PrefixHashMap cannot distinguish DELETED from FULL because all non-zero control bytes have bit 7 set. This is a deliberate simplification for the insertion-only use case.
-
 ---
 
-## 7. Load Factor and Growth Strategy
-
-**Status: Different and less sophisticated in PrefixHashMap**
+## Optimizations Investigated
 
-### hashbrown
+### 1. SIMD Group Scanning ✅ Implemented
 
-Uses an **87.5% maximum load factor** (7/8) with a `growth_left` counter[^18]:
+Platform-specific SIMD for control byte matching:
+- **aarch64**: NEON `vceq_u8` + `vreinterpret_u64_u8` (8-byte groups)
+- **x86_64**: SSE2 `_mm_cmpeq_epi8` + `_mm_movemask_epi8` (16-byte groups)
+- **Fallback**: Scalar u64 zero-byte detection trick
 
-```rust
-fn bucket_mask_to_capacity(bucket_mask: usize) -> usize {
-    if bucket_mask < 8 { bucket_mask }
-    else { ((bucket_mask + 1) / 8) * 7 }
-}
-```
+**Benchmark result**: ~5% faster than scalar on Apple M-series. The gain is
+modest because the slot-hint fast path often skips the group scan entirely.
 
-Growth is triggered when `growth_left` reaches 0, which tracks insertions minus the capacity. The `growth_left` field is decremented only when inserting into an EMPTY slot (not a DELETED one)[^19].
+### 2. Open Addressing with Triangular Probing ❌ Rejected
 
-### PrefixHashMap
+Tested an open-addressing variant (`OpenPrefixHashMap`) with triangular
+probing over AoS groups.
 
-Uses overflow group exhaustion as the growth trigger[^20]:
+**Benchmark result**: **40% slower** than overflow chaining. With the AoS
+layout, each group is ~112 bytes, so probing to the next group jumps over
+large memory regions. Overflow chaining with the slot-hint fast path is
+faster because most inserts land in the first group.
 
-```rust
-let max_overflow = self.num_primary / 8 + 1;
-let num_overflow = self.groups.len() as u32 - self.num_primary;
-if num_overflow >= max_overflow {
-    self.grow();
-    return self.insert(key, value);
-}
-```
+### 3. SoA Memory Layout ❌ Rejected
 
-This reserves 12.5% extra groups for overflow. Growth happens when the overflow area is full. This is a coarser signal than hashbrown's per-slot tracking and can lead to:
-- **Premature growth** if unlucky hash distribution fills overflow disproportionately
-- **Delayed growth** if hash distribution is uniform (overflow area may never fill even at high load)
+Tested a SoA variant (`SoaPrefixHashMap`) with separate control byte and
+key/value arrays, combined with triangular probing.
 
----
+**Benchmark result**: **Slowest variant** — even slower than AoS open
+addressing. The two-Vec SoA layout doubles TLB/cache pressure versus
+hashbrown's single-allocation layout. Without the single-allocation trick,
+SoA is worse than AoS for this use case.
 
-## 8. Resize Strategy
+### 4. Capacity Sizing ✅ Implemented
 
-**Status: Significantly less optimized in PrefixHashMap**
+The original `with_capacity` allocated `capacity / 8` groups, giving ~100%
+slot utilization. hashbrown uses `capacity * 8 / 7`, giving ~50% load.
 
-### hashbrown
+**Fix**: Changed to `capacity * 8 / 7` (87.5% max load factor), matching
+hashbrown. This was the **single biggest improvement** — PrefixHashMap went
+from 2× slower to matching hashbrown.
 
-hashbrown's resize has multiple optimizations:
+### 5. Optimized Growth ✅ Implemented
 
-1. **Over-allocation utilization**[^21]: When the allocator returns more memory than requested, hashbrown uses the extra space for additional buckets:
-   ```rust
-   if block.len() != layout.size() {
-       let x = maximum_buckets_in(block.len(), table_layout, Group::WIDTH);
-       // Use larger capacity...
-   }
-   ```
+The original `grow()` called the full `insert()` for each element (including
+duplicate checking and overflow traversal). hashbrown uses:
+- `find_insert_index` (skip duplicate check)
+- `ptr::copy_nonoverlapping` (raw memory copy)
+- Bulk counter updates
 
-2. **In-place rehashing** when fragmentation from deletions is high (described in §5).
+**Fix**: Added `insert_for_grow()` that skips duplicate checking, uses raw
+pointer copies, and iterates occupied slots via bitmask.
 
-3. **Efficient element copying** using `ptr::copy_nonoverlapping` with layout-aware size calculations[^22].
+**Benchmark result**: Growth is now **2× faster** than hashbrown (4.8 µs vs
+9.8 µs for 3 resize rounds).
 
-4. **Panic-safe resize** using `ScopeGuard` to ensure the old table is freed even if the hasher panics[^23].
+### 6. Branch Prediction Hints ⚠️ Mixed Results
 
-### PrefixHashMap
+Added `likely()`/`unlikely()` annotations and `#[cold] #[inline(never)]` on
+the overflow path.
 
-PrefixHashMap's grow is simpler and less efficient[^24]:
+**Benchmark result**: Helped the scalar version (~2–6% faster) but **hurt the
+SIMD version** by pessimizing NEON code generation. Removed from the SIMD
+implementation, kept in the scalar version.
 
-```rust
-fn grow(&mut self) {
-    let old_groups = std::mem::take(&mut self.groups);
-    self.n_bits += 1;
-    // ... allocate new groups ...
-    for group in old_groups {
-        for i in 0..GROUP_SIZE {
-            if group.ctrl[i] != CTRL_EMPTY {
-                self.insert(key, value);  // full re-insertion
-            }
-        }
-        std::mem::forget(group);
-    }
-}
-```
-
-Missing optimizations:
-- **Always doubles** — no option for in-place rehash
-- **Re-inserts via the public API** — each element goes through the full insert path including overflow chain traversal, whereas hashbrown uses a fast `prepare_insert_index` that skips duplicate checking
-- **No over-allocation utilization**
-- **Limited panic safety** — uses `mem::forget` on old groups but doesn't guard against panics during re-insertion
-
----
-
-## 9. Branch Prediction Hints
-
-**Status: Missing from PrefixHashMap**
-
-### hashbrown
-
-hashbrown extensively uses `likely()` and `unlikely()` hints to guide the CPU's branch predictor[^25]:
+### 7. Slot Hint Fast Path (Unique to PrefixHashMap)
 
+PrefixHashMap checks a preferred slot before scanning the group:
 ```rust
-if unlikely(self.table.growth_left == 0 && old_ctrl.special_is_empty()) {
-    self.reserve(1, hasher);
-}
+let hint = slot_hint(hash);  // 3 bits from hash → slot index
+if ctrl[hint] == EMPTY { /* direct insert */ }
+if ctrl[hint] == tag && keys[hint] == key { /* direct hit */ }
 ```
 
-```rust
-if likely(eq(index)) {
-    return Some(index);
-}
-```
+hashbrown does **not** have this optimization — it always does a full SIMD
+group scan. At ~50% load, the hint hits ~58% of the time, avoiding the scan
+entirely.
 
-### PrefixHashMap
+### 8. Overflow Reserve Sizing ✅ Validated
 
-No branch hints are used anywhere in the implementation. On modern CPUs, this can affect branch prediction accuracy for cold paths like growth and overflow traversal.
+Tested overflow reserves from 0% to 100% of primary groups:
 
----
+| Reserve | Growth scenario (µs) |
+|---------|---------------------|
+| m/8 (12.5%, default) | 8.04 |
+| m/4 (25%) | 8.33 |
+| m/2 (50%) | 8.93 |
+| m/1 (100%) | 10.31 |
+| 0 (grow immediately) | 6.96 |
 
-## 10. Slot Hint / Preferred Slot
+**Conclusion**: Smaller reserves are faster — growing early is cheaper than
+traversing overflow chains. The `m/8` default implicitly enforces ~62.5% max
+load, which aligns with the mathematical analysis (Poisson model, 3σ
+confidence).
 
-**Status: Present in PrefixHashMap but NOT in hashbrown (PrefixHashMap advantage)**
+### 9. IdentityHasher Fix ✅ Implemented
 
-PrefixHashMap has a unique optimization not present in hashbrown: a **preferred slot hint** derived from additional hash bits[^26]:
+The original `IdentityHasher` zero-extended u32 to u64, putting zeros in the
+top 32 bits. Since hashbrown derives the 7-bit tag from `hash >> 57`, every
+entry got the same tag — completely defeating control byte filtering.
 
-```rust
-fn slot_hint(key: u32) -> usize {
-    ((key >> 7) & 0x7) as usize
-}
-```
-
-Before scanning the group, PrefixHashMap first checks the preferred slot directly[^27]:
-
-```rust
-let c = group.ctrl[hint];
-if c == CTRL_EMPTY {
-    // Direct insert without scanning
-}
-if c == tag && group.keys[hint] == key {
-    // Direct hit without scanning
-}
-```
-
-This is a fast path that avoids the scalar group scan entirely when the preferred slot is available. hashbrown does not have this optimization — it always does a full group scan via SIMD/scalar.
-
----
-
-## 11. Additional Missing Features and Optimizations
-
-| Feature | hashbrown | PrefixHashMap |
-|---------|-----------|---------------|
-| Custom allocator support | Yes (`Allocator` trait)[^28] | No (uses `Vec` with global allocator) |
-| ZST (Zero-Sized Type) handling | Optimized special case[^29] | Not supported |
-| `#[cold]` / `#[inline(never)]` on slow paths | Yes (e.g., `reserve_rehash`)[^30] | Not used |
-| `Entry` API | Full entry API | Not provided |
-| Iterator support | `RawIter`, `RawDrain`, `RawIntoIter` | Not provided |
-| `shrink_to` / `shrink_to_fit` | Yes | Not provided |
-| Generic over key type | Yes (any `K: Hash + Eq`) | Fixed `u32` keys only |
-| `remove` / `erase` | Yes, with tombstones | Not supported |
-| Monomorphization reduction | Uses `dyn Fn` for inner functions[^31] | Not applicable (simpler API) |
-| Small table optimization | Min capacity thresholds based on layout/group width[^32] | Minimum 2 primary groups |
+**Fix**: Use `folded_multiply` to expand u32 keys to u64 with independent
+entropy in both halves. Also changed trigram generation to use
+`folded_multiply` instead of murmur3.
 
 ---
 
-## 12. Summary of Impact
-
-The missing optimizations can be categorized by their likely performance impact:
-
-### High Impact
-1. **SIMD group scanning** — 2× more slots per scan on SSE2; lower-latency instructions
-2. **SoA memory layout** — dramatically better cache behavior for control byte scanning
-3. **Open addressing with triangular probing** — eliminates pointer chasing in overflow chains
-4. **Resize without re-insertion** — hashbrown copies elements directly without re-probing
+## Optimizations Not Implemented (and Why)
 
-### Medium Impact
-5. **In-place rehashing** — avoids allocation when table is fragmented by deletions (N/A for insert-only)
-6. **Over-allocation utilization** — free extra capacity from allocator rounding
-7. **Branch hints** — guides CPU branch predictor for common vs. rare paths
-8. **Load factor tracking** — precise growth triggering vs. overflow-area exhaustion
-
-### Lower Impact (or N/A for the use case)
-9. **Control byte mirroring** — needed for open addressing wrap-around (not needed with chaining)
-10. **Tombstone/DELETED support** — only matters if deletion is needed
-11. **Custom allocators** — not needed for most use cases
-12. **ZST handling** — irrelevant for `u32` keys
-
-### PrefixHashMap Advantages (Not in hashbrown)
-- **Slot hint fast path** — direct preferred-slot check before group scan
-- **No hashing overhead** — keys are pre-hashed `u32` values
-- **Simpler implementation** — ~250 lines vs. ~5000+ lines, easier to reason about
+| Optimization | Reason |
+|---|---|
+| **Tombstone / DELETED support** | Insertion-only map — no deletions needed |
+| **In-place rehashing** | No tombstones to reclaim |
+| **Control byte mirroring** | Not needed with overflow chaining (no wrap-around) |
+| **Custom allocator support** | Out of scope for benchmarking |
+| **Over-allocation utilization** | Uses `Vec` (no raw allocator control) |
 
 ---
 
-## Confidence Assessment
+## Summary of Impact
 
-- **High confidence**: All claims about both implementations are verified directly from source code. The hashbrown analysis is based on the current `main` branch (commit `420e83ba`), and the PrefixHashMap analysis is from the local `crates/hashmap-bench/prefix_map.rs`.
-- **Moderate confidence**: Performance impact assessments are based on algorithmic analysis and known CPU architecture properties (cache line sizes, SIMD throughput) rather than measured benchmarks. Actual impact depends on workload, key distribution, and hardware.
-- **Assumption**: The PrefixHashMap is intentionally minimal — many "missing" features are deliberate design choices for simplicity in a benchmarking context, not oversights.
-
----
+| Change | Effect on insert time |
+|---|---|
+| Capacity sizing fix (`*8/7`) | **−50%** (biggest win) |
+| Optimized growth path | **−10%** on growth scenarios |
+| SIMD group scanning | **−5%** |
+| Branch hints (scalar only) | **−2–6%** |
+| IdentityHasher fix | Enabled fair comparison |
 
-## Footnotes
-
-[^1]: `src/control/group/mod.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — compile-time cfg selection of SSE2, NEON, LSX, or generic backend
-[^2]: `src/control/group/sse2.rs:80-93` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `match_tag` using `_mm_cmpeq_epi8` + `_mm_movemask_epi8`
-[^3]: `crates/hashmap-bench/prefix_map.rs:50-56` — scalar `match_byte` function
-[^4]: `src/control/group/generic.rs:96-104` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — generic `match_tag` using same bit-trick
-[^5]: `src/raw.rs:80-97` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `ProbeSeq` struct and `move_next`
-[^6]: Blog post cited in hashbrown source: https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/
-[^7]: `crates/hashmap-bench/prefix_map.rs:12` — `overflow: u32` field in Group struct
-[^8]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `TableLayout::calculate_layout_for` computes `ctrl_offset = size * buckets` (data then control bytes)
-[^9]: `crates/hashmap-bench/prefix_map.rs:8-13` — Group struct with interleaved ctrl/keys/values
-[^10]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `set_ctrl` method mirrors control bytes
-[^11]: `src/control/tag.rs:5-9` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — Tag EMPTY=0xFF, DELETED=0x80
-[^12]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `rehash_in_place` method
-[^13]: `crates/hashmap-bench/prefix_map.rs:4` — `CTRL_EMPTY: u8 = 0x00`
-[^14]: `crates/hashmap-bench/prefix_map.rs:26` — doc comment: "Insertion-only hash map"
-[^15]: `src/control/tag.rs:36-47` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `Tag::full` method
-[^16]: `crates/hashmap-bench/prefix_map.rs:40-42` — `tag` function
-[^17]: `crates/hashmap-bench/prefix_map.rs:59-62` — `match_empty` function
-[^18]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `bucket_mask_to_capacity` function
-[^19]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `record_item_insert_at` decrements `growth_left` only for EMPTY
-[^20]: `crates/hashmap-bench/prefix_map.rs:148-154` — overflow exhaustion check triggering `grow()`
-[^21]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `new_uninitialized` over-allocation handling
-[^22]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `resize_inner` uses `ptr::copy_nonoverlapping`
-[^23]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `prepare_resize` returns `ScopeGuard`
-[^24]: `crates/hashmap-bench/prefix_map.rs:216-241` — `grow` method
-[^25]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — uses `likely()` / `unlikely()` from `crate::util`
-[^26]: `crates/hashmap-bench/prefix_map.rs:45-47` — `slot_hint` function
-[^27]: `crates/hashmap-bench/prefix_map.rs:98-114` — fast path check in `insert`
-[^28]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `RawTable<T, A: Allocator>`
-[^29]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `IS_ZERO_SIZED` special cases throughout
-[^30]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `#[cold] #[inline(never)]` on `reserve_rehash`
-[^31]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `find_inner` uses `&mut dyn FnMut(usize) -> bool`
-[^32]: `src/raw.rs` in [rust-lang/hashbrown](https://github.com/rust-lang/hashbrown) — `capacity_to_buckets` with `min_cap` thresholds
+The current PrefixHashMap **matches hashbrown+FxHash** on pre-sized inserts,
+**beats all hashbrown variants** on overwrites, and has **2× faster growth**.
diff --git a/crates/hashmap-bench/README.md b/crates/hashmap-bench/README.md
index 5b93b23..290060b 100644
--- a/crates/hashmap-bench/README.md
+++ b/crates/hashmap-bench/README.md
@@ -1,29 +1,25 @@
 # hashmap-bench
 
-Benchmarks comparing the custom `PrefixHashMap` (an insertion-only hash map for
-pre-hashed `u32` keys) against Rust's standard library and several third-party
-hash map implementations.
+Benchmarks comparing `PrefixHashMap` — an insertion-only hash map with
+overflow chaining and SIMD group scanning — against Rust's standard library
+and several third-party hash map implementations.
 
 ## Design
 
-`PrefixHashMap` is a Swiss-table-inspired hash map optimized for the case where
-keys are already well-distributed `u32` hashes (e.g. trigram fingerprints). It
-skips the hash function entirely and uses the key bits directly for bucket
-selection and tag matching.
-
-Key design choices:
+`PrefixHashMap<K, V, S>` is a Swiss-table-inspired hash map that uses:
 
 - **Overflow chaining** instead of open addressing — groups that fill up link
   to overflow groups rather than probing into neighbours.
-- **Slot hint** — a preferred slot index derived from the key, checked before
+- **Slot hint** — a preferred slot index derived from the hash, checked before
   scanning the group. Gives a direct hit on most inserts at low load.
+- **SIMD group scanning** — uses NEON on aarch64, SSE2 on x86\_64, and a
+  scalar fallback elsewhere to scan 8–16 control bytes in parallel.
 - **AoS group layout** — each group stores its control bytes, keys, and values
   together, keeping a single insert's data within 1–2 cache lines.
 - **Optimized growth** — during resize, elements are re-inserted without
   duplicate checking and copied via raw pointers.
-
-`SimdPrefixHashMap` adds platform-specific SIMD for the control byte scan
-(NEON on aarch64, SSE2 on x86\_64, scalar fallback elsewhere).
+- **Generic key/value/hasher** — supports any `K: Hash + Eq`, any
+  `S: BuildHasher`, and `Borrow<Q>`-based lookups.
 
 ## Benchmark results
 
@@ -35,56 +31,38 @@ M-series (aarch64).
 
 | Rank | Map | Time (µs) | vs best |
 |------|-----|-----------|---------|
-| 🥇 | FoldHashMap | 2.31 | — |
-| 🥈 | **SimdPrefixHashMap** | **2.51** | +9% |
-| 🥉 | FxHashMap | 2.65 | +15% |
-| 4 | hashbrown::HashMap | 2.67 | +16% |
-| 5 | hashbrown+Identity | 2.72 | +18% |
-| 6 | NoHintSimd | 2.76 | +19% |
-| 7 | **PrefixHashMap** | **3.00** | +30% |
-| 8 | std::HashMap+FNV | 3.10 | +34% |
-| 9 | AHashMap | 3.33 | +44% |
-| 10 | GxHashMap | 3.74 | +62% |
-| 11 | std::HashMap | 8.52 | +269% |
+| 🥇 | FoldHashMap | 2.44 | — |
+| 🥈 | FxHashMap | 2.61 | +7% |
+| 🥉 | hashbrown::HashMap | 2.67 | +9% |
+| 4 | **PrefixHashMap** | **2.71** | +11% |
+| 5 | hashbrown+Identity | 2.74 | +12% |
+| 6 | std::HashMap+FNV | 3.27 | +34% |
+| 7 | AHashMap | 3.22 | +32% |
+| 8 | GxHashMap | 3.69 | +51% |
+| 9 | std::HashMap | 8.49 | +248% |
 
 ### Re-insert same keys (all overwrites)
 
 | Map | Time (µs) |
 |-----|-----------|
-| **SimdPrefixHashMap** | **2.15** ✅ |
-| hashbrown+Identity | 2.33 |
-| PrefixHashMap | 3.24 |
+| **PrefixHashMap** | **2.36** ✅ |
+| hashbrown+Identity | 2.58 |
 
 ### Growth from small (`with_capacity(128)`, 3 resize rounds)
 
-| Map | Time (µs) | Growth cost |
-|-----|-----------|-------------|
-| **SimdPrefixHashMap** | **7.21** | +4.70 |
-| **PrefixHashMap** | **7.68** | +4.68 |
-| hashbrown+Identity | 10.05 | +7.33 |
-
-### Overflow reserve sizing (from small, 3 resize rounds)
-
-| Reserve | Time (µs) |
-|---------|-----------|
-| 0 (grow immediately) | 6.96 |
-| m/8 (12.5%, default) | 8.04 |
-| m/4 (25%) | 8.33 |
-| m/2 (50%) | 8.93 |
-| m/1 (100%) | 10.31 |
-| hashbrown+Identity | 9.86 |
+| Map | Time (µs) | Growth penalty |
+|-----|-----------|----------------|
+| **PrefixHashMap** | **4.85** | +2.14 |
+| hashbrown+Identity | 9.77 | +7.03 |
 
 ### Key takeaways
 
-- **SimdPrefixHashMap beats every hashbrown variant** except FoldHashMap on
-  first-time inserts, and is **the fastest** for overwrites.
-- **Growth is ~40% cheaper** than hashbrown thanks to the optimized
+- **PrefixHashMap matches the fastest hashbrown configurations** on pre-sized
+  first-time inserts and is **the fastest for overwrites**.
+- **Growth is ~2× faster** than hashbrown thanks to the optimized
   `insert_for_grow` path that skips duplicate checking and uses raw copies.
-- **Smaller overflow reserves are faster** — growing early is cheaper than
-  traversing overflow chains.
-- The remaining ~9% gap to FoldHashMap comes from hashbrown's highly optimized
-  code generation (branch hints, `#[cold]` paths, monomorphization reduction)
-  and its SoA memory layout advantage for SIMD group scans.
+- The remaining gap to FoldHashMap (~11%) comes from foldhash's extremely
+  efficient hash function that pipelines well with hashbrown's SIMD scan.
 
 ## Running
 

From 200f837625a73f4d2147f8bef76e9f4350a7972c Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Wed, 29 Apr 2026 15:24:21 +0200
Subject: [PATCH 11/22] reorganize

---
 Cargo.toml                                    |   2 +-
 crates/hash-sorted-map/Cargo.toml             |  10 +
 .../OPTIMIZATIONS.md                          |  18 +-
 .../README.md                                 |  37 ++-
 .../benchmarks}/Cargo.toml                    |   8 +-
 .../benchmarks}/lib.rs                        |  13 +-
 .../benchmarks/performance.rs}                |  84 +++----
 crates/hash-sorted-map/src/group_ops.rs       | 170 ++++++++++++++
 .../src/hash_sorted_map.rs}                   | 211 ++----------------
 crates/hash-sorted-map/src/lib.rs             |   2 +
 10 files changed, 297 insertions(+), 258 deletions(-)
 create mode 100644 crates/hash-sorted-map/Cargo.toml
 rename crates/{hashmap-bench => hash-sorted-map}/OPTIMIZATIONS.md (92%)
 rename crates/{hashmap-bench => hash-sorted-map}/README.md (61%)
 rename crates/{hashmap-bench => hash-sorted-map/benchmarks}/Cargo.toml (67%)
 rename crates/{hashmap-bench => hash-sorted-map/benchmarks}/lib.rs (82%)
 rename crates/{hashmap-bench/hashmap_insert.rs => hash-sorted-map/benchmarks/performance.rs} (69%)
 create mode 100644 crates/hash-sorted-map/src/group_ops.rs
 rename crates/{hashmap-bench/prefix_map_simd.rs => hash-sorted-map/src/hash_sorted_map.rs} (80%)
 create mode 100644 crates/hash-sorted-map/src/lib.rs

diff --git a/Cargo.toml b/Cargo.toml
index 524e62a..7547f1b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,7 +4,7 @@ members = [
     "crates/*",
     "crates/bpe/benchmarks",
     "crates/bpe/tests",
-    "crates/hashmap-bench",
+    "crates/hash-sorted-map/benchmarks",
 ]
 resolver = "2"
 
diff --git a/crates/hash-sorted-map/Cargo.toml b/crates/hash-sorted-map/Cargo.toml
new file mode 100644
index 0000000..84ffa02
--- /dev/null
+++ b/crates/hash-sorted-map/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "hash-sorted-map"
+authors = ["The blackbird team <support@github.com>"]
+version = "0.1.0"
+edition = "2021"
+description = "A hash map with hash-ordered iteration and linear-time merge, designed for search-index term maps."
+repository = "https://github.com/github/rust-gems"
+license = "MIT"
+keywords = ["hashmap", "sorted", "merge", "simd"]
+categories = ["algorithms", "data-structures"]
diff --git a/crates/hashmap-bench/OPTIMIZATIONS.md b/crates/hash-sorted-map/OPTIMIZATIONS.md
similarity index 92%
rename from crates/hashmap-bench/OPTIMIZATIONS.md
rename to crates/hash-sorted-map/OPTIMIZATIONS.md
index 86dc4ce..c88c517 100644
--- a/crates/hashmap-bench/OPTIMIZATIONS.md
+++ b/crates/hash-sorted-map/OPTIMIZATIONS.md
@@ -1,8 +1,8 @@
-# PrefixHashMap vs. Rust Swiss Table (hashbrown): Optimization Analysis
+# HashSortedMap vs. Rust Swiss Table (hashbrown): Optimization Analysis
 
 ## Executive Summary
 
-`PrefixHashMap` is a Swiss-table-inspired hash map that uses **overflow
+`HashSortedMap` is a Swiss-table-inspired hash map that uses **overflow
 chaining** (instead of open addressing), **SIMD group scanning** (NEON/SSE2),
 a **slot-hint fast path**, and an **optimized growth strategy**. It is generic
 over key type, value type, and hash builder.
@@ -29,7 +29,7 @@ experimental results that guided the current design.
 └──────────────────────────────────────────────────────────────────┘
 
 ┌──────────────────────────────────────────────────────────────────┐
-│                      PrefixHashMap                               │
+│                      HashSortedMap                               │
 │                                                                  │
 │  Vec<Group<K,V>> where each Group (AoS):                         │
 │  { ctrl: [u8; 8], keys: [MaybeUninit<K>; 8],                    │
@@ -58,7 +58,7 @@ modest because the slot-hint fast path often skips the group scan entirely.
 
 ### 2. Open Addressing with Triangular Probing ❌ Rejected
 
-Tested an open-addressing variant (`OpenPrefixHashMap`) with triangular
+Tested an open-addressing variant (`OpenHashSortedMap`) with triangular
 probing over AoS groups.
 
 **Benchmark result**: **40% slower** than overflow chaining. With the AoS
@@ -68,7 +68,7 @@ faster because most inserts land in the first group.
 
 ### 3. SoA Memory Layout ❌ Rejected
 
-Tested a SoA variant (`SoaPrefixHashMap`) with separate control byte and
+Tested a SoA variant (`SoaHashSortedMap`) with separate control byte and
 key/value arrays, combined with triangular probing.
 
 **Benchmark result**: **Slowest variant** — even slower than AoS open
@@ -82,7 +82,7 @@ The original `with_capacity` allocated `capacity / 8` groups, giving ~100%
 slot utilization. hashbrown uses `capacity * 8 / 7`, giving ~50% load.
 
 **Fix**: Changed to `capacity * 8 / 7` (87.5% max load factor), matching
-hashbrown. This was the **single biggest improvement** — PrefixHashMap went
+hashbrown. This was the **single biggest improvement** — HashSortedMap went
 from 2× slower to matching hashbrown.
 
 ### 5. Optimized Growth ✅ Implemented
@@ -108,9 +108,9 @@ the overflow path.
 SIMD version** by pessimizing NEON code generation. Removed from the SIMD
 implementation, kept in the scalar version.
 
-### 7. Slot Hint Fast Path (Unique to PrefixHashMap)
+### 7. Slot Hint Fast Path (Unique to HashSortedMap)
 
-PrefixHashMap checks a preferred slot before scanning the group:
+HashSortedMap checks a preferred slot before scanning the group:
 ```rust
 let hint = slot_hint(hash);  // 3 bits from hash → slot index
 if ctrl[hint] == EMPTY { /* direct insert */ }
@@ -172,5 +172,5 @@ entropy in both halves. Also changed trigram generation to use
 | Branch hints (scalar only) | **−2–6%** |
 | IdentityHasher fix | Enabled fair comparison |
 
-The current PrefixHashMap **matches hashbrown+FxHash** on pre-sized inserts,
+The current HashSortedMap **matches hashbrown+FxHash** on pre-sized inserts,
 **beats all hashbrown variants** on overwrites, and has **2× faster growth**.
diff --git a/crates/hashmap-bench/README.md b/crates/hash-sorted-map/README.md
similarity index 61%
rename from crates/hashmap-bench/README.md
rename to crates/hash-sorted-map/README.md
index 290060b..63a2dbb 100644
--- a/crates/hashmap-bench/README.md
+++ b/crates/hash-sorted-map/README.md
@@ -1,12 +1,31 @@
-# hashmap-bench
+# hash-sorted-map
 
-Benchmarks comparing `PrefixHashMap` — an insertion-only hash map with
-overflow chaining and SIMD group scanning — against Rust's standard library
-and several third-party hash map implementations.
+A hash map whose groups are ordered by hash prefix, enabling efficient
+sorted-order iteration and linear-time merging of two maps.
+
+## Motivation
+
+In a search index, each document produces a **term map** (term → frequency).
+At index time, term maps from many documents must be **merged** into a single
+posting list, and the result is **serialized in hash-key order** so that
+lookups can use a skip-list approach, leveraging the hash ordering to
+efficiently jump to the right region of the serialized data.
+
+A conventional hash map stores entries in arbitrary order, so merging two maps
+requires collecting, sorting, and reshuffling all entries — an expensive step
+that dominates indexing time for large term maps typical of code search, where
+documents contain massive numbers of tokens.
+
+`HashSortedMap` avoids this by organizing its groups by hash prefix.
+Iterating through the groups in order yields entries sorted by their hashed
+keys, which means:
+
+- **Merging** two maps is a single linear scan (like merge-sort's merge step).
+- **Serialization** in hash-key order requires no extra sorting or copying.
 
 ## Design
 
-`PrefixHashMap<K, V, S>` is a Swiss-table-inspired hash map that uses:
+`HashSortedMap<K, V, S>` is a Swiss-table-inspired hash map that uses:
 
 - **Overflow chaining** instead of open addressing — groups that fill up link
   to overflow groups rather than probing into neighbours.
@@ -34,7 +53,7 @@ M-series (aarch64).
 | 🥇 | FoldHashMap | 2.44 | — |
 | 🥈 | FxHashMap | 2.61 | +7% |
 | 🥉 | hashbrown::HashMap | 2.67 | +9% |
-| 4 | **PrefixHashMap** | **2.71** | +11% |
+| 4 | **HashSortedMap** | **2.71** | +11% |
 | 5 | hashbrown+Identity | 2.74 | +12% |
 | 6 | std::HashMap+FNV | 3.27 | +34% |
 | 7 | AHashMap | 3.22 | +32% |
@@ -45,19 +64,19 @@ M-series (aarch64).
 
 | Map | Time (µs) |
 |-----|-----------|
-| **PrefixHashMap** | **2.36** ✅ |
+| **HashSortedMap** | **2.36** ✅ |
 | hashbrown+Identity | 2.58 |
 
 ### Growth from small (`with_capacity(128)`, 3 resize rounds)
 
 | Map | Time (µs) | Growth penalty |
 |-----|-----------|----------------|
-| **PrefixHashMap** | **4.85** | +2.14 |
+| **HashSortedMap** | **4.85** | +2.14 |
 | hashbrown+Identity | 9.77 | +7.03 |
 
 ### Key takeaways
 
-- **PrefixHashMap matches the fastest hashbrown configurations** on pre-sized
+- **HashSortedMap matches the fastest hashbrown configurations** on pre-sized
   first-time inserts and is **the fastest for overwrites**.
 - **Growth is ~2× faster** than hashbrown thanks to the optimized
   `insert_for_grow` path that skips duplicate checking and uses raw copies.
diff --git a/crates/hashmap-bench/Cargo.toml b/crates/hash-sorted-map/benchmarks/Cargo.toml
similarity index 67%
rename from crates/hashmap-bench/Cargo.toml
rename to crates/hash-sorted-map/benchmarks/Cargo.toml
index cae08a7..75e51c3 100644
--- a/crates/hashmap-bench/Cargo.toml
+++ b/crates/hash-sorted-map/benchmarks/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "hashmap-bench"
+name = "hash-sorted-map-benchmarks"
 edition = "2021"
 
 [lib]
@@ -7,12 +7,13 @@ path = "lib.rs"
 test = false
 
 [[bench]]
-name = "hashmap_insert"
-path = "hashmap_insert.rs"
+name = "performance"
+path = "performance.rs"
 harness = false
 test = false
 
 [dependencies]
+hash-sorted-map = { path = ".." }
 criterion = "0.7"
 rand = "0.9"
 rustc-hash = "2"
@@ -20,5 +21,4 @@ ahash = "0.8"
 hashbrown = "0.15"
 foldhash = "0.1"
 gxhash = "3"
-smallvec = "1"
 fnv = "1"
diff --git a/crates/hashmap-bench/lib.rs b/crates/hash-sorted-map/benchmarks/lib.rs
similarity index 82%
rename from crates/hashmap-bench/lib.rs
rename to crates/hash-sorted-map/benchmarks/lib.rs
index d06be51..4f5d2cb 100644
--- a/crates/hashmap-bench/lib.rs
+++ b/crates/hash-sorted-map/benchmarks/lib.rs
@@ -1,21 +1,18 @@
-pub mod prefix_map_simd;
+use std::hash::{BuildHasherDefault, Hasher};
 
 use rand::Rng;
-use std::hash::{BuildHasherDefault, Hasher};
+
+const ARBITRARY0: u64 = 0x243f6a8885a308d3;
 
 /// Folded multiply: full u64×u64→u128, then XOR the two halves.
-/// Produces a u64 with good bit independence between high and low halves.
 #[inline(always)]
 pub fn folded_multiply(x: u64, y: u64) -> u64 {
     let full = (x as u128).wrapping_mul(y as u128);
     (full as u64) ^ ((full >> 64) as u64)
 }
 
-const ARBITRARY0: u64 = 0x243f6a8885a308d3;
-
-/// A hasher that expands a u32 key into a well-distributed u64 using
-/// folded_multiply so that both hashbrown's bucket index (low bits) and
-/// tag (top 7 bits) have independent entropy.
+/// A hasher that passes through u32 keys without hashing, suitable for
+/// keys that are already well-distributed.
 #[derive(Default)]
 pub struct IdentityHasher(u64);
 
diff --git a/crates/hashmap-bench/hashmap_insert.rs b/crates/hash-sorted-map/benchmarks/performance.rs
similarity index 69%
rename from crates/hashmap-bench/hashmap_insert.rs
rename to crates/hash-sorted-map/benchmarks/performance.rs
index e13e3e6..44fc93b 100644
--- a/crates/hashmap-bench/hashmap_insert.rs
+++ b/crates/hash-sorted-map/benchmarks/performance.rs
@@ -1,11 +1,13 @@
 use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
-use hashmap_bench::random_trigram_hashes;
+use hash_sorted_map_benchmarks::{random_trigram_hashes, IdentityBuildHasher};
 
-fn bench_hashmap_insert(c: &mut Criterion) {
-    let trigrams = random_trigram_hashes(1000);
+fn trigrams() -> Vec<u32> {
+    random_trigram_hashes(1000)
+}
 
-    // ── Main comparison: insert 1000 trigrams ───────────────────────────
-    let mut group = c.benchmark_group("hashmap_insert_1000_trigrams");
+fn bench_insert(c: &mut Criterion) {
+    let trigrams = trigrams();
+    let mut group = c.benchmark_group("presized_insert_1000_trigrams");
 
     group.bench_function("std::HashMap", |b| {
         b.iter_batched(
@@ -103,7 +105,7 @@ fn bench_hashmap_insert(c: &mut Criterion) {
 
     group.bench_function("hashbrown+Identity", |b| {
         b.iter_batched(
-            || hashbrown::HashMap::<u32, usize, hashmap_bench::IdentityBuildHasher>::with_capacity_and_hasher(
+            || hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
                 trigrams.len(),
                 Default::default(),
             ),
@@ -117,11 +119,11 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group.bench_function("PrefixHashMap", |b| {
+    group.bench_function("HashSortedMap", |b| {
         b.iter_batched(
-            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity_and_hasher(
+            || hash_sorted_map::hash_sorted_map::HashSortedMap::with_capacity_and_hasher(
                 trigrams.len(),
-                hashmap_bench::IdentityBuildHasher::default(),
+                IdentityBuildHasher::default(),
             ),
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
@@ -134,14 +136,16 @@ fn bench_hashmap_insert(c: &mut Criterion) {
     });
 
     group.finish();
+}
 
-    // ── Re-insert: insert same keys twice (second pass = all overwrites) ─
-    let mut group2 = c.benchmark_group("reinsert_1000_trigrams");
+fn bench_reinsert(c: &mut Criterion) {
+    let trigrams = trigrams();
+    let mut group = c.benchmark_group("reinsert_1000_trigrams");
 
-    group2.bench_function("hashbrown+Identity", |b| {
+    group.bench_function("hashbrown+Identity", |b| {
         b.iter_batched(
             || {
-                let mut map = hashbrown::HashMap::<u32, usize, hashmap_bench::IdentityBuildHasher>::with_capacity_and_hasher(
+                let mut map = hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
                     trigrams.len(),
                     Default::default(),
                 );
@@ -160,12 +164,12 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group2.bench_function("PrefixHashMap", |b| {
+    group.bench_function("HashSortedMap", |b| {
         b.iter_batched(
             || {
-                let mut map = hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity_and_hasher(
+                let mut map = hash_sorted_map::hash_sorted_map::HashSortedMap::with_capacity_and_hasher(
                     trigrams.len(),
-                    hashmap_bench::IdentityBuildHasher::default(),
+                    IdentityBuildHasher::default(),
                 );
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -182,14 +186,16 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group2.finish();
+    group.finish();
+}
 
-    // ── Growth penalty: start small (128), force 3 growths ──────────────
-    let mut group3 = c.benchmark_group("grow_from_128_insert_1000_trigrams");
+fn bench_grow(c: &mut Criterion) {
+    let trigrams = trigrams();
+    let mut group = c.benchmark_group("grow_from_128_insert_1000_trigrams");
 
-    group3.bench_function("hashbrown+Identity", |b| {
+    group.bench_function("hashbrown+Identity", |b| {
         b.iter_batched(
-            || hashbrown::HashMap::<u32, usize, hashmap_bench::IdentityBuildHasher>::with_capacity_and_hasher(
+            || hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
                 128,
                 Default::default(),
             ),
@@ -203,11 +209,11 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group3.bench_function("PrefixHashMap", |b| {
+    group.bench_function("HashSortedMap", |b| {
         b.iter_batched(
-            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::with_capacity_and_hasher(
+            || hash_sorted_map::hash_sorted_map::HashSortedMap::with_capacity_and_hasher(
                 128,
-                hashmap_bench::IdentityBuildHasher::default(),
+                IdentityBuildHasher::default(),
             ),
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
@@ -219,21 +225,21 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group3.finish();
+    group.finish();
+}
 
-    // ── get_or_default: count trigram occurrences ──────────────────────
-    // Counting workload: most lookups hit existing keys, so this stresses
-    // the find-existing path of get_or_default / entry().or_insert().
+fn bench_count(c: &mut Criterion) {
+    let trigrams = trigrams();
     let mut counted_trigrams = Vec::with_capacity(trigrams.len() * 4);
     for _ in 0..4 {
         counted_trigrams.extend_from_slice(&trigrams);
     }
 
-    let mut group4 = c.benchmark_group("count_4000_trigrams_get_or_default");
+    let mut group = c.benchmark_group("count_4000_trigrams_get_or_default");
 
-    group4.bench_function("hashbrown+Identity entry()", |b| {
+    group.bench_function("hashbrown+Identity entry()", |b| {
         b.iter_batched(
-            || hashbrown::HashMap::<u32, u32, hashmap_bench::IdentityBuildHasher>::with_capacity_and_hasher(
+            || hashbrown::HashMap::<u32, u32, IdentityBuildHasher>::with_capacity_and_hasher(
                 trigrams.len(),
                 Default::default(),
             ),
@@ -247,11 +253,11 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group4.bench_function("PrefixHashMap get_or_default", |b| {
+    group.bench_function("HashSortedMap get_or_default", |b| {
         b.iter_batched(
-            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::<u32, u32, _>::with_capacity_and_hasher(
+            || hash_sorted_map::hash_sorted_map::HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
                 trigrams.len(),
-                hashmap_bench::IdentityBuildHasher::default(),
+                IdentityBuildHasher::default(),
             ),
             |mut map| {
                 for &key in &counted_trigrams {
@@ -263,11 +269,11 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group4.bench_function("PrefixHashMap entry().or_default()", |b| {
+    group.bench_function("HashSortedMap entry().or_default()", |b| {
         b.iter_batched(
-            || hashmap_bench::prefix_map_simd::SimdPrefixHashMap::<u32, u32, _>::with_capacity_and_hasher(
+            || hash_sorted_map::hash_sorted_map::HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
                 trigrams.len(),
-                hashmap_bench::IdentityBuildHasher::default(),
+                IdentityBuildHasher::default(),
             ),
             |mut map| {
                 for &key in &counted_trigrams {
@@ -279,8 +285,8 @@ fn bench_hashmap_insert(c: &mut Criterion) {
         );
     });
 
-    group4.finish();
+    group.finish();
 }
 
-criterion_group!(benches, bench_hashmap_insert);
+criterion_group!(benches, bench_insert, bench_reinsert, bench_grow, bench_count);
 criterion_main!(benches);
diff --git a/crates/hash-sorted-map/src/group_ops.rs b/crates/hash-sorted-map/src/group_ops.rs
new file mode 100644
index 0000000..bfae626
--- /dev/null
+++ b/crates/hash-sorted-map/src/group_ops.rs
@@ -0,0 +1,170 @@
+// Platform-dependent group size: 16 on x86_64 (SSE2), 8 everywhere else.
+#[cfg(target_arch = "x86_64")]
+pub const GROUP_SIZE: usize = 16;
+#[cfg(not(target_arch = "x86_64"))]
+pub const GROUP_SIZE: usize = 8;
+
+pub const CTRL_EMPTY: u8 = 0x00;
+
+#[cfg(target_arch = "x86_64")]
+pub type Mask = u32;
+#[cfg(not(target_arch = "x86_64"))]
+pub type Mask = u64;
+
+// ── SIMD group operations ───────────────────────────────────────────────────
+
+#[cfg(target_arch = "x86_64")]
+mod arch {
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86 as x86;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64 as x86;
+
+    use super::{Mask, GROUP_SIZE};
+
+    #[inline(always)]
+    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
+        unsafe {
+            let group = x86::_mm_loadu_si128(ctrl.as_ptr() as *const x86::__m128i);
+            let cmp = x86::_mm_cmpeq_epi8(group, x86::_mm_set1_epi8(tag as i8));
+            x86::_mm_movemask_epi8(cmp) as u32
+        }
+    }
+
+    #[inline(always)]
+    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        match_tag(ctrl, super::CTRL_EMPTY)
+    }
+
+    /// Mask of slots whose ctrl byte has the high bit set (occupied).
+    /// Uses SSE2 `_mm_movemask_epi8` which extracts the top bit of each byte.
+    #[inline(always)]
+    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        unsafe {
+            let group = x86::_mm_loadu_si128(ctrl.as_ptr() as *const x86::__m128i);
+            x86::_mm_movemask_epi8(group) as u32
+        }
+    }
+
+    #[inline(always)]
+    pub fn lowest(mask: Mask) -> usize {
+        mask.trailing_zeros() as usize
+    }
+
+    #[inline(always)]
+    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
+        mask & !(1u32 << slot)
+    }
+
+    #[inline(always)]
+    pub fn next_match(mask: &mut Mask) -> Option<usize> {
+        if *mask == 0 {
+            return None;
+        }
+        let i = lowest(*mask);
+        *mask &= *mask - 1;
+        Some(i)
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+mod arch {
+    use core::arch::aarch64 as neon;
+
+    use super::{Mask, GROUP_SIZE};
+
+    #[inline(always)]
+    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
+        unsafe {
+            let group = neon::vld1_u8(ctrl.as_ptr());
+            let cmp = neon::vceq_u8(group, neon::vdup_n_u8(tag));
+            neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0) & 0x8080808080808080
+        }
+    }
+
+    #[inline(always)]
+    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        unsafe {
+            let group = neon::vld1_u8(ctrl.as_ptr());
+            let cmp = neon::vceq_u8(group, neon::vdup_n_u8(0));
+            neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0) & 0x8080808080808080
+        }
+    }
+
+    /// Mask of slots whose ctrl byte has the high bit set (occupied).
+    #[inline(always)]
+    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        unsafe {
+            let group = neon::vld1_u8(ctrl.as_ptr());
+            neon::vget_lane_u64(neon::vreinterpret_u64_u8(group), 0) & 0x8080808080808080
+        }
+    }
+
+    #[inline(always)]
+    pub fn lowest(mask: Mask) -> usize {
+        (mask.trailing_zeros() >> 3) as usize
+    }
+
+    #[inline(always)]
+    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
+        mask & !(0x80u64 << (slot * 8))
+    }
+
+    #[inline(always)]
+    pub fn next_match(mask: &mut Mask) -> Option<usize> {
+        if *mask == 0 {
+            return None;
+        }
+        let i = lowest(*mask);
+        *mask &= *mask - 1;
+        Some(i)
+    }
+}
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+mod arch {
+    use super::{Mask, GROUP_SIZE};
+
+    #[inline(always)]
+    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
+        let word = u64::from_ne_bytes(*ctrl);
+        let broadcast = 0x0101010101010101u64 * (tag as u64);
+        let xor = word ^ broadcast;
+        (xor.wrapping_sub(0x0101010101010101)) & !xor & 0x8080808080808080
+    }
+
+    #[inline(always)]
+    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        let word = u64::from_ne_bytes(*ctrl);
+        !word & 0x8080808080808080
+    }
+
+    /// Mask of slots whose ctrl byte has the high bit set (occupied).
+    #[inline(always)]
+    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
+        let word = u64::from_ne_bytes(*ctrl);
+        word & 0x8080808080808080
+    }
+
+    #[inline(always)]
+    pub fn lowest(mask: Mask) -> usize {
+        (mask.trailing_zeros() >> 3) as usize
+    }
+
+    #[inline(always)]
+    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
+        mask & !(0x80u64 << (slot * 8))
+    }
+
+    #[inline(always)]
+    pub fn next_match(mask: &mut Mask) -> Option<usize> {
+        if *mask == 0 {
+            return None;
+        }
+        let i = lowest(*mask);
+        *mask &= *mask - 1;
+        Some(i)
+    }
+}
+
+pub use arch::*;
diff --git a/crates/hashmap-bench/prefix_map_simd.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs
similarity index 80%
rename from crates/hashmap-bench/prefix_map_simd.rs
rename to crates/hash-sorted-map/src/hash_sorted_map.rs
index b709031..0c56354 100644
--- a/crates/hashmap-bench/prefix_map_simd.rs
+++ b/crates/hash-sorted-map/src/hash_sorted_map.rs
@@ -3,176 +3,10 @@ use std::borrow::Borrow;
 use std::collections::hash_map::RandomState;
 use std::hash::{BuildHasher, Hash};
 
-// Platform-dependent group size: 16 on x86_64 (SSE2), 8 everywhere else.
-#[cfg(target_arch = "x86_64")]
-const GROUP_SIZE: usize = 16;
-#[cfg(not(target_arch = "x86_64"))]
-const GROUP_SIZE: usize = 8;
+use super::group_ops::{self, CTRL_EMPTY, GROUP_SIZE};
 
-const CTRL_EMPTY: u8 = 0x00;
 const NO_OVERFLOW: u32 = u32::MAX;
 
-#[cfg(target_arch = "x86_64")]
-type Mask = u32;
-#[cfg(not(target_arch = "x86_64"))]
-type Mask = u64;
-
-// ── SIMD group operations ───────────────────────────────────────────────────
-
-#[cfg(target_arch = "x86_64")]
-mod group_ops {
-    #[cfg(target_arch = "x86")]
-    use core::arch::x86 as x86;
-    #[cfg(target_arch = "x86_64")]
-    use core::arch::x86_64 as x86;
-
-    use super::{Mask, GROUP_SIZE};
-
-    #[inline(always)]
-    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
-        unsafe {
-            let group = x86::_mm_loadu_si128(ctrl.as_ptr() as *const x86::__m128i);
-            let cmp = x86::_mm_cmpeq_epi8(group, x86::_mm_set1_epi8(tag as i8));
-            x86::_mm_movemask_epi8(cmp) as u32
-        }
-    }
-
-    #[inline(always)]
-    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        match_tag(ctrl, super::CTRL_EMPTY)
-    }
-
-    /// Mask of slots whose ctrl byte has the high bit set (occupied).
-    /// Uses SSE2 `_mm_movemask_epi8` which extracts the top bit of each byte.
-    #[inline(always)]
-    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        unsafe {
-            let group = x86::_mm_loadu_si128(ctrl.as_ptr() as *const x86::__m128i);
-            x86::_mm_movemask_epi8(group) as u32
-        }
-    }
-
-    #[inline(always)]
-    pub fn lowest(mask: Mask) -> usize {
-        mask.trailing_zeros() as usize
-    }
-
-    #[inline(always)]
-    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
-        mask & !(1u32 << slot)
-    }
-
-    #[inline(always)]
-    pub fn next_match(mask: &mut Mask) -> Option<usize> {
-        if *mask == 0 {
-            return None;
-        }
-        let i = lowest(*mask);
-        *mask &= *mask - 1;
-        Some(i)
-    }
-}
-
-#[cfg(target_arch = "aarch64")]
-mod group_ops {
-    use core::arch::aarch64 as neon;
-
-    use super::{Mask, GROUP_SIZE};
-
-    #[inline(always)]
-    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
-        unsafe {
-            let group = neon::vld1_u8(ctrl.as_ptr());
-            let cmp = neon::vceq_u8(group, neon::vdup_n_u8(tag));
-            neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0) & 0x8080808080808080
-        }
-    }
-
-    #[inline(always)]
-    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        unsafe {
-            let group = neon::vld1_u8(ctrl.as_ptr());
-            let cmp = neon::vceq_u8(group, neon::vdup_n_u8(0));
-            neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0) & 0x8080808080808080
-        }
-    }
-
-    /// Mask of slots whose ctrl byte has the high bit set (occupied).
-    #[inline(always)]
-    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        unsafe {
-            let group = neon::vld1_u8(ctrl.as_ptr());
-            neon::vget_lane_u64(neon::vreinterpret_u64_u8(group), 0) & 0x8080808080808080
-        }
-    }
-
-    #[inline(always)]
-    pub fn lowest(mask: Mask) -> usize {
-        (mask.trailing_zeros() >> 3) as usize
-    }
-
-    #[inline(always)]
-    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
-        mask & !(0x80u64 << (slot * 8))
-    }
-
-    #[inline(always)]
-    pub fn next_match(mask: &mut Mask) -> Option<usize> {
-        if *mask == 0 {
-            return None;
-        }
-        let i = lowest(*mask);
-        *mask &= *mask - 1;
-        Some(i)
-    }
-}
-
-#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
-mod group_ops {
-    use super::{Mask, GROUP_SIZE};
-
-    #[inline(always)]
-    pub fn match_tag(ctrl: &[u8; GROUP_SIZE], tag: u8) -> Mask {
-        let word = u64::from_ne_bytes(*ctrl);
-        let broadcast = 0x0101010101010101u64 * (tag as u64);
-        let xor = word ^ broadcast;
-        (xor.wrapping_sub(0x0101010101010101)) & !xor & 0x8080808080808080
-    }
-
-    #[inline(always)]
-    pub fn match_empty(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        let word = u64::from_ne_bytes(*ctrl);
-        !word & 0x8080808080808080
-    }
-
-    /// Mask of slots whose ctrl byte has the high bit set (occupied).
-    #[inline(always)]
-    pub fn match_full(ctrl: &[u8; GROUP_SIZE]) -> Mask {
-        let word = u64::from_ne_bytes(*ctrl);
-        word & 0x8080808080808080
-    }
-
-    #[inline(always)]
-    pub fn lowest(mask: Mask) -> usize {
-        (mask.trailing_zeros() >> 3) as usize
-    }
-
-    #[inline(always)]
-    pub fn clear_slot(mask: Mask, slot: usize) -> Mask {
-        mask & !(0x80u64 << (slot * 8))
-    }
-
-    #[inline(always)]
-    pub fn next_match(mask: &mut Mask) -> Option<usize> {
-        if *mask == 0 {
-            return None;
-        }
-        let i = lowest(*mask);
-        *mask &= *mask - 1;
-        Some(i)
-    }
-}
-
 // ── Helpers ─────────────────────────────────────────────────────────────────
 
 #[inline]
@@ -207,7 +41,7 @@ impl<K, V> Group<K, V> {
 ///
 /// Uses NEON on aarch64, SSE2 on x86_64, scalar fallback elsewhere.
 /// Generic over key type `K`, value type `V`, and hash builder `S`.
-pub struct SimdPrefixHashMap<K, V, S = RandomState> {
+pub struct HashSortedMap<K, V, S = RandomState> {
     groups: Box<[Group<K, V>]>,
     num_groups: u32,
     n_bits: u32,
@@ -215,7 +49,7 @@ pub struct SimdPrefixHashMap<K, V, S = RandomState> {
     hash_builder: S,
 }
 
-impl<K: Hash + Eq, V> SimdPrefixHashMap<K, V> {
+impl<K: Hash + Eq, V> HashSortedMap<K, V> {
     pub fn new() -> Self {
         Self::with_capacity_and_hasher(0, RandomState::new())
     }
@@ -225,7 +59,7 @@ impl<K: Hash + Eq, V> SimdPrefixHashMap<K, V> {
     }
 }
 
-impl<K, V, S> SimdPrefixHashMap<K, V, S> {
+impl<K, V, S> HashSortedMap<K, V, S> {
     pub fn with_hasher(hash_builder: S) -> Self {
         Self::with_capacity_and_hasher(0, hash_builder)
     }
@@ -265,7 +99,7 @@ impl<K, V, S> SimdPrefixHashMap<K, V, S> {
     }
 }
 
-impl<K: Hash + Eq, V, S: BuildHasher> SimdPrefixHashMap<K, V, S> {
+impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
     pub fn insert(&mut self, key: K, value: V) -> Option<V> {
         let hash = self.hash_builder.hash_one(&key);
         self.insert_hashed(hash, key, value)
@@ -563,7 +397,7 @@ enum Insertion<K, V> {
     NeedsOverflow { tail: *mut Group<K, V> },
 }
 
-/// View into a single entry in a [`SimdPrefixHashMap`], either occupied or vacant.
+/// View into a single entry in a [`HashSortedMap`], either occupied or vacant.
 pub enum Entry<'a, K, V, S> {
     Occupied(OccupiedEntry<'a, V>),
     Vacant(VacantEntry<'a, K, V, S>),
@@ -577,7 +411,7 @@ pub struct OccupiedEntry<'a, V> {
 /// View into a vacant entry. Holds the borrow of the map plus the hash, key,
 /// and pre-computed insertion slot.
 pub struct VacantEntry<'a, K, V, S> {
-    map: &'a mut SimdPrefixHashMap<K, V, S>,
+    map: &'a mut HashSortedMap<K, V, S>,
     hash: u64,
     key: K,
     insertion: Insertion<K, V>,
@@ -645,6 +479,7 @@ impl<'a, V> OccupiedEntry<'a, V> {
 }
 
 impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> {
+
     /// Insert `value` and return a mutable reference to it.
     /// Writes directly to the slot pre-computed during `entry()`; only re-walks
     /// the chain on the rare grow path (where the pre-computed pointers become
@@ -699,7 +534,7 @@ impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> {
 #[cold]
 #[inline(never)]
 fn insert_after_grow<'a, K: Hash + Eq, V, S: BuildHasher>(
-    map: &'a mut SimdPrefixHashMap<K, V, S>,
+    map: &'a mut HashSortedMap<K, V, S>,
     hash: u64,
     key: K,
     value: V,
@@ -726,7 +561,7 @@ fn insert_after_grow<'a, K: Hash + Eq, V, S: BuildHasher>(
     }
 }
 
-impl<K, V, S> Drop for SimdPrefixHashMap<K, V, S> {
+impl<K, V, S> Drop for HashSortedMap<K, V, S> {
     fn drop(&mut self) {
         for group in &mut self.groups[..self.num_groups as usize] {
             for i in 0..GROUP_SIZE {
@@ -745,7 +580,7 @@ mod tests {
 
     #[test]
     fn insert_and_get() {
-        let mut map = SimdPrefixHashMap::new();
+        let mut map = HashSortedMap::new();
         map.insert(100, "hello");
         map.insert(200, "world");
         assert_eq!(map.get(&100), Some(&"hello"));
@@ -756,7 +591,7 @@ mod tests {
 
     #[test]
     fn insert_overwrite() {
-        let mut map = SimdPrefixHashMap::new();
+        let mut map = HashSortedMap::new();
         map.insert(42, "a");
         assert_eq!(map.insert(42, "b"), Some("a"));
         assert_eq!(map.get(&42), Some(&"b"));
@@ -765,7 +600,7 @@ mod tests {
 
     #[test]
     fn grow_preserves_entries() {
-        let mut map = SimdPrefixHashMap::new();
+        let mut map = HashSortedMap::new();
         for i in 0..200u32 {
             map.insert(i, i * 10);
         }
@@ -777,7 +612,7 @@ mod tests {
 
     #[test]
     fn many_entries() {
-        let mut map = SimdPrefixHashMap::with_capacity(2000);
+        let mut map = HashSortedMap::with_capacity(2000);
         for i in 0..2000u32 {
             map.insert(i.wrapping_mul(2654435761), i);
         }
@@ -789,7 +624,7 @@ mod tests {
 
     #[test]
     fn overflow_chain() {
-        let mut map = SimdPrefixHashMap::with_capacity(8);
+        let mut map = HashSortedMap::with_capacity(8);
         for i in 0..20u32 {
             let key = i | 0xAB000000;
             map.insert(key, i);
@@ -803,7 +638,7 @@ mod tests {
 
     #[test]
     fn grow_on_overflow_exhaustion() {
-        let mut map = SimdPrefixHashMap::with_capacity(1);
+        let mut map = HashSortedMap::with_capacity(1);
         let old_n_bits = map.n_bits;
         for i in 0..100u32 {
             let key = i | 0xFF000000;
@@ -819,7 +654,7 @@ mod tests {
 
     #[test]
     fn string_keys() {
-        let mut map = SimdPrefixHashMap::new();
+        let mut map = HashSortedMap::new();
         map.insert("hello".to_string(), 1);
         map.insert("world".to_string(), 2);
         assert_eq!(map.get("hello"), Some(&1));
@@ -834,7 +669,7 @@ mod tests {
 
     #[test]
     fn get_or_default_basics() {
-        let mut map: SimdPrefixHashMap<&str, i32> = SimdPrefixHashMap::new();
+        let mut map: HashSortedMap<&str, i32> = HashSortedMap::new();
         // Inserts default (0), then mutates.
         *map.get_or_default("a") += 5;
         *map.get_or_default("b") += 7;
@@ -847,7 +682,7 @@ mod tests {
 
     #[test]
     fn get_or_insert_with_lazy() {
-        let mut map: SimdPrefixHashMap<u32, String> = SimdPrefixHashMap::new();
+        let mut map: HashSortedMap<u32, String> = HashSortedMap::new();
         let mut call_count = 0;
         let mut make = |s: &str| {
             call_count += 1;
@@ -865,7 +700,7 @@ mod tests {
 
     #[test]
     fn get_or_default_survives_grow() {
-        let mut map: SimdPrefixHashMap<u32, u32> = SimdPrefixHashMap::with_capacity(1);
+        let mut map: HashSortedMap<u32, u32> = HashSortedMap::with_capacity(1);
         for i in 0..500u32 {
             *map.get_or_default(i) = i * 2;
         }
@@ -878,7 +713,7 @@ mod tests {
     #[test]
     fn entry_or_default_counting() {
         // Classic counting workload via Entry API.
-        let mut map: SimdPrefixHashMap<&str, u32> = SimdPrefixHashMap::new();
+        let mut map: HashSortedMap<&str, u32> = HashSortedMap::new();
         for word in ["a", "b", "a", "c", "b", "a"] {
             *map.entry(word).or_default() += 1;
         }
@@ -890,7 +725,7 @@ mod tests {
 
     #[test]
     fn entry_or_insert_lazy() {
-        let mut map: SimdPrefixHashMap<u32, String> = SimdPrefixHashMap::new();
+        let mut map: HashSortedMap<u32, String> = HashSortedMap::new();
         let mut call_count = 0;
         let mut make = |s: &str| {
             call_count += 1;
@@ -907,7 +742,7 @@ mod tests {
 
     #[test]
     fn entry_and_modify() {
-        let mut map: SimdPrefixHashMap<u32, u32> = SimdPrefixHashMap::new();
+        let mut map: HashSortedMap<u32, u32> = HashSortedMap::new();
         // Vacant: and_modify is a no-op, then or_insert(0) runs.
         *map.entry(7).and_modify(|v| *v *= 10).or_insert(1) += 100;
         assert_eq!(map.get(&7), Some(&101));
diff --git a/crates/hash-sorted-map/src/lib.rs b/crates/hash-sorted-map/src/lib.rs
new file mode 100644
index 0000000..63d3e8e
--- /dev/null
+++ b/crates/hash-sorted-map/src/lib.rs
@@ -0,0 +1,2 @@
+pub mod group_ops;
+pub mod hash_sorted_map;

From 427d9826f4e2c42221d2f4933d5ff21fb2fb4f60 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Wed, 29 Apr 2026 15:30:42 +0200
Subject: [PATCH 12/22] remove gxhash which doesn't compile with some
 configurations

---
 crates/hash-sorted-map/README.md                 |  3 +--
 crates/hash-sorted-map/benchmarks/Cargo.toml     |  1 -
 crates/hash-sorted-map/benchmarks/performance.rs | 13 -------------
 3 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/crates/hash-sorted-map/README.md b/crates/hash-sorted-map/README.md
index 63a2dbb..ebd5ef6 100644
--- a/crates/hash-sorted-map/README.md
+++ b/crates/hash-sorted-map/README.md
@@ -57,8 +57,7 @@ M-series (aarch64).
 | 5 | hashbrown+Identity | 2.74 | +12% |
 | 6 | std::HashMap+FNV | 3.27 | +34% |
 | 7 | AHashMap | 3.22 | +32% |
-| 8 | GxHashMap | 3.69 | +51% |
-| 9 | std::HashMap | 8.49 | +248% |
+| 8 | std::HashMap | 8.49 | +248% |
 
 ### Re-insert same keys (all overwrites)
 
diff --git a/crates/hash-sorted-map/benchmarks/Cargo.toml b/crates/hash-sorted-map/benchmarks/Cargo.toml
index 75e51c3..252bd21 100644
--- a/crates/hash-sorted-map/benchmarks/Cargo.toml
+++ b/crates/hash-sorted-map/benchmarks/Cargo.toml
@@ -20,5 +20,4 @@ rustc-hash = "2"
 ahash = "0.8"
 hashbrown = "0.15"
 foldhash = "0.1"
-gxhash = "3"
 fnv = "1"
diff --git a/crates/hash-sorted-map/benchmarks/performance.rs b/crates/hash-sorted-map/benchmarks/performance.rs
index 44fc93b..2b88cc9 100644
--- a/crates/hash-sorted-map/benchmarks/performance.rs
+++ b/crates/hash-sorted-map/benchmarks/performance.rs
@@ -77,19 +77,6 @@ fn bench_insert(c: &mut Criterion) {
         );
     });
 
-    group.bench_function("GxHashMap", |b| {
-        b.iter_batched(
-            || gxhash::HashMap::with_capacity_and_hasher(trigrams.len(), Default::default()),
-            |mut map| {
-                for (i, &key) in trigrams.iter().enumerate() {
-                    map.insert(key, i);
-                }
-                map
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
     group.bench_function("std::HashMap+FNV", |b| {
         b.iter_batched(
             || std::collections::HashMap::with_capacity_and_hasher(trigrams.len(), fnv::FnvBuildHasher::default()),

From 7195a44ba3ca37c951b9945d72813fca792d6da0 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Thu, 30 Apr 2026 08:34:49 +0200
Subject: [PATCH 13/22] Update crates/hash-sorted-map/src/lib.rs

Co-authored-by: Jason Orendorff <jorendorff@github.com>
---
 crates/hash-sorted-map/src/lib.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/crates/hash-sorted-map/src/lib.rs b/crates/hash-sorted-map/src/lib.rs
index 63d3e8e..b34b492 100644
--- a/crates/hash-sorted-map/src/lib.rs
+++ b/crates/hash-sorted-map/src/lib.rs
@@ -1,2 +1,4 @@
-pub mod group_ops;
-pub mod hash_sorted_map;
+mod group_ops;
+mod hash_sorted_map;
+
+pub use hash_sorted_map::{Entry, HashSortedMap, OccupiedEntry, VacandEntry};

From 4e1a0383a1e4c23db7f20b60f0d8ea30d225282b Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Thu, 30 Apr 2026 09:24:20 +0200
Subject: [PATCH 14/22] fix initial capacity (and typo)

---
 crates/hash-sorted-map/src/group_ops.rs       | 7 +++++++
 crates/hash-sorted-map/src/hash_sorted_map.rs | 4 ++--
 crates/hash-sorted-map/src/lib.rs             | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/crates/hash-sorted-map/src/group_ops.rs b/crates/hash-sorted-map/src/group_ops.rs
index bfae626..f2e35c3 100644
--- a/crates/hash-sorted-map/src/group_ops.rs
+++ b/crates/hash-sorted-map/src/group_ops.rs
@@ -4,6 +4,13 @@ pub const GROUP_SIZE: usize = 16;
 #[cfg(not(target_arch = "x86_64"))]
 pub const GROUP_SIZE: usize = 8;
 
+/// Maximum safe fill ratio (keys / primary slots) that keeps overflow within
+/// the 12.5% reserve budget at p95 confidence. Derived from simulation.
+#[cfg(target_arch = "x86_64")]
+pub const MAX_FILL: f64 = 0.71;
+#[cfg(not(target_arch = "x86_64"))]
+pub const MAX_FILL: f64 = 0.67;
+
 pub const CTRL_EMPTY: u8 = 0x00;
 
 #[cfg(target_arch = "x86_64")]
diff --git a/crates/hash-sorted-map/src/hash_sorted_map.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs
index 0c56354..5c04c3c 100644
--- a/crates/hash-sorted-map/src/hash_sorted_map.rs
+++ b/crates/hash-sorted-map/src/hash_sorted_map.rs
@@ -65,8 +65,8 @@ impl<K, V, S> HashSortedMap<K, V, S> {
     }
 
     pub fn with_capacity_and_hasher(capacity: usize, hash_builder: S) -> Self {
-        let adjusted = capacity.checked_mul(8).unwrap_or(usize::MAX) / 7;
-        let min_groups = (adjusted / GROUP_SIZE).max(1).next_power_of_two();
+        let adjusted = (capacity as f64 / group_ops::MAX_FILL).ceil() as usize;
+        let min_groups = (adjusted.div_ceil(GROUP_SIZE)).max(1).next_power_of_two();
         let n_bits = min_groups.trailing_zeros().max(1);
         let (groups, num_primary) = Self::alloc_groups(n_bits);
         Self {
diff --git a/crates/hash-sorted-map/src/lib.rs b/crates/hash-sorted-map/src/lib.rs
index b34b492..79dac69 100644
--- a/crates/hash-sorted-map/src/lib.rs
+++ b/crates/hash-sorted-map/src/lib.rs
@@ -1,4 +1,4 @@
 mod group_ops;
 mod hash_sorted_map;
 
-pub use hash_sorted_map::{Entry, HashSortedMap, OccupiedEntry, VacandEntry};
+pub use hash_sorted_map::{Entry, HashSortedMap, OccupiedEntry, VacantEntry};

From 865757a65c3f666b4d8d9421caa03b710d81ece9 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Mon, 4 May 2026 11:01:10 +0200
Subject: [PATCH 15/22] lints + build errors

---
 crates/hash-sorted-map/benchmarks/performance.rs | 11 ++++++-----
 crates/hash-sorted-map/src/hash_sorted_map.rs    | 16 +++++++++++++---
 crates/string-offsets/benchmarks/performance.rs  |  3 ++-
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/crates/hash-sorted-map/benchmarks/performance.rs b/crates/hash-sorted-map/benchmarks/performance.rs
index 2b88cc9..d4b42cd 100644
--- a/crates/hash-sorted-map/benchmarks/performance.rs
+++ b/crates/hash-sorted-map/benchmarks/performance.rs
@@ -1,4 +1,5 @@
 use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use hash_sorted_map::HashSortedMap;
 use hash_sorted_map_benchmarks::{random_trigram_hashes, IdentityBuildHasher};
 
 fn trigrams() -> Vec<u32> {
@@ -108,7 +109,7 @@ fn bench_insert(c: &mut Criterion) {
 
     group.bench_function("HashSortedMap", |b| {
         b.iter_batched(
-            || hash_sorted_map::hash_sorted_map::HashSortedMap::with_capacity_and_hasher(
+            || HashSortedMap::with_capacity_and_hasher(
                 trigrams.len(),
                 IdentityBuildHasher::default(),
             ),
@@ -154,7 +155,7 @@ fn bench_reinsert(c: &mut Criterion) {
     group.bench_function("HashSortedMap", |b| {
         b.iter_batched(
             || {
-                let mut map = hash_sorted_map::hash_sorted_map::HashSortedMap::with_capacity_and_hasher(
+                let mut map = HashSortedMap::with_capacity_and_hasher(
                     trigrams.len(),
                     IdentityBuildHasher::default(),
                 );
@@ -198,7 +199,7 @@ fn bench_grow(c: &mut Criterion) {
 
     group.bench_function("HashSortedMap", |b| {
         b.iter_batched(
-            || hash_sorted_map::hash_sorted_map::HashSortedMap::with_capacity_and_hasher(
+            || HashSortedMap::with_capacity_and_hasher(
                 128,
                 IdentityBuildHasher::default(),
             ),
@@ -242,7 +243,7 @@ fn bench_count(c: &mut Criterion) {
 
     group.bench_function("HashSortedMap get_or_default", |b| {
         b.iter_batched(
-            || hash_sorted_map::hash_sorted_map::HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
+            || HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
                 trigrams.len(),
                 IdentityBuildHasher::default(),
             ),
@@ -258,7 +259,7 @@ fn bench_count(c: &mut Criterion) {
 
     group.bench_function("HashSortedMap entry().or_default()", |b| {
         b.iter_batched(
-            || hash_sorted_map::hash_sorted_map::HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
+            || HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
                 trigrams.len(),
                 IdentityBuildHasher::default(),
             ),
diff --git a/crates/hash-sorted-map/src/hash_sorted_map.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs
index 5c04c3c..527aa8b 100644
--- a/crates/hash-sorted-map/src/hash_sorted_map.rs
+++ b/crates/hash-sorted-map/src/hash_sorted_map.rs
@@ -49,6 +49,12 @@ pub struct HashSortedMap<K, V, S = RandomState> {
     hash_builder: S,
 }
 
+impl<K: Hash + Eq, V> Default for HashSortedMap<K, V> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl<K: Hash + Eq, V> HashSortedMap<K, V> {
     pub fn new() -> Self {
         Self::with_capacity_and_hasher(0, RandomState::new())
@@ -97,6 +103,10 @@ impl<K, V, S> HashSortedMap<K, V, S> {
     pub fn len(&self) -> usize {
         self.len
     }
+
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
 }
 
 impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
@@ -533,12 +543,12 @@ impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> {
 /// capacity check.)
 #[cold]
 #[inline(never)]
-fn insert_after_grow<'a, K: Hash + Eq, V, S: BuildHasher>(
-    map: &'a mut HashSortedMap<K, V, S>,
+fn insert_after_grow<K: Hash + Eq, V, S: BuildHasher>(
+    map: &mut HashSortedMap<K, V, S>,
     hash: u64,
     key: K,
     value: V,
-) -> &'a mut V {
+) -> &mut V {
     map.grow();
     match map.find_or_insertion_slot(hash, &key) {
         FindResult::Vacant(Insertion::Empty { group, slot }) => {
diff --git a/crates/string-offsets/benchmarks/performance.rs b/crates/string-offsets/benchmarks/performance.rs
index c4e6cb4..199a053 100644
--- a/crates/string-offsets/benchmarks/performance.rs
+++ b/crates/string-offsets/benchmarks/performance.rs
@@ -1,4 +1,5 @@
-use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use std::hint::black_box;
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
 use rand::{rng, RngExt};
 use string_offsets::{AllConfig, OnlyLines, StringOffsets};
 

From 0ebfb796d52c4ea66c53f2091191f15bb45c105c Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Mon, 4 May 2026 12:48:27 +0200
Subject: [PATCH 16/22] and more :(

---
 .../hash-sorted-map/benchmarks/performance.rs | 89 ++++++++++++-------
 crates/hash-sorted-map/src/group_ops.rs       |  2 +-
 crates/hash-sorted-map/src/hash_sorted_map.rs | 54 ++++++-----
 .../string-offsets/benchmarks/performance.rs  |  2 +-
 4 files changed, 90 insertions(+), 57 deletions(-)

diff --git a/crates/hash-sorted-map/benchmarks/performance.rs b/crates/hash-sorted-map/benchmarks/performance.rs
index d4b42cd..5a04801 100644
--- a/crates/hash-sorted-map/benchmarks/performance.rs
+++ b/crates/hash-sorted-map/benchmarks/performance.rs
@@ -80,7 +80,12 @@ fn bench_insert(c: &mut Criterion) {
 
     group.bench_function("std::HashMap+FNV", |b| {
         b.iter_batched(
-            || std::collections::HashMap::with_capacity_and_hasher(trigrams.len(), fnv::FnvBuildHasher::default()),
+            || {
+                std::collections::HashMap::with_capacity_and_hasher(
+                    trigrams.len(),
+                    fnv::FnvBuildHasher::default(),
+                )
+            },
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -93,10 +98,12 @@ fn bench_insert(c: &mut Criterion) {
 
     group.bench_function("hashbrown+Identity", |b| {
         b.iter_batched(
-            || hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
-                trigrams.len(),
-                Default::default(),
-            ),
+            || {
+                hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
+                    trigrams.len(),
+                    Default::default(),
+                )
+            },
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -109,10 +116,12 @@ fn bench_insert(c: &mut Criterion) {
 
     group.bench_function("HashSortedMap", |b| {
         b.iter_batched(
-            || HashSortedMap::with_capacity_and_hasher(
-                trigrams.len(),
-                IdentityBuildHasher::default(),
-            ),
+            || {
+                HashSortedMap::with_capacity_and_hasher(
+                    trigrams.len(),
+                    IdentityBuildHasher::default(),
+                )
+            },
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -133,10 +142,11 @@ fn bench_reinsert(c: &mut Criterion) {
     group.bench_function("hashbrown+Identity", |b| {
         b.iter_batched(
             || {
-                let mut map = hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
-                    trigrams.len(),
-                    Default::default(),
-                );
+                let mut map =
+                    hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
+                        trigrams.len(),
+                        Default::default(),
+                    );
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
                 }
@@ -183,10 +193,12 @@ fn bench_grow(c: &mut Criterion) {
 
     group.bench_function("hashbrown+Identity", |b| {
         b.iter_batched(
-            || hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
-                128,
-                Default::default(),
-            ),
+            || {
+                hashbrown::HashMap::<u32, usize, IdentityBuildHasher>::with_capacity_and_hasher(
+                    128,
+                    Default::default(),
+                )
+            },
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -199,10 +211,7 @@ fn bench_grow(c: &mut Criterion) {
 
     group.bench_function("HashSortedMap", |b| {
         b.iter_batched(
-            || HashSortedMap::with_capacity_and_hasher(
-                128,
-                IdentityBuildHasher::default(),
-            ),
+            || HashSortedMap::with_capacity_and_hasher(128, IdentityBuildHasher::default()),
             |mut map| {
                 for (i, &key) in trigrams.iter().enumerate() {
                     map.insert(key, i);
@@ -227,10 +236,12 @@ fn bench_count(c: &mut Criterion) {
 
     group.bench_function("hashbrown+Identity entry()", |b| {
         b.iter_batched(
-            || hashbrown::HashMap::<u32, u32, IdentityBuildHasher>::with_capacity_and_hasher(
-                trigrams.len(),
-                Default::default(),
-            ),
+            || {
+                hashbrown::HashMap::<u32, u32, IdentityBuildHasher>::with_capacity_and_hasher(
+                    trigrams.len(),
+                    Default::default(),
+                )
+            },
             |mut map| {
                 for &key in &counted_trigrams {
                     *map.entry(key).or_insert(0) += 1;
@@ -243,10 +254,12 @@ fn bench_count(c: &mut Criterion) {
 
     group.bench_function("HashSortedMap get_or_default", |b| {
         b.iter_batched(
-            || HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
-                trigrams.len(),
-                IdentityBuildHasher::default(),
-            ),
+            || {
+                HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
+                    trigrams.len(),
+                    IdentityBuildHasher::default(),
+                )
+            },
             |mut map| {
                 for &key in &counted_trigrams {
                     *map.get_or_default(key) += 1;
@@ -259,10 +272,12 @@ fn bench_count(c: &mut Criterion) {
 
     group.bench_function("HashSortedMap entry().or_default()", |b| {
         b.iter_batched(
-            || HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
-                trigrams.len(),
-                IdentityBuildHasher::default(),
-            ),
+            || {
+                HashSortedMap::<u32, u32, _>::with_capacity_and_hasher(
+                    trigrams.len(),
+                    IdentityBuildHasher::default(),
+                )
+            },
             |mut map| {
                 for &key in &counted_trigrams {
                     *map.entry(key).or_default() += 1;
@@ -276,5 +291,11 @@ fn bench_count(c: &mut Criterion) {
     group.finish();
 }
 
-criterion_group!(benches, bench_insert, bench_reinsert, bench_grow, bench_count);
+criterion_group!(
+    benches,
+    bench_insert,
+    bench_reinsert,
+    bench_grow,
+    bench_count
+);
 criterion_main!(benches);
diff --git a/crates/hash-sorted-map/src/group_ops.rs b/crates/hash-sorted-map/src/group_ops.rs
index f2e35c3..a1b92ec 100644
--- a/crates/hash-sorted-map/src/group_ops.rs
+++ b/crates/hash-sorted-map/src/group_ops.rs
@@ -23,7 +23,7 @@ pub type Mask = u64;
 #[cfg(target_arch = "x86_64")]
 mod arch {
     #[cfg(target_arch = "x86")]
-    use core::arch::x86 as x86;
+    use core::arch::x86;
     #[cfg(target_arch = "x86_64")]
     use core::arch::x86_64 as x86;
 
diff --git a/crates/hash-sorted-map/src/hash_sorted_map.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs
index 527aa8b..22b43d8 100644
--- a/crates/hash-sorted-map/src/hash_sorted_map.rs
+++ b/crates/hash-sorted-map/src/hash_sorted_map.rs
@@ -174,10 +174,7 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
                 return None;
             }
             if c == tag && unsafe { group.keys[hint].assume_init_ref() } == &key {
-                let old = std::mem::replace(
-                    unsafe { group.values[hint].assume_init_mut() },
-                    value,
-                );
+                let old = std::mem::replace(unsafe { group.values[hint].assume_init_mut() }, value);
                 return Some(old);
             }
             // Slow path: SIMD scan group for tag match.
@@ -185,10 +182,8 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
             tag_mask = group_ops::clear_slot(tag_mask, hint);
             while let Some(i) = group_ops::next_match(&mut tag_mask) {
                 if unsafe { group.keys[i].assume_init_ref() } == &key {
-                    let old = std::mem::replace(
-                        unsafe { group.values[i].assume_init_mut() },
-                        value,
-                    );
+                    let old =
+                        std::mem::replace(unsafe { group.values[i].assume_init_mut() }, value);
                     return Some(old);
                 }
             }
@@ -240,9 +235,7 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
 
             // Fast path: preferred slot.
             let c = group.ctrl[hint];
-            if c == tag
-                && unsafe { group.keys[hint].assume_init_ref() }.borrow() == key
-            {
+            if c == tag && unsafe { group.keys[hint].assume_init_ref() }.borrow() == key {
                 return Some(unsafe { group.values[hint].assume_init_ref() });
             }
 
@@ -338,9 +331,9 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
         for group in &old_groups[..old_num_groups] {
             let mut full_mask = group_ops::match_full(&group.ctrl);
             while let Some(i) = group_ops::next_match(&mut full_mask) {
-                let hash = self.hash_builder.hash_one(unsafe {
-                    group.keys[i].assume_init_ref()
-                });
+                let hash = self
+                    .hash_builder
+                    .hash_one(unsafe { group.keys[i].assume_init_ref() });
                 self.insert_for_grow(hash, group.keys[i].as_ptr(), group.values[i].as_ptr());
             }
         }
@@ -378,8 +371,16 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
             }
         }
         group.ctrl[hint] = tag;
-        unsafe { group.keys[hint].as_mut_ptr().copy_from_nonoverlapping(key_src, 1) };
-        unsafe { group.values[hint].as_mut_ptr().copy_from_nonoverlapping(value_src, 1) };
+        unsafe {
+            group.keys[hint]
+                .as_mut_ptr()
+                .copy_from_nonoverlapping(key_src, 1)
+        };
+        unsafe {
+            group.values[hint]
+                .as_mut_ptr()
+                .copy_from_nonoverlapping(value_src, 1)
+        };
         self.len += 1;
     }
 }
@@ -402,7 +403,10 @@ enum FindResult<K, V> {
 /// long as no reallocation occurs (the grow path re-walks via the slow path).
 enum Insertion<K, V> {
     /// An empty slot is waiting at `(group, slot)`.
-    Empty { group: *mut Group<K, V>, slot: usize },
+    Empty {
+        group: *mut Group<K, V>,
+        slot: usize,
+    },
     /// The chain is full; allocate a new overflow group and link via `tail`.
     NeedsOverflow { tail: *mut Group<K, V> },
 }
@@ -489,7 +493,6 @@ impl<'a, V> OccupiedEntry<'a, V> {
 }
 
 impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> {
-
     /// Insert `value` and return a mutable reference to it.
     /// Writes directly to the slot pre-computed during `entry()`; only re-walks
     /// the chain on the rare grow path (where the pre-computed pointers become
@@ -699,11 +702,20 @@ mod tests {
             s.to_string()
         };
         // First call: f runs, inserts "first".
-        assert_eq!(map.get_or_insert_with(1, || make("first")), &mut "first".to_string());
+        assert_eq!(
+            map.get_or_insert_with(1, || make("first")),
+            &mut "first".to_string()
+        );
         // Second call with same key: f does NOT run; returns existing.
-        assert_eq!(map.get_or_insert_with(1, || make("second")), &mut "first".to_string());
+        assert_eq!(
+            map.get_or_insert_with(1, || make("second")),
+            &mut "first".to_string()
+        );
         // New key: f runs.
-        assert_eq!(map.get_or_insert_with(2, || make("third")), &mut "third".to_string());
+        assert_eq!(
+            map.get_or_insert_with(2, || make("third")),
+            &mut "third".to_string()
+        );
         assert_eq!(call_count, 2);
         assert_eq!(map.len(), 2);
     }
diff --git a/crates/string-offsets/benchmarks/performance.rs b/crates/string-offsets/benchmarks/performance.rs
index 199a053..8f62e8f 100644
--- a/crates/string-offsets/benchmarks/performance.rs
+++ b/crates/string-offsets/benchmarks/performance.rs
@@ -1,6 +1,6 @@
-use std::hint::black_box;
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
 use rand::{rng, RngExt};
+use std::hint::black_box;
 use string_offsets::{AllConfig, OnlyLines, StringOffsets};
 
 fn only_lines_construction_benchmark(c: &mut Criterion) {

From d213be8f684f3bc6f636a58a8cda84a71c60573b Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Mon, 4 May 2026 12:54:09 +0200
Subject: [PATCH 17/22] Update equivalence.rs

---
 crates/bpe/benchmarks/equivalence.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/bpe/benchmarks/equivalence.rs b/crates/bpe/benchmarks/equivalence.rs
index 4019602..d325dbf 100644
--- a/crates/bpe/benchmarks/equivalence.rs
+++ b/crates/bpe/benchmarks/equivalence.rs
@@ -30,7 +30,7 @@ fn test_compare_dictionary() {
             hugging_tokens.remove(added_token);
         }
         let mut hugging_tokens: Vec<_> = hugging_tokens.into_iter().collect();
-        hugging_tokens.sort_by(|(_, a), (_, b)| a.cmp(b));
+        hugging_tokens.sort_by_key(|(_, a)| *a);
         let hugging_tokens: Vec<_> = hugging_tokens
             .into_iter()
             .map(|(token, _)| token.chars().map(char_to_byte).collect())

From f36c8dfa2ce94f338ca33d65593bc723a11797da Mon Sep 17 00:00:00 2001
From: Jason Orendorff <jorendorff@github.com>
Date: Mon, 4 May 2026 19:36:27 -0500
Subject: [PATCH 18/22] Add test for growth with collisions

---
 crates/hash-sorted-map/src/hash_sorted_map.rs | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/crates/hash-sorted-map/src/hash_sorted_map.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs
index 22b43d8..73877e2 100644
--- a/crates/hash-sorted-map/src/hash_sorted_map.rs
+++ b/crates/hash-sorted-map/src/hash_sorted_map.rs
@@ -589,6 +589,8 @@ impl<K, V, S> Drop for HashSortedMap<K, V, S> {
 
 #[cfg(test)]
 mod tests {
+    use std::hash::{BuildHasher, Hasher};
+
     use super::*;
 
     #[test]
@@ -772,4 +774,37 @@ mod tests {
         *map.entry(7).and_modify(|v| *v *= 2).or_insert(99) += 1;
         assert_eq!(map.get(&7), Some(&203));
     }
+
+    /// Degenerate hasher that returns a fixed hash code, for forcing collisions.
+    struct FixedHasher(u64);
+
+    impl Hasher for FixedHasher {
+        fn finish(&self) -> u64 {
+            self.0
+        }
+        fn write(&mut self, _bytes: &[u8]) {}
+    }
+
+    #[derive(Clone)]
+    struct FixedState(u64);
+
+    impl BuildHasher for FixedState {
+        type Hasher = FixedHasher;
+        fn build_hasher(&self) -> FixedHasher {
+            FixedHasher(self.0)
+        }
+    }
+
+    #[test]
+    fn test_collisions() {
+        // Tiny initial capacity + all collisions
+        let mut m = HashSortedMap::with_capacity_and_hasher(1, FixedState(0));
+        for i in 0..200u32 {
+            m.insert(i, i);
+        }
+        assert_eq!(m.len(), 200);
+        for i in 0..200u32 {
+            assert_eq!(m.get(&i), Some(&i));
+        }
+    }
 }

From 908d3e9774aea3ccbc12be1d7afb9b41e759db8b Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Tue, 5 May 2026 15:23:32 +0200
Subject: [PATCH 19/22] Update crates/hash-sorted-map/src/hash_sorted_map.rs

Co-authored-by: Jason Orendorff <jorendorff@github.com>
---
 crates/hash-sorted-map/src/hash_sorted_map.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crates/hash-sorted-map/src/hash_sorted_map.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs
index 73877e2..aaeae4b 100644
--- a/crates/hash-sorted-map/src/hash_sorted_map.rs
+++ b/crates/hash-sorted-map/src/hash_sorted_map.rs
@@ -425,7 +425,8 @@ pub struct OccupiedEntry<'a, V> {
 /// View into a vacant entry. Holds the borrow of the map plus the hash, key,
 /// and pre-computed insertion slot.
 pub struct VacantEntry<'a, K, V, S> {
-    map: &'a mut HashSortedMap<K, V, S>,
+    phantom: PhantomData<&'a mut HashSortedMap<K, V, S>>,
+    map: *mut HashSortedMap<K, V, S>,
     hash: u64,
     key: K,
     insertion: Insertion<K, V>,

From 7cac054f46e75bbf8f72291287754112f14d54da Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Tue, 5 May 2026 15:26:43 +0200
Subject: [PATCH 20/22] Apply suggestions from code review

Co-authored-by: Jason Orendorff <jorendorff@github.com>
---
 crates/hash-sorted-map/src/hash_sorted_map.rs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/crates/hash-sorted-map/src/hash_sorted_map.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs
index aaeae4b..a4bcad9 100644
--- a/crates/hash-sorted-map/src/hash_sorted_map.rs
+++ b/crates/hash-sorted-map/src/hash_sorted_map.rs
@@ -374,13 +374,11 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
         unsafe {
             group.keys[hint]
                 .as_mut_ptr()
-                .copy_from_nonoverlapping(key_src, 1)
-        };
-        unsafe {
+                .copy_from_nonoverlapping(key_src, 1);
             group.values[hint]
                 .as_mut_ptr()
-                .copy_from_nonoverlapping(value_src, 1)
-        };
+                .copy_from_nonoverlapping(value_src, 1);
+        }
         self.len += 1;
     }
 }

From 7d2a74b1e6c862f19665bc6b37ec7014aec8fe11 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Tue, 5 May 2026 16:04:06 +0200
Subject: [PATCH 21/22] address review comments

---
 crates/hash-sorted-map/benchmarks/Cargo.toml  |  4 +--
 crates/hash-sorted-map/benchmarks/lib.rs      |  2 +-
 crates/hash-sorted-map/src/hash_sorted_map.rs | 30 +++++++++++--------
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/crates/hash-sorted-map/benchmarks/Cargo.toml b/crates/hash-sorted-map/benchmarks/Cargo.toml
index 252bd21..9ee37dc 100644
--- a/crates/hash-sorted-map/benchmarks/Cargo.toml
+++ b/crates/hash-sorted-map/benchmarks/Cargo.toml
@@ -14,8 +14,8 @@ test = false
 
 [dependencies]
 hash-sorted-map = { path = ".." }
-criterion = "0.7"
-rand = "0.9"
+criterion = "0.8"
+rand = "0.10"
 rustc-hash = "2"
 ahash = "0.8"
 hashbrown = "0.15"
diff --git a/crates/hash-sorted-map/benchmarks/lib.rs b/crates/hash-sorted-map/benchmarks/lib.rs
index 4f5d2cb..b80c3e4 100644
--- a/crates/hash-sorted-map/benchmarks/lib.rs
+++ b/crates/hash-sorted-map/benchmarks/lib.rs
@@ -1,6 +1,6 @@
 use std::hash::{BuildHasherDefault, Hasher};
 
-use rand::Rng;
+use rand::RngExt;
 
 const ARBITRARY0: u64 = 0x243f6a8885a308d3;
 
diff --git a/crates/hash-sorted-map/src/hash_sorted_map.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs
index a4bcad9..26a4ecd 100644
--- a/crates/hash-sorted-map/src/hash_sorted_map.rs
+++ b/crates/hash-sorted-map/src/hash_sorted_map.rs
@@ -2,6 +2,7 @@ use core::mem::MaybeUninit;
 use std::borrow::Borrow;
 use std::collections::hash_map::RandomState;
 use std::hash::{BuildHasher, Hash};
+use std::marker::PhantomData;
 
 use super::group_ops::{self, CTRL_EMPTY, GROUP_SIZE};
 
@@ -150,6 +151,7 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
                 value: unsafe { &mut *ptr },
             }),
             FindResult::Vacant(insertion) => Entry::Vacant(VacantEntry {
+                phantom: PhantomData,
                 map: self,
                 hash,
                 key,
@@ -364,7 +366,7 @@ impl<K: Hash + Eq, V, S: BuildHasher> HashSortedMap<K, V, S> {
                 group = &mut self.groups[overflow as usize];
             } else {
                 let new_gi = self.num_groups as usize;
-                self.groups[gi].overflow = new_gi as u32;
+                group.overflow = new_gi as u32;
                 self.num_groups += 1;
                 group = &mut self.groups[new_gi];
                 break;
@@ -505,30 +507,34 @@ impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> {
         let (group_ptr, slot) = match self.insertion {
             Insertion::Empty { group, slot } => (group, slot),
             Insertion::NeedsOverflow { tail } => {
-                if map.num_groups as usize == map.groups.len() {
-                    return insert_after_grow(map, hash, key, value);
-                }
-                let new_gi = map.num_groups as usize;
-                map.num_groups += 1;
-                // SAFETY: `tail` was obtained from `&mut self.groups[..]` and
-                // remains valid because no reallocation occurred between
-                // `entry()` and now (we hold the only `&mut self`).
+                let (new_gi, new_group) = unsafe {
+                    let map = &mut *map;
+                    if map.num_groups as usize == map.groups.len() {
+                        return insert_after_grow(map, hash, key, value);
+                    }
+                    let new_gi = map.num_groups as usize;
+                    map.num_groups += 1;
+                    let new_group: *mut Group<K, V> = &mut map.groups[new_gi];
+                    (new_gi, new_group)
+                };
                 unsafe {
+                    // SAFETY: `tail` was obtained from `&mut self.groups[..]` and
+                    // remains valid because no reallocation occurred between
+                    // `entry()` and now (we hold the only `&mut self`).
                     (*tail).overflow = new_gi as u32;
                 }
-                let new_group: *mut Group<K, V> = &mut map.groups[new_gi];
                 (new_group, slot_hint(hash))
             }
         };
 
         let tag = tag(hash);
-        // SAFETY: `group_ptr` points into `map.groups` and is valid for `'a`.
         unsafe {
+            (*map).len += 1;
+            // SAFETY: `group_ptr` points into `map.groups` and is valid for `'a`.
             let group = &mut *group_ptr;
             group.ctrl[slot] = tag;
             group.keys[slot] = MaybeUninit::new(key);
             group.values[slot] = MaybeUninit::new(value);
-            map.len += 1;
             group.values[slot].assume_init_mut()
         }
     }

From 2a5c666d77229b6f110aeb83eb34fa68d7077bed Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Wed, 6 May 2026 11:56:30 +0200
Subject: [PATCH 22/22] last comments

---
 crates/hash-sorted-map/OPTIMIZATIONS.md | 63 ++++++++++++-------------
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/crates/hash-sorted-map/OPTIMIZATIONS.md b/crates/hash-sorted-map/OPTIMIZATIONS.md
index c88c517..0b04520 100644
--- a/crates/hash-sorted-map/OPTIMIZATIONS.md
+++ b/crates/hash-sorted-map/OPTIMIZATIONS.md
@@ -20,8 +20,8 @@ experimental results that guided the current design.
 │                   hashbrown Swiss Table                          │
 │                                                                  │
 │  Single contiguous allocation (SoA):                             │
-│  [Padding] [T_n ... T_1  T_0] [CT_0 CT_1 ... CT_n] [CT_extra]  │
-│                data               control bytes      (mirrored) │
+│  [Padding] [T_n ... T_1  T_0] [CT_0 CT_1 ... CT_n] [CT_extra]    │
+│                data               control bytes    (mirrored)    │
 │                                                                  │
 │  • Open addressing, triangular probing                           │
 │  • 16-byte groups (SSE2) or 8-byte groups (NEON/generic)         │
@@ -32,7 +32,7 @@ experimental results that guided the current design.
 │                      HashSortedMap                               │
 │                                                                  │
 │  Vec<Group<K,V>> where each Group (AoS):                         │
-│  { ctrl: [u8; 8], keys: [MaybeUninit<K>; 8],                    │
+│  { ctrl: [u8; 8], keys: [MaybeUninit<K>; 8],                     │
 │    values: [MaybeUninit<V>; 8], overflow: u32 }                  │
 │                                                                  │
 │  • Overflow chaining (linked groups)                             │
@@ -58,8 +58,9 @@ modest because the slot-hint fast path often skips the group scan entirely.
 
 ### 2. Open Addressing with Triangular Probing ❌ Rejected
 
-Tested an open-addressing variant (`OpenHashSortedMap`) with triangular
-probing over AoS groups.
+This is not really an option for this hash map, since it would prevent efficient sorting.
+Additionally, we didn't observe any performance improvement in comparison to the linked overflow buffer approach.
+The biggest benefit of triangular probing is that it allows a much higher load factor, i.e. reduces memory consumption which isn't our main concern though.
 
 **Benchmark result**: **40% slower** than overflow chaining. With the AoS
 layout, each group is ~112 bytes, so probing to the next group jumps over
@@ -78,12 +79,9 @@ SoA is worse than AoS for this use case.
 
 ### 4. Capacity Sizing ✅ Implemented
 
-The original `with_capacity` allocated `capacity / 8` groups, giving ~100%
-slot utilization. hashbrown uses `capacity * 8 / 7`, giving ~50% load.
+Without the correct sizing, there was always the penality of a grow operation.
 
-**Fix**: Changed to `capacity * 8 / 7` (87.5% max load factor), matching
-hashbrown. This was the **single biggest improvement** — HashSortedMap went
-from 2× slower to matching hashbrown.
+**Fix**: Changed to ~70% max load factor. This was the **single biggest improvement** — HashSortedMap went from 2× slower to matching hashbrown.
 
 ### 5. Optimized Growth ✅ Implemented
 
@@ -118,25 +116,22 @@ if ctrl[hint] == tag && keys[hint] == key { /* direct hit */ }
 ```
 
 hashbrown does **not** have this optimization — it always does a full SIMD
-group scan. At ~50% load, the hint hits ~58% of the time, avoiding the scan
-entirely.
+group scan. The reason why the performance is different is probably due to the different overflow strategies and the different load factors.
 
 ### 8. Overflow Reserve Sizing ✅ Validated
 
 Tested overflow reserves from 0% to 100% of primary groups:
 
 | Reserve | Growth scenario (µs) |
-|---------|---------------------|
-| m/8 (12.5%, default) | 8.04 |
-| m/4 (25%) | 8.33 |
-| m/2 (50%) | 8.93 |
-| m/1 (100%) | 10.31 |
-| 0 (grow immediately) | 6.96 |
+|---------|----------------------|
+| m/8 (12.5%, default) |  8.04   |
+| m/4 (25%)            |  8.33   |
+| m/2 (50%)            |  8.93   |
+| m/1 (100%)           | 10.31   |
+| 0 (grow immediately) |  6.96   |
 
 **Conclusion**: Smaller reserves are faster — growing early is cheaper than
-traversing overflow chains. The `m/8` default implicitly enforces ~62.5% max
-load, which aligns with the mathematical analysis (Poisson model, 3σ
-confidence).
+traversing overflow chains.
 
 ### 9. IdentityHasher Fix ✅ Implemented
 
@@ -152,25 +147,25 @@ entropy in both halves. Also changed trigram generation to use
 
 ## Optimizations Not Implemented (and Why)
 
-| Optimization | Reason |
-|---|---|
+| Optimization                    | Reason                                   |
+|---------------------------------|------------------------------------------|
 | **Tombstone / DELETED support** | Insertion-only map — no deletions needed |
-| **In-place rehashing** | No tombstones to reclaim |
-| **Control byte mirroring** | Not needed with overflow chaining (no wrap-around) |
-| **Custom allocator support** | Out of scope for benchmarking |
-| **Over-allocation utilization** | Uses `Vec` (no raw allocator control) |
+| **In-place rehashing**          | No tombstones to reclaim                 |
+| **Control byte mirroring**      | Not needed with overflow chaining (no wrap-around) |
+| **Custom allocator support**    | Out of scope for benchmarking            |
+| **Over-allocation utilization** | Uses `Vec` (no raw allocator control)    |
 
 ---
 
 ## Summary of Impact
 
-| Change | Effect on insert time |
-|---|---|
-| Capacity sizing fix (`*8/7`) | **−50%** (biggest win) |
-| Optimized growth path | **−10%** on growth scenarios |
-| SIMD group scanning | **−5%** |
-| Branch hints (scalar only) | **−2–6%** |
-| IdentityHasher fix | Enabled fair comparison |
+| Change                     | Effect on insert time        |
+|----------------------------|------------------------------|
+| Capacity sizing fix        | **−50%** (biggest win)       |
+| Optimized growth path      | **−10%** on growth scenarios |
+| SIMD group scanning        | **−5%**                      |
+| Branch hints (scalar only) | **−2–6%**                    |
+| IdentityHasher fix         | Enabled fair comparison      |
 
 The current HashSortedMap **matches hashbrown+FxHash** on pre-sized inserts,
 **beats all hashbrown variants** on overwrites, and has **2× faster growth**.