Conversation
|
I found that pre hash hurts the compression ratio. the following patch can improve the compression ratio for lz4hc: Detailsdiff --git a/src/block/compress_hc/hash_chain.rs b/src/block/compress_hc/hash_chain.rs
index a84230f..865312c 100644
--- a/src/block/compress_hc/hash_chain.rs
+++ b/src/block/compress_hc/hash_chain.rs
@@ -487,19 +487,6 @@ impl HashTableHCU32 {
self.dictionary[hash] as usize
}
- /// Set dictionary slot at hash index.
- #[inline]
- fn set_dictionary_at(&mut self, hash: usize, pos: usize) {
- self.dictionary[hash] = pos as u32;
- }
-
- /// Set chain value at position
- #[inline]
- fn set_chain(&mut self, pos: usize, delta: u16) {
- let chain_index = pos & self.chain_mask();
- self.chain_table[chain_index] = delta;
- }
-
/// Insert hashes for all positions up to the given local offset.
/// Positions stored in the hash table are absolute (`local_pos + stream_offset`).
#[inline]
@@ -739,39 +726,6 @@ pub(super) fn find_longer_hash_chain_match(
}
}
-/// Update the hash chain for the first match found at `cur`.
-///
-/// This mirrors the pre-hash step from the LZ4 HC reference: positions inside the
-/// first accepted match are inserted eagerly so later searches can skip ahead.
-fn prehash_first_match(
- hash_table: &mut HashTableHCU32,
- input: &[u8],
- cur_absolute: usize,
- stream_offset: usize,
- first_match_length: usize,
- delta: usize,
-) {
- let mut hash_pos = cur_absolute;
- let end_pos = cur_absolute + first_match_length - 3;
-
- while hash_pos < end_pos - delta {
- hash_table.set_chain(hash_pos, delta as u16);
- hash_pos += 1;
- }
-
- loop {
- hash_table.set_chain(hash_pos, delta as u16);
- let local_hash_pos = hash_pos - stream_offset;
- hash_table.set_dictionary_at(get_hash_at(input, local_hash_pos), hash_pos);
- hash_pos += 1;
- if hash_pos >= end_pos {
- break;
- }
- }
-
- hash_table.next_to_update = end_pos;
-}
-
/// Insert `cur` into the hash/chain tables, then search the chain for the
/// longest match starting at `cur`.
///
@@ -795,8 +749,6 @@ fn find_best_hash_chain_match(
match_length: 0,
candidate: 0,
};
- let mut first_match_delta = 0usize;
- let mut first_match_length = 0usize;
let cur_absolute = cur + stream_offset;
let ext_dict_stream_offset = stream_offset - ext_dict.len();
@@ -805,7 +757,7 @@ fn find_best_hash_chain_match(
let mut candidate = hash_table.get_dictionary_at(get_hash_at(input, cur));
- for attempt in 0..hash_table.max_attempts {
+ for _ in 0..hash_table.max_attempts {
if !hash_table.in_range(candidate, cur_absolute) {
break;
}
@@ -836,28 +788,12 @@ fn find_best_hash_chain_match(
best_match.match_length = match_length as u32;
}
- if attempt == 0 && match_length > 0 {
- first_match_length = match_length;
- first_match_delta = cur_absolute - candidate;
- }
-
let Some(next_candidate) = hash_table.advance(candidate, cur_absolute) else {
break;
};
candidate = next_candidate;
}
- if first_match_length != 0 {
- prehash_first_match(
- hash_table,
- input,
- cur_absolute,
- stream_offset,
- first_match_length,
- first_match_delta,
- );
- }
-
if best_match.match_length == 0 {
None
} else {
@@ -1165,13 +1101,14 @@ pub(super) fn compress_hash_chain_internal(
// Do not extend matches into the last `LAST_LITERALS` bytes (they are literals).
let match_limit = input_end - LAST_LITERALS;
- let mut cur = input_pos + 1;
+ // Match C's LZ4HC main loop: start at block start and scan through `mflimit` inclusive.
+ let mut cur = input_pos;
let mut literal_start = input_pos;
let mut previous_match;
let mut current_match;
let mut next_match;
- while cur < end_pos_check {
+ while cur <= end_pos_check {
let Some(found_match) = find_best_hash_chain_match(
hash_table,
input,
diff --git a/src/block/compress_hc/tests.rs b/src/block/compress_hc/tests.rs
index 07f8a06..9bd9fc6 100644
--- a/src/block/compress_hc/tests.rs
+++ b/src/block/compress_hc/tests.rs
@@ -257,8 +257,8 @@ fn test_compressed_sizes_exact() {
// (level, html_like, json_like, code_like)
let expected: &[(u8, usize, usize, usize)] = &[
(1, 16_350, 16_183, 6_246),
- (4, 15_620, 16_198, 6_071),
- (9, 15_509, 15_698, 5_985),
+ (4, 15_642, 16_363, 6_081),
+ (9, 15_111, 15_482, 5_984),
(10, 15_153, 15_102, 5_990),
(12, 15_100, 15_083, 5_979),
]; |
- cursor_pos → cur, literal_anchor_pos → literal_start - candidate_absolute_position → candidate, absolute_byte_offset → cur_absolute - reference_local_position → candidate_local - external_dictionary → ext_dict, external_dictionary_stream_offset → ext_dict_stream_offset - match_extension_end_pos → match_limit, max_main_cursor_pos → end_pos_check - MIN_MATCH → MINMATCH (use shared constant from mod.rs) - MIN_BYTES_FROM_CURSOR_TO_BLOCK_END → MFLIMIT (use shared constant) - Shorten verbose names in opt parser and mid compressor - Ignore 10MB tests for faster iteration
The same ext_dict candidate matching logic (boundary-crossing reads, min-match check, forward count) was duplicated across 3 search methods in HashTableHCU32. Extract into a shared helper function.
- Extract in_range() and advance() helpers to replace repeated chain-validity checks - Use early continue/break to reduce indentation depth - Shorten local variable names in insert_and_find_wider_match
…loops
Replace the hard-to-follow break true/break false pattern in the
match0/match1/match2/match3 lazy evaluation with labeled loops ('lazy
and 'resolve), making control flow explicit and adding comments
explaining each branch.
…ctions Move add_hash4/add_hash8 to methods on HashTableMid and resolve_candidate to a standalone resolve_mid_candidate function, eliminating nested fn definitions and reducing parameter passing.
Remove test_lz4mid_debug (println-based debug test). Make Match, HashTableHCU32, and HashTableMid private since they're only used within compress_hc.rs. Restrict HcLevelParams fields to pub(crate).
Align compress_hc.rs naming with compress.rs conventions where the earlier match position is consistently called 'candidate'.
|
Any further concerns? Also, please take a look at #216 (comment). Thanks. |
There was a problem hiding this comment.
Pull request overview
This PR adds multi-level compression support (LZ4MID / LZ4HC / LZ4OPT-style behavior) to lz4_flex, integrating it into both block compression APIs and the frame encoder, along with CLI, tests/fuzzing, and benchmarking/docs updates.
Changes:
- Introduces a new
block::compress_hc*API family with level-based strategy selection (two-hash-tables, hash-chain HC, optimal parsing) and reusable compression tables. - Extends
frame::FrameEncoderwithwith_compression_level(...)and routes frame block compression through fast vs HC/optimal paths. - Updates
lz4_bin, benches, tests, fuzz targets, and adds documentation/perf notes for the new algorithms.
Reviewed changes
Copilot reviewed 23 out of 23 changed files in this pull request and generated 6 comments.
Show a summary per file
| File | Description |
|---|---|
tests/tests.rs |
Expands roundtrip/cross-compat tests to cover levels 1–12 and adds linked-mode HC tests; marks some large tests ignored. |
src/frame/compress.rs |
Adds level-aware frame encoding, switching between fast and HC/optimal compressors and maintaining reusable state. |
src/block/mod.rs |
Wires in the new compress_hc module and re-exports its public API; minor doc link tweak. |
src/block/compress.rs |
Refactors shared helpers (count_same_bytes, handle_last_literals, encode_sequence) for reuse by HC implementations. |
src/block/compress_hc/mod.rs |
New high-compression module: strategy selection, reusable tables, public APIs, and linked-block support hooks for frames. |
src/block/compress_hc/two_hashtables.rs |
New “mid” (two-hash-tables) compressor used for low levels. |
src/block/compress_hc/hash_chain.rs |
New hash-chain HC compressor implementation (levels ~3–9). |
src/block/compress_hc/optimal.rs |
New optimal parsing implementation (levels ~10–12). |
src/block/compress_hc/tests.rs |
Adds unit tests for the new HC/optimal compressors and clamping behavior. |
perf.md |
Adds performance investigation notes for the two-hash-tables strategy. |
lz4_Block_format.md |
Adds a copy of the upstream LZ4 block format description for reference. |
lz4_bin/src/main.rs |
Adds CLI flags for compression level and block size; uses the new frame encoder constructor. |
lz4_bin/Cargo.toml |
Enables the frame feature for lz4_flex in the CLI crate. |
fuzz/fuzz_targets/fuzz_roundtrip_hc.rs |
Adds block HC roundtrip fuzzing with varied levels. |
fuzz/fuzz_targets/fuzz_roundtrip_hc_cpp.rs |
Adds block HC fuzzing with C++ (lzzzz) decompression validation. |
fuzz/fuzz_targets/fuzz_roundtrip_hc_frame.rs |
Adds frame fuzzing across levels, block modes, block sizes, and checksum settings. |
fuzz/Cargo.toml |
Registers the new fuzz targets as binaries. |
docs/compress_hc_algorithms.md |
Adds high-level documentation describing the three HC strategies and how to read the implementation. |
CLAUDE.md |
Adds repository code style guidance. |
Cargo.toml |
Updates bench profile settings (adds debug = true). |
benches/binggan_bench.rs |
Extends benchmarking to cover block HC levels and compare against lz4 reference where applicable. |
benches/bench.rs |
Removes an old commented-out benchmark file. |
.rgignore |
Adjusts ripgrep ignore rules to skip large benchmark corpora while keeping bench source searchable. |
Comments suppressed due to low confidence (1)
tests/tests.rs:741
- This test is now unconditionally marked
#[ignore], so it won’t run in CI and may stop catching regressions in block sizing/compression behavior. If it’s too slow for default CI, consider gating it behind an env var/feature (e.g.SLOW_TESTS) rather than ignoring it entirely.
#[test]
#[cfg_attr(miri, ignore)]
#[ignore]
fn block_size() {
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| // The unsafe version copies blocks of 8bytes, and therefore may copy up to 7bytes more than | ||
| // needed. This is safe, because the last 12 bytes (MF_LIMIT) are handled in | ||
| // handle_last_literals. | ||
| copy_literals_wild(output, literal, 0, literal.len()); |
| /// Reset the table for reuse. | ||
| /// | ||
| /// Stale entries are harmless: `resolve_candidate` bounds-checks every | ||
| /// position before use, so old entries just fail the match attempt. | ||
| /// Skipping the 128 KB memset is a large win for small inputs. | ||
| pub(super) fn reset(&mut self) { | ||
| // Intentionally not zeroed — see resolve_candidate. |
| #[test] | ||
| #[cfg_attr(miri, ignore)] | ||
| #[ignore] | ||
| fn test_text_10mb() { |
Prehashing the first match can reduce compression ratio by skipping useful later candidates. Remove that step and scan the HC main loop from the block start through mflimit inclusive. Benchmark: cargo bench "block_compress AND flex" Level 9 impact: - 725: 19.548 MB/s (-0.43%), reuse 16.284 MB/s (-1.26%), output 542 - 34308: 62.703 MB/s (-10.59%), reuse 61.468 MB/s (-11.68%), output 15_901 (-0.48%) - 64723: 54.728 MB/s (-14.94%), reuse 54.194 MB/s (-19.79%), output 29_088 (-0.52%) - 66675: 87.811 MB/s (-14.86%), reuse 86.612 MB/s (-13.28%), output 12_249 (-0.25%) - 9991663: 29.519 MB/s (-21.45%), reuse 29.523 MB/s (-21.64%), output 4_393_516 (-0.80%)
Thanks, I applied the diff here 5739b5b |
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
Based on the PR from @yujincheng08 #209 with some changes on top
closes #21
closes #165