diff --git a/src/complevel_estimator.rs b/src/complevel_estimator.rs index 7a740d9..77bb5be 100644 --- a/src/complevel_estimator.rs +++ b/src/complevel_estimator.rs @@ -11,11 +11,12 @@ use crate::hash_algorithm::{ HashAlgorithm, LibdeflateRotatingHash4, MiniZHash, ZlibNGHash, ZlibRotatingHash, MINIZ_LEVEL1_HASH_SIZE_MASK, }; -use crate::hash_chain::{HashChain, MAX_UPDATE_HASH_BATCH}; +use crate::hash_chain::{DictionaryAddPolicy, HashChain, MAX_UPDATE_HASH_BATCH}; use crate::preflate_constants; use crate::preflate_input::PreflateInput; use crate::preflate_parse_config::{FAST_PREFLATE_PARSER_SETTINGS, SLOW_PREFLATE_PARSER_SETTINGS}; use crate::preflate_token::{BlockType, PreflateToken, PreflateTokenBlock, PreflateTokenReference}; +use crate::skip_length_estimator::estimate_skip_length; #[derive(Default)] pub struct CompLevelInfo { @@ -26,9 +27,9 @@ pub struct CompLevelInfo { pub very_far_matches_detected: bool, pub max_dist_3_matches: u16, pub min_len: u32, + pub add_policy: DictionaryAddPolicy, pub hash_mask: u16, pub hash_shift: u32, - pub is_fast_compressor: bool, pub hash_algorithm: HashAlgorithm, pub good_length: u32, pub max_lazy: u32, @@ -46,7 +47,7 @@ enum HashChainType { struct CandidateInfo { hash_mask: u16, hash_shift: u32, - skip_length: Option, + add_policy: DictionaryAddPolicy, hash_chain: HashChainType, longest_dist_at_hop_0: u32, @@ -58,14 +59,14 @@ impl CandidateInfo { fn new( hash_mask: u16, hash_shift: u32, - skip_length: Option, + add_policy: DictionaryAddPolicy, hash_algorithm: HashAlgorithm, input: &PreflateInput, ) -> Self { CandidateInfo { hash_mask, hash_shift, - skip_length, + add_policy, hash_chain: match hash_algorithm { HashAlgorithm::Zlib => HashChainType::Zlib(HashChain::::new( hash_shift, hash_mask, &input, @@ -88,21 +89,25 @@ impl CandidateInfo { } } - fn invoke_update_hash(&mut self, len: u32, input: &PreflateInput) { - match self.hash_chain { - HashChainType::Zlib(ref mut h) => h.update_hash::(len, input), - HashChainType::MiniZ(ref mut h) => h.update_hash::(len, input), - HashChainType::LibFlate4(ref mut h) => h.update_hash::(len, input), - HashChainType::ZlibNG(ref mut h) => h.update_hash::(len, input), - } - } - - fn invoke_skip_hash(&mut self, len: u32, input: &PreflateInput) { + fn invoke_update_hash( + &mut self, + len: u32, + input: &PreflateInput, + add_policy: DictionaryAddPolicy, + ) { match self.hash_chain { - HashChainType::Zlib(ref mut h) => h.skip_hash::(len, input), - HashChainType::MiniZ(ref mut h) => h.skip_hash::(len, input), - HashChainType::LibFlate4(ref mut h) => h.skip_hash::(len, input), - HashChainType::ZlibNG(ref mut h) => h.skip_hash::(len, input), + HashChainType::Zlib(ref mut h) => { + h.update_hash_with_policy::(len, input, add_policy) + } + HashChainType::MiniZ(ref mut h) => { + h.update_hash_with_policy::(len, input, add_policy) + } + HashChainType::LibFlate4(ref mut h) => { + h.update_hash_with_policy::(len, input, add_policy) + } + HashChainType::ZlibNG(ref mut h) => { + h.update_hash_with_policy::(len, input, add_policy) + } } } @@ -165,18 +170,6 @@ impl CandidateInfo { } } - fn skip_or_update_hash(&mut self, len: u32, input: &PreflateInput) { - if let Some(skip_length) = self.skip_length { - if len <= skip_length { - self.invoke_update_hash(len, input); - } else { - self.invoke_skip_hash(len, input); - } - } else { - self.invoke_update_hash(len, input); - } - } - fn max_chain_found(&self) -> u32 { self.max_chain_found } @@ -189,10 +182,6 @@ impl CandidateInfo { self.hash_shift } - fn skip_length(&self) -> Option { - self.skip_length - } - fn hash_algorithm(&self) -> HashAlgorithm { match self.hash_chain { HashChainType::Zlib(_) => HashAlgorithm::Zlib, @@ -229,6 +218,8 @@ impl<'a> CompLevelEstimatorState<'a> { plain_text: &'a [u8], blocks: &'a Vec, ) -> Self { + let add_policy = estimate_skip_length(blocks); + let hash_bits = mem_level + 7; let mem_hash_shift = (hash_bits + 2) / 3; let mem_hash_mask = ((1u32 << hash_bits) - 1) as u16; @@ -246,33 +237,19 @@ impl<'a> CompLevelEstimatorState<'a> { let mut candidates: Vec> = Vec::new(); - // add the ZlibRotatingHash candidates - for config in &FAST_PREFLATE_PARSER_SETTINGS { - for &(hash_shift, hash_mask) in hashparameters.iter() { - candidates.push(Box::new(CandidateInfo::new( - hash_mask, - hash_shift, - Some(config.max_lazy), - HashAlgorithm::Zlib, - &input, - ))); - } - } - candidates.push(Box::new(CandidateInfo::new( MINIZ_LEVEL1_HASH_SIZE_MASK, 0, - Some(2), + add_policy, HashAlgorithm::MiniZFast, &input, ))); - // slow compressor candidates for (hash_shift, hash_mask) in [(5, 32767), (4, 2047)] { candidates.push(Box::new(CandidateInfo::new( hash_mask, hash_shift, - None, + add_policy, HashAlgorithm::Zlib, &input, ))); @@ -282,16 +259,16 @@ impl<'a> CompLevelEstimatorState<'a> { candidates.push(Box::new(CandidateInfo::new( 0xffff, 0, - None, + add_policy, HashAlgorithm::Libdeflate4, &input, ))); - // ZlibNG slow candidate + // ZlibNG candidate candidates.push(Box::new(CandidateInfo::new( 0xffff, 0, - None, + add_policy, HashAlgorithm::ZlibNG, &input, ))); @@ -309,25 +286,20 @@ impl<'a> CompLevelEstimatorState<'a> { } } - fn update_hash(&mut self, mut length: u32) { + fn update_hash(&mut self, mut length: u32, override_add_policy: bool) { while length > 0 { let batch_len = std::cmp::min(length, MAX_UPDATE_HASH_BATCH); for i in &mut self.candidates { - i.invoke_update_hash(batch_len, &self.input); - } - - self.input.advance(batch_len); - length -= batch_len; - } - } - - fn skip_or_update_hash(&mut self, mut length: u32) { - while length > 0 { - let batch_len = std::cmp::min(length, MAX_UPDATE_HASH_BATCH); - - for c in &mut self.candidates { - c.skip_or_update_hash(batch_len, &self.input); + i.invoke_update_hash( + batch_len, + &self.input, + if override_add_policy { + DictionaryAddPolicy::AddAll + } else { + i.add_policy + }, + ); } self.input.advance(batch_len); @@ -362,17 +334,17 @@ impl<'a> CompLevelEstimatorState<'a> { fn check_dump(&mut self) { for (_i, b) in self.blocks.iter().enumerate() { if b.block_type == BlockType::Stored { - self.update_hash(b.uncompressed_len); + self.update_hash(b.uncompressed_len, true); continue; } for (_j, t) in b.tokens.iter().enumerate() { match t { PreflateToken::Literal => { - self.update_hash(1); + self.update_hash(1, true); } PreflateToken::Reference(r) => { self.check_match(r); - self.skip_or_update_hash(r.len()); + self.update_hash(r.len(), false); } } } @@ -396,28 +368,24 @@ impl<'a> CompLevelEstimatorState<'a> { let hash_mask = candidate.hash_mask(); let hash_shift = candidate.hash_shift(); + let add_policy = candidate.add_policy; let max_chain = candidate.max_chain_found() + 1; let hash_algorithm = candidate.hash_algorithm(); let longest_dist_at_hop_0 = candidate.longest_dist_at_hop_0; let longest_dist_at_hop_1_plus = candidate.longest_dist_at_hop_1_plus; - let fast_compressor; - - match candidate.skip_length() { - Some(skip_length) => { - max_lazy = skip_length; - fast_compressor = true; + match candidate.add_policy { + DictionaryAddPolicy::AddFirst(_) | DictionaryAddPolicy::AddFirstAndLast(_) => { for config in &FAST_PREFLATE_PARSER_SETTINGS { if candidate.max_chain_found() < config.max_chain { good_length = config.good_length; nice_length = config.nice_length; + max_lazy = 0; break; } } } - None => { - fast_compressor = false; - + DictionaryAddPolicy::AddAll => { for config in &SLOW_PREFLATE_PARSER_SETTINGS { if candidate.max_chain_found() < config.max_chain { good_length = config.good_length; @@ -448,7 +416,7 @@ impl<'a> CompLevelEstimatorState<'a> { max_dist_3_matches: self.longest_len_3_dist as u16, hash_mask, hash_shift, - is_fast_compressor: fast_compressor, + add_policy, good_length, max_lazy, nice_length, @@ -457,7 +425,7 @@ impl<'a> CompLevelEstimatorState<'a> { hash_algorithm, zlib_compatible: !self.match_to_start && !very_far_matches - && (self.longest_len_3_dist < 4096 || fast_compressor), + && (self.longest_len_3_dist < 4096 || add_policy != DictionaryAddPolicy::AddAll), }) } diff --git a/src/hash_chain.rs b/src/hash_chain.rs index d6759a1..0a660fe 100644 --- a/src/hash_chain.rs +++ b/src/hash_chain.rs @@ -17,6 +17,21 @@ use crate::{ pub const MAX_UPDATE_HASH_BATCH: u32 = 0x180; +pub const UPDATE_MODE_ALL: u32 = 0; +pub const UPDATE_MODE_FIRST: u32 = 1; +pub const UPDATE_MODE_FIRST_AND_LAST: u32 = 2; + +#[derive(Default, Eq, PartialEq, Debug, Clone, Copy)] +pub enum DictionaryAddPolicy { + /// Add all substrings of a match to the dictionary + #[default] + AddAll, + /// Add only the first substring of a match to the dictionary that are larger than the limit + AddFirst(u16), + /// Add only the first and last substring of a match to the dictionary that are larger than the limit + AddFirstAndLast(u16), +} + pub trait HashChainTrait: Default {} #[derive(Default, Copy, Clone, Eq, PartialEq, Debug)] @@ -130,7 +145,7 @@ impl HashTable { self.running_hash = self.running_hash.append(b, self.hash_shift); } - fn update_chain( + fn update_chain( &mut self, chars: &[u8], mut pos: InternalPosition, @@ -143,10 +158,14 @@ impl HashTable { return; } - for i in 0..cmp::min(length as usize, chars.len() - offset) { + let last = cmp::min(length as usize, chars.len() - offset); + for i in 0..last { self.update_running_hash(chars[i + offset]); - if !IS_FAST_COMPRESSOR || i == 0 { + if UPDATE_MODE == UPDATE_MODE_ALL + || (UPDATE_MODE == UPDATE_MODE_FIRST && i == 0) + || (UPDATE_MODE == UPDATE_MODE_FIRST_AND_LAST && (i == 0 || i == last - 1)) + { let h = self.get_running_hash(); if MAINTAIN_DEPTH { @@ -425,27 +444,38 @@ impl HashChain { }) } - pub fn update_hash(&mut self, length: u32, input: &PreflateInput) { - assert!(length <= MAX_UPDATE_HASH_BATCH); - - self.reshift_if_necessary::(input); - - let pos = InternalPosition::from_absolute(input.pos(), self.total_shift); - let chars = input.cur_chars(0); - - self.hash_table - .update_chain::(chars, pos, length); - - // maintain the extra 3 length chain if we have it - if let Some(x) = self.hash_table_3_len.as_mut() { - x.update_chain::(chars, pos, length); + pub fn update_hash_with_policy( + &mut self, + length: u32, + input: &PreflateInput, + add_policy: DictionaryAddPolicy, + ) { + match add_policy { + DictionaryAddPolicy::AddAll => { + self.update_hash::(length, input); + } + DictionaryAddPolicy::AddFirst(limit) => { + if length > limit.into() { + self.update_hash::(length, input); + } else { + self.update_hash::(length, input); + } + } + DictionaryAddPolicy::AddFirstAndLast(limit) => { + if length > limit.into() { + self.update_hash::(length, input); + } else { + self.update_hash::(length, input); + } + } } - - //let c = self.checksum_whole_struct(); - //println!("u {} = {}", length, c); } - pub fn skip_hash(&mut self, length: u32, input: &PreflateInput) { + fn update_hash( + &mut self, + length: u32, + input: &PreflateInput, + ) { assert!(length <= MAX_UPDATE_HASH_BATCH); self.reshift_if_necessary::(input); @@ -454,12 +484,15 @@ impl HashChain { let chars = input.cur_chars(0); self.hash_table - .update_chain::(chars, pos, length); + .update_chain::(chars, pos, length); // maintain the extra 3 length chain if we have it if let Some(x) = self.hash_table_3_len.as_mut() { - x.update_chain::(chars, pos, length); + x.update_chain::(chars, pos, length); } + + //let c = self.checksum_whole_struct(); + //println!("u {} = {}", length, c); } pub fn match_depth( diff --git a/src/lib.rs b/src/lib.rs index e3440a5..0120543 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,6 +26,7 @@ mod preflate_stream_info; mod preflate_token; mod process; mod scan_deflate; +mod skip_length_estimator; mod statistical_codec; mod token_predictor; mod tree_predictor; diff --git a/src/predictor_state.rs b/src/predictor_state.rs index bdd5b67..b2ef2f6 100644 --- a/src/predictor_state.rs +++ b/src/predictor_state.rs @@ -6,10 +6,10 @@ use crate::bit_helper::DebugHash; use crate::hash_algorithm::RotatingHashTrait; -use crate::hash_chain::{HashChain, MAX_UPDATE_HASH_BATCH}; +use crate::hash_chain::{DictionaryAddPolicy, HashChain, MAX_UPDATE_HASH_BATCH}; use crate::preflate_constants::{MAX_MATCH, MIN_LOOKAHEAD, MIN_MATCH}; use crate::preflate_input::PreflateInput; -use crate::preflate_parameter_estimator::PreflateParameters; +use crate::preflate_parameter_estimator::{PreflateParameters, PreflateStrategy}; use crate::preflate_token::PreflateTokenReference; use std::cmp; use std::sync::atomic; @@ -55,22 +55,21 @@ impl<'a, H: RotatingHashTrait> PredictorState<'a, H> { self.hash.checksum(checksum); } - pub fn update_hash(&mut self, mut length: u32) { - while length > 0 { - let batch_len = cmp::min(length, MAX_UPDATE_HASH_BATCH); - - self.hash.update_hash::(batch_len, &self.input); - - self.input.advance(batch_len); - length -= batch_len; - } + pub fn update_hash_with_policy(&mut self, length: u32, add_policy: DictionaryAddPolicy) { + self.hash + .update_hash_with_policy::(length, &self.input, add_policy); + self.input.advance(length); } - pub fn skip_hash(&mut self, mut length: u32) { + pub fn update_hash_batch(&mut self, mut length: u32) { while length > 0 { let batch_len = cmp::min(length, MAX_UPDATE_HASH_BATCH); - self.hash.skip_hash::(batch_len, &self.input); + self.hash.update_hash_with_policy::( + batch_len, + &self.input, + DictionaryAddPolicy::AddAll, + ); self.input.advance(batch_len); length -= batch_len; } @@ -146,9 +145,20 @@ impl<'a, H: RotatingHashTrait> PredictorState<'a, H> { cur_max_dist_hop0 = cmp::min(max_dist_to_start, self.window_size()); cur_max_dist_hop1_plus = cur_max_dist_hop0; } else { - let max_dist: u32 = self.window_size() - MIN_LOOKAHEAD + 1; - cur_max_dist_hop0 = cmp::min(max_dist_to_start, max_dist); - cur_max_dist_hop1_plus = cmp::min(max_dist_to_start, max_dist - 1); + match self.params.strategy { + PreflateStrategy::HuffOnly | PreflateStrategy::Store => { + return MatchResult::NoMoreMatchesFound; + } + PreflateStrategy::RleOnly => { + cur_max_dist_hop0 = 1; + cur_max_dist_hop1_plus = 1; + } + _ => { + let max_dist: u32 = self.window_size() - MIN_LOOKAHEAD + 1; + cur_max_dist_hop0 = cmp::min(max_dist_to_start, max_dist); + cur_max_dist_hop1_plus = cmp::min(max_dist_to_start, max_dist - 1); + } + } } let nice_len = std::cmp::min(self.params.nice_length, max_len); diff --git a/src/preflate_parameter_estimator.rs b/src/preflate_parameter_estimator.rs index fc4a7b5..f59dc9d 100644 --- a/src/preflate_parameter_estimator.rs +++ b/src/preflate_parameter_estimator.rs @@ -10,6 +10,7 @@ use crate::{ bit_helper::bit_length, complevel_estimator::estimate_preflate_comp_level, hash_algorithm::HashAlgorithm, + hash_chain::DictionaryAddPolicy, preflate_constants::{self}, preflate_stream_info::{extract_preflate_info, PreflateStreamInfo}, preflate_token::PreflateTokenBlock, @@ -49,14 +50,16 @@ pub struct PreflateParameters { /// Zlib does not match to first byte of a file in order to reserve 0 for the end of chain pub matches_to_start_detected: bool, - /// the fast compressor only adds each match to the dictionary, not each character of the match - pub is_fast_compressor: bool, pub good_length: u32, pub max_lazy: u32, pub nice_length: u32, pub max_chain: u32, pub hash_algorithm: HashAlgorithm, pub min_len: u32, + + /// if something, then we use the "fast" compressor, which only adds smaller substrings + /// to the dictionary + pub add_policy: DictionaryAddPolicy, } const FILE_VERSION: u16 = 1; @@ -74,7 +77,6 @@ impl PreflateParameters { let max_dist_3_matches = decoder.decode_value(16); let very_far_matches_detected = decoder.decode_value(1) != 0; let matches_to_start_detected = decoder.decode_value(1) != 0; - let is_fast_compressor = decoder.decode_value(1) != 0; let good_length = decoder.decode_value(16); let max_lazy = decoder.decode_value(16); let nice_length = decoder.decode_value(16); @@ -82,6 +84,13 @@ impl PreflateParameters { let hash_algorithm = decoder.decode_value(4); let min_len = decoder.decode_value(16); + let add_policy = match decoder.decode_value(2) { + 0 => DictionaryAddPolicy::AddAll, + 1 => DictionaryAddPolicy::AddFirst(u16::from(decoder.decode_value(8))), + 2 => DictionaryAddPolicy::AddFirstAndLast(u16::from(decoder.decode_value(8))), + _ => panic!("invalid add policy"), + }; + const STRATEGY_DEFAULT: u16 = PreflateStrategy::Default as u16; const STRATEGY_RLE_ONLY: u16 = PreflateStrategy::RleOnly as u16; const STRATEGY_HUFF_ONLY: u16 = PreflateStrategy::HuffOnly as u16; @@ -118,12 +127,12 @@ impl PreflateParameters { max_dist_3_matches, very_far_matches_detected, matches_to_start_detected, - is_fast_compressor, good_length: good_length.into(), max_lazy: max_lazy.into(), nice_length: nice_length.into(), max_chain: max_chain.into(), min_len: min_len.into(), + add_policy, hash_algorithm: match hash_algorithm { HASH_ALGORITHM_ZLIB => HashAlgorithm::Zlib, HASH_ALGORITHM_MINIZ_FAST => HashAlgorithm::MiniZFast, @@ -146,13 +155,24 @@ impl PreflateParameters { encoder.encode_value(self.max_dist_3_matches, 16); encoder.encode_value(u16::try_from(self.very_far_matches_detected).unwrap(), 1); encoder.encode_value(u16::try_from(self.matches_to_start_detected).unwrap(), 1); - encoder.encode_value(u16::try_from(self.is_fast_compressor).unwrap(), 1); encoder.encode_value(u16::try_from(self.good_length).unwrap(), 16); encoder.encode_value(u16::try_from(self.max_lazy).unwrap(), 16); encoder.encode_value(u16::try_from(self.nice_length).unwrap(), 16); encoder.encode_value(u16::try_from(self.max_chain).unwrap(), 16); encoder.encode_value(self.hash_algorithm as u16, 4); encoder.encode_value(u16::try_from(self.min_len).unwrap(), 16); + + match self.add_policy { + DictionaryAddPolicy::AddAll => encoder.encode_value(0, 2), + DictionaryAddPolicy::AddFirst(v) => { + encoder.encode_value(1, 2); + encoder.encode_value(v as u16, 8); + } + DictionaryAddPolicy::AddFirstAndLast(v) => { + encoder.encode_value(2, 2); + encoder.encode_value(v as u16, 8); + } + } } } @@ -227,13 +247,13 @@ pub fn estimate_preflate_parameters( max_dist_3_matches: cl.max_dist_3_matches, very_far_matches_detected: cl.very_far_matches_detected, matches_to_start_detected: cl.matches_to_start_detected, - is_fast_compressor: cl.is_fast_compressor, good_length: cl.good_length, max_lazy: cl.max_lazy, nice_length: cl.nice_length, max_chain: cl.max_chain, hash_algorithm: cl.hash_algorithm, min_len: cl.min_len, + add_policy: cl.add_policy, }) } @@ -255,17 +275,31 @@ fn verify_zlib_recognition() { assert_eq!(params.strategy, PreflateStrategy::Store); } else if i >= 1 && i < 4 { let config = &FAST_PREFLATE_PARSER_SETTINGS[i as usize - 1]; + assert!( + params.max_chain <= config.max_chain, + "max_chain mismatch {} should be <= {}", + params.max_chain, + config.max_chain + ); assert_eq!(params.good_length, config.good_length); - assert_eq!(params.max_lazy, config.max_lazy); + assert_eq!( + params.add_policy, + DictionaryAddPolicy::AddFirst(config.max_lazy as u16) + ); assert_eq!(params.nice_length, config.nice_length); - assert!(params.max_chain <= config.max_chain); assert_eq!(params.strategy, PreflateStrategy::Default); } else if i >= 4 { let config = &SLOW_PREFLATE_PARSER_SETTINGS[i as usize - 4]; + assert!( + params.max_chain <= config.max_chain, + "max_chain mismatch {} should be <= {}", + params.max_chain, + config.max_chain + ); assert_eq!(params.good_length, config.good_length); assert_eq!(params.max_lazy, config.max_lazy); assert_eq!(params.nice_length, config.nice_length); - assert!(params.max_chain <= config.max_chain); + assert_eq!(params.add_policy, DictionaryAddPolicy::AddAll); assert_eq!(params.strategy, PreflateStrategy::Default); } } diff --git a/src/process.rs b/src/process.rs index 51a4404..829974c 100644 --- a/src/process.rs +++ b/src/process.rs @@ -464,16 +464,19 @@ fn verify_zlib_compressed_perfect() { let v = read_file(&format!("compressed_zlib_level{}.deflate", i)); let config; - let is_fast_compressor; + let add_policy; let max_dist_3_matches; + let max_lazy; if i < 4 { config = &FAST_PREFLATE_PARSER_SETTINGS[i as usize - 1]; - is_fast_compressor = true; + add_policy = crate::hash_chain::DictionaryAddPolicy::AddFirst(config.max_lazy as u16); max_dist_3_matches = 32768; + max_lazy = 0; } else { config = &SLOW_PREFLATE_PARSER_SETTINGS[i as usize - 4]; - is_fast_compressor = false; + add_policy = crate::hash_chain::DictionaryAddPolicy::AddAll; max_dist_3_matches = 4096; + max_lazy = config.max_lazy; } let params = PreflateParameters { @@ -487,13 +490,13 @@ fn verify_zlib_compressed_perfect() { max_dist_3_matches, very_far_matches_detected: false, matches_to_start_detected: false, - is_fast_compressor, good_length: config.good_length, - max_lazy: config.max_lazy, + max_lazy: max_lazy, nice_length: config.nice_length, max_chain: config.max_chain, hash_algorithm: HashAlgorithm::Zlib, min_len: 3, + add_policy, }; let contents = parse_deflate(&v, 1).unwrap(); @@ -533,13 +536,13 @@ fn verify_miniz1_compressed_perfect() { max_dist_3_matches: 8192, very_far_matches_detected: false, matches_to_start_detected: false, - is_fast_compressor: true, good_length: 258, - max_lazy: 2, + max_lazy: 0, nice_length: 258, max_chain: 2, hash_algorithm: HashAlgorithm::MiniZFast, min_len: 3, + add_policy: crate::hash_chain::DictionaryAddPolicy::AddFirst(0), }; encode_mispredictions(&contents, ¶ms, &mut cabac_encoder).unwrap(); diff --git a/src/skip_length_estimator.rs b/src/skip_length_estimator.rs new file mode 100644 index 0000000..d553d6f --- /dev/null +++ b/src/skip_length_estimator.rs @@ -0,0 +1,65 @@ +/// Different versions of Zlib use some length criterea to decide whether to add all the substrings of +/// a large match to the hash table. For example, zlib level 1 will only add all the substrings of matches +/// of length 4 in order to save on CPU. +/// +/// What we do here is walk through all the matches and record how long the matching +/// substrings are. The we see what the largest string was that we fully added to the +/// dictionary. +/// +/// This will be the limit that we use when we decide whether to +/// use skip_hash or update_hash. +use crate::{ + hash_chain::DictionaryAddPolicy, + preflate_token::{PreflateToken, PreflateTokenBlock}, +}; + +pub fn estimate_skip_length(token_blocks: &[PreflateTokenBlock]) -> DictionaryAddPolicy { + let mut current_window = vec![0u16; 32768]; + let mut max_distance: u32 = 0; + let mut max_distance_last_add = 0; + let mut current_offset: u32 = 0; + let mut counters = [0u32; 259]; + let mut counters_b = [0u32; 259]; + + for token_block in token_blocks { + for token in token_block.tokens.iter() { + match token { + PreflateToken::Literal => { + current_window[(current_offset & 0x7fff) as usize] = 0; + current_offset += 1; + } + PreflateToken::Reference(r) => { + let match_length = + u32::from(current_window[((current_offset - r.dist()) & 0x7fff) as usize]); + + counters[(match_length & 0x7fff) as usize] += 1; + + max_distance = std::cmp::max(max_distance, match_length & 0x7fff); + if (match_length & 0x8000) == 0 { + counters_b[(match_length & 0x7fff) as usize] += 1; + + max_distance_last_add = + std::cmp::max(max_distance_last_add, match_length & 0x7fff); + } + + current_window[(current_offset & 0x7fff) as usize] = 0; + current_offset += 1; + + for i in 1..r.len() { + current_window[(current_offset & 0x7fff) as usize] = + r.len() as u16 | if i == r.len() - 1 { 0x8000 } else { 0 }; + current_offset += 1; + } + } + } + } + } + + if max_distance_last_add < max_distance { + DictionaryAddPolicy::AddFirstAndLast(max_distance_last_add as u16) + } else if max_distance < 258 { + DictionaryAddPolicy::AddFirst(max_distance as u16) + } else { + DictionaryAddPolicy::AddAll + } +} diff --git a/src/token_predictor.rs b/src/token_predictor.rs index a825ae4..ca873ce 100644 --- a/src/token_predictor.rs +++ b/src/token_predictor.rs @@ -71,7 +71,7 @@ impl<'a, H: RotatingHashTrait> TokenPredictor<'a, H> { codec.encode_value(block.uncompressed_len as u16, 16); codec.encode_correction(CodecCorrection::NonZeroPadding, block.padding_bits.into()); - self.state.update_hash(block.uncompressed_len); + self.state.update_hash_batch(block.uncompressed_len); return Ok(()); } @@ -234,7 +234,7 @@ impl<'a, H: RotatingHashTrait> TokenPredictor<'a, H> { block.uncompressed_len = codec.decode_value(16).into(); block.padding_bits = codec.decode_correction(CodecCorrection::NonZeroPadding) as u8; - self.state.update_hash(block.uncompressed_len); + self.state.update_hash_batch(block.uncompressed_len); return Ok(block); } BT_STATICHUFF => { @@ -362,10 +362,6 @@ impl<'a, H: RotatingHashTrait> TokenPredictor<'a, H> { return PreflateToken::Literal; } - if self.params.is_fast_compressor { - return PreflateToken::Reference(match_token); - } - // match is too small and far way to be worth encoding as a distance/length pair. if match_token.len() == 3 && match_token.dist() > self.params.max_dist_3_matches.into() { @@ -446,20 +442,15 @@ impl<'a, H: RotatingHashTrait> TokenPredictor<'a, H> { block.add_literal(self.state.input_cursor()[0]); } - self.state.update_hash(1); + self.state.update_hash_batch(1); } PreflateToken::Reference(t) => { if let Some(block) = block { block.add_reference(t.len(), t.dist(), t.get_irregular258()); } - // max_lazy is reused by the fast compressor to mean that if a match is larger than a - // certain size it should not be added to the dictionary in order to save on speed. - if self.params.is_fast_compressor && t.len() > self.params.max_lazy { - self.state.skip_hash(t.len()); - } else { - self.state.update_hash(t.len()); - } + self.state + .update_hash_with_policy(t.len(), self.params.add_policy); } }