Skip to content

Commit

Permalink
Add scanning for zlib streams to the libary (#4)
Browse files Browse the repository at this point in the history
* added full wrapper for zlib compressed files

* cleanup debug

* update config.toml
  • Loading branch information
mcroomp authored Dec 21, 2023
1 parent 33560a0 commit 9b03c26
Show file tree
Hide file tree
Showing 5 changed files with 327 additions and 7 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "preflate-rs"
version = "0.5.0"
version = "0.6.0"
edition = "2021"
authors = ["Kristof Roomp <kristofr@microsoft.com>"]
license = "Apache-2.0"
Expand All @@ -25,6 +25,7 @@ anyhow = { version="1.0", features = ["backtrace"]}
byteorder = "1.4"
cabac = "0.6.0"
default-boxed = "0.2"
zstd = "0.13.0"

[dev-dependencies]
crc32fast = "1.3"
Expand Down
Binary file added samples/samplezip.zip
Binary file not shown.
169 changes: 163 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,17 @@ mod preflate_parse_config;
mod preflate_stream_info;
mod preflate_token;
mod process;
mod scan_deflate;
mod statistical_codec;
mod token_predictor;
mod tree_predictor;

use anyhow::{self};
use cabac::{
debug::{DebugReader, DebugWriter},
vp8::{VP8Reader, VP8Writer},
};
use cabac::vp8::{VP8Reader, VP8Writer};
use preflate_error::PreflateError;
use preflate_parameter_estimator::{estimate_preflate_parameters, PreflateParameters};
use process::parse_deflate;
use scan_deflate::search_for_deflate_streams;
use std::io::Cursor;

use crate::{
Expand All @@ -45,6 +44,8 @@ use crate::{
statistical_codec::PredictionEncoder,
};

const COMPRESSED_WRAPPER_VERSION_1: u8 = 1;

/// result of decompress_deflate_stream
pub struct DecompressResult {
/// the plaintext that was decompressed from the stream
Expand All @@ -58,6 +59,12 @@ pub struct DecompressResult {
pub compressed_size: usize,
}

impl core::fmt::Debug for DecompressResult {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "DecompressResult {{ plain_text: {}, prediction_corrections: {}, compressed_size: {} }}", self.plain_text.len(), self.prediction_corrections.len(), self.compressed_size)
}
}

/// decompresses a deflate stream and returns the plaintext and cabac_encoded data that can be used to reconstruct it
pub fn decompress_deflate_stream(
compressed_data: &[u8],
Expand All @@ -68,7 +75,7 @@ pub fn decompress_deflate_stream(
let mut cabac_encoder =
PredictionEncoderCabac::new(VP8Writer::new(&mut cabac_encoded).unwrap());

let contents = parse_deflate(compressed_data, 1)?;
let contents = parse_deflate(compressed_data, 0)?;

/*
let mut writecomp = File::create("c:\\temp\\lastop.deflate").unwrap();
Expand Down Expand Up @@ -130,16 +137,19 @@ pub fn recompress_deflate_stream(

/// decompresses a deflate stream and returns the plaintext and cabac_encoded data that can be used to reconstruct it
/// This version uses DebugWriter and DebugReader, which are slower but can be used to debug the cabac encoding errors.
#[cfg(test)]
pub fn decompress_deflate_stream_assert(
compressed_data: &[u8],
verify: bool,
) -> Result<DecompressResult, PreflateError> {
use cabac::debug::{DebugReader, DebugWriter};

let mut cabac_encoded = Vec::new();

let mut cabac_encoder =
PredictionEncoderCabac::new(DebugWriter::new(&mut cabac_encoded).unwrap());

let contents = parse_deflate(compressed_data, 1)?;
let contents = parse_deflate(compressed_data, 0)?;

let params = estimate_preflate_parameters(&contents.plain_text, &contents.blocks)
.map_err(|e| PreflateError::AnalyzeFailed(e))?;
Expand Down Expand Up @@ -175,10 +185,13 @@ pub fn decompress_deflate_stream_assert(

/// recompresses a deflate stream using the cabac_encoded data that was returned from decompress_deflate_stream
/// This version uses DebugWriter and DebugReader, which are slower and don't compress but can be used to debug the cabac encoding errors.
#[cfg(test)]
pub fn recompress_deflate_stream_assert(
plain_text: &[u8],
prediction_corrections: &[u8],
) -> Result<Vec<u8>, PreflateError> {
use cabac::debug::DebugReader;

let mut cabac_decoder = PredictionDecoderCabac::new(
DebugReader::new(Cursor::new(&prediction_corrections)).unwrap(),
);
Expand All @@ -191,6 +204,133 @@ pub fn recompress_deflate_stream_assert(
Ok(recompressed)
}

fn append_with_length(dst: &mut Vec<u8>, src: &[u8]) {
dst.extend_from_slice(&(src.len() as u32).to_le_bytes());
dst.extend_from_slice(src);
}

fn read_segment_with_length<'a>(
src: &'a [u8],
index: &mut usize,
) -> Result<&'a [u8], PreflateError> {
if src.len() < *index + 4 {
return Err(PreflateError::InvalidCompressedWrapper);
}

let len = u32::from_le_bytes(src[*index..*index + 4].try_into().unwrap()) as usize;
*index += 4;

if src.len() < *index + len {
return Err(PreflateError::InvalidCompressedWrapper);
}

let result = &src[*index..*index + len];
*index += len;

Ok(result)
}

/// scans for deflate streams in a zlib compressed file, decompresses the streams and
/// returns an uncompressed file that can then be recompressed using a better algorithm.
/// This can then be passed back into recreated_zlib_chunks to recreate the exact original file.
pub fn expand_zlib_chunks(compressed_data: &[u8]) -> Vec<u8> {
let mut locations_found = Vec::new();

search_for_deflate_streams(compressed_data, &mut locations_found);

let mut plain_text = Vec::new();
plain_text.push(COMPRESSED_WRAPPER_VERSION_1); // version 1 of format. Definitely will improved.

let mut prev: Option<scan_deflate::DeflateStreamLocation> = None;
for loc in locations_found {
//println!("loc: {:?}", loc);

if let Some(prev) = prev {
append_with_length(
&mut plain_text,
&compressed_data[prev.start + prev.data.compressed_size..loc.start],
);
} else {
append_with_length(&mut plain_text, &compressed_data[0..loc.start]);
}

append_with_length(&mut plain_text, &loc.data.prediction_corrections);
append_with_length(&mut plain_text, &loc.data.plain_text);
prev = Some(loc);
}

// append the last chunk
if let Some(prev) = prev {
append_with_length(
&mut plain_text,
&compressed_data[prev.start + prev.data.compressed_size..],
);
}

plain_text
}

/// takes a binary chunk of data that was created by expand_zlib_chunks and recompresses it back to its
/// original form.
pub fn recreated_zlib_chunks(compressed_data: &[u8]) -> Result<Vec<u8>, PreflateError> {
let mut result = Vec::new();
let mut index = 0;

if compressed_data.len() < 5 {
return Err(PreflateError::InvalidCompressedWrapper);
}

if compressed_data[0] != COMPRESSED_WRAPPER_VERSION_1 {
return Err(PreflateError::InvalidCompressedWrapper);
}
index += 1;

while index != compressed_data.len() {
let segment = read_segment_with_length(compressed_data, &mut index)?;
result.extend_from_slice(segment);

if index == compressed_data.len() {
// reached end of file
break;
}

let corrections = read_segment_with_length(compressed_data, &mut index)?;
let plain_text = read_segment_with_length(compressed_data, &mut index)?;
let recompressed = recompress_deflate_stream(plain_text, corrections)?;
result.extend_from_slice(&recompressed);
}

Ok(result)
}

/// expands the Zlib compressed streams in the data and then recompresses the result
/// with Zstd with the maximum level.
pub fn compress_zstd(zlib_compressed_data: &[u8]) -> Vec<u8> {
let plain_text = expand_zlib_chunks(&zlib_compressed_data);
zstd::bulk::compress(&plain_text, 9).unwrap()
}

/// decompresses the Zstd compressed data and then recompresses the result back
/// to the original Zlib compressed streams.
pub fn decompress_zstd(compressed_data: &[u8], capacity: usize) -> Result<Vec<u8>, PreflateError> {
let compressed_data = zstd::bulk::decompress(compressed_data, capacity)
.map_err(|e| PreflateError::ZstdError(e))?;

recreated_zlib_chunks(&compressed_data)
}

#[test]
fn verify_zip_compress() {
use crate::process::read_file;
let v = read_file("samplezip.zip");

let expanded = expand_zlib_chunks(&v);

let recompressed = recreated_zlib_chunks(&expanded).unwrap();

assert!(v == recompressed);
}

#[test]
fn verify_roundtrip_zlib() {
for i in 0..9 {
Expand Down Expand Up @@ -222,6 +362,23 @@ fn verify_file(filename: &str) {
assert!(v == recompressed);
}

#[test]
fn verify_zip_compress_zstd() {
use crate::process::read_file;
let v = read_file("samplezip.zip");

let compressed = compress_zstd(&v);

let recreated = decompress_zstd(&compressed, 256 * 1024 * 1024).unwrap();

assert!(v == recreated);
println!(
"original zip = {} bytes, recompressed zip = {} bytes",
v.len(),
compressed.len()
);
}

#[test]
fn verify_roundtrip_assert() {
use crate::process::read_file;
Expand Down
4 changes: 4 additions & 0 deletions src/preflate_error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ pub enum PreflateError {
RecreateBlock(usize, anyhow::Error),
RecreateTree(usize, anyhow::Error),
EncodeBlock(usize, anyhow::Error),
InvalidCompressedWrapper,
ZstdError(std::io::Error),
}

impl Display for PreflateError {
Expand All @@ -36,6 +38,8 @@ impl Display for PreflateError {
PreflateError::EncodeBlock(i, e) => write!(f, "EncodeBlock[{}]: {}", i, e),
PreflateError::RecompressFailed(e) => write!(f, "RecompressFailed: {}", e),
PreflateError::AnalyzeFailed(e) => write!(f, "AnalyzeFailed: {}", e),
PreflateError::InvalidCompressedWrapper => write!(f, "InvalidCompressedWrapper"),
PreflateError::ZstdError(e) => write!(f, "ZstdError: {}", e),
}
}
}
Expand Down
Loading

0 comments on commit 9b03c26

Please sign in to comment.