From 3ad25626a5432f9b801e7817710e9773127edd6c Mon Sep 17 00:00:00 2001
From: schombert <schombert@users.noreply.github.com>
Date: Mon, 8 Apr 2024 21:13:22 -0700
Subject: [PATCH] update zstd

---
 src/zstd/bits.h                               |  169 -
 src/zstd/bitstream.h                          |  432 -
 src/zstd/clevels.h                            |  136 -
 src/zstd/common/allocations.h                 |   55 +
 src/zstd/common/bits.h                        |  200 +
 src/zstd/common/bitstream.h                   |  457 ++
 src/zstd/common/compiler.h                    |  450 ++
 src/zstd/common/cpu.h                         |  249 +
 src/zstd/{ => common}/debug.c                 |    9 +-
 src/zstd/{ => common}/debug.h                 |   68 +-
 src/zstd/common/entropy_common.c              |  340 +
 src/zstd/common/error_private.c               |   63 +
 src/zstd/common/error_private.h               |  168 +
 src/zstd/{ => common}/fse.h                   |  391 +-
 src/zstd/common/fse_decompress.c              |  313 +
 src/zstd/{ => common}/huf.h                   |  179 +-
 src/zstd/common/mem.h                         |  426 +
 src/zstd/common/pool.c                        |  371 +
 src/zstd/{ => common}/pool.h                  |   14 +-
 src/zstd/{ => common}/portability_macros.h    |  108 +-
 src/zstd/common/threading.c                   |  182 +
 src/zstd/common/threading.h                   |  150 +
 src/zstd/{ => common}/xxhash.c                |   10 +-
 src/zstd/common/xxhash.h                      | 7020 ++++++++++++++++
 src/zstd/{ => common}/zstd_common.c           |   48 +-
 src/zstd/{ => common}/zstd_deps.h             |   14 +-
 src/zstd/common/zstd_internal.h               |  392 +
 src/zstd/{ => common}/zstd_trace.h            |  120 +-
 src/zstd/compiler.h                           |  358 -
 src/zstd/compress/clevels.h                   |  134 +
 src/zstd/compress/fse_compress.c              |  625 ++
 src/zstd/compress/hist.c                      |  181 +
 src/zstd/{ => compress}/hist.h                |   10 +-
 src/zstd/compress/huf_compress.c              | 1464 ++++
 src/zstd/compress/zstd_compress.c             | 7153 +++++++++++++++++
 src/zstd/compress/zstd_compress_internal.h    | 1534 ++++
 src/zstd/compress/zstd_compress_literals.c    |  235 +
 .../{ => compress}/zstd_compress_literals.h   |   21 +-
 src/zstd/compress/zstd_compress_sequences.c   |  442 +
 src/zstd/compress/zstd_compress_sequences.h   |   54 +
 src/zstd/compress/zstd_compress_superblock.c  |  688 ++
 .../{ => compress}/zstd_compress_superblock.h |   12 +-
 src/zstd/compress/zstd_cwksp.h                |  748 ++
 src/zstd/compress/zstd_double_fast.c          |  770 ++
 src/zstd/compress/zstd_double_fast.h          |   50 +
 src/zstd/compress/zstd_fast.c                 |  968 +++
 src/zstd/{ => compress}/zstd_fast.h           |   20 +-
 src/zstd/compress/zstd_lazy.c                 | 2199 +++++
 src/zstd/compress/zstd_lazy.h                 |  202 +
 src/zstd/compress/zstd_ldm.c                  |  730 ++
 src/zstd/{ => compress}/zstd_ldm.h            |   30 +-
 src/zstd/{ => compress}/zstd_ldm_geartab.h    |   25 +-
 src/zstd/compress/zstd_opt.c                  | 1576 ++++
 src/zstd/compress/zstd_opt.h                  |   80 +
 src/zstd/compress/zstdmt_compress.c           | 1882 +++++
 src/zstd/{ => compress}/zstdmt_compress.h     |   66 +-
 src/zstd/cpu.h                                |  219 -
 src/zstd/decompress/huf_decompress.c          | 1944 +++++
 .../{ => decompress}/huf_decompress_amd64.S   |   59 +-
 src/zstd/decompress/zstd_ddict.c              |  244 +
 src/zstd/{ => decompress}/zstd_ddict.h        |    8 +-
 src/zstd/decompress/zstd_decompress.c         | 2407 ++++++
 src/zstd/decompress/zstd_decompress_block.c   | 2215 +++++
 .../{ => decompress}/zstd_decompress_block.h  |   34 +-
 .../decompress/zstd_decompress_internal.h     |  240 +
 src/zstd/dictBuilder/cover.c                  | 1261 +++
 src/zstd/dictBuilder/cover.h                  |  152 +
 src/zstd/dictBuilder/divsufsort.c             | 1913 +++++
 src/zstd/dictBuilder/divsufsort.h             |   67 +
 src/zstd/dictBuilder/fastcover.c              |  766 ++
 src/zstd/dictBuilder/zdict.c                  | 1133 +++
 src/zstd/entropy_common.c                     |  324 -
 src/zstd/error_private.c                      |   96 -
 src/zstd/error_private.h                      |  160 -
 src/zstd/fse_compress.c                       |  660 --
 src/zstd/fse_decompress.c                     |  325 -
 src/zstd/hist.c                               |  202 -
 src/zstd/huf_compress.c                       | 1490 ----
 src/zstd/huf_decompress.c                     | 1914 -----
 src/zstd/mem.h                                |  392 -
 src/zstd/pool.c                               |  384 -
 src/zstd/threading.c                          |  167 -
 src/zstd/threading.h                          |  148 -
 src/zstd/xxhash.h                             | 5751 -------------
 src/zstd/zdict.h                              |  197 +-
 src/zstd/zstd.cpp                             |   42 +-
 src/zstd/zstd.h                               | 2060 ++---
 src/zstd/zstd_compress.c                      | 6835 ----------------
 src/zstd/zstd_compress_internal.h             | 1491 ----
 src/zstd/zstd_compress_literals.c             |  234 -
 src/zstd/zstd_compress_sequences.c            |  673 --
 src/zstd/zstd_compress_sequences.h            |   54 -
 src/zstd/zstd_compress_superblock.c           |  589 --
 src/zstd/zstd_cwksp.h                         |  666 --
 src/zstd/zstd_ddict.c                         |  238 -
 src/zstd/zstd_decompress.c                    | 2350 ------
 src/zstd/zstd_decompress_block.c              | 2275 ------
 src/zstd/zstd_decompress_internal.h           |  238 -
 src/zstd/zstd_double_fast.c                   |  800 --
 src/zstd/zstd_double_fast.h                   |   38 -
 src/zstd/zstd_errors.h                        |  118 +-
 src/zstd/zstd_fast.c                          |  968 ---
 src/zstd/zstd_internal.h                      |  397 -
 src/zstd/zstd_lazy.c                          | 2124 -----
 src/zstd/zstd_lazy.h                          |   72 -
 src/zstd/zstd_ldm.c                           |  706 --
 src/zstd/zstd_opt.c                           | 1483 ----
 src/zstd/zstd_opt.h                           |   55 -
 src/zstd/zstdmt_compress.c                    | 1852 -----
 109 files changed, 46855 insertions(+), 39166 deletions(-)
 delete mode 100644 src/zstd/bits.h
 delete mode 100644 src/zstd/bitstream.h
 delete mode 100644 src/zstd/clevels.h
 create mode 100644 src/zstd/common/allocations.h
 create mode 100644 src/zstd/common/bits.h
 create mode 100644 src/zstd/common/bitstream.h
 create mode 100644 src/zstd/common/compiler.h
 create mode 100644 src/zstd/common/cpu.h
 rename src/zstd/{ => common}/debug.c (70%)
 rename src/zstd/{ => common}/debug.h (66%)
 create mode 100644 src/zstd/common/entropy_common.c
 create mode 100644 src/zstd/common/error_private.c
 create mode 100644 src/zstd/common/error_private.h
 rename src/zstd/{ => common}/fse.h (67%)
 create mode 100644 src/zstd/common/fse_decompress.c
 rename src/zstd/{ => common}/huf.h (61%)
 create mode 100644 src/zstd/common/mem.h
 create mode 100644 src/zstd/common/pool.c
 rename src/zstd/{ => common}/pool.h (93%)
 rename src/zstd/{ => common}/portability_macros.h (59%)
 create mode 100644 src/zstd/common/threading.c
 create mode 100644 src/zstd/common/threading.h
 rename src/zstd/{ => common}/xxhash.c (59%)
 create mode 100644 src/zstd/common/xxhash.h
 rename src/zstd/{ => common}/zstd_common.c (54%)
 rename src/zstd/{ => common}/zstd_deps.h (84%)
 create mode 100644 src/zstd/common/zstd_internal.h
 rename src/zstd/{ => common}/zstd_trace.h (63%)
 delete mode 100644 src/zstd/compiler.h
 create mode 100644 src/zstd/compress/clevels.h
 create mode 100644 src/zstd/compress/fse_compress.c
 create mode 100644 src/zstd/compress/hist.c
 rename src/zstd/{ => compress}/hist.h (92%)
 create mode 100644 src/zstd/compress/huf_compress.c
 create mode 100644 src/zstd/compress/zstd_compress.c
 create mode 100644 src/zstd/compress/zstd_compress_internal.h
 create mode 100644 src/zstd/compress/zstd_compress_literals.c
 rename src/zstd/{ => compress}/zstd_compress_literals.h (56%)
 create mode 100644 src/zstd/compress/zstd_compress_sequences.c
 create mode 100644 src/zstd/compress/zstd_compress_sequences.h
 create mode 100644 src/zstd/compress/zstd_compress_superblock.c
 rename src/zstd/{ => compress}/zstd_compress_superblock.h (80%)
 create mode 100644 src/zstd/compress/zstd_cwksp.h
 create mode 100644 src/zstd/compress/zstd_double_fast.c
 create mode 100644 src/zstd/compress/zstd_double_fast.h
 create mode 100644 src/zstd/compress/zstd_fast.c
 rename src/zstd/{ => compress}/zstd_fast.h (58%)
 create mode 100644 src/zstd/compress/zstd_lazy.c
 create mode 100644 src/zstd/compress/zstd_lazy.h
 create mode 100644 src/zstd/compress/zstd_ldm.c
 rename src/zstd/{ => compress}/zstd_ldm.h (82%)
 rename src/zstd/{ => compress}/zstd_ldm_geartab.h (88%)
 create mode 100644 src/zstd/compress/zstd_opt.c
 create mode 100644 src/zstd/compress/zstd_opt.h
 create mode 100644 src/zstd/compress/zstdmt_compress.c
 rename src/zstd/{ => compress}/zstdmt_compress.h (60%)
 delete mode 100644 src/zstd/cpu.h
 create mode 100644 src/zstd/decompress/huf_decompress.c
 rename src/zstd/{ => decompress}/huf_decompress_amd64.S (89%)
 create mode 100644 src/zstd/decompress/zstd_ddict.c
 rename src/zstd/{ => decompress}/zstd_ddict.h (90%)
 create mode 100644 src/zstd/decompress/zstd_decompress.c
 create mode 100644 src/zstd/decompress/zstd_decompress_block.c
 rename src/zstd/{ => decompress}/zstd_decompress_block.h (56%)
 create mode 100644 src/zstd/decompress/zstd_decompress_internal.h
 create mode 100644 src/zstd/dictBuilder/cover.c
 create mode 100644 src/zstd/dictBuilder/cover.h
 create mode 100644 src/zstd/dictBuilder/divsufsort.c
 create mode 100644 src/zstd/dictBuilder/divsufsort.h
 create mode 100644 src/zstd/dictBuilder/fastcover.c
 create mode 100644 src/zstd/dictBuilder/zdict.c
 delete mode 100644 src/zstd/entropy_common.c
 delete mode 100644 src/zstd/error_private.c
 delete mode 100644 src/zstd/error_private.h
 delete mode 100644 src/zstd/fse_compress.c
 delete mode 100644 src/zstd/fse_decompress.c
 delete mode 100644 src/zstd/hist.c
 delete mode 100644 src/zstd/huf_compress.c
 delete mode 100644 src/zstd/huf_decompress.c
 delete mode 100644 src/zstd/mem.h
 delete mode 100644 src/zstd/pool.c
 delete mode 100644 src/zstd/threading.c
 delete mode 100644 src/zstd/threading.h
 delete mode 100644 src/zstd/xxhash.h
 delete mode 100644 src/zstd/zstd_compress.c
 delete mode 100644 src/zstd/zstd_compress_internal.h
 delete mode 100644 src/zstd/zstd_compress_literals.c
 delete mode 100644 src/zstd/zstd_compress_sequences.c
 delete mode 100644 src/zstd/zstd_compress_sequences.h
 delete mode 100644 src/zstd/zstd_compress_superblock.c
 delete mode 100644 src/zstd/zstd_cwksp.h
 delete mode 100644 src/zstd/zstd_ddict.c
 delete mode 100644 src/zstd/zstd_decompress.c
 delete mode 100644 src/zstd/zstd_decompress_block.c
 delete mode 100644 src/zstd/zstd_decompress_internal.h
 delete mode 100644 src/zstd/zstd_double_fast.c
 delete mode 100644 src/zstd/zstd_double_fast.h
 delete mode 100644 src/zstd/zstd_fast.c
 delete mode 100644 src/zstd/zstd_internal.h
 delete mode 100644 src/zstd/zstd_lazy.c
 delete mode 100644 src/zstd/zstd_lazy.h
 delete mode 100644 src/zstd/zstd_ldm.c
 delete mode 100644 src/zstd/zstd_opt.c
 delete mode 100644 src/zstd/zstd_opt.h
 delete mode 100644 src/zstd/zstdmt_compress.c

diff --git a/src/zstd/bits.h b/src/zstd/bits.h
deleted file mode 100644
index 98b94fe4f..000000000
--- a/src/zstd/bits.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef ZSTD_BITS_H
-#define ZSTD_BITS_H
-
-#include "mem.h"
-
-MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val) {
-	assert(val != 0);
-	{
-		static const int DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
-		                                        30, 22, 20, 15, 25, 17, 4, 8,
-		                                        31, 27, 13, 23, 21, 19, 16, 7,
-		                                        26, 12, 18, 6, 11, 5, 10, 9};
-		return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-	}
-}
-
-MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) {
-	assert(val != 0);
-#if defined(_MSC_VER)
-#if STATIC_BMI2 == 1
-	return _tzcnt_u32(val);
-#else
-	if(val != 0) {
-		unsigned long r;
-		_BitScanForward(&r, val);
-		return (unsigned)r;
-	} else {
-		/* Should not reach this code path */
-		__assume(0);
-	}
-#endif
-#elif defined(__GNUC__) && (__GNUC__ >= 4)
-	return (unsigned)__builtin_ctz(val);
-#else
-	return ZSTD_countTrailingZeros32_fallback(val);
-#endif
-}
-
-MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
-	assert(val != 0);
-	{
-		static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
-		                                    11, 14, 16, 18, 22, 25, 3, 30,
-		                                    8, 12, 20, 28, 15, 17, 24, 7,
-		                                    19, 27, 23, 6, 26, 5, 4, 31};
-		val |= val >> 1;
-		val |= val >> 2;
-		val |= val >> 4;
-		val |= val >> 8;
-		val |= val >> 16;
-		return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
-	}
-}
-
-MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) {
-	assert(val != 0);
-#if defined(_MSC_VER)
-#if STATIC_BMI2 == 1
-	return _lzcnt_u32(val);
-#else
-	if(val != 0) {
-		unsigned long r;
-		_BitScanReverse(&r, val);
-		return (unsigned)(31 - r);
-	} else {
-		/* Should not reach this code path */
-		__assume(0);
-	}
-#endif
-#elif defined(__GNUC__) && (__GNUC__ >= 4)
-	return (unsigned)__builtin_clz(val);
-#else
-	return ZSTD_countLeadingZeros32_fallback(val);
-#endif
-}
-
-MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) {
-	assert(val != 0);
-#if defined(_MSC_VER) && defined(_WIN64)
-#if STATIC_BMI2 == 1
-	return (unsigned int)(_tzcnt_u64(val));
-#else
-	if(val != 0) {
-		unsigned long r;
-		_BitScanForward64(&r, val);
-		return (unsigned)r;
-	} else {
-		/* Should not reach this code path */
-		__assume(0);
-	}
-#endif
-#elif defined(__GNUC__) && (__GNUC__ >= 4) && defined(__LP64__)
-	return (unsigned int)__builtin_ctzll(val);
-#else
-	{
-		U32 mostSignificantWord = (U32)(val >> 32);
-		U32 leastSignificantWord = (U32)val;
-		if(leastSignificantWord == 0) {
-			return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
-		} else {
-			return ZSTD_countTrailingZeros32(leastSignificantWord);
-		}
-	}
-#endif
-}
-
-MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) {
-	assert(val != 0);
-#if defined(_MSC_VER) && defined(_WIN64)
-#if STATIC_BMI2 == 1
-	return (unsigned int)(_lzcnt_u64(val));
-#else
-	if(val != 0) {
-		unsigned long r;
-		_BitScanReverse64(&r, val);
-		return (unsigned)(63 - r);
-	} else {
-		/* Should not reach this code path */
-		__assume(0);
-	}
-#endif
-#elif defined(__GNUC__) && (__GNUC__ >= 4)
-	return (unsigned)(__builtin_clzll(val));
-#else
-	{
-		U32 mostSignificantWord = (U32)(val >> 32);
-		U32 leastSignificantWord = (U32)val;
-		if(mostSignificantWord == 0) {
-			return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
-		} else {
-			return ZSTD_countLeadingZeros32(mostSignificantWord);
-		}
-	}
-#endif
-}
-
-MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) {
-	if(MEM_isLittleEndian()) {
-		if(MEM_64bits()) {
-			return ZSTD_countTrailingZeros64((U64)val) >> 3;
-		} else {
-			return ZSTD_countTrailingZeros32((U32)val) >> 3;
-		}
-	} else { /* Big Endian CPU */
-		if(MEM_64bits()) {
-			return ZSTD_countLeadingZeros64((U64)val) >> 3;
-		} else {
-			return ZSTD_countLeadingZeros32((U32)val) >> 3;
-		}
-	}
-}
-
-MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */
-{
-	assert(val != 0);
-	return 31 - ZSTD_countLeadingZeros32(val);
-}
-
-#endif /* ZSTD_BITS_H */
diff --git a/src/zstd/bitstream.h b/src/zstd/bitstream.h
deleted file mode 100644
index 81d4c4c63..000000000
--- a/src/zstd/bitstream.h
+++ /dev/null
@@ -1,432 +0,0 @@
-/* ******************************************************************
- * bitstream
- * Part of FSE library
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * You can contact the author at :
- * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
-#ifndef BITSTREAM_H_MODULE
-#define BITSTREAM_H_MODULE
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-/*
- *  This API consists of small unitary functions, which must be inlined for best performance.
- *  Since link-time-optimization is not available for all compilers,
- *  these functions are defined into a .h to be included.
- */
-
-/*-****************************************
- *  Dependencies
- ******************************************/
-#include "mem.h"           /* unaligned access routines */
-#include "compiler.h"      /* UNLIKELY() */
-#include "debug.h"         /* assert(), DEBUGLOG(), RAWLOG() */
-#include "error_private.h" /* error codes and messages */
-#include "bits.h"          /* ZSTD_highbit32 */
-
-/*=========================================
-*  Target specific
-=========================================*/
-#ifndef ZSTD_NO_INTRINSICS
-#if(defined(__BMI__) || defined(__BMI2__)) && defined(__GNUC__)
-#include <immintrin.h> /* support for bextr (experimental)/bzhi */
-#elif defined(__ICCARM__)
-#include <intrinsics.h>
-#endif
-#endif
-
-#define STREAM_ACCUMULATOR_MIN_32 25
-#define STREAM_ACCUMULATOR_MIN_64 57
-#define STREAM_ACCUMULATOR_MIN ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
-
-/*-******************************************
- *  bitStream encoding API (write forward)
- ********************************************/
-/* bitStream can mix input from multiple sources.
- * A critical property of these streams is that they encode and decode in **reverse** direction.
- * So the first bit sequence you add will be the last to be read, like a LIFO stack.
- */
-typedef struct {
-	size_t bitContainer;
-	unsigned bitPos;
-	char* startPtr;
-	char* ptr;
-	char* endPtr;
-} BIT_CStream_t;
-
-MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
-MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
-MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC);
-MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
-
-/* Start with initCStream, providing the size of buffer to write into.
- *  bitStream will never write outside of this buffer.
- *  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
- *
- *  bits are first added to a local register.
- *  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
- *  Writing data into memory is an explicit operation, performed by the flushBits function.
- *  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
- *  After a flushBits, a maximum of 7 bits might still be stored into local register.
- *
- *  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
- *
- *  Last operation is to close the bitStream.
- *  The function returns the final size of CStream in bytes.
- *  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
- */
-
-/*-********************************************
- *  bitStream decoding API (read backward)
- **********************************************/
-typedef struct {
-	size_t bitContainer;
-	unsigned bitsConsumed;
-	const char* ptr;
-	const char* start;
-	const char* limitPtr;
-} BIT_DStream_t;
-
-typedef enum { BIT_DStream_unfinished = 0,
-	           BIT_DStream_endOfBuffer = 1,
-	           BIT_DStream_completed = 2,
-	           BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */
-                                                              /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
-
-MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
-MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
-MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
-
-/* Start by invoking BIT_initDStream().
- *  A chunk of the bitStream is then stored into a local register.
- *  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
- *  You can then retrieve bitFields stored into the local register, **in reverse order**.
- *  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
- *  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
- *  Otherwise, it can be less than that, so proceed accordingly.
- *  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
- */
-
-/*-****************************************
- *  unsafe API
- ******************************************/
-MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
-/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
-
-MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
-/* unsafe version; does not check buffer overflow */
-
-MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
-/* faster, but works only if nbBits >= 1 */
-
-/*=====    Local Constants   =====*/
-static const unsigned BIT_mask[] = {
-    0, 1, 3, 7, 0xF, 0x1F,
-    0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
-    0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF,
-    0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,
-    0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
-    0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */
-#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0]))
-
-/*-**************************************************************
- *  bitStream encoding
- ****************************************************************/
-/*! BIT_initCStream() :
- *  `dstCapacity` must be > sizeof(size_t)
- *  @return : 0 if success,
- *            otherwise an error code (can be tested using ERR_isError()) */
-MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
-                                  void* startPtr, size_t dstCapacity) {
-	bitC->bitContainer = 0;
-	bitC->bitPos = 0;
-	bitC->startPtr = (char*)startPtr;
-	bitC->ptr = bitC->startPtr;
-	bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
-	if(dstCapacity <= sizeof(bitC->bitContainer))
-		return ERROR(dstSize_tooSmall);
-	return 0;
-}
-
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) {
-#if defined(STATIC_BMI2) && STATIC_BMI2 == 1 && !defined(ZSTD_NO_INTRINSICS)
-	return _bzhi_u64(bitContainer, nbBits);
-#else
-	assert(nbBits < BIT_MASK_SIZE);
-	return bitContainer & BIT_mask[nbBits];
-#endif
-}
-
-/*! BIT_addBits() :
- *  can add up to 31 bits into `bitC`.
- *  Note : does not check for register overflow ! */
-MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
-                            size_t value, unsigned nbBits) {
-	DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
-	assert(nbBits < BIT_MASK_SIZE);
-	assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
-	bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
-	bitC->bitPos += nbBits;
-}
-
-/*! BIT_addBitsFast() :
- *  works only if `value` is _clean_,
- *  meaning all high bits above nbBits are 0 */
-MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
-                                size_t value, unsigned nbBits) {
-	assert((value >> nbBits) == 0);
-	assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
-	bitC->bitContainer |= value << bitC->bitPos;
-	bitC->bitPos += nbBits;
-}
-
-/*! BIT_flushBitsFast() :
- *  assumption : bitContainer has not overflowed
- *  unsafe version; does not check buffer overflow */
-MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC) {
-	size_t const nbBytes = bitC->bitPos >> 3;
-	assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
-	assert(bitC->ptr <= bitC->endPtr);
-	MEM_writeLEST(bitC->ptr, bitC->bitContainer);
-	bitC->ptr += nbBytes;
-	bitC->bitPos &= 7;
-	bitC->bitContainer >>= nbBytes * 8;
-}
-
-/*! BIT_flushBits() :
- *  assumption : bitContainer has not overflowed
- *  safe version; check for buffer overflow, and prevents it.
- *  note : does not signal buffer overflow.
- *  overflow will be revealed later on using BIT_closeCStream() */
-MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC) {
-	size_t const nbBytes = bitC->bitPos >> 3;
-	assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
-	assert(bitC->ptr <= bitC->endPtr);
-	MEM_writeLEST(bitC->ptr, bitC->bitContainer);
-	bitC->ptr += nbBytes;
-	if(bitC->ptr > bitC->endPtr)
-		bitC->ptr = bitC->endPtr;
-	bitC->bitPos &= 7;
-	bitC->bitContainer >>= nbBytes * 8;
-}
-
-/*! BIT_closeCStream() :
- *  @return : size of CStream, in bytes,
- *            or 0 if it could not fit into dstBuffer */
-MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC) {
-	BIT_addBitsFast(bitC, 1, 1); /* endMark */
-	BIT_flushBits(bitC);
-	if(bitC->ptr >= bitC->endPtr)
-		return 0; /* overflow detected */
-	return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
-}
-
-/*-********************************************************
- *  bitStream decoding
- **********************************************************/
-/*! BIT_initDStream() :
- *  Initialize a BIT_DStream_t.
- * `bitD` : a pointer to an already allocated BIT_DStream_t structure.
- * `srcSize` must be the *exact* size of the bitStream, in bytes.
- * @return : size of stream (== srcSize), or an errorCode if a problem is detected
- */
-MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize) {
-	if(srcSize < 1) {
-		ZSTD_memset(bitD, 0, sizeof(*bitD));
-		return ERROR(srcSize_wrong);
-	}
-
-	bitD->start = (const char*)srcBuffer;
-	bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer);
-
-	if(srcSize >= sizeof(bitD->bitContainer)) { /* normal case */
-		bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
-		bitD->bitContainer = MEM_readLEST(bitD->ptr);
-		{
-			BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize - 1];
-			bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */
-			if(lastByte == 0)
-				return ERROR(GENERIC); /* endMark not present */
-		}
-	} else {
-		bitD->ptr = bitD->start;
-		bitD->bitContainer = *(const BYTE*)(bitD->start);
-		switch(srcSize) {
-		case 7:
-			bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer) * 8 - 16);
-			ZSTD_FALLTHROUGH;
-
-		case 6:
-			bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer) * 8 - 24);
-			ZSTD_FALLTHROUGH;
-
-		case 5:
-			bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer) * 8 - 32);
-			ZSTD_FALLTHROUGH;
-
-		case 4:
-			bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
-			ZSTD_FALLTHROUGH;
-
-		case 3:
-			bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
-			ZSTD_FALLTHROUGH;
-
-		case 2:
-			bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8;
-			ZSTD_FALLTHROUGH;
-
-		default:
-			break;
-		}
-		{
-			BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize - 1];
-			bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
-			if(lastByte == 0)
-				return ERROR(corruption_detected); /* endMark not present */
-		}
-		bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize) * 8;
-	}
-
-	return srcSize;
-}
-
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start) {
-	return bitContainer >> start;
-}
-
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) {
-	U32 const regMask = sizeof(bitContainer) * 8 - 1;
-	/* if start > regMask, bitstream is corrupted, and result is undefined */
-	assert(nbBits < BIT_MASK_SIZE);
-	/* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is better
-	 * than accessing memory. When bmi2 instruction is not present, we consider
-	 * such cpus old (pre-Haswell, 2013) and their performance is not of that
-	 * importance.
-	 */
-#if defined(__x86_64__) || defined(_M_X86)
-	return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1);
-#else
-	return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
-#endif
-}
-
-/*! BIT_lookBits() :
- *  Provides next n bits from local register.
- *  local register is not modified.
- *  On 32-bits, maxNbBits==24.
- *  On 64-bits, maxNbBits==56.
- * @return : value extracted */
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) {
-	/* arbitrate between double-shift and shift+mask */
-#if 1
-	/* if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8,
-	 * bitstream is likely corrupted, and result is undefined */
-	return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer) * 8) - bitD->bitsConsumed - nbBits, nbBits);
-#else
-	/* this code path is slower on my os-x laptop */
-	U32 const regMask = sizeof(bitD->bitContainer) * 8 - 1;
-	return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask - nbBits) & regMask);
-#endif
-}
-
-/*! BIT_lookBitsFast() :
- *  unsafe version; only works if nbBits >= 1 */
-MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) {
-	U32 const regMask = sizeof(bitD->bitContainer) * 8 - 1;
-	assert(nbBits >= 1);
-	return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask + 1) - nbBits) & regMask);
-}
-
-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) {
-	bitD->bitsConsumed += nbBits;
-}
-
-/*! BIT_readBits() :
- *  Read (consume) next n bits from local register and update.
- *  Pay attention to not read more than nbBits contained into local register.
- * @return : extracted value. */
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) {
-	size_t const value = BIT_lookBits(bitD, nbBits);
-	BIT_skipBits(bitD, nbBits);
-	return value;
-}
-
-/*! BIT_readBitsFast() :
- *  unsafe version; only works if nbBits >= 1 */
-MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) {
-	size_t const value = BIT_lookBitsFast(bitD, nbBits);
-	assert(nbBits >= 1);
-	BIT_skipBits(bitD, nbBits);
-	return value;
-}
-
-/*! BIT_reloadDStreamFast() :
- *  Similar to BIT_reloadDStream(), but with two differences:
- *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
- *  2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this
- *     point you must use BIT_reloadDStream() to reload.
- */
-MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) {
-	if(UNLIKELY(bitD->ptr < bitD->limitPtr))
-		return BIT_DStream_overflow;
-	assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer) * 8);
-	bitD->ptr -= bitD->bitsConsumed >> 3;
-	bitD->bitsConsumed &= 7;
-	bitD->bitContainer = MEM_readLEST(bitD->ptr);
-	return BIT_DStream_unfinished;
-}
-
-/*! BIT_reloadDStream() :
- *  Refill `bitD` from buffer previously set in BIT_initDStream() .
- *  This function is safe, it guarantees it will not read beyond src buffer.
- * @return : status of `BIT_DStream_t` internal register.
- *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) {
-	if(bitD->bitsConsumed > (sizeof(bitD->bitContainer) * 8)) /* overflow detected, like end of stream */
-		return BIT_DStream_overflow;
-
-	if(bitD->ptr >= bitD->limitPtr) {
-		return BIT_reloadDStreamFast(bitD);
-	}
-	if(bitD->ptr == bitD->start) {
-		if(bitD->bitsConsumed < sizeof(bitD->bitContainer) * 8)
-			return BIT_DStream_endOfBuffer;
-		return BIT_DStream_completed;
-	}
-	/* start < ptr < limitPtr */
-	{
-		U32 nbBytes = bitD->bitsConsumed >> 3;
-		BIT_DStream_status result = BIT_DStream_unfinished;
-		if(bitD->ptr - nbBytes < bitD->start) {
-			nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */
-			result = BIT_DStream_endOfBuffer;
-		}
-		bitD->ptr -= nbBytes;
-		bitD->bitsConsumed -= nbBytes * 8;
-		bitD->bitContainer = MEM_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */
-		return result;
-	}
-}
-
-/*! BIT_endOfDStream() :
- * @return : 1 if DStream has _exactly_ reached its end (all bits consumed).
- */
-MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) {
-	return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer) * 8));
-}
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* BITSTREAM_H_MODULE */
diff --git a/src/zstd/clevels.h b/src/zstd/clevels.h
deleted file mode 100644
index c5060113a..000000000
--- a/src/zstd/clevels.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef ZSTD_CLEVELS_H
-#define ZSTD_CLEVELS_H
-
-#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_compressionParameters  */
-#include "zstd.h"
-
-/*-=====  Pre-defined compression levels  =====-*/
-
-#define ZSTD_MAX_CLEVEL 22
-
-#ifdef __GNUC__
-__attribute__((__unused__))
-#endif
-
-static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL + 1] = {
-    {
-        /* "default" - for any srcSize > 256 KB */
-        /* W,  C,  H,  S,  L, TL, strat */
-        {19, 12, 13, 1, 6, 1, ZSTD_fast},       /* base for negative levels */
-        {19, 13, 14, 1, 7, 0, ZSTD_fast},       /* level  1 */
-        {20, 15, 16, 1, 6, 0, ZSTD_fast},       /* level  2 */
-        {21, 16, 17, 1, 5, 0, ZSTD_dfast},      /* level  3 */
-        {21, 18, 18, 1, 5, 0, ZSTD_dfast},      /* level  4 */
-        {21, 18, 19, 3, 5, 2, ZSTD_greedy},     /* level  5 */
-        {21, 18, 19, 3, 5, 4, ZSTD_lazy},       /* level  6 */
-        {21, 19, 20, 4, 5, 8, ZSTD_lazy},       /* level  7 */
-        {21, 19, 20, 4, 5, 16, ZSTD_lazy2},     /* level  8 */
-        {22, 20, 21, 4, 5, 16, ZSTD_lazy2},     /* level  9 */
-        {22, 21, 22, 5, 5, 16, ZSTD_lazy2},     /* level 10 */
-        {22, 21, 22, 6, 5, 16, ZSTD_lazy2},     /* level 11 */
-        {22, 22, 23, 6, 5, 32, ZSTD_lazy2},     /* level 12 */
-        {22, 22, 22, 4, 5, 32, ZSTD_btlazy2},   /* level 13 */
-        {22, 22, 23, 5, 5, 32, ZSTD_btlazy2},   /* level 14 */
-        {22, 23, 23, 6, 5, 32, ZSTD_btlazy2},   /* level 15 */
-        {22, 22, 22, 5, 5, 48, ZSTD_btopt},     /* level 16 */
-        {23, 23, 22, 5, 4, 64, ZSTD_btopt},     /* level 17 */
-        {23, 23, 22, 6, 3, 64, ZSTD_btultra},   /* level 18 */
-        {23, 24, 22, 7, 3, 256, ZSTD_btultra2}, /* level 19 */
-        {25, 25, 23, 7, 3, 256, ZSTD_btultra2}, /* level 20 */
-        {26, 26, 24, 7, 3, 512, ZSTD_btultra2}, /* level 21 */
-        {27, 27, 25, 9, 3, 999, ZSTD_btultra2}, /* level 22 */
-    },
-    {
-        /* for srcSize <= 256 KB */
-        /* W,  C,  H,  S,  L,  T, strat */
-        {18, 12, 13, 1, 5, 1, ZSTD_fast},        /* base for negative levels */
-        {18, 13, 14, 1, 6, 0, ZSTD_fast},        /* level  1 */
-        {18, 14, 14, 1, 5, 0, ZSTD_dfast},       /* level  2 */
-        {18, 16, 16, 1, 4, 0, ZSTD_dfast},       /* level  3 */
-        {18, 16, 17, 3, 5, 2, ZSTD_greedy},      /* level  4.*/
-        {18, 17, 18, 5, 5, 2, ZSTD_greedy},      /* level  5.*/
-        {18, 18, 19, 3, 5, 4, ZSTD_lazy},        /* level  6.*/
-        {18, 18, 19, 4, 4, 4, ZSTD_lazy},        /* level  7 */
-        {18, 18, 19, 4, 4, 8, ZSTD_lazy2},       /* level  8 */
-        {18, 18, 19, 5, 4, 8, ZSTD_lazy2},       /* level  9 */
-        {18, 18, 19, 6, 4, 8, ZSTD_lazy2},       /* level 10 */
-        {18, 18, 19, 5, 4, 12, ZSTD_btlazy2},    /* level 11.*/
-        {18, 19, 19, 7, 4, 12, ZSTD_btlazy2},    /* level 12.*/
-        {18, 18, 19, 4, 4, 16, ZSTD_btopt},      /* level 13 */
-        {18, 18, 19, 4, 3, 32, ZSTD_btopt},      /* level 14.*/
-        {18, 18, 19, 6, 3, 128, ZSTD_btopt},     /* level 15.*/
-        {18, 19, 19, 6, 3, 128, ZSTD_btultra},   /* level 16.*/
-        {18, 19, 19, 8, 3, 256, ZSTD_btultra},   /* level 17.*/
-        {18, 19, 19, 6, 3, 128, ZSTD_btultra2},  /* level 18.*/
-        {18, 19, 19, 8, 3, 256, ZSTD_btultra2},  /* level 19.*/
-        {18, 19, 19, 10, 3, 512, ZSTD_btultra2}, /* level 20.*/
-        {18, 19, 19, 12, 3, 512, ZSTD_btultra2}, /* level 21.*/
-        {18, 19, 19, 13, 3, 999, ZSTD_btultra2}, /* level 22.*/
-    },
-    {
-        /* for srcSize <= 128 KB */
-        /* W,  C,  H,  S,  L,  T, strat */
-        {17, 12, 12, 1, 5, 1, ZSTD_fast},        /* base for negative levels */
-        {17, 12, 13, 1, 6, 0, ZSTD_fast},        /* level  1 */
-        {17, 13, 15, 1, 5, 0, ZSTD_fast},        /* level  2 */
-        {17, 15, 16, 2, 5, 0, ZSTD_dfast},       /* level  3 */
-        {17, 17, 17, 2, 4, 0, ZSTD_dfast},       /* level  4 */
-        {17, 16, 17, 3, 4, 2, ZSTD_greedy},      /* level  5 */
-        {17, 16, 17, 3, 4, 4, ZSTD_lazy},        /* level  6 */
-        {17, 16, 17, 3, 4, 8, ZSTD_lazy2},       /* level  7 */
-        {17, 16, 17, 4, 4, 8, ZSTD_lazy2},       /* level  8 */
-        {17, 16, 17, 5, 4, 8, ZSTD_lazy2},       /* level  9 */
-        {17, 16, 17, 6, 4, 8, ZSTD_lazy2},       /* level 10 */
-        {17, 17, 17, 5, 4, 8, ZSTD_btlazy2},     /* level 11 */
-        {17, 18, 17, 7, 4, 12, ZSTD_btlazy2},    /* level 12 */
-        {17, 18, 17, 3, 4, 12, ZSTD_btopt},      /* level 13.*/
-        {17, 18, 17, 4, 3, 32, ZSTD_btopt},      /* level 14.*/
-        {17, 18, 17, 6, 3, 256, ZSTD_btopt},     /* level 15.*/
-        {17, 18, 17, 6, 3, 128, ZSTD_btultra},   /* level 16.*/
-        {17, 18, 17, 8, 3, 256, ZSTD_btultra},   /* level 17.*/
-        {17, 18, 17, 10, 3, 512, ZSTD_btultra},  /* level 18.*/
-        {17, 18, 17, 5, 3, 256, ZSTD_btultra2},  /* level 19.*/
-        {17, 18, 17, 7, 3, 512, ZSTD_btultra2},  /* level 20.*/
-        {17, 18, 17, 9, 3, 512, ZSTD_btultra2},  /* level 21.*/
-        {17, 18, 17, 11, 3, 999, ZSTD_btultra2}, /* level 22.*/
-    },
-    {
-        /* for srcSize <= 16 KB */
-        /* W,  C,  H,  S,  L,  T, strat */
-        {14, 12, 13, 1, 5, 1, ZSTD_fast},        /* base for negative levels */
-        {14, 14, 15, 1, 5, 0, ZSTD_fast},        /* level  1 */
-        {14, 14, 15, 1, 4, 0, ZSTD_fast},        /* level  2 */
-        {14, 14, 15, 2, 4, 0, ZSTD_dfast},       /* level  3 */
-        {14, 14, 14, 4, 4, 2, ZSTD_greedy},      /* level  4 */
-        {14, 14, 14, 3, 4, 4, ZSTD_lazy},        /* level  5.*/
-        {14, 14, 14, 4, 4, 8, ZSTD_lazy2},       /* level  6 */
-        {14, 14, 14, 6, 4, 8, ZSTD_lazy2},       /* level  7 */
-        {14, 14, 14, 8, 4, 8, ZSTD_lazy2},       /* level  8.*/
-        {14, 15, 14, 5, 4, 8, ZSTD_btlazy2},     /* level  9.*/
-        {14, 15, 14, 9, 4, 8, ZSTD_btlazy2},     /* level 10.*/
-        {14, 15, 14, 3, 4, 12, ZSTD_btopt},      /* level 11.*/
-        {14, 15, 14, 4, 3, 24, ZSTD_btopt},      /* level 12.*/
-        {14, 15, 14, 5, 3, 32, ZSTD_btultra},    /* level 13.*/
-        {14, 15, 15, 6, 3, 64, ZSTD_btultra},    /* level 14.*/
-        {14, 15, 15, 7, 3, 256, ZSTD_btultra},   /* level 15.*/
-        {14, 15, 15, 5, 3, 48, ZSTD_btultra2},   /* level 16.*/
-        {14, 15, 15, 6, 3, 128, ZSTD_btultra2},  /* level 17.*/
-        {14, 15, 15, 7, 3, 256, ZSTD_btultra2},  /* level 18.*/
-        {14, 15, 15, 8, 3, 256, ZSTD_btultra2},  /* level 19.*/
-        {14, 15, 15, 8, 3, 512, ZSTD_btultra2},  /* level 20.*/
-        {14, 15, 15, 9, 3, 512, ZSTD_btultra2},  /* level 21.*/
-        {14, 15, 15, 10, 3, 999, ZSTD_btultra2}, /* level 22.*/
-    },
-};
-
-#endif /* ZSTD_CLEVELS_H */
diff --git a/src/zstd/common/allocations.h b/src/zstd/common/allocations.h
new file mode 100644
index 000000000..5e8995501
--- /dev/null
+++ b/src/zstd/common/allocations.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This file provides custom allocation primitives
+ */
+
+#define ZSTD_DEPS_NEED_MALLOC
+#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+
+#include "compiler.h" /* MEM_STATIC */
+#define ZSTD_STATIC_LINKING_ONLY
+#include "../zstd.h" /* ZSTD_customMem */
+
+#ifndef ZSTD_ALLOCATIONS_H
+#define ZSTD_ALLOCATIONS_H
+
+/* custom memory allocation functions */
+
+MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc)
+        return customMem.customAlloc(customMem.opaque, size);
+    return ZSTD_malloc(size);
+}
+
+MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc) {
+        /* calloc implemented as malloc+memset;
+         * not as efficient as calloc, but next best guess for custom malloc */
+        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+        ZSTD_memset(ptr, 0, size);
+        return ptr;
+    }
+    return ZSTD_calloc(1, size);
+}
+
+MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+{
+    if (ptr!=NULL) {
+        if (customMem.customFree)
+            customMem.customFree(customMem.opaque, ptr);
+        else
+            ZSTD_free(ptr);
+    }
+}
+
+#endif /* ZSTD_ALLOCATIONS_H */
diff --git a/src/zstd/common/bits.h b/src/zstd/common/bits.h
new file mode 100644
index 000000000..def56c474
--- /dev/null
+++ b/src/zstd/common/bits.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_BITS_H
+#define ZSTD_BITS_H
+
+#include "mem.h"
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
+{
+    assert(val != 0);
+    {
+        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
+                                                30, 22, 20, 15, 25, 17, 4, 8,
+                                                31, 27, 13, 23, 21, 19, 16, 7,
+                                                26, 12, 18, 6, 11, 5, 10, 9};
+        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER)
+#       if STATIC_BMI2 == 1
+            return (unsigned)_tzcnt_u32(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanForward(&r, val);
+                return (unsigned)r;
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4)
+        return (unsigned)__builtin_ctz(val);
+#   else
+        return ZSTD_countTrailingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
+    assert(val != 0);
+    {
+        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
+                                            11, 14, 16, 18, 22, 25, 3, 30,
+                                            8, 12, 20, 28, 15, 17, 24, 7,
+                                            19, 27, 23, 6, 26, 5, 4, 31};
+        val |= val >> 1;
+        val |= val >> 2;
+        val |= val >> 4;
+        val |= val >> 8;
+        val |= val >> 16;
+        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER)
+#       if STATIC_BMI2 == 1
+            return (unsigned)_lzcnt_u32(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanReverse(&r, val);
+                return (unsigned)(31 - r);
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4)
+        return (unsigned)__builtin_clz(val);
+#   else
+        return ZSTD_countLeadingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER) && defined(_WIN64)
+#       if STATIC_BMI2 == 1
+            return (unsigned)_tzcnt_u64(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanForward64(&r, val);
+                return (unsigned)r;
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4) && defined(__LP64__)
+        return (unsigned)__builtin_ctzll(val);
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (leastSignificantWord == 0) {
+                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
+            } else {
+                return ZSTD_countTrailingZeros32(leastSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER) && defined(_WIN64)
+#       if STATIC_BMI2 == 1
+            return (unsigned)_lzcnt_u64(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanReverse64(&r, val);
+                return (unsigned)(63 - r);
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4)
+        return (unsigned)(__builtin_clzll(val));
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (mostSignificantWord == 0) {
+                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
+            } else {
+                return ZSTD_countLeadingZeros32(mostSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+            return ZSTD_countTrailingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countTrailingZeros32((U32)val) >> 3;
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+            return ZSTD_countLeadingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countLeadingZeros32((U32)val) >> 3;
+        }
+    }
+}
+
+MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+{
+    assert(val != 0);
+    return 31 - ZSTD_countLeadingZeros32(val);
+}
+
+/* ZSTD_rotateRight_*():
+ * Rotates a bitfield to the right by "count" bits.
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+ */
+MEM_STATIC
+U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+    assert(count < 64);
+    count &= 0x3F; /* for fickle pattern recognition */
+    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+}
+
+MEM_STATIC
+U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+    assert(count < 32);
+    count &= 0x1F; /* for fickle pattern recognition */
+    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+}
+
+MEM_STATIC
+U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+    assert(count < 16);
+    count &= 0x0F; /* for fickle pattern recognition */
+    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+}
+
+#endif /* ZSTD_BITS_H */
diff --git a/src/zstd/common/bitstream.h b/src/zstd/common/bitstream.h
new file mode 100644
index 000000000..676044989
--- /dev/null
+++ b/src/zstd/common/bitstream.h
@@ -0,0 +1,457 @@
+/* ******************************************************************
+ * bitstream
+ * Part of FSE library
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "mem.h"            /* unaligned access routines */
+#include "compiler.h"       /* UNLIKELY() */
+#include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
+#include "error_private.h"  /* error codes and messages */
+#include "bits.h"           /* ZSTD_highbit32 */
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#ifndef ZSTD_NO_INTRINSICS
+#  if (defined(__BMI__) || defined(__BMI2__)) && defined(__GNUC__)
+#    include <immintrin.h>   /* support for bextr (experimental)/bzhi */
+#  elif defined(__ICCARM__)
+#    include <intrinsics.h>
+#  endif
+#endif
+
+#define STREAM_ACCUMULATOR_MIN_32  25
+#define STREAM_ACCUMULATOR_MIN_64  57
+#define STREAM_ACCUMULATOR_MIN    ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+
+/*-******************************************
+*  bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+ * A critical property of these streams is that they encode and decode in **reverse** direction.
+ * So the first bit sequence you add will be the last to be read, like a LIFO stack.
+ */
+typedef struct {
+    size_t bitContainer;
+    unsigned bitPos;
+    char*  startPtr;
+    char*  ptr;
+    char*  endPtr;
+} BIT_CStream_t;
+
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
+MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+MEM_STATIC void   BIT_flushBits(BIT_CStream_t* bitC);
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+*  bitStream will never write outside of this buffer.
+*  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+*  bits are first added to a local register.
+*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+*  Writing data into memory is an explicit operation, performed by the flushBits function.
+*  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+*  After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+*  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+*  Last operation is to close the bitStream.
+*  The function returns the final size of CStream in bytes.
+*  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef size_t BitContainerType;
+typedef struct {
+    BitContainerType bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+    const char* limitPtr;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,  /* fully refilled */
+               BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */
+               BIT_DStream_completed = 2,   /* bitstream entirely consumed, bit-exact */
+               BIT_DStream_overflow = 3     /* user requested more bits than present in bitstream */
+    } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+/* Start by invoking BIT_initDStream().
+*  A chunk of the bitStream is then stored into a local register.
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType).
+*  You can then retrieve bitFields stored into the local register, **in reverse order**.
+*  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+*  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+*  Otherwise, it can be less than that, so proceed accordingly.
+*  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+/* unsafe version; does not check buffer overflow */
+
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+/*=====    Local Constants   =====*/
+static const unsigned BIT_mask[] = {
+    0,          1,         3,         7,         0xF,       0x1F,
+    0x3F,       0x7F,      0xFF,      0x1FF,     0x3FF,     0x7FF,
+    0xFFF,      0x1FFF,    0x3FFF,    0x7FFF,    0xFFFF,    0x1FFFF,
+    0x3FFFF,    0x7FFFF,   0xFFFFF,   0x1FFFFF,  0x3FFFFF,  0x7FFFFF,
+    0xFFFFFF,   0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
+    0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */
+#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0]))
+
+/*-**************************************************************
+*  bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ *  `dstCapacity` must be > sizeof(size_t)
+ *  @return : 0 if success,
+ *            otherwise an error code (can be tested using ERR_isError()) */
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+                                  void* startPtr, size_t dstCapacity)
+{
+    bitC->bitContainer = 0;
+    bitC->bitPos = 0;
+    bitC->startPtr = (char*)startPtr;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
+    if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall);
+    return 0;
+}
+
+FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+#if defined(STATIC_BMI2) && STATIC_BMI2 == 1 && !defined(ZSTD_NO_INTRINSICS)
+    return  _bzhi_u64(bitContainer, nbBits);
+#else
+    assert(nbBits < BIT_MASK_SIZE);
+    return bitContainer & BIT_mask[nbBits];
+#endif
+}
+
+/*! BIT_addBits() :
+ *  can add up to 31 bits into `bitC`.
+ *  Note : does not check for register overflow ! */
+MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+                            size_t value, unsigned nbBits)
+{
+    DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+    assert(nbBits < BIT_MASK_SIZE);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ *  works only if `value` is _clean_,
+ *  meaning all high bits above nbBits are 0 */
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
+                                size_t value, unsigned nbBits)
+{
+    assert((value>>nbBits) == 0);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= value << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ *  assumption : bitContainer has not overflowed
+ *  unsafe version; does not check buffer overflow */
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    assert(bitC->ptr <= bitC->endPtr);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_flushBits() :
+ *  assumption : bitContainer has not overflowed
+ *  safe version; check for buffer overflow, and prevents it.
+ *  note : does not signal buffer overflow.
+ *  overflow will be revealed later on using BIT_closeCStream() */
+MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    assert(bitC->ptr <= bitC->endPtr);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_closeCStream() :
+ *  @return : size of CStream, in bytes,
+ *            or 0 if it could not fit into dstBuffer */
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
+{
+    BIT_addBitsFast(bitC, 1, 1);   /* endMark */
+    BIT_flushBits(bitC);
+    if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+    return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+
+/*-********************************************************
+*  bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+ *  Initialize a BIT_DStream_t.
+ * `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+ * `srcSize` must be the *exact* size of the bitStream, in bytes.
+ * @return : size of stream (== srcSize), or an errorCode if a problem is detected
+ */
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { ZSTD_memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    bitD->start = (const char*)srcBuffer;
+    bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer);
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+    } else {
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+        case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                ZSTD_FALLTHROUGH;
+
+        case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                ZSTD_FALLTHROUGH;
+
+        case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                ZSTD_FALLTHROUGH;
+
+        case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24;
+                ZSTD_FALLTHROUGH;
+
+        case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16;
+                ZSTD_FALLTHROUGH;
+
+        case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                ZSTD_FALLTHROUGH;
+
+        default: break;
+        }
+        {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+            if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+        }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start)
+{
+    return bitContainer >> start;
+}
+
+FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits)
+{
+    U32 const regMask = sizeof(bitContainer)*8 - 1;
+    /* if start > regMask, bitstream is corrupted, and result is undefined */
+    assert(nbBits < BIT_MASK_SIZE);
+    /* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is better
+     * than accessing memory. When bmi2 instruction is not present, we consider
+     * such cpus old (pre-Haswell, 2013) and their performance is not of that
+     * importance.
+     */
+#if defined(__x86_64__) || defined(_M_X86)
+    return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1);
+#else
+    return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
+#endif
+}
+
+/*! BIT_lookBits() :
+ *  Provides next n bits from local register.
+ *  local register is not modified.
+ *  On 32-bits, maxNbBits==24.
+ *  On 64-bits, maxNbBits==56.
+ * @return : value extracted */
+FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
+{
+    /* arbitrate between double-shift and shift+mask */
+#if 1
+    /* if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8,
+     * bitstream is likely corrupted, and result is undefined */
+    return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
+#else
+    /* this code path is slower on my os-x laptop */
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask);
+#endif
+}
+
+/*! BIT_lookBitsFast() :
+ *  unsafe version; only works if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+{
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    assert(nbBits >= 1);
+    return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+}
+
+FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+/*! BIT_readBits() :
+ *  Read (consume) next n bits from local register and update.
+ *  Pay attention to not read more than nbBits contained into local register.
+ * @return : extracted value. */
+FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+{
+    size_t const value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_readBitsFast() :
+ *  unsafe version; only works if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+{
+    size_t const value = BIT_lookBitsFast(bitD, nbBits);
+    assert(nbBits >= 1);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_reloadDStream_internal() :
+ *  Simple variant of BIT_reloadDStream(), with two conditions:
+ *  1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8
+ *  2. look window is valid after shifted down : bitD->ptr >= bitD->start
+ */
+MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD)
+{
+    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+    bitD->ptr -= bitD->bitsConsumed >> 3;
+    assert(bitD->ptr >= bitD->start);
+    bitD->bitsConsumed &= 7;
+    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+    return BIT_DStream_unfinished;
+}
+
+/*! BIT_reloadDStreamFast() :
+ *  Similar to BIT_reloadDStream(), but with two differences:
+ *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+ *  2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this
+ *     point you must use BIT_reloadDStream() to reload.
+ */
+MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+{
+    if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+        return BIT_DStream_overflow;
+    return BIT_reloadDStream_internal(bitD);
+}
+
+/*! BIT_reloadDStream() :
+ *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+ *  This function is safe, it guarantees it will not never beyond src buffer.
+ * @return : status of `BIT_DStream_t` internal register.
+ *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    /* note : once in overflow mode, a bitstream remains in this mode until it's reset */
+    if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) {
+        static const BitContainerType zeroFilled = 0;
+        bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */
+        /* overflow detected, erroneous scenario or end of stream: no update */
+        return BIT_DStream_overflow;
+    }
+
+    assert(bitD->ptr >= bitD->start);
+
+    if (bitD->ptr >= bitD->limitPtr) {
+        return BIT_reloadDStream_internal(bitD);
+    }
+    if (bitD->ptr == bitD->start) {
+        /* reached end of bitStream => no update */
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    /* start < ptr < limitPtr => cautious update */
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream() :
+ * @return : 1 if DStream has _exactly_ reached its end (all bits consumed).
+ */
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
diff --git a/src/zstd/common/compiler.h b/src/zstd/common/compiler.h
new file mode 100644
index 000000000..31880ecbe
--- /dev/null
+++ b/src/zstd/common/compiler.h
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPILER_H
+#define ZSTD_COMPILER_H
+
+#include <stddef.h>
+
+#include "portability_macros.h"
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+/* force inlining */
+
+#if !defined(ZSTD_NO_INLINE)
+#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  define INLINE_KEYWORD inline
+#else
+#  define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__) || defined(__ICCARM__)
+#  define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define FORCE_INLINE_ATTR __forceinline
+#else
+#  define FORCE_INLINE_ATTR
+#endif
+
+#else
+
+#define INLINE_KEYWORD
+#define FORCE_INLINE_ATTR
+
+#endif
+
+/**
+  On MSVC qsort requires that functions passed into it use the __cdecl calling conversion(CC).
+  This explicitly marks such functions as __cdecl so that the code will still compile
+  if a CC other than __cdecl has been made the default.
+*/
+#if  defined(_MSC_VER)
+#  define WIN_CDECL __cdecl
+#else
+#  define WIN_CDECL
+#endif
+
+/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+#if defined(__GNUC__)
+#  define UNUSED_ATTR __attribute__((unused))
+#else
+#  define UNUSED_ATTR
+#endif
+
+/**
+ * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+ * parameters. They must be inlined for the compiler to eliminate the constant
+ * branches.
+ */
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR
+/**
+ * HINT_INLINE is used to help the compiler generate better code. It is *not*
+ * used for "templates", so it can be tweaked based on the compilers
+ * performance.
+ *
+ * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
+ * always_inline attribute.
+ *
+ * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
+ * attribute.
+ */
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+#  define HINT_INLINE static INLINE_KEYWORD
+#else
+#  define HINT_INLINE FORCE_INLINE_TEMPLATE
+#endif
+
+/* "soft" inline :
+ * The compiler is free to select if it's a good idea to inline or not.
+ * The main objective is to silence compiler warnings
+ * when a defined function in included but not used.
+ *
+ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit.
+ * Updating the prefix is probably preferable, but requires a fairly large codemod,
+ * since this name is used everywhere.
+ */
+#ifndef MEM_STATIC  /* already defined in Linux Kernel mem.h */
+#if defined(__GNUC__)
+#  define MEM_STATIC static __inline UNUSED_ATTR
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+#endif
+
+/* force no inlining */
+#ifdef _MSC_VER
+#  define FORCE_NOINLINE static __declspec(noinline)
+#else
+#  if defined(__GNUC__) || defined(__ICCARM__)
+#    define FORCE_NOINLINE static __attribute__((__noinline__))
+#  else
+#    define FORCE_NOINLINE static
+#  endif
+#endif
+
+
+/* target attribute */
+#if defined(__GNUC__) || defined(__ICCARM__)
+#  define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
+#else
+#  define TARGET_ATTRIBUTE(target)
+#endif
+
+/* Target attribute for BMI2 dynamic dispatch.
+ * Enable lzcnt, bmi, and bmi2.
+ * We test for bmi1 & bmi2. lzcnt is included in bmi1.
+ */
+#define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2")
+
+/* prefetch
+ * can be disabled, by declaring NO_PREFETCH build macro */
+#if defined(NO_PREFETCH)
+#  define PREFETCH_L1(ptr)  do { (void)(ptr); } while (0)  /* disabled */
+#  define PREFETCH_L2(ptr)  do { (void)(ptr); } while (0)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#    define PREFETCH_L2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#    define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#  elif defined(__aarch64__)
+#    define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
+#    define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+#  else
+#    define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
+#    define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
+#  endif
+#endif  /* NO_PREFETCH */
+
+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(p, s)                              \
+    do {                                                 \
+        const char* const _ptr = (const char*)(p);       \
+        size_t const _size = (size_t)(s);                \
+        size_t _pos;                                     \
+        for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
+            PREFETCH_L2(_ptr + _pos);                    \
+        }                                                \
+    } while (0)
+
+/* vectorization
+ * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
+ * and some compilers, like Intel ICC and MCST LCC, do not support it at all. */
+#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) && !defined(__LCC__)
+#  if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5)
+#    define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
+#  else
+#    define DONT_VECTORIZE _Pragma("GCC optimize(\"no-tree-vectorize\")")
+#  endif
+#else
+#  define DONT_VECTORIZE
+#endif
+
+/* Tell the compiler that a branch is likely or unlikely.
+ * Only use these macros if it causes the compiler to generate better code.
+ * If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc
+ * and clang, please do.
+ */
+#if defined(__GNUC__)
+#define LIKELY(x) (__builtin_expect((x), 1))
+#define UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+#if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+#  define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0)
+#else
+#  define ZSTD_UNREACHABLE do { assert(0); } while (0)
+#endif
+
+/* disable warnings */
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/
+#ifndef STATIC_BMI2
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))
+#    ifdef __AVX2__  //MSVC does not have a BMI2 specific flag, but every CPU that supports AVX2 also supports BMI2
+#       define STATIC_BMI2 1
+#    endif
+#  elif defined(__BMI2__) && defined(__x86_64__) && defined(__GNUC__)
+#    define STATIC_BMI2 1
+#  endif
+#endif
+
+#ifndef STATIC_BMI2
+    #define STATIC_BMI2 0
+#endif
+
+/* compile time determination of SIMD support */
+#if !defined(ZSTD_NO_INTRINSICS)
+#  if defined(__SSE2__) || defined(_M_AMD64) || (defined (_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
+#    define ZSTD_ARCH_X86_SSE2
+#  endif
+#  if defined(__ARM_NEON) || defined(_M_ARM64)
+#    define ZSTD_ARCH_ARM_NEON
+#  endif
+#
+#  if defined(ZSTD_ARCH_X86_SSE2)
+#    include <emmintrin.h>
+#  elif defined(ZSTD_ARCH_ARM_NEON)
+#    include <arm_neon.h>
+#  endif
+#endif
+
+/* C-language Attributes are added in C23. */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
+# define ZSTD_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+#else
+# define ZSTD_HAS_C_ATTRIBUTE(x) 0
+#endif
+
+/* Only use C++ attributes in C++. Some compilers report support for C++
+ * attributes when compiling with C.
+ */
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define ZSTD_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define ZSTD_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+/* Define ZSTD_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute.
+ * - C23: https://en.cppreference.com/w/c/language/attributes/fallthrough
+ * - CPP17: https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+ * - Else: __attribute__((__fallthrough__))
+ */
+#ifndef ZSTD_FALLTHROUGH
+# if ZSTD_HAS_C_ATTRIBUTE(fallthrough)
+#  define ZSTD_FALLTHROUGH [[fallthrough]]
+# elif ZSTD_HAS_CPP_ATTRIBUTE(fallthrough)
+#  define ZSTD_FALLTHROUGH [[fallthrough]]
+# elif __has_attribute(__fallthrough__)
+/* Leading semicolon is to satisfy gcc-11 with -pedantic. Without the semicolon
+ * gcc complains about: a label can only be part of a statement and a declaration is not a statement.
+ */
+#  define ZSTD_FALLTHROUGH ; __attribute__((__fallthrough__))
+# else
+#  define ZSTD_FALLTHROUGH
+# endif
+#endif
+
+/*-**************************************************************
+*  Alignment check
+*****************************************************************/
+
+/* this test was initially positioned in mem.h,
+ * but this file is removed (or replaced) for linux kernel
+ * so it's now hosted in compiler.h,
+ * which remains valid for both user & kernel spaces.
+ */
+
+#ifndef ZSTD_ALIGNOF
+# if defined(__GNUC__) || defined(_MSC_VER)
+/* covers gcc, clang & MSVC */
+/* note : this section must come first, before C11,
+ * due to a limitation in the kernel source generator */
+#  define ZSTD_ALIGNOF(T) __alignof(T)
+
+# elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+/* C11 support */
+#  include <stdalign.h>
+#  define ZSTD_ALIGNOF(T) alignof(T)
+
+# else
+/* No known support for alignof() - imperfect backup */
+#  define ZSTD_ALIGNOF(T) (sizeof(void*) < sizeof(T) ? sizeof(void*) : sizeof(T))
+
+# endif
+#endif /* ZSTD_ALIGNOF */
+
+/*-**************************************************************
+*  Sanitizer
+*****************************************************************/
+
+/**
+ * Zstd relies on pointer overflow in its decompressor.
+ * We add this attribute to functions that rely on pointer overflow.
+ */
+#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+#  if __has_attribute(no_sanitize)
+#    if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8
+       /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */
+#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow")))
+#    else
+       /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */
+#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow")))
+#    endif
+#  else
+#    define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+#  endif
+#endif
+
+/**
+ * Helper function to perform a wrapped pointer difference without trigging
+ * UBSAN.
+ *
+ * @returns lhs - rhs with wrapping
+ */
+MEM_STATIC
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs)
+{
+    return lhs - rhs;
+}
+
+/**
+ * Helper function to perform a wrapped pointer add without triggering UBSAN.
+ *
+ * @return ptr + add with wrapping
+ */
+MEM_STATIC
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add)
+{
+    return ptr + add;
+}
+
+/**
+ * Helper function to perform a wrapped pointer subtraction without triggering
+ * UBSAN.
+ *
+ * @return ptr - sub with wrapping
+ */
+MEM_STATIC
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub)
+{
+    return ptr - sub;
+}
+
+/**
+ * Helper function to add to a pointer that works around C's undefined behavior
+ * of adding 0 to NULL.
+ *
+ * @returns `ptr + add` except it defines `NULL + 0 == NULL`.
+ */
+MEM_STATIC
+unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add)
+{
+    return add > 0 ? ptr + add : ptr;
+}
+
+/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
+ * abundance of caution, disable our custom poisoning on mingw. */
+#ifdef __MINGW32__
+#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
+#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
+#endif
+#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
+#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
+#endif
+#endif
+
+#if ZSTD_MEMORY_SANITIZER && !defined(ZSTD_MSAN_DONT_POISON_WORKSPACE)
+/* Not all platforms that support msan provide sanitizers/msan_interface.h.
+ * We therefore declare the functions we need ourselves, rather than trying to
+ * include the header file... */
+#include <stddef.h>  /* size_t */
+#define ZSTD_DEPS_NEED_STDINT
+#include "zstd_deps.h"  /* intptr_t */
+
+/* Make memory region fully initialized (without changing its contents). */
+void __msan_unpoison(const volatile void *a, size_t size);
+
+/* Make memory region fully uninitialized (without changing its contents).
+   This is a legacy interface that does not update origin information. Use
+   __msan_allocated_memory() instead. */
+void __msan_poison(const volatile void *a, size_t size);
+
+/* Returns the offset of the first (at least partially) poisoned byte in the
+   memory range, or -1 if the whole range is good. */
+intptr_t __msan_test_shadow(const volatile void *x, size_t size);
+
+/* Print shadow and origin for the memory range to stderr in a human-readable
+   format. */
+void __msan_print_shadow(const volatile void *x, size_t size);
+#endif
+
+#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE)
+/* Not all platforms that support asan provide sanitizers/asan_interface.h.
+ * We therefore declare the functions we need ourselves, rather than trying to
+ * include the header file... */
+#include <stddef.h>  /* size_t */
+
+/**
+ * Marks a memory region (<c>[addr, addr+size)</c>) as unaddressable.
+ *
+ * This memory must be previously allocated by your program. Instrumented
+ * code is forbidden from accessing addresses in this region until it is
+ * unpoisoned. This function is not guaranteed to poison the entire region -
+ * it could poison only a subregion of <c>[addr, addr+size)</c> due to ASan
+ * alignment restrictions.
+ *
+ * \note This function is not thread-safe because no two threads can poison or
+ * unpoison memory in the same memory region simultaneously.
+ *
+ * \param addr Start of memory region.
+ * \param size Size of memory region. */
+void __asan_poison_memory_region(void const volatile *addr, size_t size);
+
+/**
+ * Marks a memory region (<c>[addr, addr+size)</c>) as addressable.
+ *
+ * This memory must be previously allocated by your program. Accessing
+ * addresses in this region is allowed until this region is poisoned again.
+ * This function could unpoison a super-region of <c>[addr, addr+size)</c> due
+ * to ASan alignment restrictions.
+ *
+ * \note This function is not thread-safe because no two threads can
+ * poison or unpoison memory in the same memory region simultaneously.
+ *
+ * \param addr Start of memory region.
+ * \param size Size of memory region. */
+void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
+#endif
+
+#endif /* ZSTD_COMPILER_H */
diff --git a/src/zstd/common/cpu.h b/src/zstd/common/cpu.h
new file mode 100644
index 000000000..d9cdf8feb
--- /dev/null
+++ b/src/zstd/common/cpu.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMMON_CPU_H
+#define ZSTD_COMMON_CPU_H
+
+/**
+ * Implementation taken from folly/CpuId.h
+ * https://github.com/facebook/folly/blob/master/folly/CpuId.h
+ */
+
+#include "mem.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+typedef struct {
+    U32 f1c;
+    U32 f1d;
+    U32 f7b;
+    U32 f7c;
+} ZSTD_cpuid_t;
+
+MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
+    U32 f1c = 0;
+    U32 f1d = 0;
+    U32 f7b = 0;
+    U32 f7c = 0;
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#if !defined(__clang__) || __clang_major__ >= 16
+    int reg[4];
+    __cpuid((int*)reg, 0);
+    {
+        int const n = reg[0];
+        if (n >= 1) {
+            __cpuid((int*)reg, 1);
+            f1c = (U32)reg[2];
+            f1d = (U32)reg[3];
+        }
+        if (n >= 7) {
+            __cpuidex((int*)reg, 7, 0);
+            f7b = (U32)reg[1];
+            f7c = (U32)reg[2];
+        }
+    }
+#else
+    /* Clang compiler has a bug (fixed in https://reviews.llvm.org/D101338) in
+     * which the `__cpuid` intrinsic does not save and restore `rbx` as it needs
+     * to due to being a reserved register. So in that case, do the `cpuid`
+     * ourselves. Clang supports inline assembly anyway.
+     */
+    U32 n;
+    __asm__(
+        "pushq %%rbx\n\t"
+        "cpuid\n\t"
+        "popq %%rbx\n\t"
+        : "=a"(n)
+        : "a"(0)
+        : "rcx", "rdx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__(
+          "pushq %%rbx\n\t"
+          "cpuid\n\t"
+          "popq %%rbx\n\t"
+          : "=a"(f1a), "=c"(f1c), "=d"(f1d)
+          : "a"(1)
+          :);
+    }
+    if (n >= 7) {
+      __asm__(
+          "pushq %%rbx\n\t"
+          "cpuid\n\t"
+          "movq %%rbx, %%rax\n\t"
+          "popq %%rbx"
+          : "=a"(f7b), "=c"(f7c)
+          : "a"(7), "c"(0)
+          : "rdx");
+    }
+#endif
+#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
+    /* The following block like the normal cpuid branch below, but gcc
+     * reserves ebx for use of its pic register so we must specially
+     * handle the save and restore to avoid clobbering the register
+     */
+    U32 n;
+    __asm__(
+        "pushl %%ebx\n\t"
+        "cpuid\n\t"
+        "popl %%ebx\n\t"
+        : "=a"(n)
+        : "a"(0)
+        : "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "popl %%ebx\n\t"
+          : "=a"(f1a), "=c"(f1c), "=d"(f1d)
+          : "a"(1));
+    }
+    if (n >= 7) {
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "movl %%ebx, %%eax\n\t"
+          "popl %%ebx"
+          : "=a"(f7b), "=c"(f7c)
+          : "a"(7), "c"(0)
+          : "edx");
+    }
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
+    U32 n;
+    __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx");
+    }
+    if (n >= 7) {
+      U32 f7a;
+      __asm__("cpuid"
+              : "=a"(f7a), "=b"(f7b), "=c"(f7c)
+              : "a"(7), "c"(0)
+              : "edx");
+    }
+#endif
+    {
+        ZSTD_cpuid_t cpuid;
+        cpuid.f1c = f1c;
+        cpuid.f1d = f1d;
+        cpuid.f7b = f7b;
+        cpuid.f7c = f7c;
+        return cpuid;
+    }
+}
+
+#define X(name, r, bit)                                                        \
+  MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) {                 \
+    return ((cpuid.r) & (1U << bit)) != 0;                                     \
+  }
+
+/* cpuid(1): Processor Info and Feature Bits. */
+#define C(name, bit) X(name, f1c, bit)
+  C(sse3, 0)
+  C(pclmuldq, 1)
+  C(dtes64, 2)
+  C(monitor, 3)
+  C(dscpl, 4)
+  C(vmx, 5)
+  C(smx, 6)
+  C(eist, 7)
+  C(tm2, 8)
+  C(ssse3, 9)
+  C(cnxtid, 10)
+  C(fma, 12)
+  C(cx16, 13)
+  C(xtpr, 14)
+  C(pdcm, 15)
+  C(pcid, 17)
+  C(dca, 18)
+  C(sse41, 19)
+  C(sse42, 20)
+  C(x2apic, 21)
+  C(movbe, 22)
+  C(popcnt, 23)
+  C(tscdeadline, 24)
+  C(aes, 25)
+  C(xsave, 26)
+  C(osxsave, 27)
+  C(avx, 28)
+  C(f16c, 29)
+  C(rdrand, 30)
+#undef C
+#define D(name, bit) X(name, f1d, bit)
+  D(fpu, 0)
+  D(vme, 1)
+  D(de, 2)
+  D(pse, 3)
+  D(tsc, 4)
+  D(msr, 5)
+  D(pae, 6)
+  D(mce, 7)
+  D(cx8, 8)
+  D(apic, 9)
+  D(sep, 11)
+  D(mtrr, 12)
+  D(pge, 13)
+  D(mca, 14)
+  D(cmov, 15)
+  D(pat, 16)
+  D(pse36, 17)
+  D(psn, 18)
+  D(clfsh, 19)
+  D(ds, 21)
+  D(acpi, 22)
+  D(mmx, 23)
+  D(fxsr, 24)
+  D(sse, 25)
+  D(sse2, 26)
+  D(ss, 27)
+  D(htt, 28)
+  D(tm, 29)
+  D(pbe, 31)
+#undef D
+
+/* cpuid(7): Extended Features. */
+#define B(name, bit) X(name, f7b, bit)
+  B(bmi1, 3)
+  B(hle, 4)
+  B(avx2, 5)
+  B(smep, 7)
+  B(bmi2, 8)
+  B(erms, 9)
+  B(invpcid, 10)
+  B(rtm, 11)
+  B(mpx, 14)
+  B(avx512f, 16)
+  B(avx512dq, 17)
+  B(rdseed, 18)
+  B(adx, 19)
+  B(smap, 20)
+  B(avx512ifma, 21)
+  B(pcommit, 22)
+  B(clflushopt, 23)
+  B(clwb, 24)
+  B(avx512pf, 26)
+  B(avx512er, 27)
+  B(avx512cd, 28)
+  B(sha, 29)
+  B(avx512bw, 30)
+  B(avx512vl, 31)
+#undef B
+#define C(name, bit) X(name, f7c, bit)
+  C(prefetchwt1, 0)
+  C(avx512vbmi, 1)
+#undef C
+
+#undef X
+
+#endif /* ZSTD_COMMON_CPU_H */
diff --git a/src/zstd/debug.c b/src/zstd/common/debug.c
similarity index 70%
rename from src/zstd/debug.c
rename to src/zstd/common/debug.c
index 3402dc43e..9d0b7d229 100644
--- a/src/zstd/debug.c
+++ b/src/zstd/common/debug.c
@@ -10,7 +10,8 @@
  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  * in the COPYING file in the root directory of this source tree).
  * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
+****************************************************************** */
+
 
 /*
  * This module only hosts one global variable
@@ -20,4 +21,10 @@
 
 #include "debug.h"
 
+#if !defined(ZSTD_LINUX_KERNEL) || (DEBUGLEVEL>=2)
+/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a
+ * translation unit is empty. So remove this from Linux kernel builds, but
+ * otherwise just leave it in.
+ */
 int g_debuglevel = DEBUGLEVEL;
+#endif
diff --git a/src/zstd/debug.h b/src/zstd/common/debug.h
similarity index 66%
rename from src/zstd/debug.h
rename to src/zstd/common/debug.h
index 9d1e70194..a16b69e57 100644
--- a/src/zstd/debug.h
+++ b/src/zstd/common/debug.h
@@ -10,7 +10,8 @@
  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  * in the COPYING file in the root directory of this source tree).
  * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
+****************************************************************** */
+
 
 /*
  * The purpose of this header is to enable debug functions.
@@ -31,22 +32,25 @@
 #ifndef DEBUG_H_12987983217
 #define DEBUG_H_12987983217
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 extern "C" {
 #endif
 
+
 /* static assert is triggered at compile time, leaving no runtime artefact.
  * static assert only works with compile-time constants.
  * Also, this variant can only be used inside a function. */
 #define DEBUG_STATIC_ASSERT(c) (void)sizeof(char[(c) ? 1 : -1])
 
+
 /* DEBUGLEVEL is expected to be defined externally,
  * typically through compiler command line.
  * Value must be a number. */
 #ifndef DEBUGLEVEL
-#define DEBUGLEVEL 0
+#  define DEBUGLEVEL 0
 #endif
 
+
 /* recommended values for DEBUGLEVEL :
  * 0 : release mode, no debug, all run-time checks disabled
  * 1 : enables assert() only, no display
@@ -62,18 +66,18 @@ extern "C" {
  * by modifying g_debug_level.
  */
 
-#if(DEBUGLEVEL >= 1)
-#define ZSTD_DEPS_NEED_ASSERT
-#include "zstd_deps.h"
+#if (DEBUGLEVEL>=1)
+#  define ZSTD_DEPS_NEED_ASSERT
+#  include "zstd_deps.h"
 #else
-#ifndef assert                      /* assert may be already defined, due to prior #include <assert.h> */
-#define assert(condition) ((void)0) /* disable assert (default) */
-#endif
+#  ifndef assert   /* assert may be already defined, due to prior #include <assert.h> */
+#    define assert(condition) ((void)0)   /* disable assert (default) */
+#  endif
 #endif
 
-#if(DEBUGLEVEL >= 2)
-#define ZSTD_DEPS_NEED_IO
-#include "zstd_deps.h"
+#if (DEBUGLEVEL>=2)
+#  define ZSTD_DEPS_NEED_IO
+#  include "zstd_deps.h"
 extern int g_debuglevel; /* the variable is only declared,
                             it actually lives in debug.c,
                             and is shared by the whole process.
@@ -81,27 +85,31 @@ extern int g_debuglevel; /* the variable is only declared,
                             It's useful when enabling very verbose levels
                             on selective conditions (such as position in src) */
 
-#define RAWLOG(l, ...)                     \
-	{                                      \
-		if(l <= g_debuglevel) {            \
-			ZSTD_DEBUG_PRINT(__VA_ARGS__); \
-		}                                  \
-	}
-#define DEBUGLOG(l, ...)                                 \
-	{                                                    \
-		if(l <= g_debuglevel) {                          \
-			ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \
-			ZSTD_DEBUG_PRINT(" \n");                     \
-		}                                                \
-	}
+#  define RAWLOG(l, ...)                   \
+    do {                                   \
+        if (l<=g_debuglevel) {             \
+            ZSTD_DEBUG_PRINT(__VA_ARGS__); \
+        }                                  \
+    } while (0)
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+#define LINE_AS_STRING TOSTRING(__LINE__)
+
+#  define DEBUGLOG(l, ...)                               \
+    do {                                                 \
+        if (l<=g_debuglevel) {                           \
+            ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \
+            ZSTD_DEBUG_PRINT(" \n");                     \
+        }                                                \
+    } while (0)
 #else
-#define RAWLOG(l, ...) \
-	{ } /* disabled */
-#define DEBUGLOG(l, ...) \
-	{ } /* disabled */
+#  define RAWLOG(l, ...)   do { } while (0)    /* disabled */
+#  define DEBUGLOG(l, ...) do { } while (0)    /* disabled */
 #endif
 
-#if defined(__cplusplus)
+
+#if defined (__cplusplus)
 }
 #endif
 
diff --git a/src/zstd/common/entropy_common.c b/src/zstd/common/entropy_common.c
new file mode 100644
index 000000000..c54deadb8
--- /dev/null
+++ b/src/zstd/common/entropy_common.c
@@ -0,0 +1,340 @@
+/* ******************************************************************
+ * Common functions of New Generation Entropy library
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include "mem.h"
+#include "error_private.h"       /* ERR_*, ERROR */
+#define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+#include "fse.h"
+#include "huf.h"
+#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
+
+
+/*===   Version   ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+
+/*===   Error Management   ===*/
+//unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+//unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+FORCE_INLINE_TEMPLATE
+size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                           const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    unsigned const maxSV1 = *maxSVPtr + 1;
+    int previous0 = 0;
+
+    if (hbSize < 8) {
+        /* This function only works when hbSize >= 8 */
+        char buffer[8] = {0};
+        ZSTD_memcpy(buffer, headerBuffer, hbSize);
+        {   size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr,
+                                                    buffer, sizeof(buffer));
+            if (FSE_isError(countSize)) return countSize;
+            if (countSize > hbSize) return ERROR(corruption_detected);
+            return countSize;
+    }   }
+    assert(hbSize >= 8);
+
+    /* init */
+    ZSTD_memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0]));   /* all symbols not present in NCount have a frequency of 0 */
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    for (;;) {
+        if (previous0) {
+            /* Count the number of repeats. Each time the
+             * 2-bit repeat code is 0b11 there is another
+             * repeat.
+             * Avoid UB by setting the high bit to 1.
+             */
+            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+            while (repeats >= 12) {
+                charnum += 3 * 12;
+                if (LIKELY(ip <= iend-7)) {
+                    ip += 3;
+                } else {
+                    bitCount -= (int)(8 * (iend - 7 - ip));
+                    bitCount &= 31;
+                    ip = iend - 4;
+                }
+                bitStream = MEM_readLE32(ip) >> bitCount;
+                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+            }
+            charnum += 3 * repeats;
+            bitStream >>= 2 * repeats;
+            bitCount += 2 * repeats;
+
+            /* Add the final repeat which isn't 0b11. */
+            assert((bitStream & 3) < 3);
+            charnum += bitStream & 3;
+            bitCount += 2;
+
+            /* This is an error, but break and return an error
+             * at the end, because returning out of a loop makes
+             * it harder for the compiler to optimize.
+             */
+            if (charnum >= maxSV1) break;
+
+            /* We don't need to set the normalized count to 0
+             * because we already memset the whole buffer to 0.
+             */
+
+            if (LIKELY(ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                assert((bitCount >> 3) <= 3); /* For first condition to work */
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                bitCount &= 31;
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> bitCount;
+        }
+        {
+            int const max = (2*threshold-1) - remaining;
+            int count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = bitStream & (threshold-1);
+                bitCount += nbBits-1;
+            } else {
+                count = bitStream & (2*threshold-1);
+                if (count >= threshold) count -= max;
+                bitCount += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            /* When it matters (small blocks), this is a
+             * predictable branch, because we don't use -1.
+             */
+            if (count >= 0) {
+                remaining -= count;
+            } else {
+                assert(count == -1);
+                remaining += count;
+            }
+            normalizedCounter[charnum++] = (short)count;
+            previous0 = !count;
+
+            assert(threshold > 1);
+            if (remaining < threshold) {
+                /* This branch can be folded into the
+                 * threshold update condition because we
+                 * know that threshold > 1.
+                 */
+                if (remaining <= 1) break;
+                nbBits = ZSTD_highbit32(remaining) + 1;
+                threshold = 1 << (nbBits - 1);
+            }
+            if (charnum >= maxSV1) break;
+
+            if (LIKELY(ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                bitCount &= 31;
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> bitCount;
+    }   }
+    if (remaining != 1) return ERROR(corruption_detected);
+    /* Only possible when there are too many zeros. */
+    if (charnum > maxSV1) return ERROR(maxSymbolValue_tooSmall);
+    if (bitCount > 32) return ERROR(corruption_detected);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    return ip-istart;
+}
+
+/* Avoids the FORCE_INLINE of the _body() function. */
+static size_t FSE_readNCount_body_default(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize)
+{
+    return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+}
+
+#if DYNAMIC_BMI2
+BMI2_TARGET_ATTRIBUTE static size_t FSE_readNCount_body_bmi2(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize)
+{
+    return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+}
+#endif
+
+size_t FSE_readNCount_bmi2(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return FSE_readNCount_body_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+    }
+#endif
+    (void)bmi2;
+    return FSE_readNCount_body_default(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+}
+
+size_t FSE_readNCount(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize)
+{
+    return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize, /* bmi2 */ 0);
+}
+
+
+/*! HUF_readStats() :
+    Read compact Huffman tree, saved by HUF_writeCTable().
+    `huffWeight` is destination buffer.
+    `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize)
+{
+    U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                   U32* nbSymbolsPtr, U32* tableLogPtr,
+                   const void* src, size_t srcSize,
+                   void* workSpace, size_t wkspSize,
+                   int bmi2)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    /* ZSTD_memset(huffWeight, 0, hwSize);   *//* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128) {  /* special header */
+        oSize = iSize - 127;
+        iSize = ((oSize+1)/2);
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        if (oSize >= hwSize) return ERROR(corruption_detected);
+        ip += 1;
+        {   U32 n;
+            for (n=0; n<oSize; n+=2) {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        /* max (hwSize-1) values decoded, as last one is implied */
+        oSize = FSE_decompress_wksp_bmi2(huffWeight, hwSize-1, ip+1, iSize, 6, workSpace, wkspSize, bmi2);
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    ZSTD_memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
+        if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << ZSTD_highbit32(rest);
+            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}
+
+/* Avoids the FORCE_INLINE of the _body() function. */
+static size_t HUF_readStats_body_default(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize,
+                     void* workSpace, size_t wkspSize)
+{
+    return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 0);
+}
+
+#if DYNAMIC_BMI2
+static BMI2_TARGET_ATTRIBUTE size_t HUF_readStats_body_bmi2(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize,
+                     void* workSpace, size_t wkspSize)
+{
+    return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 1);
+}
+#endif
+
+size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize,
+                     void* workSpace, size_t wkspSize,
+                     int flags)
+{
+#if DYNAMIC_BMI2
+    if (flags & HUF_flags_bmi2) {
+        return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+    }
+#endif
+    (void)flags;
+    return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+}
diff --git a/src/zstd/common/error_private.c b/src/zstd/common/error_private.c
new file mode 100644
index 000000000..075fc5ef4
--- /dev/null
+++ b/src/zstd/common/error_private.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* The purpose of this file is to have a single list of error strings embedded in binary */
+
+#include "error_private.h"
+
+const char* ERR_getErrorString(ERR_enum code)
+{
+#ifdef ZSTD_STRIP_ERROR_STRINGS
+    (void)code;
+    return "Error strings stripped";
+#else
+    static const char* const notErrorCode = "Unspecified error code";
+    switch( code )
+    {
+    case PREFIX(no_error): return "No error detected";
+    case PREFIX(GENERIC):  return "Error (generic)";
+    case PREFIX(prefix_unknown): return "Unknown frame descriptor";
+    case PREFIX(version_unsupported): return "Version not supported";
+    case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+    case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+    case PREFIX(corruption_detected): return "Data corruption detected";
+    case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
+    case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
+    case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+    case PREFIX(init_missing): return "Context should be init first";
+    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+    case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough";
+    case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
+    case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+    case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+    case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
+    case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+    case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+    case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+    case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+    case PREFIX(srcSize_wrong): return "Src size is incorrect";
+    case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
+    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
+    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
+        /* following error codes are not stable and may be removed or changed in a future version */
+    case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+    case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+    case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+    case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
+    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
+    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
+    case PREFIX(maxCode):
+    default: return notErrorCode;
+    }
+#endif
+}
diff --git a/src/zstd/common/error_private.h b/src/zstd/common/error_private.h
new file mode 100644
index 000000000..0156010c7
--- /dev/null
+++ b/src/zstd/common/error_private.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Dependencies
+******************************************/
+#include "../zstd_errors.h"  /* enum list */
+#include "compiler.h"
+#include "debug.h"
+#include "zstd_deps.h"       /* size_t */
+
+
+/* ****************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-****************************************
+*  Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#undef ERROR   /* already defined on Visual Studio */
+#define ERROR(name) ZSTD_ERROR(name)
+#define ZSTD_ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+
+/* check and forward error code */
+#define CHECK_V_F(e, f)     \
+    size_t const e = f;     \
+    do {                    \
+        if (ERR_isError(e)) \
+            return e;       \
+    } while (0)
+#define CHECK_F(f)   do { CHECK_V_F(_var_err__, f); } while (0)
+
+
+/*-****************************************
+*  Error Strings
+******************************************/
+
+const char* ERR_getErrorString(ERR_enum code);   /* error_private.c */
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    return ERR_getErrorString(ERR_getErrorCode(code));
+}
+
+/**
+ * Ignore: this is an internal helper.
+ *
+ * This is a helper function to help force C99-correctness during compilation.
+ * Under strict compilation modes, variadic macro arguments can't be empty.
+ * However, variadic function arguments can be. Using a function therefore lets
+ * us statically check that at least one (string) argument was passed,
+ * independent of the compilation flags.
+ */
+static INLINE_KEYWORD UNUSED_ATTR
+void _force_has_format_string(const char *format, ...) {
+  (void)format;
+}
+
+/**
+ * Ignore: this is an internal helper.
+ *
+ * We want to force this function invocation to be syntactically correct, but
+ * we don't want to force runtime evaluation of its arguments.
+ */
+#define _FORCE_HAS_FORMAT_STRING(...)              \
+    do {                                           \
+        if (0) {                                   \
+            _force_has_format_string(__VA_ARGS__); \
+        }                                          \
+    } while (0)
+
+#define ERR_QUOTE(str) #str
+
+/**
+ * Return the specified error if the condition evaluates to true.
+ *
+ * In debug modes, prints additional information.
+ * In order to do that (particularly, printing the conditional that failed),
+ * this can't just wrap RETURN_ERROR().
+ */
+#define RETURN_ERROR_IF(cond, err, ...)                                        \
+    do {                                                                       \
+        if (cond) {                                                            \
+            RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s",          \
+                  __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
+            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                             \
+            RAWLOG(3, ": " __VA_ARGS__);                                       \
+            RAWLOG(3, "\n");                                                   \
+            return ERROR(err);                                                 \
+        }                                                                      \
+    } while (0)
+
+/**
+ * Unconditionally return the specified error.
+ *
+ * In debug modes, prints additional information.
+ */
+#define RETURN_ERROR(err, ...)                                               \
+    do {                                                                     \
+        RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+              __FILE__, __LINE__, ERR_QUOTE(ERROR(err)));                    \
+        _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                               \
+        RAWLOG(3, ": " __VA_ARGS__);                                         \
+        RAWLOG(3, "\n");                                                     \
+        return ERROR(err);                                                   \
+    } while(0)
+
+/**
+ * If the provided expression evaluates to an error code, returns that error code.
+ *
+ * In debug modes, prints additional information.
+ */
+#define FORWARD_IF_ERROR(err, ...)                                                 \
+    do {                                                                           \
+        size_t const err_code = (err);                                             \
+        if (ERR_isError(err_code)) {                                               \
+            RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s",                 \
+                  __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
+            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                                 \
+            RAWLOG(3, ": " __VA_ARGS__);                                           \
+            RAWLOG(3, "\n");                                                       \
+            return err_code;                                                       \
+        }                                                                          \
+    } while(0)
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
diff --git a/src/zstd/fse.h b/src/zstd/common/fse.h
similarity index 67%
rename from src/zstd/fse.h
rename to src/zstd/common/fse.h
index 7dfe132c1..2ae128e60 100644
--- a/src/zstd/fse.h
+++ b/src/zstd/common/fse.h
@@ -10,58 +10,62 @@
  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  * in the COPYING file in the root directory of this source tree).
  * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
+****************************************************************** */
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 extern "C" {
 #endif
 
 #ifndef FSE_H
 #define FSE_H
 
+
 /*-*****************************************
- *  Dependencies
- ******************************************/
-#include "zstd_deps.h" /* size_t, ptrdiff_t */
+*  Dependencies
+******************************************/
+#include "zstd_deps.h"    /* size_t, ptrdiff_t */
+
 
 /*-*****************************************
- *  FSE_PUBLIC_API : control library symbols visibility
- ******************************************/
-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT == 1) && defined(__GNUC__) && (__GNUC__ >= 4)
-#define FSE_PUBLIC_API __attribute__((visibility("default")))
-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT == 1) /* Visual expected */
-#define FSE_PUBLIC_API __declspec(dllexport)
-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT == 1)
-#define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+*  FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define FSE_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define FSE_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
 #else
-#define FSE_PUBLIC_API
+#  define FSE_PUBLIC_API
 #endif
 
 /*------   Version   ------*/
-#define FSE_VERSION_MAJOR 0
-#define FSE_VERSION_MINOR 9
-#define FSE_VERSION_RELEASE 0
+#define FSE_VERSION_MAJOR    0
+#define FSE_VERSION_MINOR    9
+#define FSE_VERSION_RELEASE  0
 
 #define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
 #define FSE_QUOTE(str) #str
 #define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
 #define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
 
-#define FSE_VERSION_NUMBER (FSE_VERSION_MAJOR * 100 * 100 + FSE_VERSION_MINOR * 100 + FSE_VERSION_RELEASE)
-FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */
+#define FSE_VERSION_NUMBER  (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /**< library version number; to be used when checking dll version */
+
 
 /*-*****************************************
- *  Tool functions
- ******************************************/
-FSE_PUBLIC_API size_t FSE_compressBound(size_t size); /* maximum compressed size */
+*  Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size);       /* maximum compressed size */
 
 /* Error Management */
-FSE_PUBLIC_API unsigned FSE_isError(size_t code);         /* tells if a return value is an error code */
-FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */
+FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
+FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
 
 /*-*****************************************
- *  FSE detailed API
- ******************************************/
+*  FSE detailed API
+******************************************/
 /*!
 FSE_compress() does the following:
 1. count symbol occurrence from source[] into table count[] (see hist.h)
@@ -100,7 +104,7 @@ FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize
     @return : tableLog,
               or an errorCode, which can be tested using FSE_isError() */
 FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog,
-                                         const unsigned* count, size_t srcSize, unsigned maxSymbolValue, unsigned useLowProbCount);
+                    const unsigned* count, size_t srcSize, unsigned maxSymbolValue, unsigned useLowProbCount);
 
 /*! FSE_NCountWriteBound():
     Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
@@ -111,13 +115,13 @@ FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tab
     Compactly save 'normalizedCounter' into 'buffer'.
     @return : size of the compressed table,
               or an errorCode, which can be tested using FSE_isError(). */
-FSE_PUBLIC_API size_t FSE_writeNCount(void* buffer, size_t bufferSize,
-                                      const short* normalizedCounter,
-                                      unsigned maxSymbolValue, unsigned tableLog);
+FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                                 const short* normalizedCounter,
+                                 unsigned maxSymbolValue, unsigned tableLog);
 
 /*! Constructor and Destructor of FSE_CTable.
     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
-typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
 
 /*! FSE_buildCTable():
     Builds `ct`, which must be already allocated, using FSE_createCTable().
@@ -129,7 +133,7 @@ FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCou
     @return : size of compressed data (<= `dstCapacity`),
               or 0 if compressed data could not fit into `dst`,
               or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_compress_usingCTable(void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct);
+FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct);
 
 /*!
 Tutorial :
@@ -174,6 +178,7 @@ If it returns '0', compressed data could not fit into 'dst'.
 If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
 */
 
+
 /* *** DECOMPRESSION *** */
 
 /*! FSE_readNCount():
@@ -181,18 +186,18 @@ If there is an error, the function will return an ErrorCode (which can be tested
     @return : size read from 'rBuffer',
               or an errorCode, which can be tested using FSE_isError().
               maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
-FSE_PUBLIC_API size_t FSE_readNCount(short* normalizedCounter,
-                                     unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
-                                     const void* rBuffer, size_t rBuffSize);
+FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter,
+                           unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                           const void* rBuffer, size_t rBuffSize);
 
 /*! FSE_readNCount_bmi2():
  * Same as FSE_readNCount() but pass bmi2=1 when your CPU supports BMI2 and 0 otherwise.
  */
 FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
-                                          unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
-                                          const void* rBuffer, size_t rBuffSize, int bmi2);
+                           unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                           const void* rBuffer, size_t rBuffSize, int bmi2);
 
-typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
 
 /*!
 Tutorial :
@@ -222,7 +227,8 @@ FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<
 If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
 */
 
-#endif /* FSE_H */
+#endif  /* FSE_H */
+
 
 #if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
 #define FSE_H_FSE_STATIC_LINKING_ONLY
@@ -230,21 +236,23 @@ If there is an error, the function will return an error code, which can be teste
 /* *** Dependency *** */
 #include "bitstream.h"
 
+
 /* *****************************************
- *  Static allocation
- *******************************************/
+*  Static allocation
+*******************************************/
 /* FSE buffer bounds */
 #define FSE_NCOUNTBOUND 512
-#define FSE_BLOCKBOUND(size) ((size) + ((size) >> 7) + 4 /* fse states */ + sizeof(size_t) /* bitContainer */)
-#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+#define FSE_BLOCKBOUND(size) ((size) + ((size)>>7) + 4 /* fse states */ + sizeof(size_t) /* bitContainer */)
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
 
 /* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
-#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1 << ((maxTableLog)-1)) + (((maxSymbolValue) + 1) * 2))
-#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1 << (maxTableLog)))
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<((maxTableLog)-1)) + (((maxSymbolValue)+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<(maxTableLog)))
 
 /* or use the size to malloc() space directly. Pay attention to alignment restrictions though */
-#define FSE_CTABLE_SIZE(maxTableLog, maxSymbolValue) (FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(FSE_CTable))
-#define FSE_DTABLE_SIZE(maxTableLog) (FSE_DTABLE_SIZE_U32(maxTableLog) * sizeof(FSE_DTable))
+#define FSE_CTABLE_SIZE(maxTableLog, maxSymbolValue)   (FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(FSE_CTable))
+#define FSE_DTABLE_SIZE(maxTableLog)                   (FSE_DTABLE_SIZE_U32(maxTableLog) * sizeof(FSE_DTable))
+
 
 /* *****************************************
  *  FSE advanced API
@@ -253,7 +261,7 @@ If there is an error, the function will return an error code, which can be teste
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
 /**< same as FSE_optimalTableLog(), which used `minus==2` */
 
-size_t FSE_buildCTable_rle(FSE_CTable* ct, unsigned char symbolValue);
+size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
 /**< build a fake FSE_CTable, designed to compress always the same symbolValue */
 
 /* FSE_buildCTable_wksp() :
@@ -261,7 +269,7 @@ size_t FSE_buildCTable_rle(FSE_CTable* ct, unsigned char symbolValue);
  * `wkspSize` must be >= `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)` of `unsigned`.
  * See FSE_buildCTable_wksp() for breakdown of workspace usage.
  */
-#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((maxSymbolValue + 2) + (1ull << (tableLog))) / 2 + sizeof(U64) / sizeof(U32) /* additional 8 bytes for potential table overwrite */)
+#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((maxSymbolValue + 2) + (1ull << (tableLog)))/2 + sizeof(U64)/sizeof(U32) /* additional 8 bytes for potential table overwrite */)
 #define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog))
 size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
 
@@ -277,23 +285,23 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
  * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
 
 typedef enum {
-	FSE_repeat_none,  /**< Cannot use the previous table */
-	FSE_repeat_check, /**< Can use the previous table but it must be checked */
-	FSE_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
-} FSE_repeat;
+   FSE_repeat_none,  /**< Cannot use the previous table */
+   FSE_repeat_check, /**< Can use the previous table but it must be checked */
+   FSE_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
+ } FSE_repeat;
 
 /* *****************************************
- *  FSE symbol compression API
- *******************************************/
+*  FSE symbol compression API
+*******************************************/
 /*!
    This API consists of small unitary functions, which highly benefit from being inlined.
    Hence their body are included in next section.
 */
 typedef struct {
-	ptrdiff_t value;
-	const void* stateTable;
-	const void* symbolTT;
-	unsigned stateLog;
+    ptrdiff_t   value;
+    const void* stateTable;
+    const void* symbolTT;
+    unsigned    stateLog;
 } FSE_CState_t;
 
 static void FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
@@ -345,15 +353,17 @@ If there is an error, it returns an errorCode (which can be tested using FSE_isE
     size_t size = BIT_closeCStream(&bitStream);
 */
 
+
 /* *****************************************
- *  FSE symbol decompression API
- *******************************************/
+*  FSE symbol decompression API
+*******************************************/
 typedef struct {
-	size_t state;
-	const void* table; /* precise table may vary, depending on U16 */
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
 } FSE_DState_t;
 
-static void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
 
 static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
 
@@ -408,202 +418,223 @@ Check also the states. There might be some symbols left there, if some high prob
     FSE_endOfDState(&DState);
 */
 
+
 /* *****************************************
- *  FSE unsafe API
- *******************************************/
+*  FSE unsafe API
+*******************************************/
 static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
 /* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
 
+
 /* *****************************************
- *  Implementation of inlined functions
- *******************************************/
+*  Implementation of inlined functions
+*******************************************/
 typedef struct {
-	int deltaFindState;
-	U32 deltaNbBits;
+    int deltaFindState;
+    U32 deltaNbBits;
 } FSE_symbolCompressionTransform; /* total 8 bytes */
 
-MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct) {
-	const void* ptr = ct;
-	const U16* u16ptr = (const U16*)ptr;
-	const U32 tableLog = MEM_read16(ptr);
-	statePtr->value = (ptrdiff_t)1 << tableLog;
-	statePtr->stateTable = u16ptr + 2;
-	statePtr->symbolTT = ct + 1 + (tableLog ? (1 << (tableLog - 1)) : 1);
-	statePtr->stateLog = tableLog;
+MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+    const void* ptr = ct;
+    const U16* u16ptr = (const U16*) ptr;
+    const U32 tableLog = MEM_read16(ptr);
+    statePtr->value = (ptrdiff_t)1<<tableLog;
+    statePtr->stateTable = u16ptr+2;
+    statePtr->symbolTT = ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1);
+    statePtr->stateLog = tableLog;
 }
 
+
 /*! FSE_initCState2() :
- *   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
- *   uses the smallest state value possible, saving the cost of this symbol */
-MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol) {
-	FSE_initCState(statePtr, ct);
-	{
-		const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
-		const U16* stateTable = (const U16*)(statePtr->stateTable);
-		U32 nbBitsOut = (U32)((symbolTT.deltaNbBits + (1 << 15)) >> 16);
-		statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
-		statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
-	}
+*   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+*   uses the smallest state value possible, saving the cost of this symbol */
+MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+    FSE_initCState(statePtr, ct);
+    {   const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+        const U16* stateTable = (const U16*)(statePtr->stateTable);
+        U32 nbBitsOut  = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+        statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+        statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+    }
 }
 
-MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, unsigned symbol) {
-	FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
-	const U16* const stateTable = (const U16*)(statePtr->stateTable);
-	U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
-	BIT_addBits(bitC, statePtr->value, nbBitsOut);
-	statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, unsigned symbol)
+{
+    FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    const U16* const stateTable = (const U16*)(statePtr->stateTable);
+    U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+    BIT_addBits(bitC,  (size_t)statePtr->value, nbBitsOut);
+    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
 }
 
-MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) {
-	BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
-	BIT_flushBits(bitC);
+MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+    BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog);
+    BIT_flushBits(bitC);
 }
 
+
 /* FSE_getMaxNbBits() :
  * Approximate maximum cost of a symbol, in bits.
  * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
-MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) {
-	const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*)symbolTTPtr;
-	return (symbolTT[symbolValue].deltaNbBits + ((1 << 16) - 1)) >> 16;
+MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    return (symbolTT[symbolValue].deltaNbBits + ((1<<16)-1)) >> 16;
 }
 
 /* FSE_bitCost() :
  * Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits)
  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
-MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog) {
-	const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*)symbolTTPtr;
-	U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16;
-	U32 const threshold = (minNbBits + 1) << 16;
-	assert(tableLog < 16);
-	assert(accuracyLog < 31 - tableLog); /* ensure enough room for renormalization double shift */
-	{
-		U32 const tableSize = 1 << tableLog;
-		U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize);
-		U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog; /* linear interpolation (very approximate) */
-		U32 const bitMultiplier = 1 << accuracyLog;
-		assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold);
-		assert(normalizedDeltaFromThreshold <= bitMultiplier);
-		return (minNbBits + 1) * bitMultiplier - normalizedDeltaFromThreshold;
-	}
+MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16;
+    U32 const threshold = (minNbBits+1) << 16;
+    assert(tableLog < 16);
+    assert(accuracyLog < 31-tableLog);  /* ensure enough room for renormalization double shift */
+    {   U32 const tableSize = 1 << tableLog;
+        U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize);
+        U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog;   /* linear interpolation (very approximate) */
+        U32 const bitMultiplier = 1 << accuracyLog;
+        assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold);
+        assert(normalizedDeltaFromThreshold <= bitMultiplier);
+        return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold;
+    }
 }
 
+
 /* ======    Decompression    ====== */
 
 typedef struct {
-	U16 tableLog;
-	U16 fastMode;
-} FSE_DTableHeader; /* sizeof U32 */
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
 
 typedef struct
 {
-	unsigned short newState;
-	unsigned char symbol;
-	unsigned char nbBits;
-} FSE_decode_t; /* size == U32 */
-
-MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt) {
-	const void* ptr = dt;
-	const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
-	DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
-	BIT_reloadDStream(bitD);
-	DStatePtr->table = dt + 1;
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
 }
 
-MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr) {
-	FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
-	return DInfo.symbol;
+MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
 }
 
-MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) {
-	FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
-	U32 const nbBits = DInfo.nbBits;
-	size_t const lowBits = BIT_readBits(bitD, nbBits);
-	DStatePtr->state = DInfo.newState + lowBits;
+MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
 }
 
-MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) {
-	FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
-	U32 const nbBits = DInfo.nbBits;
-	BYTE const symbol = DInfo.symbol;
-	size_t const lowBits = BIT_readBits(bitD, nbBits);
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
 
-	DStatePtr->state = DInfo.newState + lowBits;
-	return symbol;
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
 }
 
 /*! FSE_decodeSymbolFast() :
     unsafe, only works if no symbol has a probability > 50% */
-MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) {
-	FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
-	U32 const nbBits = DInfo.nbBits;
-	BYTE const symbol = DInfo.symbol;
-	size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
-
-	DStatePtr->state = DInfo.newState + lowBits;
-	return symbol;
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
 }
 
-MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) {
-	return DStatePtr->state == 0;
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
 }
 
+
+
 #ifndef FSE_COMMONDEFS_ONLY
 
 /* **************************************************************
- *  Tuning parameters
- ****************************************************************/
+*  Tuning parameters
+****************************************************************/
 /*!MEMORY_USAGE :
- *  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
- *  Increasing memory usage improves compression ratio
- *  Reduced memory usage can improve speed, due to cache effect
- *  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
 #ifndef FSE_MAX_MEMORY_USAGE
-#define FSE_MAX_MEMORY_USAGE 14
+#  define FSE_MAX_MEMORY_USAGE 14
 #endif
 #ifndef FSE_DEFAULT_MEMORY_USAGE
-#define FSE_DEFAULT_MEMORY_USAGE 13
+#  define FSE_DEFAULT_MEMORY_USAGE 13
 #endif
-#if(FSE_DEFAULT_MEMORY_USAGE > FSE_MAX_MEMORY_USAGE)
-#error "FSE_DEFAULT_MEMORY_USAGE must be <= FSE_MAX_MEMORY_USAGE"
+#if (FSE_DEFAULT_MEMORY_USAGE > FSE_MAX_MEMORY_USAGE)
+#  error "FSE_DEFAULT_MEMORY_USAGE must be <= FSE_MAX_MEMORY_USAGE"
 #endif
 
 /*!FSE_MAX_SYMBOL_VALUE :
- *  Maximum symbol value authorized.
- *  Required for proper stack allocation */
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
 #ifndef FSE_MAX_SYMBOL_VALUE
-#define FSE_MAX_SYMBOL_VALUE 255
+#  define FSE_MAX_SYMBOL_VALUE 255
 #endif
 
 /* **************************************************************
- *  template functions type & suffix
- ****************************************************************/
+*  template functions type & suffix
+****************************************************************/
 #define FSE_FUNCTION_TYPE BYTE
 #define FSE_FUNCTION_EXTENSION
 #define FSE_DECODE_TYPE FSE_decode_t
 
-#endif /* !FSE_COMMONDEFS_ONLY */
+
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
 
 /* ***************************************************************
- *  Constants
- *****************************************************************/
-#define FSE_MAX_TABLELOG (FSE_MAX_MEMORY_USAGE - 2)
-#define FSE_MAX_TABLESIZE (1U << FSE_MAX_TABLELOG)
-#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE - 1)
-#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE - 2)
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
 #define FSE_MIN_TABLELOG 5
 
 #define FSE_TABLELOG_ABSOLUTE_MAX 15
 #if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
-#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#  error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
 #endif
 
-#define FSE_TABLESTEP(tableSize) (((tableSize) >> 1) + ((tableSize) >> 3) + 3)
+#define FSE_TABLESTEP(tableSize) (((tableSize)>>1) + ((tableSize)>>3) + 3)
+
 
 #endif /* FSE_STATIC_LINKING_ONLY */
 
-#if defined(__cplusplus)
+
+#if defined (__cplusplus)
 }
 #endif
diff --git a/src/zstd/common/fse_decompress.c b/src/zstd/common/fse_decompress.c
new file mode 100644
index 000000000..0dcc4640d
--- /dev/null
+++ b/src/zstd/common/fse_decompress.c
@@ -0,0 +1,313 @@
+/* ******************************************************************
+ * FSE : Finite State Entropy decoder
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "debug.h"      /* assert */
+#include "bitstream.h"
+#include "compiler.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#include "error_private.h"
+#include "zstd_deps.h"  /* ZSTD_memcpy */
+#include "bits.h"       /* ZSTD_highbit32 */
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
+    U16* symbolNext = (U16*)workSpace;
+    BYTE* spread = (BYTE*)(symbolNext + maxSymbolValue + 1);
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (FSE_BUILD_DTABLE_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(maxSymbolValue_tooLarge);
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSE_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = (U16)normalizedCounter[s];
+        }   }   }
+        ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    if (highThreshold == tableSize - 1) {
+        size_t const tableMask = tableSize-1;
+        size_t const step = FSE_TABLESTEP(tableSize);
+        /* First lay down the symbols in order.
+         * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
+         * misses since small blocks generally have small table logs, so nearly
+         * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+         * our buffer to handle the over-write.
+         */
+        {   U64 const add = 0x0101010101010101ull;
+            size_t pos = 0;
+            U64 sv = 0;
+            U32 s;
+            for (s=0; s<maxSV1; ++s, sv += add) {
+                int i;
+                int const n = normalizedCounter[s];
+                MEM_write64(spread + pos, sv);
+                for (i = 8; i < n; i += 8) {
+                    MEM_write64(spread + pos + i, sv);
+                }
+                pos += (size_t)n;
+        }   }
+        /* Now we spread those positions across the table.
+         * The benefit of doing it in two stages is that we avoid the
+         * variable size inner loop, which caused lots of branch misses.
+         * Now we can run through all the positions without any branch misses.
+         * We unroll the loop twice, since that is what empirically worked best.
+         */
+        {
+            size_t position = 0;
+            size_t s;
+            size_t const unroll = 2;
+            assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
+            for (s = 0; s < (size_t)tableSize; s += unroll) {
+                size_t u;
+                for (u = 0; u < unroll; ++u) {
+                    size_t const uPosition = (position + (u * step)) & tableMask;
+                    tableDecode[uPosition].symbol = spread[s + u];
+                }
+                position = (position + (unroll * step)) & tableMask;
+            }
+            assert(position == 0);
+        }
+    } else {
+        U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    return FSE_buildDTable_internal(dt, normalizedCounter, maxSymbolValue, tableLog, workSpace, wkspSize);
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+
+FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+
+    /* Init */
+    CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) & (op<olimit) ; op+=4) {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state1);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state2);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    assert(op >= ostart);
+    return (size_t)(op-ostart);
+}
+
+typedef struct {
+    short ncount[FSE_MAX_SYMBOL_VALUE + 1];
+} FSE_DecompressWksp;
+
+
+FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+        void* dst, size_t dstCapacity,
+        const void* cSrc, size_t cSrcSize,
+        unsigned maxLog, void* workSpace, size_t wkspSize,
+        int bmi2)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
+    size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable);
+    FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos;
+
+    FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
+    if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
+
+    /* correct offset to dtable depends on this property */
+    FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0);
+
+    /* normal FSE decoding mode */
+    {   size_t const NCountLength =
+            FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
+        if (FSE_isError(NCountLength)) return NCountLength;
+        if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+        assert(NCountLength <= cSrcSize);
+        ip += NCountLength;
+        cSrcSize -= NCountLength;
+    }
+
+    if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
+    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
+    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+    wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+
+    CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+
+    {
+        const void* ptr = dtable;
+        const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+        const U32 fastMode = DTableH->fastMode;
+
+        /* select fast mode (static) */
+        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1);
+        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0);
+    }
+}
+
+/* Avoids the FORCE_INLINE of the _body() function. */
+static size_t FSE_decompress_wksp_body_default(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+{
+    return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 0);
+}
+
+#if DYNAMIC_BMI2
+BMI2_TARGET_ATTRIBUTE static size_t FSE_decompress_wksp_body_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+{
+    return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 1);
+}
+#endif
+
+size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return FSE_decompress_wksp_body_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+    }
+#endif
+    (void)bmi2;
+    return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+}
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/src/zstd/huf.h b/src/zstd/common/huf.h
similarity index 61%
rename from src/zstd/huf.h
rename to src/zstd/common/huf.h
index 2c6312a9e..99bf85d6f 100644
--- a/src/zstd/huf.h
+++ b/src/zstd/common/huf.h
@@ -10,9 +10,9 @@
  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  * in the COPYING file in the root directory of this source tree).
  * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
+****************************************************************** */
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 extern "C" {
 #endif
 
@@ -20,97 +20,102 @@ extern "C" {
 #define HUF_H_298734234
 
 /* *** Dependencies *** */
-#include "zstd_deps.h" /* size_t */
-#include "mem.h"       /* U32 */
+#include "zstd_deps.h"    /* size_t */
+#include "mem.h"          /* U32 */
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
 
+
 /* ***   Tool functions *** */
-#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */
-size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)   /**< maximum input size for a single block compressed with HUF_compress */
+size_t HUF_compressBound(size_t size);   /**< maximum compressed size (worst case) */
 
 /* Error Management */
-unsigned HUF_isError(size_t code);         /**< tells if a return value is an error code */
-const char* HUF_getErrorName(size_t code); /**< provides error code string (useful for debugging) */
+unsigned    HUF_isError(size_t code);       /**< tells if a return value is an error code */
+const char* HUF_getErrorName(size_t code);  /**< provides error code string (useful for debugging) */
+
 
 #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
 #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
 
 /* *** Constants *** */
-#define HUF_TABLELOG_MAX 12     /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
-#define HUF_TABLELOG_DEFAULT 11 /* default tableLog value when none specified */
-#define HUF_SYMBOLVALUE_MAX 255
+#define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
+#define HUF_TABLELOG_DEFAULT  11      /* default tableLog value when none specified */
+#define HUF_SYMBOLVALUE_MAX  255
 
-#define HUF_TABLELOG_ABSOLUTEMAX 12 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
-#if(HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
-#error "HUF_TABLELOG_MAX is too large !"
+#define HUF_TABLELOG_ABSOLUTEMAX  12  /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#  error "HUF_TABLELOG_MAX is too large !"
 #endif
 
+
 /* ****************************************
- *  Static allocation
- ******************************************/
+*  Static allocation
+******************************************/
 /* HUF buffer bounds */
 #define HUF_CTABLEBOUND 129
-#define HUF_BLOCKBOUND(size) (size + (size >> 8) + 8)                    /* only true when incompressible is pre-filtered with fast heuristic */
-#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true when incompressible is pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
 
 /* static allocation of HUF's Compression Table */
 /* this is a private definition, just exposed for allocation and strict aliasing purpose. never EVER access its members directly */
-typedef size_t HUF_CElt;                                          /* consider it an incomplete type */
-#define HUF_CTABLE_SIZE_ST(maxSymbolValue) ((maxSymbolValue) + 2) /* Use tables of size_t, for proper alignment */
-#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_ST(maxSymbolValue) * sizeof(size_t))
+typedef size_t HUF_CElt;   /* consider it an incomplete type */
+#define HUF_CTABLE_SIZE_ST(maxSymbolValue)   ((maxSymbolValue)+2)   /* Use tables of size_t, for proper alignment */
+#define HUF_CTABLE_SIZE(maxSymbolValue)       (HUF_CTABLE_SIZE_ST(maxSymbolValue) * sizeof(size_t))
 #define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
-	HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */
+    HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */
 
 /* static allocation of HUF's DTable */
 typedef U32 HUF_DTable;
-#define HUF_DTABLE_SIZE(maxTableLog) (1 + (1 << (maxTableLog)))
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
 #define HUF_CREATE_STATIC_DTABLEX1(DTable, maxTableLog) \
-	HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = {((U32)((maxTableLog)-1) * 0x01000001)}
+        HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
 #define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
-	HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = {((U32)(maxTableLog)*0x01000001)}
+        HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
+
 
 /* ****************************************
- *  Advanced decompression functions
- ******************************************/
+*  Advanced decompression functions
+******************************************/
 
 /**
  * Huffman flags bitset.
  * For all flags, 0 is the default value.
  */
 typedef enum {
-	/**
-	 * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
-	 * Otherwise: Ignored.
-	 */
-	HUF_flags_bmi2 = (1 << 0),
-	/**
-	 * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
-	 * If unset: Use heuristic to find the table depth.
-	 */
-	HUF_flags_optimalDepth = (1 << 1),
-	/**
-	 * If set: If the previous table can encode the input, always reuse the previous table.
-	 * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
-	 */
-	HUF_flags_preferRepeat = (1 << 2),
-	/**
-	 * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
-	 * If unset: Always histogram the entire input.
-	 */
-	HUF_flags_suspectUncompressible = (1 << 3),
-	/**
-	 * If set: Don't use assembly implementations
-	 * If unset: Allow using assembly implementations
-	 */
-	HUF_flags_disableAsm = (1 << 4),
-	/**
-	 * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
-	 * If unset: Use the fast decoding loop when possible.
-	 */
-	HUF_flags_disableFast = (1 << 5)
+    /**
+     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
+     * Otherwise: Ignored.
+     */
+    HUF_flags_bmi2 = (1 << 0),
+    /**
+     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
+     * If unset: Use heuristic to find the table depth.
+     */
+    HUF_flags_optimalDepth = (1 << 1),
+    /**
+     * If set: If the previous table can encode the input, always reuse the previous table.
+     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
+     */
+    HUF_flags_preferRepeat = (1 << 2),
+    /**
+     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
+     * If unset: Always histogram the entire input.
+     */
+    HUF_flags_suspectUncompressible = (1 << 3),
+    /**
+     * If set: Don't use assembly implementations
+     * If unset: Allow using assembly implementations
+     */
+    HUF_flags_disableAsm = (1 << 4),
+    /**
+     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
+     * If unset: Use the fast decoding loop when possible.
+     */
+    HUF_flags_disableFast = (1 << 5)
 } HUF_flags_e;
 
+
 /* ****************************************
  *  HUF detailed API
  * ****************************************/
@@ -130,17 +135,17 @@ typedef enum {
 unsigned HUF_minTableLog(unsigned symbolCardinality);
 unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
 unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
-                             size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
+ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
 size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
 size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
 size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
 int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
 
 typedef enum {
-	HUF_repeat_none,  /**< Cannot use the previous table */
-	HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
-	HUF_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
-} HUF_repeat;
+   HUF_repeat_none,  /**< Cannot use the previous table */
+   HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+   HUF_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
+ } HUF_repeat;
 
 /** HUF_compress4X_repeat() :
  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
@@ -149,10 +154,10 @@ typedef enum {
  *  If preferRepeat then the old table will always be used if valid.
  *  If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
 size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
-                             const void* src, size_t srcSize,
-                             unsigned maxSymbolValue, unsigned tableLog,
-                             void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
-                             HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,    /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
 
 /** HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
@@ -160,9 +165,9 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
  */
 #define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
 #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
-size_t HUF_buildCTable_wksp(HUF_CElt* tree,
-                            const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
-                            void* workSpace, size_t wkspSize);
+size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                       const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+                             void* workSpace, size_t wkspSize);
 
 /*! HUF_readStats() :
  *  Read compact Huffman tree, saved by HUF_writeCTable().
@@ -178,7 +183,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
  * 4-byte aligned and its size must be >= HUF_READ_STATS_WORKSPACE_SIZE.
  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
  */
-#define HUF_READ_STATS_WORKSPACE_SIZE_U32 FSE_DECOMPRESS_WKSP_SIZE_U32(6, HUF_TABLELOG_MAX - 1)
+#define HUF_READ_STATS_WORKSPACE_SIZE_U32 FSE_DECOMPRESS_WKSP_SIZE_U32(6, HUF_TABLELOG_MAX-1)
 #define HUF_READ_STATS_WORKSPACE_SIZE (HUF_READ_STATS_WORKSPACE_SIZE_U32 * sizeof(unsigned))
 size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
@@ -188,13 +193,26 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
 
 /** HUF_readCTable() :
  *  Loading a CTable saved with HUF_writeCTable() */
-size_t HUF_readCTable(HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights);
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights);
 
 /** HUF_getNbBitsFromCTable() :
  *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
- *  Note 1 : is not inlined, as HUF_CElt definition is private */
+ *  Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0
+ *  Note 2 : is not inlined, as HUF_CElt definition is private
+ */
 U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
 
+typedef struct {
+    BYTE tableLog;
+    BYTE maxSymbolValue;
+    BYTE unused[sizeof(size_t) - 2];
+} HUF_CTableHeader;
+
+/** HUF_readCTableHeader() :
+ *  @returns The header from the CTable specifying the tableLog and the maxSymbolValue.
+ */
+HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable);
+
 /*
  * HUF_decompress() does the following:
  * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
@@ -207,7 +225,7 @@ U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
  *  based on a set of pre-computed metrics.
  * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
  *  Assumption : 0 < dstSize <= 128 KB */
-U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize);
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
 
 /**
  *  The minimum workspace size for the `workSpace` used in
@@ -222,6 +240,7 @@ U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize);
 #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
 #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
 
+
 /* ====================== */
 /* single stream variants */
 /* ====================== */
@@ -234,14 +253,14 @@ size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, si
  *  If preferRepeat then the old table will always be used if valid.
  *  If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
 size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
-                             const void* src, size_t srcSize,
-                             unsigned maxSymbolValue, unsigned tableLog,
-                             void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
-                             HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,   /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
 
 size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /**< double-symbols decoder */
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /**< double-symbols decoder */
 #endif
 
 /* BMI2 variants.
@@ -260,8 +279,8 @@ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
 size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
 
-#endif /* HUF_H_298734234 */
+#endif   /* HUF_H_298734234 */
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 }
 #endif
diff --git a/src/zstd/common/mem.h b/src/zstd/common/mem.h
new file mode 100644
index 000000000..096f4be51
--- /dev/null
+++ b/src/zstd/common/mem.h
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>  /* size_t, ptrdiff_t */
+#include "compiler.h"  /* __has_builtin */
+#include "debug.h"  /* DEBUG_STATIC_ASSERT */
+#include "zstd_deps.h"  /* ZSTD_memcpy */
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#  if defined(_AIX)
+#    include <inttypes.h>
+#  else
+#    include <stdint.h> /* intptr_t */
+#  endif
+  typedef   uint8_t BYTE;
+  typedef   uint8_t U8;
+  typedef    int8_t S8;
+  typedef  uint16_t U16;
+  typedef   int16_t S16;
+  typedef  uint32_t U32;
+  typedef   int32_t S32;
+  typedef  uint64_t U64;
+  typedef   int64_t S64;
+#else
+# include <limits.h>
+#if CHAR_BIT != 8
+#  error "this implementation requires char to be exactly 8-bit type"
+#endif
+  typedef unsigned char      BYTE;
+  typedef unsigned char      U8;
+  typedef   signed char      S8;
+#if USHRT_MAX != 65535
+#  error "this implementation requires short to be exactly 16-bit type"
+#endif
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+#if UINT_MAX != 4294967295
+#  error "this implementation requires int to be exactly 32-bit type"
+#endif
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+/* note : there are no limits defined for long long type in C90.
+ * limits exist in C99, however, in such case, <stdint.h> is preferred */
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O API
+*****************************************************************/
+/*=== Static platform detection ===*/
+MEM_STATIC unsigned MEM_32bits(void);
+MEM_STATIC unsigned MEM_64bits(void);
+MEM_STATIC unsigned MEM_isLittleEndian(void);
+
+/*=== Native unaligned read/write ===*/
+MEM_STATIC U16 MEM_read16(const void* memPtr);
+MEM_STATIC U32 MEM_read32(const void* memPtr);
+MEM_STATIC U64 MEM_read64(const void* memPtr);
+MEM_STATIC size_t MEM_readST(const void* memPtr);
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value);
+MEM_STATIC void MEM_write32(void* memPtr, U32 value);
+MEM_STATIC void MEM_write64(void* memPtr, U64 value);
+
+/*=== Little endian unaligned read/write ===*/
+MEM_STATIC U16 MEM_readLE16(const void* memPtr);
+MEM_STATIC U32 MEM_readLE24(const void* memPtr);
+MEM_STATIC U32 MEM_readLE32(const void* memPtr);
+MEM_STATIC U64 MEM_readLE64(const void* memPtr);
+MEM_STATIC size_t MEM_readLEST(const void* memPtr);
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val);
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val);
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32);
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64);
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val);
+
+/*=== Big endian unaligned read/write ===*/
+MEM_STATIC U32 MEM_readBE32(const void* memPtr);
+MEM_STATIC U64 MEM_readBE64(const void* memPtr);
+MEM_STATIC size_t MEM_readBEST(const void* memPtr);
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32);
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64);
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val);
+
+/*=== Byteswap ===*/
+MEM_STATIC U32 MEM_swap32(U32 in);
+MEM_STATIC U64 MEM_swap64(U64 in);
+MEM_STATIC size_t MEM_swapST(size_t in);
+
+
+/*-**************************************************************
+*  Memory I/O Implementation
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
+ * Method 0 : always use `memcpy()`. Safe and portable.
+ * Method 1 : Use compiler extension to set unaligned access.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ * Default  : method 1 if supported, else method 0
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  ifdef __GNUC__
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+    return 1;
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return 0;
+#elif defined(__clang__) && __LITTLE_ENDIAN__
+    return 1;
+#elif defined(__clang__) && __BIG_ENDIAN__
+    return 0;
+#elif defined(_MSC_VER) && (_M_AMD64 || _M_IX86)
+    return 1;
+#elif defined(__DMC__) && defined(_M_IX86)
+    return 1;
+#else
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+#endif
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+typedef __attribute__((aligned(1))) U16 unalign16;
+typedef __attribute__((aligned(1))) U32 unalign32;
+typedef __attribute__((aligned(1))) U64 unalign64;
+typedef __attribute__((aligned(1))) size_t unalignArch;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }
+MEM_STATIC size_t MEM_readST(const void* ptr) { return *(const unalignArch*)ptr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(unalign32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(unalign64*)memPtr = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC size_t MEM_readST(const void* memPtr)
+{
+    size_t val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    ZSTD_memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write32(void* memPtr, U32 value)
+{
+    ZSTD_memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write64(void* memPtr, U64 value)
+{
+    ZSTD_memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32_fallback(U32 in)
+{
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+}
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap32))
+    return __builtin_bswap32(in);
+#else
+    return MEM_swap32_fallback(in);
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64_fallback(U64 in)
+{
+     return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap64))
+    return __builtin_bswap64(in);
+#else
+    return MEM_swap64_fallback(in);
+#endif
+}
+
+MEM_STATIC size_t MEM_swapST(size_t in)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_swap32((U32)in);
+    else
+        return (size_t)MEM_swap64((U64)in);
+}
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE24(const void* memPtr)
+{
+    return (U32)MEM_readLE16(memPtr) + ((U32)(((const BYTE*)memPtr)[2]) << 16);
+}
+
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val)
+{
+    MEM_writeLE16(memPtr, (U16)val);
+    ((BYTE*)memPtr)[2] = (BYTE)(val>>16);
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, val32);
+    else
+        MEM_write32(memPtr, MEM_swap32(val32));
+}
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, val64);
+    else
+        MEM_write64(memPtr, MEM_swap64(val64));
+}
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeLE32(memPtr, (U32)val);
+    else
+        MEM_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+MEM_STATIC U32 MEM_readBE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap32(MEM_read32(memPtr));
+    else
+        return MEM_read32(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, MEM_swap32(val32));
+    else
+        MEM_write32(memPtr, val32);
+}
+
+MEM_STATIC U64 MEM_readBE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap64(MEM_read64(memPtr));
+    else
+        return MEM_read64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, MEM_swap64(val64));
+    else
+        MEM_write64(memPtr, val64);
+}
+
+MEM_STATIC size_t MEM_readBEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readBE32(memPtr);
+    else
+        return (size_t)MEM_readBE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeBE32(memPtr, (U32)val);
+    else
+        MEM_writeBE64(memPtr, (U64)val);
+}
+
+/* code only tested on 32 and 64 bits systems */
+MEM_STATIC void MEM_check(void) { DEBUG_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); }
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
diff --git a/src/zstd/common/pool.c b/src/zstd/common/pool.c
new file mode 100644
index 000000000..3adcefc9a
--- /dev/null
+++ b/src/zstd/common/pool.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Dependencies   ======= */
+#include "../common/allocations.h"  /* ZSTD_customCalloc, ZSTD_customFree */
+#include "zstd_deps.h" /* size_t */
+#include "debug.h"     /* assert */
+#include "pool.h"
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+#ifdef ZSTD_MULTITHREAD
+
+#include "threading.h"   /* pthread adaptation */
+
+/* A job is a function and an opaque argument */
+typedef struct POOL_job_s {
+    POOL_function function;
+    void *opaque;
+} POOL_job;
+
+struct POOL_ctx_s {
+    ZSTD_customMem customMem;
+    /* Keep track of the threads */
+    ZSTD_pthread_t* threads;
+    size_t threadCapacity;
+    size_t threadLimit;
+
+    /* The queue is a circular buffer */
+    POOL_job *queue;
+    size_t queueHead;
+    size_t queueTail;
+    size_t queueSize;
+
+    /* The number of threads working on jobs */
+    size_t numThreadsBusy;
+    /* Indicates if the queue is empty */
+    int queueEmpty;
+
+    /* The mutex protects the queue */
+    ZSTD_pthread_mutex_t queueMutex;
+    /* Condition variable for pushers to wait on when the queue is full */
+    ZSTD_pthread_cond_t queuePushCond;
+    /* Condition variables for poppers to wait on when the queue is empty */
+    ZSTD_pthread_cond_t queuePopCond;
+    /* Indicates if the queue is shutting down */
+    int shutdown;
+};
+
+/* POOL_thread() :
+ * Work thread for the thread pool.
+ * Waits for jobs and executes them.
+ * @returns : NULL on failure else non-null.
+ */
+static void* POOL_thread(void* opaque) {
+    POOL_ctx* const ctx = (POOL_ctx*)opaque;
+    if (!ctx) { return NULL; }
+    for (;;) {
+        /* Lock the mutex and wait for a non-empty queue or until shutdown */
+        ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+
+        while ( ctx->queueEmpty
+            || (ctx->numThreadsBusy >= ctx->threadLimit) ) {
+            if (ctx->shutdown) {
+                /* even if !queueEmpty, (possible if numThreadsBusy >= threadLimit),
+                 * a few threads will be shutdown while !queueEmpty,
+                 * but enough threads will remain active to finish the queue */
+                ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+                return opaque;
+            }
+            ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
+        }
+        /* Pop a job off the queue */
+        {   POOL_job const job = ctx->queue[ctx->queueHead];
+            ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
+            ctx->numThreadsBusy++;
+            ctx->queueEmpty = (ctx->queueHead == ctx->queueTail);
+            /* Unlock the mutex, signal a pusher, and run the job */
+            ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+
+            job.function(job.opaque);
+
+            /* If the intended queue size was 0, signal after finishing job */
+            ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+            ctx->numThreadsBusy--;
+            ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        }
+    }  /* for (;;) */
+    assert(0);  /* Unreachable */
+}
+
+/* ZSTD_createThreadPool() : public access point */
+POOL_ctx* ZSTD_createThreadPool(size_t numThreads) {
+    return POOL_create (numThreads, 0);
+}
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem)
+{
+    POOL_ctx* ctx;
+    /* Check parameters */
+    if (!numThreads) { return NULL; }
+    /* Allocate the context and zero initialize */
+    ctx = (POOL_ctx*)ZSTD_customCalloc(sizeof(POOL_ctx), customMem);
+    if (!ctx) { return NULL; }
+    /* Initialize the job queue.
+     * It needs one extra space since one space is wasted to differentiate
+     * empty and full queues.
+     */
+    ctx->queueSize = queueSize + 1;
+    ctx->queue = (POOL_job*)ZSTD_customCalloc(ctx->queueSize * sizeof(POOL_job), customMem);
+    ctx->queueHead = 0;
+    ctx->queueTail = 0;
+    ctx->numThreadsBusy = 0;
+    ctx->queueEmpty = 1;
+    {
+        int error = 0;
+        error |= ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL);
+        error |= ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL);
+        error |= ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL);
+        if (error) { POOL_free(ctx); return NULL; }
+    }
+    ctx->shutdown = 0;
+    /* Allocate space for the thread handles */
+    ctx->threads = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
+    ctx->threadCapacity = 0;
+    ctx->customMem = customMem;
+    /* Check for errors */
+    if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; }
+    /* Initialize the threads */
+    {   size_t i;
+        for (i = 0; i < numThreads; ++i) {
+            if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
+                ctx->threadCapacity = i;
+                POOL_free(ctx);
+                return NULL;
+        }   }
+        ctx->threadCapacity = numThreads;
+        ctx->threadLimit = numThreads;
+    }
+    return ctx;
+}
+
+/*! POOL_join() :
+    Shutdown the queue, wake any sleeping threads, and join all of the threads.
+*/
+static void POOL_join(POOL_ctx* ctx) {
+    /* Shut down the queue */
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    ctx->shutdown = 1;
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    /* Wake up sleeping threads */
+    ZSTD_pthread_cond_broadcast(&ctx->queuePushCond);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    /* Join all of the threads */
+    {   size_t i;
+        for (i = 0; i < ctx->threadCapacity; ++i) {
+            ZSTD_pthread_join(ctx->threads[i]);  /* note : could fail */
+    }   }
+}
+
+void POOL_free(POOL_ctx *ctx) {
+    if (!ctx) { return; }
+    POOL_join(ctx);
+    ZSTD_pthread_mutex_destroy(&ctx->queueMutex);
+    ZSTD_pthread_cond_destroy(&ctx->queuePushCond);
+    ZSTD_pthread_cond_destroy(&ctx->queuePopCond);
+    ZSTD_customFree(ctx->queue, ctx->customMem);
+    ZSTD_customFree(ctx->threads, ctx->customMem);
+    ZSTD_customFree(ctx, ctx->customMem);
+}
+
+/*! POOL_joinJobs() :
+ *  Waits for all queued jobs to finish executing.
+ */
+void POOL_joinJobs(POOL_ctx* ctx) {
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    while(!ctx->queueEmpty || ctx->numThreadsBusy > 0) {
+        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+    }
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+void ZSTD_freeThreadPool (ZSTD_threadPool* pool) {
+  POOL_free (pool);
+}
+
+size_t POOL_sizeof(const POOL_ctx* ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    return sizeof(*ctx)
+        + ctx->queueSize * sizeof(POOL_job)
+        + ctx->threadCapacity * sizeof(ZSTD_pthread_t);
+}
+
+
+/* @return : 0 on success, 1 on error */
+static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads)
+{
+    if (numThreads <= ctx->threadCapacity) {
+        if (!numThreads) return 1;
+        ctx->threadLimit = numThreads;
+        return 0;
+    }
+    /* numThreads > threadCapacity */
+    {   ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
+        if (!threadPool) return 1;
+        /* replace existing thread pool */
+        ZSTD_memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(ZSTD_pthread_t));
+        ZSTD_customFree(ctx->threads, ctx->customMem);
+        ctx->threads = threadPool;
+        /* Initialize additional threads */
+        {   size_t threadId;
+            for (threadId = ctx->threadCapacity; threadId < numThreads; ++threadId) {
+                if (ZSTD_pthread_create(&threadPool[threadId], NULL, &POOL_thread, ctx)) {
+                    ctx->threadCapacity = threadId;
+                    return 1;
+            }   }
+    }   }
+    /* successfully expanded */
+    ctx->threadCapacity = numThreads;
+    ctx->threadLimit = numThreads;
+    return 0;
+}
+
+/* @return : 0 on success, 1 on error */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads)
+{
+    int result;
+    if (ctx==NULL) return 1;
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    result = POOL_resize_internal(ctx, numThreads);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return result;
+}
+
+/**
+ * Returns 1 if the queue is full and 0 otherwise.
+ *
+ * When queueSize is 1 (pool was created with an intended queueSize of 0),
+ * then a queue is empty if there is a thread free _and_ no job is waiting.
+ */
+static int isQueueFull(POOL_ctx const* ctx) {
+    if (ctx->queueSize > 1) {
+        return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize);
+    } else {
+        return (ctx->numThreadsBusy == ctx->threadLimit) ||
+               !ctx->queueEmpty;
+    }
+}
+
+
+static void
+POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque)
+{
+    POOL_job job;
+    job.function = function;
+    job.opaque = opaque;
+    assert(ctx != NULL);
+    if (ctx->shutdown) return;
+
+    ctx->queueEmpty = 0;
+    ctx->queue[ctx->queueTail] = job;
+    ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize;
+    ZSTD_pthread_cond_signal(&ctx->queuePopCond);
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    /* Wait until there is space in the queue for the new job */
+    while (isQueueFull(ctx) && (!ctx->shutdown)) {
+        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    if (isQueueFull(ctx)) {
+        ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        return 0;
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return 1;
+}
+
+
+#else  /* ZSTD_MULTITHREAD  not defined */
+
+/* ========================== */
+/* No multi-threading support */
+/* ========================== */
+
+
+/* We don't need any data, but if it is empty, malloc() might return NULL. */
+struct POOL_ctx_s {
+    int dummy;
+};
+static POOL_ctx g_poolCtx;
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx*
+POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem)
+{
+    (void)numThreads;
+    (void)queueSize;
+    (void)customMem;
+    return &g_poolCtx;
+}
+
+void POOL_free(POOL_ctx* ctx) {
+    assert(!ctx || ctx == &g_poolCtx);
+    (void)ctx;
+}
+
+void POOL_joinJobs(POOL_ctx* ctx){
+    assert(!ctx || ctx == &g_poolCtx);
+    (void)ctx;
+}
+
+int POOL_resize(POOL_ctx* ctx, size_t numThreads) {
+    (void)ctx; (void)numThreads;
+    return 0;
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+}
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+    return 1;
+}
+
+size_t POOL_sizeof(const POOL_ctx* ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    assert(ctx == &g_poolCtx);
+    return sizeof(*ctx);
+}
+
+#endif  /* ZSTD_MULTITHREAD */
diff --git a/src/zstd/pool.h b/src/zstd/common/pool.h
similarity index 93%
rename from src/zstd/pool.h
rename to src/zstd/common/pool.h
index ae61b4695..cca4de73a 100644
--- a/src/zstd/pool.h
+++ b/src/zstd/common/pool.h
@@ -11,12 +11,13 @@
 #ifndef POOL_H
 #define POOL_H
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 extern "C" {
 #endif
 
+
 #include "zstd_deps.h"
-#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_customMem */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_customMem */
 #include "../zstd.h"
 
 typedef struct POOL_ctx_s POOL_ctx;
@@ -26,7 +27,7 @@ typedef struct POOL_ctx_s POOL_ctx;
  * `numThreads` must be at least 1.
  *  The maximum number of queued jobs before blocking is `queueSize`.
  * @return : POOL_ctx pointer on success, else NULL.
- */
+*/
 POOL_ctx* POOL_create(size_t numThreads, size_t queueSize);
 
 POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
@@ -37,6 +38,7 @@ POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
  */
 void POOL_free(POOL_ctx* ctx);
 
+
 /*! POOL_joinJobs() :
  *  Waits for all queued jobs to finish executing.
  */
@@ -45,7 +47,7 @@ void POOL_joinJobs(POOL_ctx* ctx);
 /*! POOL_resize() :
  *  Expands or shrinks pool's number of threads.
  *  This is more efficient than releasing + creating a new context,
- *  since it tries to preserve and re-use existing threads.
+ *  since it tries to preserve and reuse existing threads.
  * `numThreads` must be at least 1.
  * @return : 0 when resize was successful,
  *           !0 (typically 1) if there is an error.
@@ -72,6 +74,7 @@ typedef void (*POOL_function)(void*);
  */
 void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
 
+
 /*! POOL_tryAdd() :
  *  Add the job `function(opaque)` to thread pool _if_ a queue slot is available.
  *  Returns immediately even if not (does not block).
@@ -79,7 +82,8 @@ void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
  */
 int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque);
 
-#if defined(__cplusplus)
+
+#if defined (__cplusplus)
 }
 #endif
 
diff --git a/src/zstd/portability_macros.h b/src/zstd/common/portability_macros.h
similarity index 59%
rename from src/zstd/portability_macros.h
rename to src/zstd/common/portability_macros.h
index 24a8b7193..e50314a78 100644
--- a/src/zstd/portability_macros.h
+++ b/src/zstd/common/portability_macros.h
@@ -20,66 +20,73 @@
  *
  */
 
+
 /* compat. with non-clang compilers */
 #ifndef __has_attribute
-#define __has_attribute(x) 0
+  #define __has_attribute(x) 0
 #endif
 
 /* compat. with non-clang compilers */
 #ifndef __has_builtin
-#define __has_builtin(x) 0
+#  define __has_builtin(x) 0
 #endif
 
 /* compat. with non-clang compilers */
 #ifndef __has_feature
-#define __has_feature(x) 0
+#  define __has_feature(x) 0
 #endif
 
 /* detects whether we are being compiled under msan */
 #ifndef ZSTD_MEMORY_SANITIZER
-#if __has_feature(memory_sanitizer)
-#define ZSTD_MEMORY_SANITIZER 1
-#else
-#define ZSTD_MEMORY_SANITIZER 0
-#endif
+#  if __has_feature(memory_sanitizer)
+#    define ZSTD_MEMORY_SANITIZER 1
+#  else
+#    define ZSTD_MEMORY_SANITIZER 0
+#  endif
 #endif
 
 /* detects whether we are being compiled under asan */
 #ifndef ZSTD_ADDRESS_SANITIZER
-#if __has_feature(address_sanitizer)
-#define ZSTD_ADDRESS_SANITIZER 1
-#elif defined(__SANITIZE_ADDRESS__)
-#define ZSTD_ADDRESS_SANITIZER 1
-#else
-#define ZSTD_ADDRESS_SANITIZER 0
-#endif
+#  if __has_feature(address_sanitizer)
+#    define ZSTD_ADDRESS_SANITIZER 1
+#  elif defined(__SANITIZE_ADDRESS__)
+#    define ZSTD_ADDRESS_SANITIZER 1
+#  else
+#    define ZSTD_ADDRESS_SANITIZER 0
+#  endif
 #endif
 
 /* detects whether we are being compiled under dfsan */
 #ifndef ZSTD_DATAFLOW_SANITIZER
-#if __has_feature(dataflow_sanitizer)
-#define ZSTD_DATAFLOW_SANITIZER 1
-#else
-#define ZSTD_DATAFLOW_SANITIZER 0
-#endif
+# if __has_feature(dataflow_sanitizer)
+#  define ZSTD_DATAFLOW_SANITIZER 1
+# else
+#  define ZSTD_DATAFLOW_SANITIZER 0
+# endif
 #endif
 
 /* Mark the internal assembly functions as hidden  */
 #ifdef __ELF__
-#define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
+# define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
+#elif defined(__APPLE__)
+# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func
 #else
-#define ZSTD_HIDE_ASM_FUNCTION(func)
+# define ZSTD_HIDE_ASM_FUNCTION(func)
 #endif
 
 /* Enable runtime BMI2 dispatch based on the CPU.
  * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
  */
 #ifndef DYNAMIC_BMI2
-#if((defined(__clang__) && __has_attribute(__target__)) || (defined(__GNUC__) && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) && (defined(__x86_64__) || defined(_M_X64)) && !defined(__BMI2__)
-#define DYNAMIC_BMI2 1
-#else
-#define DYNAMIC_BMI2 0
-#endif
+  #if ((defined(__clang__) && __has_attribute(__target__)) \
+      || (defined(__GNUC__) \
+          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
+      && (defined(__x86_64__) || defined(_M_X64)) \
+      && !defined(__BMI2__)
+  #  define DYNAMIC_BMI2 1
+  #else
+  #  define DYNAMIC_BMI2 0
+  #endif
 #endif
 
 /**
@@ -94,19 +101,19 @@
  * 100% of code to be instrumented to work.
  */
 #if defined(__GNUC__)
-#if defined(__linux__) || defined(__linux) || defined(__APPLE__)
-#if ZSTD_MEMORY_SANITIZER
-#define ZSTD_ASM_SUPPORTED 0
-#elif ZSTD_DATAFLOW_SANITIZER
-#define ZSTD_ASM_SUPPORTED 0
-#else
-#define ZSTD_ASM_SUPPORTED 1
-#endif
+#  if defined(__linux__) || defined(__linux) || defined(__APPLE__)
+#    if ZSTD_MEMORY_SANITIZER
+#      define ZSTD_ASM_SUPPORTED 0
+#    elif ZSTD_DATAFLOW_SANITIZER
+#      define ZSTD_ASM_SUPPORTED 0
+#    else
+#      define ZSTD_ASM_SUPPORTED 1
+#    endif
+#  else
+#    define ZSTD_ASM_SUPPORTED 0
+#  endif
 #else
-#define ZSTD_ASM_SUPPORTED 0
-#endif
-#else
-#define ZSTD_ASM_SUPPORTED 0
+#  define ZSTD_ASM_SUPPORTED 0
 #endif
 
 /**
@@ -120,13 +127,13 @@
  *   - DYNAMIC_BMI2 is enabled
  *   - BMI2 is supported at compile time
  */
-#if !defined(ZSTD_DISABLE_ASM) && \
-    ZSTD_ASM_SUPPORTED &&         \
-    defined(__x86_64__) &&        \
+#if !defined(ZSTD_DISABLE_ASM) &&                                 \
+    ZSTD_ASM_SUPPORTED &&                                         \
+    defined(__x86_64__) &&                                        \
     (DYNAMIC_BMI2 || defined(__BMI2__))
-#define ZSTD_ENABLE_ASM_X86_64_BMI2 1
+# define ZSTD_ENABLE_ASM_X86_64_BMI2 1
 #else
-#define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+# define ZSTD_ENABLE_ASM_X86_64_BMI2 0
 #endif
 
 /*
@@ -136,15 +143,16 @@
  * Additionally, any function that may be called indirectly must begin
  * with ZSTD_CET_ENDBRANCH.
  */
-#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) && defined(__has_include)
-#if __has_include(<cet.h>)
-#include <cet.h>
-#define ZSTD_CET_ENDBRANCH _CET_ENDBR
-#endif
+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
+    && defined(__has_include)
+# if __has_include(<cet.h>)
+#  include <cet.h>
+#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
+# endif
 #endif
 
 #ifndef ZSTD_CET_ENDBRANCH
-#define ZSTD_CET_ENDBRANCH
+# define ZSTD_CET_ENDBRANCH
 #endif
 
 #endif /* ZSTD_PORTABILITY_MACROS_H */
diff --git a/src/zstd/common/threading.c b/src/zstd/common/threading.c
new file mode 100644
index 000000000..25bb8b981
--- /dev/null
+++ b/src/zstd/common/threading.c
@@ -0,0 +1,182 @@
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**
+ * This file will hold wrapper for systems, which do not support pthreads
+ */
+
+#include "threading.h"
+
+/* create fake symbol to avoid empty translation unit warning */
+int g_ZSTD_threading_useless_symbol;
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper
+ */
+
+
+/* ===  Dependencies  === */
+#include <process.h>
+#include <errno.h>
+
+
+/* ===  Implementation  === */
+
+typedef struct {
+    void* (*start_routine)(void*);
+    void* arg;
+    int initialized;
+    ZSTD_pthread_cond_t initialized_cond;
+    ZSTD_pthread_mutex_t initialized_mutex;
+} ZSTD_thread_params_t;
+
+static unsigned __stdcall worker(void *arg)
+{
+    void* (*start_routine)(void*);
+    void* thread_arg;
+
+    /* Initialized thread_arg and start_routine and signal main thread that we don't need it
+     * to wait any longer.
+     */
+    {
+        ZSTD_thread_params_t*  thread_param = (ZSTD_thread_params_t*)arg;
+        thread_arg = thread_param->arg;
+        start_routine = thread_param->start_routine;
+
+        /* Signal main thread that we are running and do not depend on its memory anymore */
+        ZSTD_pthread_mutex_lock(&thread_param->initialized_mutex);
+        thread_param->initialized = 1;
+        ZSTD_pthread_cond_signal(&thread_param->initialized_cond);
+        ZSTD_pthread_mutex_unlock(&thread_param->initialized_mutex);
+    }
+
+    start_routine(thread_arg);
+
+    return 0;
+}
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+            void* (*start_routine) (void*), void* arg)
+{
+    ZSTD_thread_params_t thread_param;
+    (void)unused;
+
+    if (thread==NULL) return -1;
+    *thread = NULL;
+
+    thread_param.start_routine = start_routine;
+    thread_param.arg = arg;
+    thread_param.initialized = 0;
+
+    /* Setup thread initialization synchronization */
+    if(ZSTD_pthread_cond_init(&thread_param.initialized_cond, NULL)) {
+        /* Should never happen on Windows */
+        return -1;
+    }
+    if(ZSTD_pthread_mutex_init(&thread_param.initialized_mutex, NULL)) {
+        /* Should never happen on Windows */
+        ZSTD_pthread_cond_destroy(&thread_param.initialized_cond);
+        return -1;
+    }
+
+    /* Spawn thread */
+    *thread = (HANDLE)_beginthreadex(NULL, 0, worker, &thread_param, 0, NULL);
+    if (*thread==NULL) {
+        ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex);
+        ZSTD_pthread_cond_destroy(&thread_param.initialized_cond);
+        return errno;
+    }
+
+    /* Wait for thread to be initialized */
+    ZSTD_pthread_mutex_lock(&thread_param.initialized_mutex);
+    while(!thread_param.initialized) {
+        ZSTD_pthread_cond_wait(&thread_param.initialized_cond, &thread_param.initialized_mutex);
+    }
+    ZSTD_pthread_mutex_unlock(&thread_param.initialized_mutex);
+    ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex);
+    ZSTD_pthread_cond_destroy(&thread_param.initialized_cond);
+
+    return 0;
+}
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread)
+{
+    DWORD result;
+
+    if (!thread) return 0;
+
+    result = WaitForSingleObject(thread, INFINITE);
+    CloseHandle(thread);
+
+    switch (result) {
+    case WAIT_OBJECT_0:
+        return 0;
+    case WAIT_ABANDONED:
+        return EINVAL;
+    default:
+        return GetLastError();
+    }
+}
+
+#endif   /* ZSTD_MULTITHREAD */
+
+#if defined(ZSTD_MULTITHREAD) && DEBUGLEVEL >= 1 && !defined(_WIN32)
+
+#define ZSTD_DEPS_NEED_MALLOC
+#include "zstd_deps.h"
+
+int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr)
+{
+    assert(mutex != NULL);
+    *mutex = (pthread_mutex_t*)ZSTD_malloc(sizeof(pthread_mutex_t));
+    if (!*mutex)
+        return 1;
+    return pthread_mutex_init(*mutex, attr);
+}
+
+int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex)
+{
+    assert(mutex != NULL);
+    if (!*mutex)
+        return 0;
+    {
+        int const ret = pthread_mutex_destroy(*mutex);
+        ZSTD_free(*mutex);
+        return ret;
+    }
+}
+
+int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr)
+{
+    assert(cond != NULL);
+    *cond = (pthread_cond_t*)ZSTD_malloc(sizeof(pthread_cond_t));
+    if (!*cond)
+        return 1;
+    return pthread_cond_init(*cond, attr);
+}
+
+int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond)
+{
+    assert(cond != NULL);
+    if (!*cond)
+        return 0;
+    {
+        int const ret = pthread_cond_destroy(*cond);
+        ZSTD_free(*cond);
+        return ret;
+    }
+}
+
+#endif
diff --git a/src/zstd/common/threading.h b/src/zstd/common/threading.h
new file mode 100644
index 000000000..fb5c1c878
--- /dev/null
+++ b/src/zstd/common/threading.h
@@ -0,0 +1,150 @@
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef THREADING_H_938743
+#define THREADING_H_938743
+
+#include "debug.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper
+ */
+#ifdef WINVER
+#  undef WINVER
+#endif
+#define WINVER       0x0600
+
+#ifdef _WIN32_WINNT
+#  undef _WIN32_WINNT
+#endif
+#define _WIN32_WINNT 0x0600
+
+#ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
+#include <windows.h>
+#undef ERROR
+#define ERROR(name) ZSTD_ERROR(name)
+
+
+/* mutex */
+#define ZSTD_pthread_mutex_t           CRITICAL_SECTION
+#define ZSTD_pthread_mutex_init(a, b)  ((void)(b), InitializeCriticalSection((a)), 0)
+#define ZSTD_pthread_mutex_destroy(a)  DeleteCriticalSection((a))
+#define ZSTD_pthread_mutex_lock(a)     EnterCriticalSection((a))
+#define ZSTD_pthread_mutex_unlock(a)   LeaveCriticalSection((a))
+
+/* condition variable */
+#define ZSTD_pthread_cond_t             CONDITION_VARIABLE
+#define ZSTD_pthread_cond_init(a, b)    ((void)(b), InitializeConditionVariable((a)), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    SleepConditionVariableCS((a), (b), INFINITE)
+#define ZSTD_pthread_cond_signal(a)     WakeConditionVariable((a))
+#define ZSTD_pthread_cond_broadcast(a)  WakeAllConditionVariable((a))
+
+/* ZSTD_pthread_create() and ZSTD_pthread_join() */
+typedef HANDLE ZSTD_pthread_t;
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+                   void* (*start_routine) (void*), void* arg);
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread);
+
+/**
+ * add here more wrappers as required
+ */
+
+
+#elif defined(ZSTD_MULTITHREAD)    /* posix assumed ; need a better detection method */
+/* ===   POSIX Systems   === */
+#  include <pthread.h>
+
+#if DEBUGLEVEL < 1
+
+#define ZSTD_pthread_mutex_t            pthread_mutex_t
+#define ZSTD_pthread_mutex_init(a, b)   pthread_mutex_init((a), (b))
+#define ZSTD_pthread_mutex_destroy(a)   pthread_mutex_destroy((a))
+#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock((a))
+#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock((a))
+
+#define ZSTD_pthread_cond_t             pthread_cond_t
+#define ZSTD_pthread_cond_init(a, b)    pthread_cond_init((a), (b))
+#define ZSTD_pthread_cond_destroy(a)    pthread_cond_destroy((a))
+#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait((a), (b))
+#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal((a))
+#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast((a))
+
+#define ZSTD_pthread_t                  pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a)         pthread_join((a),NULL)
+
+#else /* DEBUGLEVEL >= 1 */
+
+/* Debug implementation of threading.
+ * In this implementation we use pointers for mutexes and condition variables.
+ * This way, if we forget to init/destroy them the program will crash or ASAN
+ * will report leaks.
+ */
+
+#define ZSTD_pthread_mutex_t            pthread_mutex_t*
+int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr);
+int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex);
+#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock(*(a))
+#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock(*(a))
+
+#define ZSTD_pthread_cond_t             pthread_cond_t*
+int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr);
+int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond);
+#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait(*(a), *(b))
+#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal(*(a))
+#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast(*(a))
+
+#define ZSTD_pthread_t                  pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a)         pthread_join((a),NULL)
+
+#endif
+
+#else  /* ZSTD_MULTITHREAD not defined */
+/* No multithreading support */
+
+typedef int ZSTD_pthread_mutex_t;
+#define ZSTD_pthread_mutex_init(a, b)   ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_mutex_destroy(a)   ((void)(a))
+#define ZSTD_pthread_mutex_lock(a)      ((void)(a))
+#define ZSTD_pthread_mutex_unlock(a)    ((void)(a))
+
+typedef int ZSTD_pthread_cond_t;
+#define ZSTD_pthread_cond_init(a, b)    ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    ((void)(a), (void)(b))
+#define ZSTD_pthread_cond_signal(a)     ((void)(a))
+#define ZSTD_pthread_cond_broadcast(a)  ((void)(a))
+
+/* do not use ZSTD_pthread_t */
+
+#endif /* ZSTD_MULTITHREAD */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* THREADING_H_938743 */
diff --git a/src/zstd/xxhash.c b/src/zstd/common/xxhash.c
similarity index 59%
rename from src/zstd/xxhash.c
rename to src/zstd/common/xxhash.c
index bab85c865..052cd5228 100644
--- a/src/zstd/xxhash.c
+++ b/src/zstd/common/xxhash.c
@@ -1,10 +1,6 @@
 /*
- *  xxHash - Fast Hash algorithm
- *  Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- *  You can contact the author at :
- *  - xxHash homepage: https://cyan4973.github.io/xxHash/
- *  - xxHash source repository : https://github.com/Cyan4973/xxHash
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (c) Yann Collet - Meta Platforms, Inc
  *
  * This source code is licensed under both the BSD-style license (found in the
  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
@@ -17,6 +13,6 @@
  */
 
 #define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
-#define XXH_IMPLEMENTATION		/* access definitions */
+#define XXH_IMPLEMENTATION      /* access definitions */
 
 #include "xxhash.h"
diff --git a/src/zstd/common/xxhash.h b/src/zstd/common/xxhash.h
new file mode 100644
index 000000000..e59e44267
--- /dev/null
+++ b/src/zstd/common/xxhash.h
@@ -0,0 +1,7020 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (c) Yann Collet - Meta Platforms, Inc
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* Local adaptations for Zstandard */
+
+#ifndef XXH_NO_XXH3
+# define XXH_NO_XXH3
+#endif
+
+#ifndef XXH_NAMESPACE
+# define XXH_NAMESPACE ZSTD_
+#endif
+
+/*!
+ * @mainpage xxHash
+ *
+ * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed
+ * limits.
+ *
+ * It is proposed in four flavors, in three families:
+ * 1. @ref XXH32_family
+ *   - Classic 32-bit hash function. Simple, compact, and runs on almost all
+ *     32-bit and 64-bit systems.
+ * 2. @ref XXH64_family
+ *   - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most
+ *     64-bit systems (but _not_ 32-bit systems).
+ * 3. @ref XXH3_family
+ *   - Modern 64-bit and 128-bit hash function family which features improved
+ *     strength and performance across the board, especially on smaller data.
+ *     It benefits greatly from SIMD and 64-bit without requiring it.
+ *
+ * Benchmarks
+ * ---
+ * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04.
+ * The open source benchmark program is compiled with clang v10.0 using -O3 flag.
+ *
+ * | Hash Name            | ISA ext | Width | Large Data Speed | Small Data Velocity |
+ * | -------------------- | ------- | ----: | ---------------: | ------------------: |
+ * | XXH3_64bits()        | @b AVX2 |    64 |        59.4 GB/s |               133.1 |
+ * | MeowHash             | AES-NI  |   128 |        58.2 GB/s |                52.5 |
+ * | XXH3_128bits()       | @b AVX2 |   128 |        57.9 GB/s |               118.1 |
+ * | CLHash               | PCLMUL  |    64 |        37.1 GB/s |                58.1 |
+ * | XXH3_64bits()        | @b SSE2 |    64 |        31.5 GB/s |               133.1 |
+ * | XXH3_128bits()       | @b SSE2 |   128 |        29.6 GB/s |               118.1 |
+ * | RAM sequential read  |         |   N/A |        28.0 GB/s |                 N/A |
+ * | ahash                | AES-NI  |    64 |        22.5 GB/s |               107.2 |
+ * | City64               |         |    64 |        22.0 GB/s |                76.6 |
+ * | T1ha2                |         |    64 |        22.0 GB/s |                99.0 |
+ * | City128              |         |   128 |        21.7 GB/s |                57.7 |
+ * | FarmHash             | AES-NI  |    64 |        21.3 GB/s |                71.9 |
+ * | XXH64()              |         |    64 |        19.4 GB/s |                71.0 |
+ * | SpookyHash           |         |    64 |        19.3 GB/s |                53.2 |
+ * | Mum                  |         |    64 |        18.0 GB/s |                67.0 |
+ * | CRC32C               | SSE4.2  |    32 |        13.0 GB/s |                57.9 |
+ * | XXH32()              |         |    32 |         9.7 GB/s |                71.9 |
+ * | City32               |         |    32 |         9.1 GB/s |                66.0 |
+ * | Blake3*              | @b AVX2 |   256 |         4.4 GB/s |                 8.1 |
+ * | Murmur3              |         |    32 |         3.9 GB/s |                56.1 |
+ * | SipHash*             |         |    64 |         3.0 GB/s |                43.2 |
+ * | Blake3*              | @b SSE2 |   256 |         2.4 GB/s |                 8.1 |
+ * | HighwayHash          |         |    64 |         1.4 GB/s |                 6.0 |
+ * | FNV64                |         |    64 |         1.2 GB/s |                62.7 |
+ * | Blake2*              |         |   256 |         1.1 GB/s |                 5.1 |
+ * | SHA1*                |         |   160 |         0.8 GB/s |                 5.6 |
+ * | MD5*                 |         |   128 |         0.6 GB/s |                 7.8 |
+ * @note
+ *   - Hashes which require a specific ISA extension are noted. SSE2 is also noted,
+ *     even though it is mandatory on x64.
+ *   - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic
+ *     by modern standards.
+ *   - Small data velocity is a rough average of algorithm's efficiency for small
+ *     data. For more accurate information, see the wiki.
+ *   - More benchmarks and strength tests are found on the wiki:
+ *         https://github.com/Cyan4973/xxHash/wiki
+ *
+ * Usage
+ * ------
+ * All xxHash variants use a similar API. Changing the algorithm is a trivial
+ * substitution.
+ *
+ * @pre
+ *    For functions which take an input and length parameter, the following
+ *    requirements are assumed:
+ *    - The range from [`input`, `input + length`) is valid, readable memory.
+ *      - The only exception is if the `length` is `0`, `input` may be `NULL`.
+ *    - For C++, the objects must have the *TriviallyCopyable* property, as the
+ *      functions access bytes directly as if it was an array of `unsigned char`.
+ *
+ * @anchor single_shot_example
+ * **Single Shot**
+ *
+ * These functions are stateless functions which hash a contiguous block of memory,
+ * immediately returning the result. They are the easiest and usually the fastest
+ * option.
+ *
+ * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits()
+ *
+ * @code{.c}
+ *   #include <string.h>
+ *   #include "xxhash.h"
+ *
+ *   // Example for a function which hashes a null terminated string with XXH32().
+ *   XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed)
+ *   {
+ *       // NULL pointers are only valid if the length is zero
+ *       size_t length = (string == NULL) ? 0 : strlen(string);
+ *       return XXH32(string, length, seed);
+ *   }
+ * @endcode
+ *
+ *
+ * @anchor streaming_example
+ * **Streaming**
+ *
+ * These groups of functions allow incremental hashing of unknown size, even
+ * more than what would fit in a size_t.
+ *
+ * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset()
+ *
+ * @code{.c}
+ *   #include <stdio.h>
+ *   #include <assert.h>
+ *   #include "xxhash.h"
+ *   // Example for a function which hashes a FILE incrementally with XXH3_64bits().
+ *   XXH64_hash_t hashFile(FILE* f)
+ *   {
+ *       // Allocate a state struct. Do not just use malloc() or new.
+ *       XXH3_state_t* state = XXH3_createState();
+ *       assert(state != NULL && "Out of memory!");
+ *       // Reset the state to start a new hashing session.
+ *       XXH3_64bits_reset(state);
+ *       char buffer[4096];
+ *       size_t count;
+ *       // Read the file in chunks
+ *       while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) {
+ *           // Run update() as many times as necessary to process the data
+ *           XXH3_64bits_update(state, buffer, count);
+ *       }
+ *       // Retrieve the finalized hash. This will not change the state.
+ *       XXH64_hash_t result = XXH3_64bits_digest(state);
+ *       // Free the state. Do not use free().
+ *       XXH3_freeState(state);
+ *       return result;
+ *   }
+ * @endcode
+ *
+ * Streaming functions generate the xxHash value from an incremental input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ *
+ *
+ * @anchor canonical_representation_example
+ * **Canonical Representation**
+ *
+ * The default return values from XXH functions are unsigned 32, 64 and 128 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ *
+ * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(),
+ * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(),
+ * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(),
+ *
+ * @code{.c}
+ *   #include <stdio.h>
+ *   #include "xxhash.h"
+ *
+ *   // Example for a function which prints XXH32_hash_t in human readable format
+ *   void printXxh32(XXH32_hash_t hash)
+ *   {
+ *       XXH32_canonical_t cano;
+ *       XXH32_canonicalFromHash(&cano, hash);
+ *       size_t i;
+ *       for(i = 0; i < sizeof(cano.digest); ++i) {
+ *           printf("%02x", cano.digest[i]);
+ *       }
+ *       printf("\n");
+ *   }
+ *
+ *   // Example for a function which converts XXH32_canonical_t to XXH32_hash_t
+ *   XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano)
+ *   {
+ *       XXH32_hash_t hash = XXH32_hashFromCanonical(&cano);
+ *       return hash;
+ *   }
+ * @endcode
+ *
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * @defgroup public Public API
+ * Contains details on the public xxHash functions.
+ * @{
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Gives access to internal state declaration, required for static allocation.
+ *
+ * Incompatible with dynamic linking, due to risks of ABI changes.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_STATIC_LINKING_ONLY
+ *     #include "xxhash.h"
+ * @endcode
+ */
+#  define XXH_STATIC_LINKING_ONLY
+/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */
+
+/*!
+ * @brief Gives access to internal definitions.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_STATIC_LINKING_ONLY
+ *     #define XXH_IMPLEMENTATION
+ *     #include "xxhash.h"
+ * @endcode
+ */
+#  define XXH_IMPLEMENTATION
+/* Do not undef XXH_IMPLEMENTATION for Doxygen */
+
+/*!
+ * @brief Exposes the implementation and marks all functions as `inline`.
+ *
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *  https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * @endcode
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#  define XXH_INLINE_ALL
+#  undef XXH_INLINE_ALL
+/*!
+ * @brief Exposes the implementation without marking functions as inline.
+ */
+#  define XXH_PRIVATE_API
+#  undef XXH_PRIVATE_API
+/*!
+ * @brief Emulate a namespace by transparently prefixing all symbols.
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#  define XXH_NAMESPACE /* YOUR NAME HERE */
+#  undef XXH_NAMESPACE
+#endif
+
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
+    * such as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+   /* Before that, we unconditionally #undef all symbols,
+    * in case they were already defined with XXH_NAMESPACE.
+    * They will then be redefined for XXH_INLINE_ALL
+    */
+#  undef XXH_versionNumber
+    /* XXH32 */
+#  undef XXH32
+#  undef XXH32_createState
+#  undef XXH32_freeState
+#  undef XXH32_reset
+#  undef XXH32_update
+#  undef XXH32_digest
+#  undef XXH32_copyState
+#  undef XXH32_canonicalFromHash
+#  undef XXH32_hashFromCanonical
+    /* XXH64 */
+#  undef XXH64
+#  undef XXH64_createState
+#  undef XXH64_freeState
+#  undef XXH64_reset
+#  undef XXH64_update
+#  undef XXH64_digest
+#  undef XXH64_copyState
+#  undef XXH64_canonicalFromHash
+#  undef XXH64_hashFromCanonical
+    /* XXH3_64bits */
+#  undef XXH3_64bits
+#  undef XXH3_64bits_withSecret
+#  undef XXH3_64bits_withSeed
+#  undef XXH3_64bits_withSecretandSeed
+#  undef XXH3_createState
+#  undef XXH3_freeState
+#  undef XXH3_copyState
+#  undef XXH3_64bits_reset
+#  undef XXH3_64bits_reset_withSeed
+#  undef XXH3_64bits_reset_withSecret
+#  undef XXH3_64bits_update
+#  undef XXH3_64bits_digest
+#  undef XXH3_generateSecret
+    /* XXH3_128bits */
+#  undef XXH128
+#  undef XXH3_128bits
+#  undef XXH3_128bits_withSeed
+#  undef XXH3_128bits_withSecret
+#  undef XXH3_128bits_reset
+#  undef XXH3_128bits_reset_withSeed
+#  undef XXH3_128bits_reset_withSecret
+#  undef XXH3_128bits_reset_withSecretandSeed
+#  undef XXH3_128bits_update
+#  undef XXH3_128bits_digest
+#  undef XXH128_isEqual
+#  undef XXH128_cmp
+#  undef XXH128_canonicalFromHash
+#  undef XXH128_hashFromCanonical
+    /* Finally, free the namespace itself */
+#  undef XXH_NAMESPACE
+
+    /* employ the namespace for XXH_INLINE_ALL */
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols,
+    * but they must nonetheless be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and has a more dispersed impact.
+    * Meanwhile, renaming can be achieved in a single place.
+    */
+#  define XXH_IPREF(Id)   XXH_NAMESPACE ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/*! @brief Marks a global symbol. */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+/* XXH32 */
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+/* XXH64 */
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+/* XXH3_64bits */
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#  define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
+#  define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
+/* XXH3_128bits */
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+#  define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Compiler specifics
+***************************************/
+
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#if defined (__GNUC__)
+# define XXH_CONSTF  __attribute__((const))
+# define XXH_PUREF   __attribute__((pure))
+# define XXH_MALLOCF __attribute__((malloc))
+#else
+# define XXH_CONSTF  /* disable */
+# define XXH_PUREF
+# define XXH_MALLOCF
+#endif
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    8
+#define XXH_VERSION_RELEASE  2
+/*! @brief Version number, encoded as two digits each */
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is mostly useful when xxHash is compiled as a shared library,
+ * since the returned value comes from the library, as opposed to header file.
+ *
+ * @return @ref XXH_VERSION_NUMBER of the invoked library.
+ */
+XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Common basic types
+******************************/
+#include <stddef.h>   /* size_t */
+/*!
+ * @brief Exit code for the streaming API.
+ */
+typedef enum {
+    XXH_OK = 0, /*!< OK */
+    XXH_ERROR   /*!< Error */
+} XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   ifdef _AIX
+#     include <inttypes.h>
+#   else
+#     include <stdint.h>
+#   endif
+    typedef uint32_t XXH32_hash_t;
+
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   elif ULONG_MAX == 0xFFFFFFFFUL
+      typedef unsigned long XXH32_hash_t;
+#   else
+#     error "unsupported platform: need a 32-bit type"
+#   endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
+ *   Note that the @ref XXH3_family provides competitive speed for both 32-bit
+ *   and 64-bit systems, and offers true 64/128 bit hash results.
+ *
+ * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families
+ * @see @ref XXH32_impl for implementation details
+ * @{
+ */
+
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit xxHash32 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+#ifndef XXH_NO_STREAM
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
+ */
+typedef struct XXH32_state_s XXH32_state_t;
+
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * @return An allocated pointer of @ref XXH32_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH32_freeState().
+ */
+XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note @p statePtr must be allocated with XXH32_createState().
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note This function resets and seeds a state. Call it before @ref XXH32_update().
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated 32-bit xxHash32 value from that state.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
+typedef struct {
+    unsigned char digest[4]; /*!< Hash bytes, big endian */
+} XXH32_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst  The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+/*! @cond Doxygen ignores this part */
+#ifdef __has_attribute
+# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+# define XXH_HAS_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * C23 __STDC_VERSION__ number hasn't been specified yet. For now
+ * leave as `201711L` (C17 + 1).
+ * TODO: Update to correct value when its been specified.
+ */
+#define XXH_C23_VN 201711L
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/* C-language Attributes are added in C23. */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)
+# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+#else
+# define XXH_HAS_C_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define XXH_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
+ * introduced in CPP17 and C23.
+ * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+ * C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
+ */
+#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough)
+# define XXH_FALLTHROUGH [[fallthrough]]
+#elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__))
+#else
+# define XXH_FALLTHROUGH /* fallthrough */
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * Define XXH_NOESCAPE for annotated pointers in public API.
+ * https://clang.llvm.org/docs/AttributeReference.html#noescape
+ * As of writing this, only supported by clang.
+ */
+#if XXH_HAS_ATTRIBUTE(noescape)
+# define XXH_NOESCAPE __attribute__((noescape))
+#else
+# define XXH_NOESCAPE
+#endif
+/*! @endcond */
+
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+ */
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   ifdef _AIX
+#     include <inttypes.h>
+#   else
+#     include <stdint.h>
+#   endif
+   typedef uint64_t XXH64_hash_t;
+#else
+#  include <limits.h>
+#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+     /* LP64 ABI says uint64_t is unsigned long */
+     typedef unsigned long XXH64_hash_t;
+#  else
+     /* the following type must have a width of 64-bit */
+     typedef unsigned long long XXH64_hash_t;
+#  endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH64_family XXH64 family
+ * @ingroup public
+ * @{
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results.
+ *   It provides better speed for systems with vector processing capabilities.
+ */
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit xxHash64 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+
+/*!
+ * @brief Allocates an @ref XXH64_state_t.
+ *
+ * @return An allocated pointer of @ref XXH64_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH64_freeState().
+ */
+XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
+
+/*!
+ * @brief Frees an @ref XXH64_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note @p statePtr must be allocated with XXH64_createState().
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+
+/*!
+ * @brief Copies one @ref XXH64_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH64_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note This function resets and seeds a state. Call it before @ref XXH64_update().
+ */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH64_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH64_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated 64-bit xxHash64 value from that state.
+ *
+ * @note
+ *   Calling XXH64_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+/*******   Canonical representation   *******/
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH64_hash_t.
+ */
+typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t.
+ *
+ * @param dst The @ref XXH64_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH64_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t.
+ *
+ * @param src The @ref XXH64_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
+
+#ifndef XXH_NO_XXH3
+
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup XXH3_family XXH3 family
+ * @ingroup public
+ * @{
+ *
+ * XXH3 is a more recent hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
+ *
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3
+ * at competitive speeds, even without vector support. Further details are
+ * explained in the implementation.
+ *
+ * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD
+ * implementations for many common platforms:
+ *   - AVX512
+ *   - AVX2
+ *   - SSE2
+ *   - ARM NEON
+ *   - WebAssembly SIMD128
+ *   - POWER8 VSX
+ *   - s390x ZVector
+ * This can be controlled via the @ref XXH_VECTOR macro, but it automatically
+ * selects the best version according to predefined macros. For the x86 family, an
+ * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generate exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+/*-**********************************************************************
+*  XXH3 64-bit variant
+************************************************************************/
+
+/*!
+ * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input.
+ *
+ * @param input  The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @note
+ *   This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however
+ *   it may have slightly better performance due to constant propagation of the
+ *   defaults.
+ *
+ * @see
+ *    XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input.
+ *
+ * @param input  The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed   The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @note
+ *    seed == 0 produces the same results as @ref XXH3_64bits().
+ *
+ * This variant generates a custom secret on the fly based on default secret
+ * altered using the @p seed value.
+ *
+ * While this operation is decently fast, note that it's not completely free.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+
+/*!
+ * The bare minimum size for a custom secret.
+ *
+ * @see
+ *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+ *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+
+/*!
+ * @brief Calculates 64-bit variant of XXH3 with a custom "secret".
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @pre
+ *   The memory between @p data and @p data + @p len must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p data may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing @ref XXH3_generateSecret() instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ */
+
+/*!
+ * @brief The opaque state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ */
+typedef struct XXH3_state_s XXH3_state_t;
+XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+
+/*!
+ * @brief Copies one @ref XXH3_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret with default parameters.
+ *   - Call this function before @ref XXH3_64bits_update().
+ *   - Digest will be equivalent to `XXH3_64bits()`.
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed     The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret from `seed`.
+ *   - Call this function before @ref XXH3_64bits_update().
+ *   - Digest will be equivalent to `XXH3_64bits_withSeed()`.
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   `secret` is referenced, it _must outlive_ the hash streaming session.
+ *
+ * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated XXH3 64-bit hash value from that state.
+ *
+ * @note
+ *   Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t  XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* note : canonical representation of XXH3 is the same as XXH64
+ * since they both produce XXH64_hash_t values */
+
+
+/*-**********************************************************************
+*  XXH3 128-bit variant
+************************************************************************/
+
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
+typedef struct {
+    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+    XXH64_hash_t high64;  /*!< `value >> 64` */
+} XXH128_hash_t;
+
+/*!
+ * @brief Calculates 128-bit unseeded variant of XXH3 of @p data.
+ *
+ * @param data The block of data to be hashed, at least @p length bytes in size.
+ * @param len  The length of @p data, in bytes.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead
+ * for shorter inputs.
+ *
+ * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however
+ * it may have slightly better performance due to constant propagation of the
+ * defaults.
+ *
+ * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
+/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param data The block of data to be hashed, at least @p length bytes in size.
+ * @param len  The length of @p data, in bytes.
+ * @param seed The 64-bit seed to alter the hash result predictably.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * @note
+ *    seed == 0 produces the same results as @ref XXH3_64bits().
+ *
+ * This variant generates a custom secret on the fly based on default secret
+ * altered using the @p seed value.
+ *
+ * While this operation is decently fast, note that it's not completely free.
+ *
+ * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
+/*!
+ * @brief Calculates 128-bit variant of XXH3 with a custom "secret".
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing @ref XXH3_generateSecret() instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ *
+ * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
+ * Use already declared XXH3_createState() and XXH3_freeState().
+ *
+ * All reset and streaming functions have same meaning as their 64-bit counterpart.
+ */
+
+/*!
+ * @brief Resets an @ref XXH3_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret with default parameters.
+ *   - Call it before @ref XXH3_128bits_update().
+ *   - Digest will be equivalent to `XXH3_128bits()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed     The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret from `seed`.
+ *   - Call it before @ref XXH3_128bits_update().
+ *   - Digest will be equivalent to `XXH3_128bits_withSeed()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   The state struct to reset.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * `secret` is referenced, it _must outlive_ the hash streaming session.
+ * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated XXH3 128-bit hash value from that state.
+ *
+ * @note
+ *   Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* Following helper functions make it possible to compare XXH128_hast_t values.
+ * Since XXH128_hash_t is a structure, this capability is not offered by the language.
+ * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * @brief Check equality of two XXH128_hash_t values
+ *
+ * @param h1 The 128-bit hash value.
+ * @param h2 Another 128-bit hash value.
+ *
+ * @return `1` if `h1` and `h2` are equal.
+ * @return `0` if they are not.
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * @brief Compares two @ref XXH128_hash_t
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * @param h128_1 Left-hand side value
+ * @param h128_2 Right-hand side value
+ *
+ * @return >0 if @p h128_1  > @p h128_2
+ * @return =0 if @p h128_1 == @p h128_2
+ * @return <0 if @p h128_1  < @p h128_2
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
+
+
+/*!
+ * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t.
+ *
+ * @param dst  The @ref XXH128_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH128_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t.
+ *
+ * @param src The @ref XXH128_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
+
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+
+/*!
+ * @}
+ */
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation
+ * of XXH states, on stack or in a struct, for example.
+ * Never **ever** access their members directly.
+ */
+
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
+   XXH32_hash_t v[4];         /*!< Accumulator lanes */
+   XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem32 */
+   XXH32_hash_t reserved;     /*!< Reserved field. Do not read nor write to it. */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
+struct XXH64_state_s {
+   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
+   XXH64_hash_t v[4];         /*!< Accumulator lanes */
+   XXH64_hash_t mem64[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem64 */
+   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it. */
+};   /* typedef'd to XXH64_state_t */
+
+#ifndef XXH_NO_XXH3
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
+/* In C++ alignas() is a keyword */
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+/*!
+ * @brief The size of the internal XXH3 buffer.
+ *
+ * This is the optimal update size for incremental hashing.
+ *
+ * @see XXH3_64b_update(), XXH3_128b_update().
+ */
+#define XXH3_INTERNALBUFFER_SIZE 256
+
+/*!
+ * @internal
+ * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+ *
+ * This is the size used in @ref XXH3_kSecret and the seeded functions.
+ *
+ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+ */
+#define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
+ * Otherwise it is an opaque type.
+ * Never use this definition in combination with dynamic library.
+ * This allows fields to safely be changed in the future.
+ *
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`,
+ * it will not be sufficiently aligned.
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do never access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+       /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+       /*!< Used to store a custom secret generated from a seed. */
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+       /*!< The internal buffer. @see XXH32_state_s::mem32 */
+   XXH32_hash_t bufferedSize;
+       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+   XXH32_hash_t useSeed;
+       /*!< Reserved field. Needed for padding on 64-bit. */
+   size_t nbStripesSoFar;
+       /*!< Number or stripes processed. */
+   XXH64_hash_t totalLen;
+       /*!< Total length hashed. 64-bit even on 32-bit targets. */
+   size_t nbStripesPerBlock;
+       /*!< Number of stripes per block. */
+   size_t secretLimit;
+       /*!< Size of @ref customSecret or @ref extSecret */
+   XXH64_hash_t seed;
+       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
+   XXH64_hash_t reserved64;
+       /*!< Reserved field. */
+   const unsigned char* extSecret;
+       /*!< Reference to an external secret for the _withSecret variants, NULL
+        *   for other variants. */
+   /* note: there may be some padding at the end due to alignment on 64 bytes */
+}; /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*!
+ * @brief Initializes a stack-allocated `XXH3_state_s`.
+ *
+ * When the @ref XXH3_state_t structure is merely emplaced on stack,
+ * it should be initialized with XXH3_INITSTATE() or a memset()
+ * in case its first reset uses XXH3_NNbits_reset_withSeed().
+ * This init can be omitted if the first reset uses default or _withSecret mode.
+ * This operation isn't necessary when the state is created with XXH3_createState().
+ * Note that this doesn't prepare the state for a streaming operation,
+ * it's still necessary to use XXH3_NNbits_reset*() afterwards.
+ */
+#define XXH3_INITSTATE(XXH3_state_ptr)                       \
+    do {                                                     \
+        XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
+        tmp_xxh3_state_ptr->seed = 0;                        \
+        tmp_xxh3_state_ptr->extSecret = NULL;                \
+    } while(0)
+
+
+/*!
+ * @brief Calculates the 128-bit hash of @p data using XXH3.
+ *
+ * @param data The block of data to be hashed, at least @p len bytes in size.
+ * @param len  The length of @p data, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p data and @p data + @p len must be valid,
+ *   readable, contiguous memory. However, if @p len is `0`, @p data may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 128-bit XXH3 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* ===   Experimental API   === */
+/* Symbols defined below must be considered tied to a specific library version. */
+
+/*!
+ * @brief Derive a high-entropy secret from any user-defined content, named customSeed.
+ *
+ * @param secretBuffer    A writable buffer for derived high-entropy secret data.
+ * @param secretSize      Size of secretBuffer, in bytes.  Must be >= XXH3_SECRET_DEFAULT_SIZE.
+ * @param customSeed      A user-defined content.
+ * @param customSeedSize  Size of customSeed, in bytes.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * The generated secret can be used in combination with `*_withSecret()` functions.
+ * The `_withSecret()` variants are useful to provide a higher level of protection
+ * than 64-bit seed, as it becomes much more difficult for an external actor to
+ * guess how to impact the calculation logic.
+ *
+ * The function accepts as input a custom seed of any length and any content,
+ * and derives from it a high-entropy secret of length @p secretSize into an
+ * already allocated buffer @p secretBuffer.
+ *
+ * The generated secret can then be used with any `*_withSecret()` variant.
+ * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(),
+ * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret()
+ * are part of this list. They all accept a `secret` parameter
+ * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN)
+ * _and_ feature very high entropy (consist of random-looking bytes).
+ * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can
+ * be employed to ensure proper quality.
+ *
+ * @p customSeed can be anything. It can have any size, even small ones,
+ * and its content can be anything, even "poor entropy" sources such as a bunch
+ * of zeroes. The resulting `secret` will nonetheless provide all required qualities.
+ *
+ * @pre
+ *   - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN
+ *   - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
+ *
+ * Example code:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <stdlib.h>
+ *    #include <string.h>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Hashes argv[2] using the entropy from argv[1].
+ *    int main(int argc, char* argv[])
+ *    {
+ *        char secret[XXH3_SECRET_SIZE_MIN];
+ *        if (argv != 3) { return 1; }
+ *        XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1]));
+ *        XXH64_hash_t h = XXH3_64bits_withSecret(
+ *             argv[2], strlen(argv[2]),
+ *             secret, sizeof(secret)
+ *        );
+ *        printf("%016llx\n", (unsigned long long) h);
+ *    }
+ * @endcode
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
+
+/*!
+ * @brief Generate the same secret as the _withSeed() variants.
+ *
+ * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes
+ * @param seed         The 64-bit seed to alter the hash result predictably.
+ *
+ * The generated secret can be used in combination with
+ *`*_withSecret()` and `_withSecretandSeed()` variants.
+ *
+ * Example C++ `std::string` hash class:
+ * @code{.cpp}
+ *    #include <string>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Slow, seeds each time
+ *    class HashSlow {
+ *        XXH64_hash_t seed;
+ *    public:
+ *        HashSlow(XXH64_hash_t s) : seed{s} {}
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)};
+ *        }
+ *    };
+ *    // Fast, caches the seeded secret for future uses.
+ *    class HashFast {
+ *        unsigned char secret[XXH3_SECRET_SIZE_MIN];
+ *    public:
+ *        HashFast(XXH64_hash_t s) {
+ *            XXH3_generateSecret_fromSeed(secret, seed);
+ *        }
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{
+ *                XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret))
+ *            };
+ *        }
+ *    };
+ * @endcode
+ */
+XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
+
+/*!
+ * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed       The 64-bit seed to alter the hash result predictably.
+ *
+ * These variants generate hash values using either
+ * @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes)
+ * or @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX).
+ *
+ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
+ * `_withSeed()` has to generate the secret on the fly for "large" keys.
+ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
+ * `_withSecret()` has to generate the masks on the fly for "small" keys,
+ * which requires more instructions than _withSeed() variants.
+ * Therefore, _withSecretandSeed variant combines the best of both worlds.
+ *
+ * When @p secret has been generated by XXH3_generateSecret_fromSeed(),
+ * this variant produces *exactly* the same results as `_withSeed()` variant,
+ * hence offering only a pure speed benefit on "large" input,
+ * by skipping the need to regenerate the secret for every large input.
+ *
+ * Another usage scenario is to hash the secret to a 64-bit hash value,
+ * for example with XXH3_64bits(), which then becomes the seed,
+ * and then employ both the seed and the secret in _withSecretandSeed().
+ * On top of speed, an added benefit is that each bit in the secret
+ * has a 50% chance to swap each bit in the output, via its impact to the seed.
+ *
+ * This is not guaranteed when using the secret directly in "small data" scenarios,
+ * because only portions of the secret are employed for small data.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
+XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
+                              XXH_NOESCAPE const void* secret, size_t secretSize,
+                              XXH64_hash_t seed);
+/*!
+ * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param input      The block of data to be hashed, at least @p len bytes in size.
+ * @param length     The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed()
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
+XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
+                               XXH_NOESCAPE const void* secret, size_t secretSize,
+                               XXH64_hash_t seed64);
+#ifndef XXH_NO_STREAM
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed()
+ */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
+                                    XXH_NOESCAPE const void* secret, size_t secretSize,
+                                    XXH64_hash_t seed64);
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed()
+ */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
+                                     XXH_NOESCAPE const void* secret, size_t secretSize,
+                                     XXH64_hash_t seed64);
+#endif /* !XXH_NO_STREAM */
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be hosted inside xxhash.c.
+ *
+ * However, inlining requires implementation to be visible to the compiler,
+ * hence be included alongside the header.
+ * Previously, implementation was hosted inside xxhash.c,
+ * which was then #included when inlining was activated.
+ * This construction created issues with a few build and install systems,
+ * as it required xxhash.c to be stored in /include directory.
+ *
+ * xxHash implementation is now directly integrated within xxhash.h.
+ * As a consequence, xxhash.c is no longer needed in /include.
+ *
+ * xxhash.c is still available and is still useful.
+ * In a "normal" setup, when xxhash is not inlined,
+ * xxhash.h only exposes the prototypes and public symbols,
+ * while xxhash.c can be built into an object file xxhash.o
+ * which can then be linked into the final binary.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+
+/*!
+ * @defgroup tuning Tuning parameters
+ * @{
+ *
+ * Various macros to control xxHash's behavior.
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Define this to disable 64-bit code.
+ *
+ * Useful if only using the @ref XXH32_family and you have a strict C90 compiler.
+ */
+#  define XXH_NO_LONG_LONG
+#  undef XXH_NO_LONG_LONG /* don't actually */
+/*!
+ * @brief Controls how unaligned memory is accessed.
+ *
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow selection of a different access method
+ * in the search for improved performance.
+ *
+ * @par Possible options:
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+ *   @par
+ *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
+ *     eliminate the function call and treat it as an unaligned access.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
+ *   @par
+ *     Depends on compiler extensions and is therefore not portable.
+ *     This method is safe _if_ your compiler supports it,
+ *     and *generally* as fast or faster than `memcpy`.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+ *  @par
+ *     Casts directly and dereferences. This method doesn't depend on the
+ *     compiler, but it violates the C standard as it directly dereferences an
+ *     unaligned pointer. It can generate buggy code on targets which do not
+ *     support unaligned memory accesses, but in some circumstances, it's the
+ *     only known way to get the most performance.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+ *  @par
+ *     Also portable. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction. However, some compilers
+ *     will emit literal byteshifts even if the target supports unaligned access.
+ *
+ *
+ * @warning
+ *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+ *   care, as what works on one compiler/platform/optimization level may cause
+ *   another to read garbage data or even crash.
+ *
+ * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
+ *
+ * Prefer these methods in priority order (0 > 3 > 1 > 2)
+ */
+#  define XXH_FORCE_MEMORY_ACCESS 0
+
+/*!
+ * @def XXH_SIZE_OPT
+ * @brief Controls how much xxHash optimizes for size.
+ *
+ * xxHash, when compiled, tends to result in a rather large binary size. This
+ * is mostly due to heavy usage to forced inlining and constant folding of the
+ * @ref XXH3_family to increase performance.
+ *
+ * However, some developers prefer size over speed. This option can
+ * significantly reduce the size of the generated code. When using the `-Os`
+ * or `-Oz` options on GCC or Clang, this is defined to 1 by default,
+ * otherwise it is defined to 0.
+ *
+ * Most of these size optimizations can be controlled manually.
+ *
+ * This is a number from 0-2.
+ *  - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed
+ *    comes first.
+ *  - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more
+ *    conservative and disables hacks that increase code size. It implies the
+ *    options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0,
+ *    and @ref XXH3_NEON_LANES == 8 if they are not already defined.
+ *  - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible.
+ *    Performance may cry. For example, the single shot functions just use the
+ *    streaming API.
+ */
+#  define XXH_SIZE_OPT 0
+
+/*!
+ * @def XXH_FORCE_ALIGN_CHECK
+ * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
+ * and XXH64() only).
+ *
+ * This is an important performance trick for architectures without decent
+ * unaligned memory access performance.
+ *
+ * It checks for input alignment, and when conditions are met, uses a "fast
+ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+ * faster_ read speed.
+ *
+ * The check costs one initial branch per hash, which is generally negligible,
+ * but not zero.
+ *
+ * Moreover, it's not useful to generate an additional code path if memory
+ * access uses the same instruction for both aligned and unaligned
+ * addresses (e.g. x86 and aarch64).
+ *
+ * In these cases, the alignment check can be removed by setting this macro to 0.
+ * Then the code will always use unaligned memory access.
+ * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips
+ * which are platforms known to offer good unaligned memory accesses performance.
+ *
+ * It is also disabled by default when @ref XXH_SIZE_OPT >= 1.
+ *
+ * This option does not affect XXH3 (only XXH32 and XXH64).
+ */
+#  define XXH_FORCE_ALIGN_CHECK 0
+
+/*!
+ * @def XXH_NO_INLINE_HINTS
+ * @brief When non-zero, sets all functions to `static`.
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if
+ * @ref XXH_SIZE_OPT >= 1, this will automatically be defined.
+ */
+#  define XXH_NO_INLINE_HINTS 0
+
+/*!
+ * @def XXH3_INLINE_SECRET
+ * @brief Determines whether to inline the XXH3 withSecret code.
+ *
+ * When the secret size is known, the compiler can improve the performance
+ * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
+ *
+ * However, if the secret size is not known, it doesn't have any benefit. This
+ * happens when xxHash is compiled into a global symbol. Therefore, if
+ * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
+ *
+ * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
+ * that are *sometimes* force inline on -Og, and it is impossible to automatically
+ * detect this optimization level.
+ */
+#  define XXH3_INLINE_SECRET 0
+
+/*!
+ * @def XXH32_ENDJMP
+ * @brief Whether to use a jump for `XXH32_finalize`.
+ *
+ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
+ * This is generally preferable for performance,
+ * but depending on exact architecture, a jmp may be preferable.
+ *
+ * This setting is only possibly making a difference for very small inputs.
+ */
+#  define XXH32_ENDJMP 0
+
+/*!
+ * @internal
+ * @brief Redefines old internal names.
+ *
+ * For compatibility with code that uses xxHash's internals before the names
+ * were changed to improve namespacing. There is no other reason to use this.
+ */
+#  define XXH_OLD_NAMES
+#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
+
+/*!
+ * @def XXH_NO_STREAM
+ * @brief Disables the streaming API.
+ *
+ * When xxHash is not inlined and the streaming functions are not used, disabling
+ * the streaming functions can improve code size significantly, especially with
+ * the @ref XXH3_family which tends to make constant folded copies of itself.
+ */
+#  define XXH_NO_STREAM
+#  undef XXH_NO_STREAM /* don't actually */
+#endif /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+   /* prefer __packed__ structures (method 1) for GCC
+    * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
+    * which for some reason does unaligned loads. */
+#  if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+#ifndef XXH_SIZE_OPT
+   /* default to 1 for -Os or -Oz */
+#  if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__)
+#    define XXH_SIZE_OPT 1
+#  else
+#    define XXH_SIZE_OPT 0
+#  endif
+#endif
+
+#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+   /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */
+#  if XXH_SIZE_OPT >= 1 || \
+      defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
+   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64)    || defined(_M_ARM) /* visual */
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+#ifndef XXH_NO_INLINE_HINTS
+#  if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__)  /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+#ifndef XXH3_INLINE_SECRET
+#  if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
+     || !defined(XXH_INLINE_ALL)
+#    define XXH3_INLINE_SECRET 0
+#  else
+#    define XXH3_INLINE_SECRET 1
+#  endif
+#endif
+
+#ifndef XXH32_ENDJMP
+/* generally preferable for performance */
+#  define XXH32_ENDJMP 0
+#endif
+
+/*!
+ * @defgroup impl Implementation
+ * @{
+ */
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+#if defined(XXH_NO_STREAM)
+/* nothing */
+#elif defined(XXH_NO_STDLIB)
+
+/* When requesting to disable any mention of stdlib,
+ * the library loses the ability to invoked malloc / free.
+ * In practice, it means that functions like `XXH*_createState()`
+ * will always fail, and return NULL.
+ * This flag is useful in situations where
+ * xxhash.h is integrated into some kernel, embedded or limited environment
+ * without access to dynamic allocation.
+ */
+
+static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; }
+static void XXH_free(void* p) { (void)p; }
+
+#else
+
+/*
+ * Modify the local functions below should you wish to use
+ * different memory routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
+static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); }
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
+static void XXH_free(void* p) { free(p); }
+
+#endif  /* XXH_NO_STDLIB */
+
+#include <string.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than memcpy().
+ */
+static void* XXH_memcpy(void* dest, const void* src, size_t size)
+{
+    return memcpy(dest,src,size);
+}
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS  /* disable inlining hints */
+#  if defined(__GNUC__) || defined(__clang__)
+#    define XXH_FORCE_INLINE static __attribute__((unused))
+#  else
+#    define XXH_FORCE_INLINE static
+#  endif
+#  define XXH_NO_INLINE static
+/* enable inlining hints */
+#elif defined(__GNUC__) || defined(__clang__)
+#  define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
+#  define XXH_NO_INLINE static __attribute__((noinline))
+#elif defined(_MSC_VER)  /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#elif defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
+#  define XXH_FORCE_INLINE static inline
+#  define XXH_NO_INLINE static
+#else
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#endif
+
+#if XXH3_INLINE_SECRET
+#  define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
+#else
+#  define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
+#endif
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*!
+ * @ingroup tuning
+ * @def XXH_DEBUGLEVEL
+ * @brief Sets the debugging level.
+ *
+ * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
+ * compiler's command line options. The value must be a number.
+ */
+#ifndef XXH_DEBUGLEVEL
+#  ifdef DEBUGLEVEL /* backwards compat */
+#    define XXH_DEBUGLEVEL DEBUGLEVEL
+#  else
+#    define XXH_DEBUGLEVEL 0
+#  endif
+#endif
+
+#if (XXH_DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  if defined(__INTEL_COMPILER)
+#    define XXH_ASSERT(c)   XXH_ASSUME((unsigned char) (c))
+#  else
+#    define XXH_ASSERT(c)   XXH_ASSUME(c)
+#  endif
+#endif
+
+/* note: use after variable declarations */
+#ifndef XXH_STATIC_ASSERT
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0)
+#  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
+#  else
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
+#  endif
+#  define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
+#endif
+
+/*!
+ * @internal
+ * @def XXH_COMPILER_GUARD(var)
+ * @brief Used to prevent unwanted optimizations for @p var.
+ *
+ * It uses an empty GCC inline assembly statement with a register constraint
+ * which forces @p var into a general purpose register (eg eax, ebx, ecx
+ * on x86) and marks it as modified.
+ *
+ * This is used in a few places to avoid unwanted autovectorization (e.g.
+ * XXH32_round()). All vectorization we want is explicit via intrinsics,
+ * and _usually_ isn't wanted elsewhere.
+ *
+ * We also use it to prevent unwanted constant folding for AArch64 in
+ * XXH3_initCustomSecret_scalar().
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
+#else
+#  define XXH_COMPILER_GUARD(var) ((void)0)
+#endif
+
+/* Specifically for NEON vectors which use the "w" constraint, on
+ * Clang. */
+#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__)
+#  define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var))
+#else
+#  define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0)
+#endif
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# ifdef _AIX
+#   include <inttypes.h>
+# else
+#   include <stdint.h>
+# endif
+  typedef uint8_t xxh_u8;
+#else
+  typedef unsigned char xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+#ifdef XXH_OLD_NAMES
+#  warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly"
+#  define BYTE xxh_u8
+#  define U8   xxh_u8
+#  define U32  xxh_u32
+#endif
+
+/* ***   Memory access   *** */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+#endif
+static xxh_u32 XXH_read32(const void* ptr)
+{
+    typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32;
+    return *((const xxh_unalign32*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianness   *** */
+
+/*!
+ * @ingroup tuning
+ * @def XXH_CPU_LITTLE_ENDIAN
+ * @brief Whether the target is little endian.
+ *
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined,
+ * a runtime check (which is usually constant folded) is used instead.
+ *
+ * @note
+ *   This is not necessarily defined to an integer constant.
+ *
+ * @see XXH_isLittleEndian() for the runtime check.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
+ */
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Portable and well-defined behavior.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifdef __has_builtin
+#  define XXH_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define XXH_HAS_BUILTIN(x) 0
+#endif
+
+
+
+/*
+ * C23 and future versions have standard "unreachable()".
+ * Once it has been implemented reliably we can add it as an
+ * additional case:
+ *
+ * ```
+ * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)
+ * #  include <stddef.h>
+ * #  ifdef unreachable
+ * #    define XXH_UNREACHABLE() unreachable()
+ * #  endif
+ * #endif
+ * ```
+ *
+ * Note C++23 also has std::unreachable() which can be detected
+ * as follows:
+ * ```
+ * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
+ * #  include <utility>
+ * #  define XXH_UNREACHABLE() std::unreachable()
+ * #endif
+ * ```
+ * NB: `__cpp_lib_unreachable` is defined in the `<version>` header.
+ * We don't use that as including `<utility>` in `extern "C"` blocks
+ * doesn't work on GCC12
+ */
+
+#if XXH_HAS_BUILTIN(__builtin_unreachable)
+#  define XXH_UNREACHABLE() __builtin_unreachable()
+
+#elif defined(_MSC_VER)
+#  define XXH_UNREACHABLE() __assume(0)
+
+#else
+#  define XXH_UNREACHABLE()
+#endif
+
+#if XXH_HAS_BUILTIN(__builtin_assume)
+#  define XXH_ASSUME(c) __builtin_assume(c)
+#else
+#  define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
+#endif
+
+/*!
+ * @internal
+ * @def XXH_rotl32(x,r)
+ * @brief 32-bit rotate left.
+ *
+ * @param x The 32-bit integer to be rotated.
+ * @param r The number of bits to rotate.
+ * @pre
+ *   @p r > 0 && @p r < 32
+ * @note
+ *   @p x and @p r may be evaluated multiple times.
+ * @return The rotated result.
+ */
+#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
+                               && XXH_HAS_BUILTIN(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+ * @brief A 32-bit byteswap.
+ *
+ * @param x The 32-bit integer to byteswap.
+ * @return @p x, byteswapped.
+ */
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+    XXH_aligned,  /*!< Aligned */
+    XXH_unaligned /*!< Possibly unaligned */
+} XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+/*! @ingroup public */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @defgroup XXH32_impl XXH32 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH32 implementation.
+ * @{
+ */
+ /* #define instead of static const, to be used as initializers */
+#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
+#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
+#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
+#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
+#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME32_1 XXH_PRIME32_1
+#  define PRIME32_2 XXH_PRIME32_2
+#  define PRIME32_3 XXH_PRIME32_3
+#  define PRIME32_4 XXH_PRIME32_4
+#  define PRIME32_5 XXH_PRIME32_5
+#endif
+
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * XXH_PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= XXH_PRIME32_1;
+#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * A compiler fence is the only thing that prevents GCC and Clang from
+     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
+     * the loop. NEON is only faster on the A53, and with the newer cores, it is less
+     * than half the speed.
+     *
+     * Additionally, this is used on WASM SIMD128 because it JITs to the same
+     * SIMD instructions and has the same issue.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param hash The hash to avalanche.
+ * @return The avalanched hash.
+ */
+static xxh_u32 XXH32_avalanche(xxh_u32 hash)
+{
+    hash ^= hash >> 15;
+    hash *= XXH_PRIME32_2;
+    hash ^= hash >> 13;
+    hash *= XXH_PRIME32_3;
+    hash ^= hash >> 16;
+    return hash;
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ * @see XXH64_finalize().
+ */
+static XXH_PUREF xxh_u32
+XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define XXH_PROCESS1 do {                             \
+    hash += (*ptr++) * XXH_PRIME32_5;                 \
+    hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1;      \
+} while (0)
+
+#define XXH_PROCESS4 do {                             \
+    hash += XXH_get32bits(ptr) * XXH_PRIME32_3;       \
+    ptr += 4;                                         \
+    hash  = XXH_rotl32(hash, 17) * XXH_PRIME32_4;     \
+} while (0)
+
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+
+    /* Compact rerolled version; generally faster */
+    if (!XXH32_ENDJMP) {
+        len &= 15;
+        while (len >= 4) {
+            XXH_PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            XXH_PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(hash);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 8:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 4:       XXH_PROCESS4;
+                         return XXH32_avalanche(hash);
+
+           case 13:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 9:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 5:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 14:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 10:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 6:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 15:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 11:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 7:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 3:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 2:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 1:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 0:       return XXH32_avalanche(hash);
+        }
+        XXH_ASSERT(0);
+        return hash;   /* reaching this point is deemed impossible */
+    }
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1 XXH_PROCESS1
+#  define PROCESS4 XXH_PROCESS4
+#else
+#  undef XXH_PROCESS1
+#  undef XXH_PROCESS4
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input , len , seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    xxh_u32 h32;
+
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=16) {
+        const xxh_u8* const bEnd = input + len;
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+        xxh_u32 v2 = seed + XXH_PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - XXH_PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + XXH_PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+    statePtr->v[1] = seed + XXH_PRIME32_2;
+    statePtr->v[2] = seed + 0;
+    statePtr->v[3] = seed - XXH_PRIME32_1;
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+
+            do {
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v[0], 1)
+            + XXH_rotl32(state->v[1], 7)
+            + XXH_rotl32(state->v[2], 12)
+            + XXH_rotl32(state->v[3], 18);
+    } else {
+        h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
+ */
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+#ifdef XXH_OLD_NAMES
+#  define U64 xxh_u64
+#endif
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    return *(const xxh_u64*) memPtr;
+}
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+#endif
+static xxh_u64 XXH_read64(const void* ptr)
+{
+    typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64;
+    return *((const xxh_unalign64*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64(xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+/*!
+ * @}
+ * @defgroup XXH64_impl XXH64 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH64 implementation.
+ * @{
+ */
+/* #define rather that static const, to be used as initializers */
+#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
+#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
+#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
+#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
+#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME64_1 XXH_PRIME64_1
+#  define PRIME64_2 XXH_PRIME64_2
+#  define PRIME64_3 XXH_PRIME64_3
+#  define PRIME64_4 XXH_PRIME64_4
+#  define PRIME64_5 XXH_PRIME64_5
+#endif
+
+/*! @copydoc XXH32_round */
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * XXH_PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= XXH_PRIME64_1;
+#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * DISABLE AUTOVECTORIZATION:
+     * A compiler fence is used to prevent GCC and Clang from
+     * autovectorizing the XXH64 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling AVX512.
+     *
+     * Autovectorization of XXH64 tends to be detrimental,
+     * though the exact outcome may change depending on exact cpu and compiler version.
+     * For information, it has been reported as detrimental for Skylake-X,
+     * but possibly beneficial for Zen4.
+     *
+     * The default is to disable auto-vectorization,
+     * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
+    return acc;
+}
+
+/*! @copydoc XXH32_avalanche */
+static xxh_u64 XXH64_avalanche(xxh_u64 hash)
+{
+    hash ^= hash >> 33;
+    hash *= XXH_PRIME64_2;
+    hash ^= hash >> 29;
+    hash *= XXH_PRIME64_3;
+    hash ^= hash >> 32;
+    return hash;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+/*!
+ * @internal
+ * @brief Processes the last 0-31 bytes of @p ptr.
+ *
+ * There may be up to 31 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 32.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash
+ * @see XXH32_finalize().
+ */
+static XXH_PUREF xxh_u64
+XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+    len &= 31;
+    while (len >= 8) {
+        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+        ptr += 8;
+        hash ^= k1;
+        hash  = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+        len -= 8;
+    }
+    if (len >= 4) {
+        hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+        ptr += 4;
+        hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+        len -= 4;
+    }
+    while (len > 0) {
+        hash ^= (*ptr++) * XXH_PRIME64_5;
+        hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1;
+        --len;
+    }
+    return  XXH64_avalanche(hash);
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1_64 XXH_PROCESS1_64
+#  define PROCESS4_64 XXH_PROCESS4_64
+#  define PROCESS8_64 XXH_PROCESS8_64
+#else
+#  undef XXH_PROCESS1_64
+#  undef XXH_PROCESS4_64
+#  undef XXH_PROCESS8_64
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH64().
+ *
+ * @param input , len , seed Directly passed from @ref XXH64().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    xxh_u64 h64;
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=32) {
+        const xxh_u8* const bEnd = input + len;
+        const xxh_u8* const limit = bEnd - 31;
+        xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+        xxh_u64 v2 = seed + XXH_PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - XXH_PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH64_family*/
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+    statePtr->v[1] = seed + XXH_PRIME64_2;
+    statePtr->v[2] = seed + 0;
+    statePtr->v[3] = seed - XXH_PRIME64_1;
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));
+            state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));
+            state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));
+            state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));
+            p += 32 - state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+
+            do {
+                state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;
+                state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;
+                state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;
+                state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
+        h64 = XXH64_mergeRound(h64, state->v[0]);
+        h64 = XXH64_mergeRound(h64, state->v[1]);
+        h64 = XXH64_mergeRound(h64, state->v[2]);
+        h64 = XXH64_mergeRound(h64, state->v[3]);
+    } else {
+        h64  = state->v[2] /*seed*/ + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/******* Canonical representation   *******/
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+#ifndef XXH_NO_XXH3
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+/*!
+ * @}
+ * @defgroup XXH3_impl XXH3 implementation
+ * @ingroup impl
+ * @{
+ */
+
+/* ===   Compiler specifics   === */
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define XXH_RESTRICT   /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
+   || (defined (__clang__)) \
+   || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
+   || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
+/*
+ * There are a LOT more compilers that recognize __restrict but this
+ * covers the major ones.
+ */
+#  define XXH_RESTRICT   __restrict
+#else
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#ifndef XXH_HAS_INCLUDE
+#  ifdef __has_include
+/*
+ * Not defined as XXH_HAS_INCLUDE(x) (function-like) because
+ * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion)
+ */
+#    define XXH_HAS_INCLUDE __has_include
+#  else
+#    define XXH_HAS_INCLUDE(x) 0
+#  endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(__ARM_FEATURE_SVE)
+#    include <arm_sve.h>
+#  endif
+#  if defined(__ARM_NEON__) || defined(__ARM_NEON) \
+   || (defined(_M_ARM) && _M_ARM >= 7) \
+   || defined(_M_ARM64) || defined(_M_ARM64EC) \
+   || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */
+#    define inline __inline__  /* circumvent a clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  elif defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ *
+ * Credit: large sections of the vectorial and asm source code paths
+ *         have been contributed by @easyaspi314
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @ingroup tuning
+ * @brief Overrides the vectorization implementation chosen for XXH3.
+ *
+ * Can be defined to 0 to disable SIMD or any of the values mentioned in
+ * @ref XXH_VECTOR_TYPE.
+ *
+ * If this is not defined, it uses predefined macros to determine the best
+ * implementation.
+ */
+#  define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Note that these are actually implemented as macros.
+ *
+ * If this is not defined, it is detected automatically.
+ * internal macro XXH_X86DISPATCH overrides this.
+ */
+enum XXH_VECTOR_TYPE /* fake enum */ {
+    XXH_SCALAR = 0,  /*!< Portable scalar version */
+    XXH_SSE2   = 1,  /*!<
+                      * SSE2 for Pentium 4, Opteron, all x86_64.
+                      *
+                      * @note SSE2 is also guaranteed on Windows 10, macOS, and
+                      * Android x86.
+                      */
+    XXH_AVX2   = 2,  /*!< AVX2 for Haswell and Bulldozer */
+    XXH_AVX512 = 3,  /*!< AVX512 for Skylake and Icelake */
+    XXH_NEON   = 4,  /*!<
+                       * NEON for most ARMv7-A, all AArch64, and WASM SIMD128
+                       * via the SIMDeverywhere polyfill provided with the
+                       * Emscripten SDK.
+                       */
+    XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+    XXH_SVE    = 6,  /*!< SVE for some ARMv8-A and ARMv9-A */
+};
+/*!
+ * @ingroup tuning
+ * @brief Selects the minimum alignment for XXH3's accumulators.
+ *
+ * When using SIMD, this should match the alignment required for said vector
+ * type, so, for example, 32 for AVX2.
+ *
+ * Default: Auto detected.
+ */
+#  define XXH_ACC_ALIGN 8
+#endif
+
+/* Actual definition */
+#ifndef XXH_DOXYGEN
+#  define XXH_SCALAR 0
+#  define XXH_SSE2   1
+#  define XXH_AVX2   2
+#  define XXH_AVX512 3
+#  define XXH_NEON   4
+#  define XXH_VSX    5
+#  define XXH_SVE    6
+#endif
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__ARM_FEATURE_SVE)
+#    define XXH_VECTOR XXH_SVE
+#  elif ( \
+        defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
+     || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
+     || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \
+   ) && ( \
+        defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+   )
+#    define XXH_VECTOR XXH_NEON
+#  elif defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
+#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
+#  ifdef _MSC_VER
+#    pragma warning(once : 4606)
+#  else
+#    warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
+#  endif
+#  undef XXH_VECTOR
+#  define XXH_VECTOR XXH_SCALAR
+#endif
+
+/*
+ * Controls the alignment of the accumulator,
+ * for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if defined(XXH_X86DISPATCH)
+#     define XXH_ACC_ALIGN 64  /* for compatibility with avx512 */
+#  elif XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_SVE   /* sve */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
+    || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#elif XXH_VECTOR == XXH_SVE
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#else
+#  define XXH_SEC_ALIGN 8
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_ALIASING __attribute__((may_alias))
+#else
+#  define XXH_ALIASING /* nothing */
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+#if XXH_VECTOR == XXH_NEON
+
+/*
+ * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
+ * optimizes out the entire hashLong loop because of the aliasing violation.
+ *
+ * However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
+ * so the only option is to mark it as aliasing.
+ */
+typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
+
+/*!
+ * @internal
+ * @brief `vld1q_u64` but faster and alignment-safe.
+ *
+ * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
+ * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
+ *
+ * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
+ * prohibits load-store optimizations. Therefore, a direct dereference is used.
+ *
+ * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
+ * unaligned load.
+ */
+#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
+{
+    return *(xxh_aliasing_uint64x2_t const *)ptr;
+}
+#else
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
+{
+    return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
+}
+#endif
+
+/*!
+ * @internal
+ * @brief `vmlal_u32` on low and high halves of a vector.
+ *
+ * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
+ * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
+ * with `vmlal_u32`.
+ */
+#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    /* Inline assembly is the only way */
+    __asm__("umlal   %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
+    return acc;
+}
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    /* This intrinsic works as expected */
+    return vmlal_high_u32(acc, lhs, rhs);
+}
+#else
+/* Portable intrinsic versions */
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
+}
+/*! @copydoc XXH_vmlal_low_u32
+ * Assume the compiler converts this to vmlal_high_u32 on aarch64 */
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
+}
+#endif
+
+/*!
+ * @ingroup tuning
+ * @brief Controls the NEON to scalar ratio for XXH3
+ *
+ * This can be set to 2, 4, 6, or 8.
+ *
+ * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
+ *
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
+ * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
+ * bandwidth.
+ *
+ * This is even more noticeable on the more advanced cores like the Cortex-A76 which
+ * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
+ *
+ * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
+ * and 2 scalar lanes, which is chosen by default.
+ *
+ * This does not apply to Apple processors or 32-bit processors, which run better with
+ * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
+ *
+ * This change benefits CPUs with large micro-op buffers without negatively affecting
+ * most other CPUs:
+ *
+ *  | Chipset               | Dispatch type       | NEON only | 6:2 hybrid | Diff. |
+ *  |:----------------------|:--------------------|----------:|-----------:|------:|
+ *  | Snapdragon 730 (A76)  | 2 NEON/8 micro-ops  |  8.8 GB/s |  10.1 GB/s |  ~16% |
+ *  | Snapdragon 835 (A73)  | 2 NEON/3 micro-ops  |  5.1 GB/s |   5.3 GB/s |   ~5% |
+ *  | Marvell PXA1928 (A53) | In-order dual-issue |  1.9 GB/s |   1.9 GB/s |    0% |
+ *  | Apple M1              | 4 NEON/8 micro-ops  | 37.3 GB/s |  36.1 GB/s |  ~-3% |
+ *
+ * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
+ *
+ * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning
+ * it effectively becomes worse 4.
+ *
+ * @see XXH3_accumulate_512_neon()
+ */
+# ifndef XXH3_NEON_LANES
+#  if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
+   && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
+#   define XXH3_NEON_LANES 6
+#  else
+#   define XXH3_NEON_LANES XXH_ACC_NB
+#  endif
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`,
+ * and `pixel`. This is a problem for obvious reasons.
+ *
+ * These keywords are unnecessary; the spec literally says they are
+ * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd
+ * after including the header.
+ *
+ * We use pragma push_macro/pop_macro to keep the namespace clean. */
+#  pragma push_macro("bool")
+#  pragma push_macro("vector")
+#  pragma push_macro("pixel")
+/* silence potential macro redefined warnings */
+#  undef bool
+#  undef vector
+#  undef pixel
+
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+#    include <altivec.h>
+#  endif
+
+/* Restore the original macro values, if applicable. */
+#  pragma pop_macro("pixel")
+#  pragma pop_macro("vector")
+#  pragma pop_macro("bool")
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+/*
+ * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
+ */
+typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+ /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+#if XXH_VECTOR == XXH_SVE
+#define ACCRND(acc, offset) \
+do { \
+    svuint64_t input_vec = svld1_u64(mask, xinput + offset);         \
+    svuint64_t secret_vec = svld1_u64(mask, xsecret + offset);       \
+    svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec);     \
+    svuint64_t swapped = svtbl_u64(input_vec, kSwap);                \
+    svuint64_t mixed_lo = svextw_u64_x(mask, mixed);                 \
+    svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32);            \
+    svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
+    acc = svadd_u64_x(mask, acc, mul);                               \
+} while (0)
+#endif /* XXH_VECTOR == XXH_SVE */
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if XXH_SIZE_OPT >= 1
+#    define XXH_PREFETCH(ptr) (void)(ptr)
+#  elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/*! Pseudorandom secret taken directly from FARSH. */
+XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL;  /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */
+static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL;  /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */
+
+#ifdef XXH_OLD_NAMES
+#  define kSecret XXH3_kSecret
+#endif
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+{
+   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+}
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
+ *
+ * @param lhs , rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+    /*
+     * MSVC for ARM64's __umulh method.
+     *
+     * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
+     */
+#elif defined(_M_ARM64) || defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(__umulh)
+#endif
+    XXH128_hash_t r128;
+    r128.low64  = lhs * rhs;
+    r128.high64 = __umulh(lhs, rhs);
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ *
+ * @param lhs , rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/*! Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * This is a fast avalanche stage,
+ * suitable when input bits are already partially mixed
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= PRIME_MX1;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+/*
+ * This is a stronger avalanche,
+ * inspired by Pelle Evensen's rrmxmx
+ * preferable when input has not been previously mixed
+ */
+static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
+{
+    /* this mix is inspired by Pelle Evensen's rrmxmx */
+    h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
+    h64 *= PRIME_MX2;
+    h64 ^= (h64 >> 35) + len ;
+    h64 *= PRIME_MX2;
+    return XXH_xorshift64(h64, 28);
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8  const c1 = input[0];
+        xxh_u8  const c2 = input[len >> 1];
+        xxh_u8  const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        return XXH64_avalanche(keyed);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 const keyed = input64 ^ bitflip;
+        return XXH3_rrmxmx(keyed, len);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    XXH_COMPILER_GUARD(seed64);
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+#if XXH_SIZE_OPT >= 1
+        /* Smaller and cleaner, but slightly slower. */
+        unsigned int i = (unsigned int)(len - 1) / 32;
+        do {
+            acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
+            acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
+        } while (i-- != 0);
+#else
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+#endif
+        return XXH3_avalanche(acc);
+    }
+}
+
+/*!
+ * @brief Maximum size of "short" key in bytes.
+ */
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        xxh_u64 acc_end;
+        unsigned int const nbRounds = (unsigned int)len / 16;
+        unsigned int i;
+        XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        /* last bytes */
+        acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        XXH_ASSERT(nbRounds >= 8);
+        acc = XXH3_avalanche(acc);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            /*
+             * Prevents clang for unrolling the acc loop and interleaving with this one.
+             */
+            XXH_COMPILER_GUARD(acc);
+            acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        return XXH3_avalanche(acc + acc_end);
+    }
+}
+
+
+/* =======     Long Keys     ======= */
+
+#define XXH_STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+
+#ifdef XXH_OLD_NAMES
+#  define STRIPE_LEN XXH_STRIPE_LEN
+#  define ACC_NB XXH_ACC_NB
+#endif
+
+#ifndef XXH_PREFETCH_DIST
+#  ifdef __clang__
+#    define XXH_PREFETCH_DIST 320
+#  else
+#    if (XXH_VECTOR == XXH_AVX512)
+#      define XXH_PREFETCH_DIST 512
+#    else
+#      define XXH_PREFETCH_DIST 384
+#    endif
+#  endif  /* __clang__ */
+#endif  /* XXH_PREFETCH_DIST */
+
+/*
+ * These macros are to generate an XXH3_accumulate() function.
+ * The two arguments select the name suffix and target attribute.
+ *
+ * The name of this symbol is XXH3_accumulate_<name>() and it calls
+ * XXH3_accumulate_512_<name>().
+ *
+ * It may be useful to hand implement this function if the compiler fails to
+ * optimize the inline function.
+ */
+#define XXH3_ACCUMULATE_TEMPLATE(name)                      \
+void                                                        \
+XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc,           \
+                       const xxh_u8* XXH_RESTRICT input,    \
+                       const xxh_u8* XXH_RESTRICT secret,   \
+                       size_t nbStripes)                    \
+{                                                           \
+    size_t n;                                               \
+    for (n = 0; n < nbStripes; n++ ) {                      \
+        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;  \
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);               \
+        XXH3_accumulate_512_##name(                         \
+                 acc,                                       \
+                 in,                                        \
+                 secret + n*XXH_SECRET_CONSUME_RATE);       \
+    }                                                       \
+}
+
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    XXH_memcpy(dst, &v64, sizeof(v64));
+}
+
+/* Several intrinsic functions below are supposed to accept __int64 as argument,
+ * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+ * However, several environments do not define __int64 type,
+ * requiring a workaround.
+ */
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+    typedef int64_t xxh_i64;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef long long xxh_i64;
+#endif
+
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+#if (XXH_VECTOR == XXH_AVX512) \
+     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
+
+#ifndef XXH_TARGET_AVX512
+# define XXH_TARGET_AVX512  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    __m512i* const xacc = (__m512i *) acc;
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+
+    {
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        /* xacc[0] += swap(data_vec); */
+        __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
+        __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+        /* xacc[0] += product; */
+        *xacc = _mm512_add_epi64(product, sum);
+    }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+    {   __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
+
+        /* xacc[0] *= XXH_PRIME32_1; */
+        __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+    XXH_ASSERT(((size_t)customSecret & 63) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+        __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
+        __m512i const seed     = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
+
+        const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
+              __m512i* const dest = (      __m512i*) customSecret;
+        int i;
+        XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 63) == 0);
+        for (i=0; i < nbRounds; ++i) {
+            dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_AVX2) \
+    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
+
+#ifndef XXH_TARGET_AVX2
+# define XXH_TARGET_AVX2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm256_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
+    (void)(&XXH_writeLE64);
+    XXH_PREFETCH(customSecret);
+    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
+
+        const __m256i* const src  = (const __m256i*) ((const void*) XXH3_kSecret);
+              __m256i*       dest = (      __m256i*) customSecret;
+
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dest);
+#       endif
+        XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 31) == 0);
+
+        /* GCC -O2 need unroll loop manually */
+        dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
+        dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
+        dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
+        dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
+        dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
+        dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
+    }
+}
+
+#endif
+
+/* x86dispatch always generates SSE2 */
+#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+
+#ifndef XXH_TARGET_SSE2
+# define XXH_TARGET_SSE2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+            __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
+
+#       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+        /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
+        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
+        __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
+#       else
+        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+#       endif
+        int i;
+
+        const void* const src16 = XXH3_kSecret;
+        __m128i* dst16 = (__m128i*) customSecret;
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dst16);
+#       endif
+        XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dst16 & 15) == 0);
+
+        for (i=0; i < nbRounds; ++i) {
+            dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_NEON)
+
+/* forward declarations for the scalar routines */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret, size_t lane);
+
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret, size_t lane);
+
+/*!
+ * @internal
+ * @brief The bulk processing loop for NEON and WASM SIMD128.
+ *
+ * The NEON code path is actually partially scalar when running on AArch64. This
+ * is to optimize the pipelining and can have up to 15% speedup depending on the
+ * CPU, and it also mitigates some GCC codegen issues.
+ *
+ * @see XXH3_NEON_LANES for configuring this and details about this optimization.
+ *
+ * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
+ * integers instead of the other platforms which mask full 64-bit vectors,
+ * so the setup is more complicated than just shifting right.
+ *
+ * Additionally, there is an optimization for 4 lanes at once noted below.
+ *
+ * Since, as stated, the most optimal amount of lanes for Cortexes is 6,
+ * there needs to be *three* versions of the accumulate operation used
+ * for the remaining 2 lanes.
+ *
+ * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap
+ * nearly perfectly.
+ */
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
+    {   /* GCC for darwin arm64 does not like aliasing here */
+        xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* xinput = (const uint8_t *) input;
+        uint8_t const* xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+#ifdef __wasm_simd128__
+        /*
+         * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret
+         * is constant propagated, which results in it converting it to this
+         * inside the loop:
+         *
+         *    a = v128.load(XXH3_kSecret +  0 + $secret_offset, offset = 0)
+         *    b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0)
+         *    ...
+         *
+         * This requires a full 32-bit address immediate (and therefore a 6 byte
+         * instruction) as well as an add for each offset.
+         *
+         * Putting an asm guard prevents it from folding (at the cost of losing
+         * the alignment hint), and uses the free offset in `v128.load` instead
+         * of adding secret_offset each time which overall reduces code size by
+         * about a kilobyte and improves performance.
+         */
+        XXH_COMPILER_GUARD(xsecret);
+#endif
+        /* Scalar lanes use the normal scalarRound routine */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarRound(acc, input, secret, i);
+        }
+        i = 0;
+        /* 4 NEON lanes at a time. */
+        for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput  + (i * 16));
+            uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput  + ((i+1) * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec_1  = XXH_vld1q_u64(xsecret + (i * 16));
+            uint64x2_t key_vec_2  = XXH_vld1q_u64(xsecret + ((i+1) * 16));
+            /* data_swap = swap(data_vec) */
+            uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
+            uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
+            uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
+
+            /*
+             * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
+             * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
+             * get one vector with the low 32 bits of each lane, and one vector
+             * with the high 32 bits of each lane.
+             *
+             * The intrinsic returns a double vector because the original ARMv7-a
+             * instruction modified both arguments in place. AArch64 and SIMD128 emit
+             * two instructions from this intrinsic.
+             *
+             *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
+             *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
+             */
+            uint32x4x2_t unzipped = vuzpq_u32(
+                vreinterpretq_u32_u64(data_key_1),
+                vreinterpretq_u32_u64(data_key_2)
+            );
+            /* data_key_lo = data_key & 0xFFFFFFFF */
+            uint32x4_t data_key_lo = unzipped.val[0];
+            /* data_key_hi = data_key >> 32 */
+            uint32x4_t data_key_hi = unzipped.val[1];
+            /*
+             * Then, we can split the vectors horizontally and multiply which, as for most
+             * widening intrinsics, have a variant that works on both high half vectors
+             * for free on AArch64. A similar instruction is available on SIMD128.
+             *
+             * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
+             */
+            uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
+            uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
+            /*
+             * Clang reorders
+             *    a += b * c;     // umlal   swap.2d, dkl.2s, dkh.2s
+             *    c += a;         // add     acc.2d, acc.2d, swap.2d
+             * to
+             *    c += a;         // add     acc.2d, acc.2d, swap.2d
+             *    c += b * c;     // umlal   acc.2d, dkl.2s, dkh.2s
+             *
+             * While it would make sense in theory since the addition is faster,
+             * for reasons likely related to umlal being limited to certain NEON
+             * pipelines, this is worse. A compiler guard fixes this.
+             */
+            XXH_COMPILER_GUARD_CLANG_NEON(sum_1);
+            XXH_COMPILER_GUARD_CLANG_NEON(sum_2);
+            /* xacc[i] = acc_vec + sum; */
+            xacc[i]   = vaddq_u64(xacc[i], sum_1);
+            xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
+        }
+        /* Operate on the remaining NEON lanes 2 at a time. */
+        for (; i < XXH3_NEON_LANES / 2; i++) {
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec = XXH_vld1q_u64(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
+            /* acc_vec_2 = swap(data_vec) */
+            uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+            /* For two lanes, just use VMOVN and VSHRN. */
+            /* data_key_lo = data_key & 0xFFFFFFFF; */
+            uint32x2_t data_key_lo = vmovn_u64(data_key);
+            /* data_key_hi = data_key >> 32; */
+            uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
+            /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
+            uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
+            /* Same Clang workaround as before */
+            XXH_COMPILER_GUARD_CLANG_NEON(sum);
+            /* xacc[i] = acc_vec + sum; */
+            xacc[i] = vaddq_u64 (xacc[i], sum);
+        }
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   xxh_aliasing_uint64x2_t* xacc       = (xxh_aliasing_uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+
+        size_t i;
+        /* WASM uses operator overloads and doesn't need these. */
+#ifndef __wasm_simd128__
+        /* { prime32_1, prime32_1 } */
+        uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1);
+        /* { 0, prime32_1, 0, prime32_1 } */
+        uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32));
+#endif
+
+        /* AArch64 uses both scalar and neon at the same time */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarScrambleRound(acc, secret, i);
+        }
+        for (i=0; i < XXH3_NEON_LANES / 2; i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64(acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+            /* xacc[i] *= XXH_PRIME32_1 */
+#ifdef __wasm_simd128__
+            /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */
+            xacc[i] = data_key * XXH_PRIME32_1;
+#else
+            /*
+             * Expanded version with portable NEON intrinsics
+             *
+             *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
+             *
+             * prod_hi = hi(data_key) * lo(prime) << 32
+             *
+             * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector
+             * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits
+             * and avoid the shift.
+             */
+            uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi);
+            /* Extract low bits for vmlal_u32  */
+            uint32x2_t data_key_lo = vmovn_u64(data_key);
+            /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
+            xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
+#endif
+        }
+    }
+}
+#endif
+
+#if (XXH_VECTOR == XXH_VSX)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* presumed aligned */
+    xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
+    xxh_u8 const* const xinput   = (xxh_u8 const*) input;   /* no alignment restriction */
+    xxh_u8 const* const xsecret  = (xxh_u8 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        /* acc_vec = xacc[i]; */
+        xxh_u64x2 acc_vec        = xacc[i];
+        acc_vec += product;
+
+        /* swap high and low halves */
+#ifdef __s390x__
+        acc_vec += vec_permi(data_vec, data_vec, 2);
+#else
+        acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+        xacc[i] = acc_vec;
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
+        const xxh_u8* const xsecret = (const xxh_u8*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
+        size_t i;
+        for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_SVE)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
+                   const void* XXH_RESTRICT input,
+                   const void* XXH_RESTRICT secret)
+{
+    uint64_t *xacc = (uint64_t *)acc;
+    const uint64_t *xinput = (const uint64_t *)(const void *)input;
+    const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+    svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+    uint64_t element_count = svcntd();
+    if (element_count >= 8) {
+        svbool_t mask = svptrue_pat_b64(SV_VL8);
+        svuint64_t vacc = svld1_u64(mask, xacc);
+        ACCRND(vacc, 0);
+        svst1_u64(mask, xacc, vacc);
+    } else if (element_count == 2) {   /* sve128 */
+        svbool_t mask = svptrue_pat_b64(SV_VL2);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+        svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+        svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 2);
+        ACCRND(acc2, 4);
+        ACCRND(acc3, 6);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 2, acc1);
+        svst1_u64(mask, xacc + 4, acc2);
+        svst1_u64(mask, xacc + 6, acc3);
+    } else {
+        svbool_t mask = svptrue_pat_b64(SV_VL4);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 4);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 4, acc1);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
+               const xxh_u8* XXH_RESTRICT input,
+               const xxh_u8* XXH_RESTRICT secret,
+               size_t nbStripes)
+{
+    if (nbStripes != 0) {
+        uint64_t *xacc = (uint64_t *)acc;
+        const uint64_t *xinput = (const uint64_t *)(const void *)input;
+        const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+        svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+        uint64_t element_count = svcntd();
+        if (element_count >= 8) {
+            svbool_t mask = svptrue_pat_b64(SV_VL8);
+            svuint64_t vacc = svld1_u64(mask, xacc + 0);
+            do {
+                /* svprfd(svbool_t, void *, enum svfprop); */
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(vacc, 0);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, vacc);
+        } else if (element_count == 2) { /* sve128 */
+            svbool_t mask = svptrue_pat_b64(SV_VL2);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+            svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+            svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 2);
+                ACCRND(acc2, 4);
+                ACCRND(acc3, 6);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 2, acc1);
+           svst1_u64(mask, xacc + 4, acc2);
+           svst1_u64(mask, xacc + 6, acc3);
+        } else {
+            svbool_t mask = svptrue_pat_b64(SV_VL4);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 4);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 4, acc1);
+       }
+    }
+}
+
+#endif
+
+/* scalar variants - universal */
+
+#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
+/*
+ * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
+ * emit an excess mask and a full 64-bit multiply-add (MADD X-form).
+ *
+ * While this might not seem like much, as AArch64 is a 64-bit architecture, only
+ * big Cortex designs have a full 64-bit multiplier.
+ *
+ * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
+ * multiplies expand to 2-3 multiplies in microcode. This has a major penalty
+ * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
+ *
+ * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
+ * not have this penalty and does the mask automatically.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
+{
+    xxh_u64 ret;
+    /* note: %x = 64-bit register, %w = 32-bit register */
+    __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
+    return ret;
+}
+#else
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
+{
+    return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
+}
+#endif
+
+/*!
+ * @internal
+ * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc,
+                 void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret,
+                 size_t lane)
+{
+    xxh_u64* xacc = (xxh_u64*) acc;
+    xxh_u8 const* xinput  = (xxh_u8 const*) input;
+    xxh_u8 const* xsecret = (xxh_u8 const*) secret;
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    {
+        xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
+        xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
+        xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
+    }
+}
+
+/*!
+ * @internal
+ * @brief Processes a 64 byte block of data using the scalar path.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
+#if defined(__GNUC__) && !defined(__clang__) \
+  && (defined(__arm__) || defined(__thumb2__)) \
+  && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
+  && XXH_SIZE_OPT <= 0
+#  pragma GCC unroll 8
+#endif
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarRound(acc, input, secret, i);
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
+
+/*!
+ * @internal
+ * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret,
+                         size_t lane)
+{
+    xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
+        xxh_u64 acc64 = xacc[lane];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= XXH_PRIME32_1;
+        xacc[lane] = acc64;
+    }
+}
+
+/*!
+ * @internal
+ * @brief Scrambles the accumulators after a large chunk has been read
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarScrambleRound(acc, secret, i);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    /*
+     * We need a separate pointer for the hack below,
+     * which requires a non-const pointer.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8* kSecretPtr = XXH3_kSecret;
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__GNUC__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), it fights for bandwidth with
+     * the arithmetic instructions.
+     *
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes the compiler to assume
+     * that XXH3_kSecretPtr has been changed), the pipelines are used more
+     * efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     *
+     * See XXH3_NEON_LANES for details on the pipsline.
+     *
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    XXH_COMPILER_GUARD(kSecretPtr);
+#endif
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+        int i;
+        for (i=0; i < nbRounds; i++) {
+            /*
+             * The asm hack causes the compiler to assume that kSecretPtr aliases with
+             * customSecret, and on aarch64, this prevented LDP from merging two
+             * loads together for free. Putting the loads together before the stores
+             * properly generates LDP.
+             */
+            xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+            xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i,     lo);
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
+    }   }
+}
+
+
+typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
+typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
+typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
+
+
+#if (XXH_VECTOR == XXH_AVX512)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+#define XXH3_accumulate     XXH3_accumulate_avx512
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+#define XXH3_accumulate     XXH3_accumulate_avx2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+#define XXH3_accumulate     XXH3_accumulate_sse2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_neon
+#define XXH3_accumulate     XXH3_accumulate_neon
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+#define XXH3_accumulate     XXH3_accumulate_vsx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_SVE)
+#define XXH3_accumulate_512 XXH3_accumulate_512_sve
+#define XXH3_accumulate     XXH3_accumulate_sve
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#else /* scalar */
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+#define XXH3_accumulate     XXH3_accumulate_scalar
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#endif
+
+#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */
+#  undef XXH3_initCustomSecret
+#  define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+#endif
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+    size_t const nb_blocks = (len - 1) / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
+        f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > XXH_STRIPE_LEN);
+    {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
+
+        /* last stripe */
+        {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
+            XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        XXH_COMPILER_GUARD(result64);
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+                        XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
+                           const void* XXH_RESTRICT secret, size_t secretSize,
+                           XXH3_f_accumulate f_acc,
+                           XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
+}
+
+/*
+ * It's important for performance to transmit secret's size (when it's static)
+ * so that the compiler can properly optimize the vectorized loop.
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
+ * breaks -Og, this is XXH_NO_INLINE.
+ */
+XXH3_WITH_SECRET_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                             XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's preferable for performance that XXH3_hashLong is not inlined,
+ * as it results in a smaller function for small data, easier to the instruction cache.
+ * Note that inside this no_inline function, we do inline the internal loop,
+ * and provide a statically defined secret size to allow optimization of vector loop.
+ */
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
+                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
+                                    XXH64_hash_t seed,
+                                    XXH3_f_accumulate f_acc,
+                                    XXH3_f_scrambleAcc f_scramble,
+                                    XXH3_f_initCustomSecret f_initSec)
+{
+#if XXH_SIZE_OPT <= 0
+    if (seed == 0)
+        return XXH3_hashLong_64b_internal(input, len,
+                                          XXH3_kSecret, sizeof(XXH3_kSecret),
+                                          f_acc, f_scramble);
+#endif
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed);
+        return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
+                                          f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+
+typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
+                                          XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
+                     XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                     XXH3_hashLong64_f f_hashLong)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secretLen` condition is not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     * Also, note that function signature doesn't offer room to return an error.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
+}
+
+
+/* ===   Public entry point   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
+{
+    return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
+{
+    return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (length <= XXH3_MIDSIZE_MAX)
+        return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize);
+}
+
+
+/* ===   XXH3 streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+/*! @ingroup XXH3_family */
+/*!
+ * @brief Allocate an @ref XXH3_state_t.
+ *
+ * @return An allocated pointer of @ref XXH3_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH3_freeState().
+ */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+    if (state==NULL) return NULL;
+    XXH3_INITSTATE(state);
+    return state;
+}
+
+/*! @ingroup XXH3_family */
+/*!
+ * @brief Frees an @ref XXH3_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note Must be allocated with XXH3_createState().
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
+{
+    XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_reset_internal(XXH3_state_t* statePtr,
+                    XXH64_hash_t seed,
+                    const void* secret, size_t secretSize)
+{
+    size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
+    size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
+    XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
+    XXH_ASSERT(statePtr != NULL);
+    /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+    memset((char*)statePtr + initStart, 0, initLength);
+    statePtr->acc[0] = XXH_PRIME32_3;
+    statePtr->acc[1] = XXH_PRIME64_1;
+    statePtr->acc[2] = XXH_PRIME64_2;
+    statePtr->acc[3] = XXH_PRIME64_3;
+    statePtr->acc[4] = XXH_PRIME64_4;
+    statePtr->acc[5] = XXH_PRIME32_2;
+    statePtr->acc[6] = XXH_PRIME64_5;
+    statePtr->acc[7] = XXH_PRIME32_1;
+    statePtr->seed = seed;
+    statePtr->useSeed = (seed != 0);
+    statePtr->extSecret = (const unsigned char*)secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (seed==0) return XXH3_64bits_reset(statePtr);
+    if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
+        XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, seed64, secret, secretSize);
+    statePtr->useSeed = 1; /* always, even if seed64==0 */
+    return XXH_OK;
+}
+
+/*!
+ * @internal
+ * @brief Processes a large input for XXH3_update() and XXH3_digest_long().
+ *
+ * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
+ *
+ * @param acc                Pointer to the 8 accumulator lanes
+ * @param nbStripesSoFarPtr  In/out pointer to the number of leftover stripes in the block*
+ * @param nbStripesPerBlock  Number of stripes in a block
+ * @param input              Input pointer
+ * @param nbStripes          Number of stripes to process
+ * @param secret             Secret pointer
+ * @param secretLimit        Offset of the last block in @p secret
+ * @param f_acc              Pointer to an XXH3_accumulate implementation
+ * @param f_scramble         Pointer to an XXH3_scrambleAcc implementation
+ * @return                   Pointer past the end of @p input after processing
+ */
+XXH_FORCE_INLINE const xxh_u8 *
+XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
+                    size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
+                    const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
+                    const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
+                    XXH3_f_accumulate f_acc,
+                    XXH3_f_scrambleAcc f_scramble)
+{
+    const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
+    /* Process full blocks */
+    if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
+        /* Process the initial partial block... */
+        size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;
+
+        do {
+            /* Accumulate and scramble */
+            f_acc(acc, input, initialSecret, nbStripesThisIter);
+            f_scramble(acc, secret + secretLimit);
+            input += nbStripesThisIter * XXH_STRIPE_LEN;
+            nbStripes -= nbStripesThisIter;
+            /* Then continue the loop with the full block size */
+            nbStripesThisIter = nbStripesPerBlock;
+            initialSecret = secret;
+        } while (nbStripes >= nbStripesPerBlock);
+        *nbStripesSoFarPtr = 0;
+    }
+    /* Process a partial block */
+    if (nbStripes > 0) {
+        f_acc(acc, input, initialSecret, nbStripes);
+        input += nbStripes * XXH_STRIPE_LEN;
+        *nbStripesSoFarPtr += nbStripes;
+    }
+    /* Return end pointer */
+    return input;
+}
+
+#ifndef XXH3_STREAM_USE_STACK
+# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */
+#   define XXH3_STREAM_USE_STACK 1
+# endif
+#endif
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
+            const xxh_u8* XXH_RESTRICT input, size_t len,
+            XXH3_f_accumulate f_acc,
+            XXH3_f_scrambleAcc f_scramble)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    XXH_ASSERT(state != NULL);
+    {   const xxh_u8* const bEnd = input + len;
+        const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* For some reason, gcc and MSVC seem to suffer greatly
+         * when operating accumulators directly into state.
+         * Operating into stack space seems to enable proper optimization.
+         * clang, on the other hand, doesn't seem to need this trick */
+        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
+        XXH_memcpy(acc, state->acc, sizeof(acc));
+#else
+        xxh_u64* XXH_RESTRICT const acc = state->acc;
+#endif
+        state->totalLen += len;
+        XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
+
+        /* small input : just fill in tmp buffer */
+        if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * Internal buffer is partially filled (always, except at beginning)
+         * Complete it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                secret, state->secretLimit,
+                                f_acc, f_scramble);
+            state->bufferedSize = 0;
+        }
+        XXH_ASSERT(input < bEnd);
+        if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
+            size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
+            input = XXH3_consumeStripes(acc,
+                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                       input, nbStripes,
+                                       secret, state->secretLimit,
+                                       f_acc, f_scramble);
+            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+
+        }
+        /* Some remaining input (always) : buffer it */
+        XXH_ASSERT(input < bEnd);
+        XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
+        XXH_ASSERT(state->bufferedSize == 0);
+        XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+        state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* save stack accumulators into state */
+        XXH_memcpy(state->acc, acc, sizeof(acc));
+#endif
+    }
+
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc,
+                  const XXH3_state_t* state,
+                  const unsigned char* secret)
+{
+    xxh_u8 lastStripe[XXH_STRIPE_LEN];
+    const xxh_u8* lastStripePtr;
+
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    XXH_memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= XXH_STRIPE_LEN) {
+        /* Consume remaining stripes then point to remaining data in buffer */
+        size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+        size_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, nbStripes,
+                            secret, state->secretLimit,
+                            XXH3_accumulate, XXH3_scrambleAcc);
+        lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
+    } else {  /* bufferedSize < XXH_STRIPE_LEN */
+        /* Copy to temp buffer */
+        size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+        XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
+        XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+        XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+        lastStripePtr = lastStripe;
+    }
+    /* Last stripe */
+    XXH3_accumulate_512(acc,
+                        lastStripePtr,
+                        secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        return XXH3_mergeAccs(acc,
+                              secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * XXH_PRIME64_1);
+    }
+    /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+    if (state->useSeed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        XXH128_hash_t h128;
+        h128.low64  = XXH64_avalanche(keyed_lo);
+        h128.high64 = XXH64_avalanche(keyed_hi);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= PRIME_MX2;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = XXH_PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
+            h128.high64 += m128.high64 * XXH_PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH64_avalanche(seed ^ bitflipl);
+            h128.high64 = XXH64_avalanche( seed ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+
+#if XXH_SIZE_OPT >= 1
+        {
+            /* Smaller, but slightly slower. */
+            unsigned int i = (unsigned int)(len - 1) / 32;
+            do {
+                acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
+            } while (i-- != 0);
+        }
+#else
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+#endif
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        unsigned i;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        /*
+         *  We set as `i` as offset + 32. We do this so that unchanged
+         * `len` can be used as upper bound. This reaches a sweet spot
+         * where both x86 and aarch64 get simple agen and good codegen
+         * for the loop.
+         */
+        for (i = 32; i < 160; i += 32) {
+            acc = XXH128_mix32B(acc,
+                                input  + i - 32,
+                                input  + i - 16,
+                                secret + i - 32,
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        /*
+         * NB: `i <= len` will duplicate the last 32-bytes if
+         * len % 32 was zero. This is an unfortunate necessity to keep
+         * the hash result stable.
+         */
+        for (i=160; i <= len; i += 32) {
+            acc = XXH128_mix32B(acc,
+                                input + i - 32,
+                                input + i - 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            (XXH64_hash_t)0 - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * XXH_PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * XXH_PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong() is not inlined.
+ */
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed64,
+                           const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's important for performance to pass @p secretLen (when it's static)
+ * to the compiler, so that it can properly optimize the vectorized loop.
+ *
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
+ * breaks -Og, this is XXH_NO_INLINE.
+ */
+XXH3_WITH_SECRET_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                              XXH64_hash_t seed64,
+                              const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
+                                XXH64_hash_t seed64,
+                                XXH3_f_accumulate f_acc,
+                                XXH3_f_scrambleAcc f_scramble,
+                                XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed64 == 0)
+        return XXH3_hashLong_128b_internal(input, len,
+                                           XXH3_kSecret, sizeof(XXH3_kSecret),
+                                           f_acc, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed64);
+        return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
+                                           f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const void* input, size_t len,
+                            XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
+                                            XXH64_hash_t, const void* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_128bits_internal(const void* input, size_t len,
+                      XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                      XXH3_hashLong128_f f_hl128)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hl128(input, len, seed64, secret, secretLen);
+}
+
+
+/* ===   Public XXH128 API   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 (const xxh_u8*)secret, secretSize,
+                                 XXH3_hashLong_128b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_internal(input, len, seed,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_withSeed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * All initialization and update functions are identical to 64-bit streaming variant.
+ * The only difference is the finalization routine.
+ */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+{
+    return XXH3_64bits_reset(statePtr);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSeed(statePtr, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_64bits_update(state, input, len);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * XXH_PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         secret + state->secretLimit + XXH_STRIPE_LEN
+                                                - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * @return : >0 if *h128_1  > *h128_2
+ *           <0 if *h128_1  < *h128_2
+ *           =0 if *h128_1 == *h128_2  */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
+    XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+
+
+/* ==========================================
+ * Secret generators
+ * ==========================================
+ */
+#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
+{
+    XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
+    XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
+{
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(secretBuffer != NULL);
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+#else
+    /* production mode, assert() are disabled */
+    if (secretBuffer == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+#endif
+
+    if (customSeedSize == 0) {
+        customSeed = XXH3_kSecret;
+        customSeedSize = XXH_SECRET_DEFAULT_SIZE;
+    }
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(customSeed != NULL);
+#else
+    if (customSeed == NULL) return XXH_ERROR;
+#endif
+
+    /* Fill secretBuffer with a copy of customSeed - repeat as needed */
+    {   size_t pos = 0;
+        while (pos < secretSize) {
+            size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
+            memcpy((char*)secretBuffer + pos, customSeed, toCopy);
+            pos += toCopy;
+    }   }
+
+    {   size_t const nbSeg16 = secretSize / 16;
+        size_t n;
+        XXH128_canonical_t scrambler;
+        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+        for (n=0; n<nbSeg16; n++) {
+            XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
+            XXH3_combine16((char*)secretBuffer + n*16, h128);
+        }
+        /* last segment */
+        XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
+    }
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
+{
+    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    XXH3_initCustomSecret(secret, seed);
+    XXH_ASSERT(secretBuffer != NULL);
+    memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
+}
+
+
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif  /* XXH_NO_XXH3 */
+
+/*!
+ * @}
+ */
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+} /* extern "C" */
+#endif
diff --git a/src/zstd/zstd_common.c b/src/zstd/common/zstd_common.c
similarity index 54%
rename from src/zstd/zstd_common.c
rename to src/zstd/common/zstd_common.c
index b3070c7b3..3f04c22ab 100644
--- a/src/zstd/zstd_common.c
+++ b/src/zstd/common/zstd_common.c
@@ -8,25 +8,28 @@
  * You may select, at your option, one of the above-listed licenses.
  */
 
+
+
 /*-*************************************
- *  Dependencies
- ***************************************/
+*  Dependencies
+***************************************/
 #define ZSTD_DEPS_NEED_MALLOC
-#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
 #include "error_private.h"
 #include "zstd_internal.h"
 
+
 /*-****************************************
- *  Version
- ******************************************/
+*  Version
+******************************************/
 unsigned ZSTD_versionNumber(void) { return ZSTD_VERSION_NUMBER; }
 
 const char* ZSTD_versionString(void) { return ZSTD_VERSION_STRING; }
 
+
 /*-****************************************
- *  ZSTD Error Management
- ******************************************/
-#undef ZSTD_isError /* defined within zstd_internal.h */
+*  ZSTD Error Management
+******************************************/
+#undef ZSTD_isError   /* defined within zstd_internal.h */
 /*! ZSTD_isError() :
  *  tells if a return value is an error code
  *  symbol is required for external callers */
@@ -43,32 +46,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
 /*! ZSTD_getErrorString() :
  *  provides error code string from enum */
 const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
-
-/*=**************************************************************
- *  Custom allocator
- ****************************************************************/
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) {
-	if(customMem.customAlloc)
-		return customMem.customAlloc(customMem.opaque, size);
-	return ZSTD_malloc(size);
-}
-
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) {
-	if(customMem.customAlloc) {
-		/* calloc implemented as malloc+memset;
-		 * not as efficient as calloc, but next best guess for custom malloc */
-		void* const ptr = customMem.customAlloc(customMem.opaque, size);
-		ZSTD_memset(ptr, 0, size);
-		return ptr;
-	}
-	return ZSTD_calloc(1, size);
-}
-
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) {
-	if(ptr != NULL) {
-		if(customMem.customFree)
-			customMem.customFree(customMem.opaque, ptr);
-		else
-			ZSTD_free(ptr);
-	}
-}
diff --git a/src/zstd/zstd_deps.h b/src/zstd/common/zstd_deps.h
similarity index 84%
rename from src/zstd/zstd_deps.h
rename to src/zstd/common/zstd_deps.h
index 69baa4002..4d767ae9b 100644
--- a/src/zstd/zstd_deps.h
+++ b/src/zstd/common/zstd_deps.h
@@ -29,13 +29,13 @@
 #include <string.h>
 
 #if defined(__GNUC__) && __GNUC__ >= 4
-#define ZSTD_memcpy(d, s, l) __builtin_memcpy((d), (s), (l))
-#define ZSTD_memmove(d, s, l) __builtin_memmove((d), (s), (l))
-#define ZSTD_memset(p, v, l) __builtin_memset((p), (v), (l))
+# define ZSTD_memcpy(d,s,l) __builtin_memcpy((d),(s),(l))
+# define ZSTD_memmove(d,s,l) __builtin_memmove((d),(s),(l))
+# define ZSTD_memset(p,v,l) __builtin_memset((p),(v),(l))
 #else
-#define ZSTD_memcpy(d, s, l) memcpy((d), (s), (l))
-#define ZSTD_memmove(d, s, l) memmove((d), (s), (l))
-#define ZSTD_memset(p, v, l) memset((p), (v), (l))
+# define ZSTD_memcpy(d,s,l) memcpy((d),(s),(l))
+# define ZSTD_memmove(d,s,l) memmove((d),(s),(l))
+# define ZSTD_memset(p,v,l) memset((p),(v),(l))
 #endif
 
 #endif /* ZSTD_DEPS_COMMON */
@@ -52,7 +52,7 @@
 #include <stdlib.h>
 
 #define ZSTD_malloc(s) malloc(s)
-#define ZSTD_calloc(n, s) calloc((n), (s))
+#define ZSTD_calloc(n,s) calloc((n), (s))
 #define ZSTD_free(p) free((p))
 
 #endif /* ZSTD_DEPS_MALLOC */
diff --git a/src/zstd/common/zstd_internal.h b/src/zstd/common/zstd_internal.h
new file mode 100644
index 000000000..ecb9cfba8
--- /dev/null
+++ b/src/zstd/common/zstd_internal.h
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/* this module contains definitions which must be identical
+ * across compression, decompression and dictBuilder.
+ * It also contains a few functions useful to at least 2 of them
+ * and which benefit from being inlined */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "compiler.h"
+#include "cpu.h"
+#include "mem.h"
+#include "debug.h"                 /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
+#include "error_private.h"
+#define ZSTD_STATIC_LINKING_ONLY
+#include "../zstd.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#include "huf.h"
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY  /* XXH64_state_t */
+#endif
+#include "xxhash.h"                /* XXH_reset, update, digest */
+#ifndef ZSTD_NO_TRACE
+#  include "zstd_trace.h"
+#else
+#  define ZSTD_TRACE 0
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ---- static assert (debug) --- */
+#define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)
+#define ZSTD_isError ERR_isError   /* for inlining */
+#define FSE_isError  ERR_isError
+#define HUF_isError  ERR_isError
+
+
+/*-*************************************
+*  shared macros
+***************************************/
+#undef MIN
+#undef MAX
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+#define BOUNDED(min,val,max) (MAX(min,MIN(val,max)))
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTD_OPT_NUM    (1<<12)
+
+#define ZSTD_REP_NUM      3                 /* number of repcodes */
+static UNUSED_ATTR const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 };
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+static UNUSED_ATTR const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
+static UNUSED_ATTR const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
+
+#define ZSTD_FRAMEIDSIZE 4   /* magic number size */
+
+#define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static UNUSED_ATTR const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define ZSTD_FRAMECHECKSUMSIZE 4
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
+#define MIN_LITERALS_FOR_4_STREAMS 6
+
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+
+#define Litbits  8
+#define LitHufLog 11
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML   52
+#define MaxLL   35
+#define DefaultMaxOff 28
+#define MaxOff  31
+#define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog    9
+#define LLFSELog    9
+#define OffFSELog   8
+#define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
+#define MaxMLBits 16
+#define MaxLLBits 16
+
+#define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
+/* Each table cannot take more than #symbols * FSELog bits */
+#define ZSTD_MAX_FSE_HEADERS_SIZE (((MaxML + 1) * MLFSELog + (MaxLL + 1) * LLFSELog + (MaxOff + 1) * OffFSELog + 7) / 8)
+
+static UNUSED_ATTR const U8 LL_bits[MaxLL+1] = {
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     1, 1, 1, 1, 2, 2, 3, 3,
+     4, 6, 7, 8, 9,10,11,12,
+    13,14,15,16
+};
+static UNUSED_ATTR const S16 LL_defaultNorm[MaxLL+1] = {
+     4, 3, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 1, 1, 1,
+     2, 2, 2, 2, 2, 2, 2, 2,
+     2, 3, 2, 1, 1, 1, 1, 1,
+    -1,-1,-1,-1
+};
+#define LL_DEFAULTNORMLOG 6  /* for static allocation */
+static UNUSED_ATTR const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static UNUSED_ATTR const U8 ML_bits[MaxML+1] = {
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     1, 1, 1, 1, 2, 2, 3, 3,
+     4, 4, 5, 7, 8, 9,10,11,
+    12,13,14,15,16
+};
+static UNUSED_ATTR const S16 ML_defaultNorm[MaxML+1] = {
+     1, 4, 3, 2, 2, 2, 2, 2,
+     2, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1,-1,-1,
+    -1,-1,-1,-1,-1
+};
+#define ML_DEFAULTNORMLOG 6  /* for static allocation */
+static UNUSED_ATTR const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static UNUSED_ATTR const S16 OF_defaultNorm[DefaultMaxOff+1] = {
+     1, 1, 1, 1, 1, 1, 2, 2,
+     2, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+    -1,-1,-1,-1,-1
+};
+#define OF_DEFAULTNORMLOG 5  /* for static allocation */
+static UNUSED_ATTR const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTD_copy8(void* dst, const void* src) {
+#if defined(ZSTD_ARCH_ARM_NEON)
+    vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
+#else
+    ZSTD_memcpy(dst, src, 8);
+#endif
+}
+#define COPY8(d,s) do { ZSTD_copy8(d,s); d+=8; s+=8; } while (0)
+
+/* Need to use memmove here since the literal buffer can now be located within
+   the dst buffer. In circumstances where the op "catches up" to where the
+   literal buffer is, there can be partial overlaps in this call on the final
+   copy if the literal is being shifted by less than 16 bytes. */
+static void ZSTD_copy16(void* dst, const void* src) {
+#if defined(ZSTD_ARCH_ARM_NEON)
+    vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
+#elif defined(ZSTD_ARCH_X86_SSE2)
+    _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src));
+#elif defined(__clang__)
+    ZSTD_memmove(dst, src, 16);
+#else
+    /* ZSTD_memmove is not inlined properly by gcc */
+    BYTE copy16_buf[16];
+    ZSTD_memcpy(copy16_buf, src, 16);
+    ZSTD_memcpy(dst, copy16_buf, 16);
+#endif
+}
+#define COPY16(d,s) do { ZSTD_copy16(d,s); d+=16; s+=16; } while (0)
+
+#define WILDCOPY_OVERLENGTH 32
+#define WILDCOPY_VECLEN 16
+
+typedef enum {
+    ZSTD_no_overlap,
+    ZSTD_overlap_src_before_dst
+    /*  ZSTD_overlap_dst_before_src, */
+} ZSTD_overlap_e;
+
+/*! ZSTD_wildcopy() :
+ *  Custom version of ZSTD_memcpy(), can over read/write up to WILDCOPY_OVERLENGTH bytes (if length==0)
+ *  @param ovtype controls the overlap detection
+ *         - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
+ *         - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart.
+ *           The src buffer must be before the dst buffer.
+ */
+MEM_STATIC FORCE_INLINE_ATTR
+void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e const ovtype)
+{
+    ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+
+    if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
+        /* Handle short offset copies. */
+        do {
+            COPY8(op, ip);
+        } while (op < oend);
+    } else {
+        assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
+        /* Separate out the first COPY16() call because the copy length is
+         * almost certain to be short, so the branches have different
+         * probabilities. Since it is almost certain to be short, only do
+         * one COPY16() in the first call. Then, do two calls per loop since
+         * at that point it is more likely to have a high trip count.
+         */
+        ZSTD_copy16(op, ip);
+        if (16 >= length) return;
+        op += 16;
+        ip += 16;
+        do {
+            COPY16(op, ip);
+            COPY16(op, ip);
+        }
+        while (op < oend);
+    }
+}
+
+MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    if (length > 0) {
+        ZSTD_memcpy(dst, src, length);
+    }
+    return length;
+}
+
+/* define "workspace is too large" as this number of times larger than needed */
+#define ZSTD_WORKSPACETOOLARGE_FACTOR 3
+
+/* when workspace is continuously too large
+ * during at least this number of times,
+ * context's memory usage is considered wasteful,
+ * because it's sized to handle a worst case scenario which rarely happens.
+ * In which case, resize it down to free some memory */
+#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128
+
+/* Controls whether the input/output buffer is buffered or stable. */
+typedef enum {
+    ZSTD_bm_buffered = 0,  /* Buffer the input/output */
+    ZSTD_bm_stable = 1     /* ZSTD_inBuffer/ZSTD_outBuffer is stable */
+} ZSTD_bufferMode_e;
+
+
+/*-*******************************************
+*  Private declarations
+*********************************************/
+typedef struct seqDef_s {
+    U32 offBase;   /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */
+    U16 litLength;
+    U16 mlBase;    /* mlBase == matchLength - MINMATCH */
+} seqDef;
+
+/* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */
+typedef enum {
+    ZSTD_llt_none = 0,             /* no longLengthType */
+    ZSTD_llt_literalLength = 1,    /* represents a long literal */
+    ZSTD_llt_matchLength = 2       /* represents a long match */
+} ZSTD_longLengthType_e;
+
+typedef struct {
+    seqDef* sequencesStart;
+    seqDef* sequences;      /* ptr to end of sequences */
+    BYTE*  litStart;
+    BYTE*  lit;             /* ptr to end of literals */
+    BYTE*  llCode;
+    BYTE*  mlCode;
+    BYTE*  ofCode;
+    size_t maxNbSeq;
+    size_t maxNbLit;
+
+    /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength
+     * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
+     * the existing value of the litLength or matchLength by 0x10000.
+     */
+    ZSTD_longLengthType_e longLengthType;
+    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
+} seqStore_t;
+
+typedef struct {
+    U32 litLength;
+    U32 matchLength;
+} ZSTD_sequenceLength;
+
+/**
+ * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences
+ * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength.
+ */
+MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq)
+{
+    ZSTD_sequenceLength seqLen;
+    seqLen.litLength = seq->litLength;
+    seqLen.matchLength = seq->mlBase + MINMATCH;
+    if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+        if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+            seqLen.litLength += 0x10000;
+        }
+        if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+            seqLen.matchLength += 0x10000;
+        }
+    }
+    return seqLen;
+}
+
+/**
+ * Contains the compressed frame size and an upper-bound for the decompressed frame size.
+ * Note: before using `compressedSize`, check for errors using ZSTD_isError().
+ *       similarly, before using `decompressedBound`, check for errors using:
+ *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+ */
+typedef struct {
+    size_t nbBlocks;
+    size_t compressedSize;
+    unsigned long long decompressedBound;
+} ZSTD_frameSizeInfo;   /* decompress & legacy */
+
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx);   /* zstdmt, adaptive_compression (shouldn't get this definition from here) */
+
+
+typedef struct {
+    blockType_e blockType;
+    U32 lastBlock;
+    U32 origSize;
+} blockProperties_t;   /* declared here for decompress and fullbench */
+
+/*! ZSTD_getcBlockSize() :
+ *  Provides the size of compressed block from block header `src` */
+/*  Used by: decompress, fullbench */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr);
+
+/*! ZSTD_decodeSeqHeaders() :
+ *  decode sequence header from src */
+/*  Used by: zstd_decompress_block, fullbench */
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                       const void* src, size_t srcSize);
+
+/**
+ * @returns true iff the CPU supports dynamic BMI2 dispatch.
+ */
+MEM_STATIC int ZSTD_cpuSupportsBmi2(void)
+{
+    ZSTD_cpuid_t cpuid = ZSTD_cpuid();
+    return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid);
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTD_CCOMMON_H_MODULE */
diff --git a/src/zstd/zstd_trace.h b/src/zstd/common/zstd_trace.h
similarity index 63%
rename from src/zstd/zstd_trace.h
rename to src/zstd/common/zstd_trace.h
index e1beb8d58..da20534eb 100644
--- a/src/zstd/zstd_trace.h
+++ b/src/zstd/common/zstd_trace.h
@@ -11,7 +11,7 @@
 #ifndef ZSTD_TRACE_H
 #define ZSTD_TRACE_H
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 extern "C" {
 #endif
 
@@ -25,24 +25,24 @@ extern "C" {
  * Also, explicitly disable on platforms known not to work so they aren't
  * forgotten in the future.
  */
-#if !defined(ZSTD_HAVE_WEAK_SYMBOLS) &&                                                                          \
-    defined(__GNUC__) && defined(__ELF__) &&                                                                     \
+#if !defined(ZSTD_HAVE_WEAK_SYMBOLS) && \
+    defined(__GNUC__) && defined(__ELF__) && \
     (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) || defined(__aarch64__)) && \
-    !defined(__APPLE__) && !defined(_WIN32) && !defined(__MINGW32__) &&                                          \
+    !defined(__APPLE__) && !defined(_WIN32) && !defined(__MINGW32__) && \
     !defined(__CYGWIN__) && !defined(_AIX)
-#define ZSTD_HAVE_WEAK_SYMBOLS 1
+#  define ZSTD_HAVE_WEAK_SYMBOLS 1
 #else
-#define ZSTD_HAVE_WEAK_SYMBOLS 0
+#  define ZSTD_HAVE_WEAK_SYMBOLS 0
 #endif
 #if ZSTD_HAVE_WEAK_SYMBOLS
-#define ZSTD_WEAK_ATTR __attribute__((__weak__))
+#  define ZSTD_WEAK_ATTR __attribute__((__weak__))
 #else
-#define ZSTD_WEAK_ATTR
+#  define ZSTD_WEAK_ATTR
 #endif
 
 /* Only enable tracing when weak symbols are available. */
 #ifndef ZSTD_TRACE
-#define ZSTD_TRACE ZSTD_HAVE_WEAK_SYMBOLS
+#  define ZSTD_TRACE ZSTD_HAVE_WEAK_SYMBOLS
 #endif
 
 #if ZSTD_TRACE
@@ -52,52 +52,52 @@ struct ZSTD_DCtx_s;
 struct ZSTD_CCtx_params_s;
 
 typedef struct {
-	/**
-	 * ZSTD_VERSION_NUMBER
-	 *
-	 * This is guaranteed to be the first member of ZSTD_trace.
-	 * Otherwise, this struct is not stable between versions. If
-	 * the version number does not match your expectation, you
-	 * should not interpret the rest of the struct.
-	 */
-	unsigned version;
-	/**
-	 * Non-zero if streaming (de)compression is used.
-	 */
-	unsigned streaming;
-	/**
-	 * The dictionary ID.
-	 */
-	unsigned dictionaryID;
-	/**
-	 * Is the dictionary cold?
-	 * Only set on decompression.
-	 */
-	unsigned dictionaryIsCold;
-	/**
-	 * The dictionary size or zero if no dictionary.
-	 */
-	size_t dictionarySize;
-	/**
-	 * The uncompressed size of the data.
-	 */
-	size_t uncompressedSize;
-	/**
-	 * The compressed size of the data.
-	 */
-	size_t compressedSize;
-	/**
-	 * The fully resolved CCtx parameters (NULL on decompression).
-	 */
-	struct ZSTD_CCtx_params_s const * params;
-	/**
-	 * The ZSTD_CCtx pointer (NULL on decompression).
-	 */
-	struct ZSTD_CCtx_s const * cctx;
-	/**
-	 * The ZSTD_DCtx pointer (NULL on compression).
-	 */
-	struct ZSTD_DCtx_s const * dctx;
+    /**
+     * ZSTD_VERSION_NUMBER
+     *
+     * This is guaranteed to be the first member of ZSTD_trace.
+     * Otherwise, this struct is not stable between versions. If
+     * the version number does not match your expectation, you
+     * should not interpret the rest of the struct.
+     */
+    unsigned version;
+    /**
+     * Non-zero if streaming (de)compression is used.
+     */
+    unsigned streaming;
+    /**
+     * The dictionary ID.
+     */
+    unsigned dictionaryID;
+    /**
+     * Is the dictionary cold?
+     * Only set on decompression.
+     */
+    unsigned dictionaryIsCold;
+    /**
+     * The dictionary size or zero if no dictionary.
+     */
+    size_t dictionarySize;
+    /**
+     * The uncompressed size of the data.
+     */
+    size_t uncompressedSize;
+    /**
+     * The compressed size of the data.
+     */
+    size_t compressedSize;
+    /**
+     * The fully resolved CCtx parameters (NULL on decompression).
+     */
+    struct ZSTD_CCtx_params_s const* params;
+    /**
+     * The ZSTD_CCtx pointer (NULL on decompression).
+     */
+    struct ZSTD_CCtx_s const* cctx;
+    /**
+     * The ZSTD_DCtx pointer (NULL on compression).
+     */
+    struct ZSTD_DCtx_s const* dctx;
 } ZSTD_Trace;
 
 /**
@@ -124,7 +124,7 @@ typedef unsigned long long ZSTD_TraceCtx;
  *          passed to ZSTD_trace_compress_end().
  */
 ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_compress_begin(
-    struct ZSTD_CCtx_s const * cctx);
+    struct ZSTD_CCtx_s const* cctx);
 
 /**
  * Trace the end of a compression call.
@@ -133,7 +133,7 @@ ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_compress_begin(
  */
 ZSTD_WEAK_ATTR void ZSTD_trace_compress_end(
     ZSTD_TraceCtx ctx,
-    ZSTD_Trace const * trace);
+    ZSTD_Trace const* trace);
 
 /**
  * Trace the beginning of a decompression call.
@@ -143,7 +143,7 @@ ZSTD_WEAK_ATTR void ZSTD_trace_compress_end(
  *          passed to ZSTD_trace_compress_end().
  */
 ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_decompress_begin(
-    struct ZSTD_DCtx_s const * dctx);
+    struct ZSTD_DCtx_s const* dctx);
 
 /**
  * Trace the end of a decompression call.
@@ -152,11 +152,11 @@ ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_decompress_begin(
  */
 ZSTD_WEAK_ATTR void ZSTD_trace_decompress_end(
     ZSTD_TraceCtx ctx,
-    ZSTD_Trace const * trace);
+    ZSTD_Trace const* trace);
 
 #endif /* ZSTD_TRACE */
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 }
 #endif
 
diff --git a/src/zstd/compiler.h b/src/zstd/compiler.h
deleted file mode 100644
index f99f85343..000000000
--- a/src/zstd/compiler.h
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef ZSTD_COMPILER_H
-#define ZSTD_COMPILER_H
-
-#include "portability_macros.h"
-
-/*-*******************************************************
- *  Compiler specifics
- *********************************************************/
-/* force inlining */
-
-#if !defined(ZSTD_NO_INLINE)
-#if(defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
-#define INLINE_KEYWORD inline
-#else
-#define INLINE_KEYWORD
-#endif
-
-#if defined(__GNUC__) || defined(__ICCARM__)
-#define FORCE_INLINE_ATTR __attribute__((always_inline))
-#elif defined(_MSC_VER)
-#define FORCE_INLINE_ATTR __forceinline
-#else
-#define FORCE_INLINE_ATTR
-#endif
-
-#else
-
-#define INLINE_KEYWORD
-#define FORCE_INLINE_ATTR
-
-#endif
-
-/**
-  On MSVC qsort requires that functions passed into it use the __cdecl calling conversion(CC).
-  This explicitly marks such functions as __cdecl so that the code will still compile
-  if a CC other than __cdecl has been made the default.
-*/
-#if defined(_MSC_VER)
-#define WIN_CDECL __cdecl
-#else
-#define WIN_CDECL
-#endif
-
-/**
- * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
- * parameters. They must be inlined for the compiler to eliminate the constant
- * branches.
- */
-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
-/**
- * HINT_INLINE is used to help the compiler generate better code. It is *not*
- * used for "templates", so it can be tweaked based on the compilers
- * performance.
- *
- * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
- * always_inline attribute.
- *
- * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
- * attribute.
- */
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
-#define HINT_INLINE static INLINE_KEYWORD
-#else
-#define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
-#endif
-
-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
-#if defined(__GNUC__)
-#define UNUSED_ATTR __attribute__((unused))
-#else
-#define UNUSED_ATTR
-#endif
-
-/* force no inlining */
-#ifdef _MSC_VER
-#define FORCE_NOINLINE static __declspec(noinline)
-#else
-#if defined(__GNUC__) || defined(__ICCARM__)
-#define FORCE_NOINLINE static __attribute__((__noinline__))
-#else
-#define FORCE_NOINLINE static
-#endif
-#endif
-
-/* target attribute */
-#if defined(__GNUC__) || defined(__ICCARM__)
-#define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
-#else
-#define TARGET_ATTRIBUTE(target)
-#endif
-
-/* Target attribute for BMI2 dynamic dispatch.
- * Enable lzcnt, bmi, and bmi2.
- * We test for bmi1 & bmi2. lzcnt is included in bmi1.
- */
-#define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2")
-
-/* prefetch
- * can be disabled, by declaring NO_PREFETCH build macro */
-#if defined(NO_PREFETCH)
-#define PREFETCH_L1(ptr) (void)(ptr) /* disabled */
-#define PREFETCH_L2(ptr) (void)(ptr) /* disabled */
-#else
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
-#include <mmintrin.h>                                         /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
-#define PREFETCH_L2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
-#elif defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
-#define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
-#define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
-#elif defined(__aarch64__)
-#define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
-#define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
-#else
-#define PREFETCH_L1(ptr) (void)(ptr) /* disabled */
-#define PREFETCH_L2(ptr) (void)(ptr) /* disabled */
-#endif
-#endif /* NO_PREFETCH */
-
-#define CACHELINE_SIZE 64
-
-#define PREFETCH_AREA(p, s)                                   \
-	{                                                         \
-		const char* const _ptr = (const char*)(p);            \
-		size_t const _size = (size_t)(s);                     \
-		size_t _pos;                                          \
-		for(_pos = 0; _pos < _size; _pos += CACHELINE_SIZE) { \
-			PREFETCH_L2(_ptr + _pos);                         \
-		}                                                     \
-	}
-
-/* vectorization
- * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
- * and some compilers, like Intel ICC and MCST LCC, do not support it at all. */
-#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) && !defined(__LCC__)
-#if(__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5)
-#define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
-#else
-#define DONT_VECTORIZE _Pragma("GCC optimize(\"no-tree-vectorize\")")
-#endif
-#else
-#define DONT_VECTORIZE
-#endif
-
-/* Tell the compiler that a branch is likely or unlikely.
- * Only use these macros if it causes the compiler to generate better code.
- * If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc
- * and clang, please do.
- */
-#if defined(__GNUC__)
-#define LIKELY(x) (__builtin_expect((x), 1))
-#define UNLIKELY(x) (__builtin_expect((x), 0))
-#else
-#define LIKELY(x) (x)
-#define UNLIKELY(x) (x)
-#endif
-
-#if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
-#define ZSTD_UNREACHABLE \
-	{ assert(0), __builtin_unreachable(); }
-#else
-#define ZSTD_UNREACHABLE \
-	{ assert(0); }
-#endif
-
-/* disable warnings */
-#ifdef _MSC_VER                 /* Visual Studio */
-#include <intrin.h>             /* For Visual 2005 */
-#pragma warning(disable : 4100) /* disable: C4100: unreferenced formal parameter */
-#pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
-#pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */
-#pragma warning(disable : 4214) /* disable: C4214: non-int bitfields */
-#pragma warning(disable : 4324) /* disable: C4324: padded structure */
-#endif
-
-/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/
-#ifndef STATIC_BMI2
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))
-#ifdef __AVX2__ // MSVC does not have a BMI2 specific flag, but every CPU that supports AVX2 also supports BMI2
-#define STATIC_BMI2 1
-#endif
-#elif defined(__BMI2__) && defined(__x86_64__) && defined(__GNUC__)
-#define STATIC_BMI2 1
-#endif
-#endif
-
-#ifndef STATIC_BMI2
-#define STATIC_BMI2 0
-#endif
-
-/* compile time determination of SIMD support */
-#if !defined(ZSTD_NO_INTRINSICS)
-#if defined(__SSE2__) || defined(_M_AMD64) || (defined(_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
-#define ZSTD_ARCH_X86_SSE2
-#endif
-#if defined(__ARM_NEON) || defined(_M_ARM64)
-#define ZSTD_ARCH_ARM_NEON
-#endif
-#
-#if defined(ZSTD_ARCH_X86_SSE2)
-#include <emmintrin.h>
-#elif defined(ZSTD_ARCH_ARM_NEON)
-#include <arm_neon.h>
-#endif
-#endif
-
-/* C-language Attributes are added in C23. */
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
-#define ZSTD_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
-#else
-#define ZSTD_HAS_C_ATTRIBUTE(x) 0
-#endif
-
-/* Only use C++ attributes in C++. Some compilers report support for C++
- * attributes when compiling with C.
- */
-#if defined(__cplusplus) && defined(__has_cpp_attribute)
-#define ZSTD_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
-#else
-#define ZSTD_HAS_CPP_ATTRIBUTE(x) 0
-#endif
-
-/* Define ZSTD_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute.
- * - C23: https://en.cppreference.com/w/c/language/attributes/fallthrough
- * - CPP17: https://en.cppreference.com/w/cpp/language/attributes/fallthrough
- * - Else: __attribute__((__fallthrough__))
- */
-#ifndef ZSTD_FALLTHROUGH
-#if ZSTD_HAS_C_ATTRIBUTE(fallthrough)
-#define ZSTD_FALLTHROUGH [[fallthrough]]
-#elif ZSTD_HAS_CPP_ATTRIBUTE(fallthrough)
-#define ZSTD_FALLTHROUGH [[fallthrough]]
-#elif __has_attribute(__fallthrough__)
-/* Leading semicolon is to satisfy gcc-11 with -pedantic. Without the semicolon
- * gcc complains about: a label can only be part of a statement and a declaration is not a statement.
- */
-#define ZSTD_FALLTHROUGH \
-	;                    \
-	__attribute__((__fallthrough__))
-#else
-#define ZSTD_FALLTHROUGH
-#endif
-#endif
-
-/*-**************************************************************
- *  Alignment check
- *****************************************************************/
-
-/* this test was initially positioned in mem.h,
- * but this file is removed (or replaced) for linux kernel
- * so it's now hosted in compiler.h,
- * which remains valid for both user & kernel spaces.
- */
-
-#ifndef ZSTD_ALIGNOF
-#if defined(__GNUC__) || defined(_MSC_VER)
-/* covers gcc, clang & MSVC */
-/* note : this section must come first, before C11,
- * due to a limitation in the kernel source generator */
-#define ZSTD_ALIGNOF(T) __alignof(T)
-
-#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
-/* C11 support */
-#include <stdalign.h>
-#define ZSTD_ALIGNOF(T) alignof(T)
-
-#else
-/* No known support for alignof() - imperfect backup */
-#define ZSTD_ALIGNOF(T) (sizeof(void*) < sizeof(T) ? sizeof(void*) : sizeof(T))
-
-#endif
-#endif /* ZSTD_ALIGNOF */
-
-/*-**************************************************************
- *  Sanitizer
- *****************************************************************/
-
-/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
- * abundance of caution, disable our custom poisoning on mingw. */
-#ifdef __MINGW32__
-#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
-#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
-#endif
-#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
-#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
-#endif
-#endif
-
-#if ZSTD_MEMORY_SANITIZER && !defined(ZSTD_MSAN_DONT_POISON_WORKSPACE)
-/* Not all platforms that support msan provide sanitizers/msan_interface.h.
- * We therefore declare the functions we need ourselves, rather than trying to
- * include the header file... */
-#include <stddef.h> /* size_t */
-#define ZSTD_DEPS_NEED_STDINT
-#include "zstd_deps.h" /* intptr_t */
-
-/* Make memory region fully initialized (without changing its contents). */
-void __msan_unpoison(const volatile void* a, size_t size);
-
-/* Make memory region fully uninitialized (without changing its contents).
-   This is a legacy interface that does not update origin information. Use
-   __msan_allocated_memory() instead. */
-void __msan_poison(const volatile void* a, size_t size);
-
-/* Returns the offset of the first (at least partially) poisoned byte in the
-   memory range, or -1 if the whole range is good. */
-intptr_t __msan_test_shadow(const volatile void* x, size_t size);
-#endif
-
-#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE)
-/* Not all platforms that support asan provide sanitizers/asan_interface.h.
- * We therefore declare the functions we need ourselves, rather than trying to
- * include the header file... */
-#include <stddef.h> /* size_t */
-
-/**
- * Marks a memory region (<c>[addr, addr+size)</c>) as unaddressable.
- *
- * This memory must be previously allocated by your program. Instrumented
- * code is forbidden from accessing addresses in this region until it is
- * unpoisoned. This function is not guaranteed to poison the entire region -
- * it could poison only a subregion of <c>[addr, addr+size)</c> due to ASan
- * alignment restrictions.
- *
- * \note This function is not thread-safe because no two threads can poison or
- * unpoison memory in the same memory region simultaneously.
- *
- * \param addr Start of memory region.
- * \param size Size of memory region. */
-void __asan_poison_memory_region(void const volatile * addr, size_t size);
-
-/**
- * Marks a memory region (<c>[addr, addr+size)</c>) as addressable.
- *
- * This memory must be previously allocated by your program. Accessing
- * addresses in this region is allowed until this region is poisoned again.
- * This function could unpoison a super-region of <c>[addr, addr+size)</c> due
- * to ASan alignment restrictions.
- *
- * \note This function is not thread-safe because no two threads can
- * poison or unpoison memory in the same memory region simultaneously.
- *
- * \param addr Start of memory region.
- * \param size Size of memory region. */
-void __asan_unpoison_memory_region(void const volatile * addr, size_t size);
-#endif
-
-#endif /* ZSTD_COMPILER_H */
diff --git a/src/zstd/compress/clevels.h b/src/zstd/compress/clevels.h
new file mode 100644
index 000000000..c18da465f
--- /dev/null
+++ b/src/zstd/compress/clevels.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CLEVELS_H
+#define ZSTD_CLEVELS_H
+
+#define ZSTD_STATIC_LINKING_ONLY  /* ZSTD_compressionParameters  */
+#include "../zstd.h"
+
+/*-=====  Pre-defined compression levels  =====-*/
+
+#define ZSTD_MAX_CLEVEL     22
+
+#ifdef __GNUC__
+__attribute__((__unused__))
+#endif
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
+{   /* "default" - for any srcSize > 256 KB */
+    /* W,  C,  H,  S,  L, TL, strat */
+    { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels */
+    { 19, 13, 14,  1,  7,  0, ZSTD_fast    },  /* level  1 */
+    { 20, 15, 16,  1,  6,  0, ZSTD_fast    },  /* level  2 */
+    { 21, 16, 17,  1,  5,  0, ZSTD_dfast   },  /* level  3 */
+    { 21, 18, 18,  1,  5,  0, ZSTD_dfast   },  /* level  4 */
+    { 21, 18, 19,  3,  5,  2, ZSTD_greedy  },  /* level  5 */
+    { 21, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6 */
+    { 21, 19, 20,  4,  5,  8, ZSTD_lazy    },  /* level  7 */
+    { 21, 19, 20,  4,  5, 16, ZSTD_lazy2   },  /* level  8 */
+    { 22, 20, 21,  4,  5, 16, ZSTD_lazy2   },  /* level  9 */
+    { 22, 21, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 10 */
+    { 22, 21, 22,  6,  5, 16, ZSTD_lazy2   },  /* level 11 */
+    { 22, 22, 23,  6,  5, 32, ZSTD_lazy2   },  /* level 12 */
+    { 22, 22, 22,  4,  5, 32, ZSTD_btlazy2 },  /* level 13 */
+    { 22, 22, 23,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */
+    { 22, 23, 23,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */
+    { 22, 22, 22,  5,  5, 48, ZSTD_btopt   },  /* level 16 */
+    { 23, 23, 22,  5,  4, 64, ZSTD_btopt   },  /* level 17 */
+    { 23, 23, 22,  6,  3, 64, ZSTD_btultra },  /* level 18 */
+    { 23, 24, 22,  7,  3,256, ZSTD_btultra2},  /* level 19 */
+    { 25, 25, 23,  7,  3,256, ZSTD_btultra2},  /* level 20 */
+    { 26, 26, 24,  7,  3,512, ZSTD_btultra2},  /* level 21 */
+    { 27, 27, 25,  9,  3,999, ZSTD_btultra2},  /* level 22 */
+},
+{   /* for srcSize <= 256 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 18, 13, 14,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 18, 14, 14,  1,  5,  0, ZSTD_dfast   },  /* level  2 */
+    { 18, 16, 16,  1,  4,  0, ZSTD_dfast   },  /* level  3 */
+    { 18, 16, 17,  3,  5,  2, ZSTD_greedy  },  /* level  4.*/
+    { 18, 17, 18,  5,  5,  2, ZSTD_greedy  },  /* level  5.*/
+    { 18, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6.*/
+    { 18, 18, 19,  4,  4,  4, ZSTD_lazy    },  /* level  7 */
+    { 18, 18, 19,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 18, 18, 19,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 18, 18, 19,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 18, 18, 19,  5,  4, 12, ZSTD_btlazy2 },  /* level 11.*/
+    { 18, 19, 19,  7,  4, 12, ZSTD_btlazy2 },  /* level 12.*/
+    { 18, 18, 19,  4,  4, 16, ZSTD_btopt   },  /* level 13 */
+    { 18, 18, 19,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
+    { 18, 18, 19,  6,  3,128, ZSTD_btopt   },  /* level 15.*/
+    { 18, 19, 19,  6,  3,128, ZSTD_btultra },  /* level 16.*/
+    { 18, 19, 19,  8,  3,256, ZSTD_btultra },  /* level 17.*/
+    { 18, 19, 19,  6,  3,128, ZSTD_btultra2},  /* level 18.*/
+    { 18, 19, 19,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 18, 19, 19, 10,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 18, 19, 19, 12,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 18, 19, 19, 13,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+{   /* for srcSize <= 128 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 17, 12, 13,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 17, 13, 15,  1,  5,  0, ZSTD_fast    },  /* level  2 */
+    { 17, 15, 16,  2,  5,  0, ZSTD_dfast   },  /* level  3 */
+    { 17, 17, 17,  2,  4,  0, ZSTD_dfast   },  /* level  4 */
+    { 17, 16, 17,  3,  4,  2, ZSTD_greedy  },  /* level  5 */
+    { 17, 16, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */
+    { 17, 16, 17,  3,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 17, 16, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 17, 16, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 17, 16, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 17, 17, 17,  5,  4,  8, ZSTD_btlazy2 },  /* level 11 */
+    { 17, 18, 17,  7,  4, 12, ZSTD_btlazy2 },  /* level 12 */
+    { 17, 18, 17,  3,  4, 12, ZSTD_btopt   },  /* level 13.*/
+    { 17, 18, 17,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
+    { 17, 18, 17,  6,  3,256, ZSTD_btopt   },  /* level 15.*/
+    { 17, 18, 17,  6,  3,128, ZSTD_btultra },  /* level 16.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btultra },  /* level 17.*/
+    { 17, 18, 17, 10,  3,512, ZSTD_btultra },  /* level 18.*/
+    { 17, 18, 17,  5,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 17, 18, 17,  7,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 17, 18, 17,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 17, 18, 17, 11,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+{   /* for srcSize <= 16 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 14, 14, 15,  1,  5,  0, ZSTD_fast    },  /* level  1 */
+    { 14, 14, 15,  1,  4,  0, ZSTD_fast    },  /* level  2 */
+    { 14, 14, 15,  2,  4,  0, ZSTD_dfast   },  /* level  3 */
+    { 14, 14, 14,  4,  4,  2, ZSTD_greedy  },  /* level  4 */
+    { 14, 14, 14,  3,  4,  4, ZSTD_lazy    },  /* level  5.*/
+    { 14, 14, 14,  4,  4,  8, ZSTD_lazy2   },  /* level  6 */
+    { 14, 14, 14,  6,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 14, 14, 14,  8,  4,  8, ZSTD_lazy2   },  /* level  8.*/
+    { 14, 15, 14,  5,  4,  8, ZSTD_btlazy2 },  /* level  9.*/
+    { 14, 15, 14,  9,  4,  8, ZSTD_btlazy2 },  /* level 10.*/
+    { 14, 15, 14,  3,  4, 12, ZSTD_btopt   },  /* level 11.*/
+    { 14, 15, 14,  4,  3, 24, ZSTD_btopt   },  /* level 12.*/
+    { 14, 15, 14,  5,  3, 32, ZSTD_btultra },  /* level 13.*/
+    { 14, 15, 15,  6,  3, 64, ZSTD_btultra },  /* level 14.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btultra },  /* level 15.*/
+    { 14, 15, 15,  5,  3, 48, ZSTD_btultra2},  /* level 16.*/
+    { 14, 15, 15,  6,  3,128, ZSTD_btultra2},  /* level 17.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btultra2},  /* level 18.*/
+    { 14, 15, 15,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 14, 15, 15,  8,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 14, 15, 15,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 14, 15, 15, 10,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+};
+
+
+
+#endif  /* ZSTD_CLEVELS_H */
diff --git a/src/zstd/compress/fse_compress.c b/src/zstd/compress/fse_compress.c
new file mode 100644
index 000000000..1ce3cf16a
--- /dev/null
+++ b/src/zstd/compress/fse_compress.c
@@ -0,0 +1,625 @@
+/* ******************************************************************
+ * FSE : Finite State Entropy encoder
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "../common/compiler.h"
+#include "../common/mem.h"        /* U32, U16, etc. */
+#include "../common/debug.h"      /* assert, DEBUGLOG */
+#include "hist.h"       /* HIST_count_wksp */
+#include "../common/bitstream.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "../common/fse.h"
+#include "../common/error_private.h"
+#define ZSTD_DEPS_NEED_MALLOC
+#define ZSTD_DEPS_NEED_MATH64
+#include "../common/zstd_deps.h"  /* ZSTD_memset */
+#include "../common/bits.h" /* ZSTD_highbit32 */
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                      const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                            void* workSpace, size_t wkspSize)
+{
+    U32 const tableSize = 1 << tableLog;
+    U32 const tableMask = tableSize - 1;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    U32 const step = FSE_TABLESTEP(tableSize);
+    U32 const maxSV1 = maxSymbolValue+1;
+
+    U16* cumul = (U16*)workSpace;   /* size = maxSV1 */
+    FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1));  /* size = tableSize */
+
+    U32 highThreshold = tableSize-1;
+
+    assert(((size_t)workSpace & 1) == 0);  /* Must be 2 bytes-aligned */
+    if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge);
+    /* CTable header */
+    tableU16[-2] = (U16) tableLog;
+    tableU16[-1] = (U16) maxSymbolValue;
+    assert(tableLog < 16);   /* required for threshold strategy to work */
+
+    /* For explanations on how to distribute symbol values over the table :
+     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+     #ifdef __clang_analyzer__
+     ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+     #endif
+
+    /* symbol start positions */
+    {   U32 u;
+        cumul[0] = 0;
+        for (u=1; u <= maxSV1; u++) {
+            if (normalizedCounter[u-1]==-1) {  /* Low proba symbol */
+                cumul[u] = cumul[u-1] + 1;
+                tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
+            } else {
+                assert(normalizedCounter[u-1] >= 0);
+                cumul[u] = cumul[u-1] + (U16)normalizedCounter[u-1];
+                assert(cumul[u] >= cumul[u-1]);  /* no overflow */
+        }   }
+        cumul[maxSV1] = (U16)(tableSize+1);
+    }
+
+    /* Spread symbols */
+    if (highThreshold == tableSize - 1) {
+        /* Case for no low prob count symbols. Lay down 8 bytes at a time
+         * to reduce branch misses since we are operating on a small block
+         */
+        BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */
+        {   U64 const add = 0x0101010101010101ull;
+            size_t pos = 0;
+            U64 sv = 0;
+            U32 s;
+            for (s=0; s<maxSV1; ++s, sv += add) {
+                int i;
+                int const n = normalizedCounter[s];
+                MEM_write64(spread + pos, sv);
+                for (i = 8; i < n; i += 8) {
+                    MEM_write64(spread + pos + i, sv);
+                }
+                assert(n>=0);
+                pos += (size_t)n;
+            }
+        }
+        /* Spread symbols across the table. Lack of lowprob symbols means that
+         * we don't need variable sized inner loop, so we can unroll the loop and
+         * reduce branch misses.
+         */
+        {   size_t position = 0;
+            size_t s;
+            size_t const unroll = 2; /* Experimentally determined optimal unroll */
+            assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
+            for (s = 0; s < (size_t)tableSize; s += unroll) {
+                size_t u;
+                for (u = 0; u < unroll; ++u) {
+                    size_t const uPosition = (position + (u * step)) & tableMask;
+                    tableSymbol[uPosition] = spread[s + u];
+                }
+                position = (position + (unroll * step)) & tableMask;
+            }
+            assert(position == 0);   /* Must have initialized all positions */
+        }
+    } else {
+        U32 position = 0;
+        U32 symbol;
+        for (symbol=0; symbol<maxSV1; symbol++) {
+            int nbOccurrences;
+            int const freq = normalizedCounter[symbol];
+            for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) {
+                tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+                position = (position + step) & tableMask;
+                while (position > highThreshold)
+                    position = (position + step) & tableMask;   /* Low proba area */
+        }   }
+        assert(position==0);  /* Must have initialized all positions */
+    }
+
+    /* Build table */
+    {   U32 u; for (u=0; u<tableSize; u++) {
+        FSE_FUNCTION_TYPE s = tableSymbol[u];   /* note : static analyzer may not understand tableSymbol is properly initialized */
+        tableU16[cumul[s]++] = (U16) (tableSize+u);   /* TableU16 : sorted by symbol order; gives next state value */
+    }   }
+
+    /* Build Symbol Transformation Table */
+    {   unsigned total = 0;
+        unsigned s;
+        for (s=0; s<=maxSymbolValue; s++) {
+            switch (normalizedCounter[s])
+            {
+            case  0:
+                /* filling nonetheless, for compatibility with FSE_getMaxNbBits() */
+                symbolTT[s].deltaNbBits = ((tableLog+1) << 16) - (1<<tableLog);
+                break;
+
+            case -1:
+            case  1:
+                symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
+                assert(total <= INT_MAX);
+                symbolTT[s].deltaFindState = (int)(total - 1);
+                total ++;
+                break;
+            default :
+                assert(normalizedCounter[s] > 1);
+                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
+                    U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
+                    symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                    symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+                    total +=  (unsigned)normalizedCounter[s];
+    }   }   }   }
+
+#if 0  /* debug : symbol costs */
+    DEBUGLOG(5, "\n --- table statistics : ");
+    {   U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            DEBUGLOG(5, "%3u: w=%3i,   maxBits=%u, fracBits=%.2f",
+                symbol, normalizedCounter[symbol],
+                FSE_getMaxNbBits(symbolTT, symbol),
+                (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
+    }   }
+#endif
+
+    return 0;
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-**************************************************************
+*  FSE NCount encoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
+                                   + 4 /* bitCount initialized at 4 */
+                                   + 2 /* first two symbols may use one additional bit each */) / 8)
+                                   + 1 /* round up to whole nb bytes */
+                                   + 2 /* additional two bytes for bitstream flush */;
+    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+}
+
+static size_t
+FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                   const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                         unsigned writeIsSafe)
+{
+    BYTE* const ostart = (BYTE*) header;
+    BYTE* out = ostart;
+    BYTE* const oend = ostart + headerBufferSize;
+    int nbBits;
+    const int tableSize = 1 << tableLog;
+    int remaining;
+    int threshold;
+    U32 bitStream = 0;
+    int bitCount = 0;
+    unsigned symbol = 0;
+    unsigned const alphabetSize = maxSymbolValue + 1;
+    int previousIs0 = 0;
+
+    /* Table Size */
+    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
+    bitCount  += 4;
+
+    /* Init */
+    remaining = tableSize+1;   /* +1 for extra accuracy */
+    threshold = tableSize;
+    nbBits = (int)tableLog+1;
+
+    while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+        if (previousIs0) {
+            unsigned start = symbol;
+            while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++;
+            if (symbol == alphabetSize) break;   /* incorrect distribution */
+            while (symbol >= start+24) {
+                start+=24;
+                bitStream += 0xFFFFU << bitCount;
+                if ((!writeIsSafe) && (out > oend-2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE) bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out+=2;
+                bitStream>>=16;
+            }
+            while (symbol >= start+3) {
+                start+=3;
+                bitStream += 3U << bitCount;
+                bitCount += 2;
+            }
+            bitStream += (symbol-start) << bitCount;
+            bitCount += 2;
+            if (bitCount>16) {
+                if ((!writeIsSafe) && (out > oend - 2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE)bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out += 2;
+                bitStream >>= 16;
+                bitCount -= 16;
+        }   }
+        {   int count = normalizedCounter[symbol++];
+            int const max = (2*threshold-1) - remaining;
+            remaining -= count < 0 ? -count : count;
+            count++;   /* +1 for extra accuracy */
+            if (count>=threshold)
+                count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            bitStream += (U32)count << bitCount;
+            bitCount  += nbBits;
+            bitCount  -= (count<max);
+            previousIs0  = (count==1);
+            if (remaining<1) return ERROR(GENERIC);
+            while (remaining<threshold) { nbBits--; threshold>>=1; }
+        }
+        if (bitCount>16) {
+            if ((!writeIsSafe) && (out > oend - 2))
+                return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            out[0] = (BYTE)bitStream;
+            out[1] = (BYTE)(bitStream>>8);
+            out += 2;
+            bitStream >>= 16;
+            bitCount -= 16;
+    }   }
+
+    if (remaining != 1)
+        return ERROR(GENERIC);  /* incorrect normalized distribution */
+    assert(symbol <= alphabetSize);
+
+    /* flush remaining bitStream */
+    if ((!writeIsSafe) && (out > oend - 2))
+        return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    out[0] = (BYTE)bitStream;
+    out[1] = (BYTE)(bitStream>>8);
+    out+= (bitCount+7) /8;
+
+    assert(out >= ostart);
+    return (size_t)(out-ostart);
+}
+
+
+size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                  const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
+
+    if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+        return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */);
+}
+
+
+/*-**************************************************************
+*  FSE Compression Code
+****************************************************************/
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
+    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
+    U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
+    U32 tableLog = maxTableLog;
+    U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
+    if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
+    if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
+    if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
+    return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+/* Secondary normalization method.
+   To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue, short lowProbCount)
+{
+    short const NOT_YET_ASSIGNED = -2;
+    U32 s;
+    U32 distributed = 0;
+    U32 ToDistribute;
+
+    /* Init */
+    U32 const lowThreshold = (U32)(total >> tableLog);
+    U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (count[s] == 0) {
+            norm[s]=0;
+            continue;
+        }
+        if (count[s] <= lowThreshold) {
+            norm[s] = lowProbCount;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+        if (count[s] <= lowOne) {
+            norm[s] = 1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+
+        norm[s]=NOT_YET_ASSIGNED;
+    }
+    ToDistribute = (1 << tableLog) - distributed;
+
+    if (ToDistribute == 0)
+        return 0;
+
+    if ((total / ToDistribute) > lowOne) {
+        /* risk of rounding to zero */
+        lowOne = (U32)((total * 3) / (ToDistribute * 2));
+        for (s=0; s<=maxSymbolValue; s++) {
+            if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+                norm[s] = 1;
+                distributed++;
+                total -= count[s];
+                continue;
+        }   }
+        ToDistribute = (1 << tableLog) - distributed;
+    }
+
+    if (distributed == maxSymbolValue+1) {
+        /* all values are pretty poor;
+           probably incompressible data (should have already been detected);
+           find max, then give all remaining points to max */
+        U32 maxV = 0, maxC = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > maxC) { maxV=s; maxC=count[s]; }
+        norm[maxV] += (short)ToDistribute;
+        return 0;
+    }
+
+    if (total == 0) {
+        /* all of the symbols were low enough for the lowOne or lowThreshold */
+        for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1))
+            if (norm[s] > 0) { ToDistribute--; norm[s]++; }
+        return 0;
+    }
+
+    {   U64 const vStepLog = 62 - tableLog;
+        U64 const mid = (1ULL << (vStepLog-1)) - 1;
+        U64 const rStep = ZSTD_div64((((U64)1<<vStepLog) * ToDistribute) + mid, (U32)total);   /* scale on remaining */
+        U64 tmpTotal = mid;
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (norm[s]==NOT_YET_ASSIGNED) {
+                U64 const end = tmpTotal + (count[s] * rStep);
+                U32 const sStart = (U32)(tmpTotal >> vStepLog);
+                U32 const sEnd = (U32)(end >> vStepLog);
+                U32 const weight = sEnd - sStart;
+                if (weight < 1)
+                    return ERROR(GENERIC);
+                norm[s] = (short)weight;
+                tmpTotal = end;
+    }   }   }
+
+    return 0;
+}
+
+size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+                           const unsigned* count, size_t total,
+                           unsigned maxSymbolValue, unsigned useLowProbCount)
+{
+    /* Sanity checks */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported size */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported size */
+    if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC);   /* Too small tableLog, compression potentially impossible */
+
+    {   static U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
+        short const lowProbCount = useLowProbCount ? -1 : 1;
+        U64 const scale = 62 - tableLog;
+        U64 const step = ZSTD_div64((U64)1<<62, (U32)total);   /* <== here, one division ! */
+        U64 const vStep = 1ULL<<(scale-20);
+        int stillToDistribute = 1<<tableLog;
+        unsigned s;
+        unsigned largest=0;
+        short largestP=0;
+        U32 lowThreshold = (U32)(total >> tableLog);
+
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (count[s] == total) return 0;   /* rle special case */
+            if (count[s] == 0) { normalizedCounter[s]=0; continue; }
+            if (count[s] <= lowThreshold) {
+                normalizedCounter[s] = lowProbCount;
+                stillToDistribute--;
+            } else {
+                short proba = (short)((count[s]*step) >> scale);
+                if (proba<8) {
+                    U64 restToBeat = vStep * rtbTable[proba];
+                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
+                }
+                if (proba > largestP) { largestP=proba; largest=s; }
+                normalizedCounter[s] = proba;
+                stillToDistribute -= proba;
+        }   }
+        if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+            /* corner case, need another normalization method */
+            size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue, lowProbCount);
+            if (FSE_isError(errorCode)) return errorCode;
+        }
+        else normalizedCounter[largest] += (short)stillToDistribute;
+    }
+
+#if 0
+    {   /* Print Table (debug) */
+        U32 s;
+        U32 nTotal = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]);
+        for (s=0; s<=maxSymbolValue; s++)
+            nTotal += abs(normalizedCounter[s]);
+        if (nTotal != (1U<<tableLog))
+            RAWLOG(2, "Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+        getchar();
+    }
+#endif
+
+    return tableLog;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+{
+    void* ptr = ct;
+    U16* tableU16 = ( (U16*) ptr) + 2;
+    void* FSCTptr = (U32*)ptr + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) FSCTptr;
+
+    /* header */
+    tableU16[-2] = (U16) 0;
+    tableU16[-1] = (U16) symbolValue;
+
+    /* Build table */
+    tableU16[0] = 0;
+    tableU16[1] = 0;   /* just in case */
+
+    /* Build Symbol Transformation Table */
+    symbolTT[symbolValue].deltaNbBits = 0;
+    symbolTT[symbolValue].deltaFindState = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct, const unsigned fast)
+{
+    const BYTE* const istart = (const BYTE*) src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip=iend;
+
+    BIT_CStream_t bitC;
+    FSE_CState_t CState1, CState2;
+
+    /* init */
+    if (srcSize <= 2) return 0;
+    { size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+      if (FSE_isError(initError)) return 0; /* not enough space available to write a bitstream */ }
+
+#define FSE_FLUSHBITS(s)  (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+    if (srcSize & 1) {
+        FSE_initCState2(&CState1, ct, *--ip);
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    } else {
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_initCState2(&CState1, ct, *--ip);
+    }
+
+    /* join to mod 4 */
+    srcSize -= 2;
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) {  /* test bit 2 */
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    /* 2 or 4 encoding per loop */
+    while ( ip>istart ) {
+
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
+            FSE_FLUSHBITS(&bitC);
+
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) {  /* this test must be static */
+            FSE_encodeSymbol(&bitC, &CState2, *--ip);
+            FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        }
+
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    FSE_flushCState(&bitC, &CState2);
+    FSE_flushCState(&bitC, &CState1);
+    return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct)
+{
+    unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+    if (fast)
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+    else
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/src/zstd/compress/hist.c b/src/zstd/compress/hist.c
new file mode 100644
index 000000000..e2fb431f0
--- /dev/null
+++ b/src/zstd/compress/hist.c
@@ -0,0 +1,181 @@
+/* ******************************************************************
+ * hist : Histogram functions
+ * part of Finite State Entropy project
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* --- dependencies --- */
+#include "../common/mem.h"             /* U32, BYTE, etc. */
+#include "../common/debug.h"           /* assert, DEBUGLOG */
+#include "../common/error_private.h"   /* ERROR */
+#include "hist.h"
+
+
+/* --- Error management --- */
+unsigned HIST_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+ *  Histogram functions
+ ****************************************************************/
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const end = ip + srcSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned largestCount=0;
+
+    ZSTD_memset(count, 0, (maxSymbolValue+1) * sizeof(*count));
+    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+    while (ip<end) {
+        assert(*ip <= maxSymbolValue);
+        count[*ip++]++;
+    }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+
+    {   U32 s;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > largestCount) largestCount = count[s];
+    }
+
+    return largestCount;
+}
+
+typedef enum { trustInput, checkMaxSymbolValue } HIST_checkInput_e;
+
+/* HIST_count_parallel_wksp() :
+ * store histogram into 4 intermediate tables, recombined at the end.
+ * this design makes better use of OoO cpus,
+ * and is noticeably faster when some values are heavily repeated.
+ * But it needs some additional workspace for intermediate tables.
+ * `workSpace` must be a U32 table of size >= HIST_WKSP_SIZE_U32.
+ * @return : largest histogram frequency,
+ *           or an error code (notably when histogram's alphabet is larger than *maxSymbolValuePtr) */
+static size_t HIST_count_parallel_wksp(
+                                unsigned* count, unsigned* maxSymbolValuePtr,
+                                const void* source, size_t sourceSize,
+                                HIST_checkInput_e check,
+                                U32* const workSpace)
+{
+    const BYTE* ip = (const BYTE*)source;
+    const BYTE* const iend = ip+sourceSize;
+    size_t const countSize = (*maxSymbolValuePtr + 1) * sizeof(*count);
+    unsigned max=0;
+    U32* const Counting1 = workSpace;
+    U32* const Counting2 = Counting1 + 256;
+    U32* const Counting3 = Counting2 + 256;
+    U32* const Counting4 = Counting3 + 256;
+
+    /* safety checks */
+    assert(*maxSymbolValuePtr <= 255);
+    if (!sourceSize) {
+        ZSTD_memset(count, 0, countSize);
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    ZSTD_memset(workSpace, 0, 4*256*sizeof(unsigned));
+
+    /* by stripes of 16 bytes */
+    {   U32 cached = MEM_read32(ip); ip += 4;
+        while (ip < iend-15) {
+            U32 c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) Counting1[*ip++]++;
+
+    {   U32 s;
+        for (s=0; s<256; s++) {
+            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+            if (Counting1[s] > max) max = Counting1[s];
+    }   }
+
+    {   unsigned maxSymbolValue = 255;
+        while (!Counting1[maxSymbolValue]) maxSymbolValue--;
+        if (check && maxSymbolValue > *maxSymbolValuePtr) return ERROR(maxSymbolValue_tooSmall);
+        *maxSymbolValuePtr = maxSymbolValue;
+        ZSTD_memmove(count, Counting1, countSize);   /* in case count & Counting1 are overlapping */
+    }
+    return (size_t)max;
+}
+
+/* HIST_countFast_wksp() :
+ * Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                          const void* source, size_t sourceSize,
+                          void* workSpace, size_t workSpaceSize)
+{
+    if (sourceSize < 1500) /* heuristic threshold */
+        return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    if ((size_t)workSpace & 3) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
+    return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace);
+}
+
+/* HIST_count_wksp() :
+ * Same as HIST_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                       const void* source, size_t sourceSize,
+                       void* workSpace, size_t workSpaceSize)
+{
+    if ((size_t)workSpace & 3) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
+    if (*maxSymbolValuePtr < 255)
+        return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue, (U32*)workSpace);
+    *maxSymbolValuePtr = 255;
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize);
+}
+
+#ifndef ZSTD_NO_UNUSED_FUNCTIONS
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters, sizeof(tmpCounters));
+}
+
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* src, size_t srcSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters, sizeof(tmpCounters));
+}
+#endif
diff --git a/src/zstd/hist.h b/src/zstd/compress/hist.h
similarity index 92%
rename from src/zstd/hist.h
rename to src/zstd/compress/hist.h
index 08f68917c..887896b81 100644
--- a/src/zstd/hist.h
+++ b/src/zstd/compress/hist.h
@@ -11,10 +11,11 @@
  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  * in the COPYING file in the root directory of this source tree).
  * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
+****************************************************************** */
 
 /* --- dependencies --- */
-#include "zstd_deps.h" /* size_t */
+#include "../common/zstd_deps.h"   /* size_t */
+
 
 /* --- simple histogram functions --- */
 
@@ -29,12 +30,13 @@
 size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
                   const void* src, size_t srcSize);
 
-unsigned HIST_isError(size_t code); /**< tells if a return value is an error code */
+unsigned HIST_isError(size_t code);  /**< tells if a return value is an error code */
+
 
 /* --- advanced histogram functions --- */
 
 #define HIST_WKSP_SIZE_U32 1024
-#define HIST_WKSP_SIZE (HIST_WKSP_SIZE_U32 * sizeof(unsigned))
+#define HIST_WKSP_SIZE    (HIST_WKSP_SIZE_U32 * sizeof(unsigned))
 /** HIST_count_wksp() :
  *  Same as HIST_count(), but using an externally provided scratch buffer.
  *  Benefit is this function will use very little stack space.
diff --git a/src/zstd/compress/huf_compress.c b/src/zstd/compress/huf_compress.c
new file mode 100644
index 000000000..ea0007232
--- /dev/null
+++ b/src/zstd/compress/huf_compress.c
@@ -0,0 +1,1464 @@
+/* ******************************************************************
+ * Huffman encoder, part of New Generation Entropy library
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "../common/zstd_deps.h"     /* ZSTD_memcpy, ZSTD_memset */
+#include "../common/compiler.h"
+#include "../common/bitstream.h"
+#include "hist.h"
+#define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+#include "../common/fse.h"        /* header compression */
+#include "../common/huf.h"
+#include "../common/error_private.h"
+#include "../common/bits.h"       /* ZSTD_highbit32 */
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+#define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Required declarations
+****************************************************************/
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+
+#if DEBUGLEVEL >= 2
+
+static size_t showU32(const U32* arr, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", arr[u]); (void)arr;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+static size_t HUF_getNbBits(HUF_CElt elt);
+
+static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
+    }
+    RAWLOG(6, " \n");
+    return size;
+
+}
+
+static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+static size_t showHNodeBits(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+#endif
+
+
+/* *******************************************************
+*  HUF : Huffman block compression
+*********************************************************/
+#define HUF_WORKSPACE_MAX_ALIGNMENT 8
+
+static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align)
+{
+    size_t const mask = align - 1;
+    size_t const rem = (size_t)workspace & mask;
+    size_t const add = (align - rem) & mask;
+    BYTE* const aligned = (BYTE*)workspace + add;
+    assert((align & (align - 1)) == 0); /* pow 2 */
+    assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT);
+    if (*workspaceSizePtr >= add) {
+        assert(add < align);
+        assert(((size_t)aligned & mask) == 0);
+        *workspaceSizePtr -= add;
+        return aligned;
+    } else {
+        *workspaceSizePtr = 0;
+        return NULL;
+    }
+}
+
+
+/* HUF_compressWeights() :
+ * Same as FSE_compress(), but dedicated to huff0's weights compression.
+ * The use case needs much less stack memory.
+ * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
+ */
+#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
+
+typedef struct {
+    FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
+    U32 scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(HUF_TABLELOG_MAX, MAX_FSE_TABLELOG_FOR_HUFF_HEADER)];
+    unsigned count[HUF_TABLELOG_MAX+1];
+    S16 norm[HUF_TABLELOG_MAX+1];
+} HUF_CompressWeightsWksp;
+
+static size_t
+HUF_compressWeights(void* dst, size_t dstSize,
+              const void* weightTable, size_t wtSize,
+                    void* workspace, size_t workspaceSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    unsigned maxSymbolValue = HUF_TABLELOG_MAX;
+    U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
+    HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+
+    if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC);
+
+    /* init conditions */
+    if (wtSize <= 1) return 0;  /* Not compressible */
+
+    /* Scan input and build symbol stats */
+    {   unsigned const maxCount = HIST_count_simple(wksp->count, &maxSymbolValue, weightTable, wtSize);   /* never fails */
+        if (maxCount == wtSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;        /* each symbol present maximum once => not compressible */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(wksp->norm, tableLog, wksp->count, wtSize, maxSymbolValue, /* useLowProbCount */ 0) );
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), wksp->norm, maxSymbolValue, tableLog) );
+        op += hSize;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(wksp->CTable, wksp->norm, maxSymbolValue, tableLog, wksp->scratchBuffer, sizeof(wksp->scratchBuffer)) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, wksp->CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    return (size_t)(op-ostart);
+}
+
+static size_t HUF_getNbBits(HUF_CElt elt)
+{
+    return elt & 0xFF;
+}
+
+static size_t HUF_getNbBitsFast(HUF_CElt elt)
+{
+    return elt;
+}
+
+static size_t HUF_getValue(HUF_CElt elt)
+{
+    return elt & ~(size_t)0xFF;
+}
+
+static size_t HUF_getValueFast(HUF_CElt elt)
+{
+    return elt;
+}
+
+static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits)
+{
+    assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX);
+    *elt = nbBits;
+}
+
+static void HUF_setValue(HUF_CElt* elt, size_t value)
+{
+    size_t const nbBits = HUF_getNbBits(*elt);
+    if (nbBits > 0) {
+        assert((value >> nbBits) == 0);
+        *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits);
+    }
+}
+
+HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable)
+{
+    HUF_CTableHeader header;
+    ZSTD_memcpy(&header, ctable, sizeof(header));
+    return header;
+}
+
+static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue)
+{
+    HUF_CTableHeader header;
+    HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header));
+    ZSTD_memset(&header, 0, sizeof(header));
+    assert(tableLog < 256);
+    header.tableLog = (BYTE)tableLog;
+    assert(maxSymbolValue < 256);
+    header.maxSymbolValue = (BYTE)maxSymbolValue;
+    ZSTD_memcpy(ctable, &header, sizeof(header));
+}
+
+typedef struct {
+    HUF_CompressWeightsWksp wksp;
+    BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
+} HUF_WriteCTableWksp;
+
+size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+                            const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog,
+                            void* workspace, size_t workspaceSize)
+{
+    HUF_CElt const* const ct = CTable + 1;
+    BYTE* op = (BYTE*)dst;
+    U32 n;
+    HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+
+    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
+
+    assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue);
+    assert(HUF_readCTableHeader(CTable).tableLog == huffLog);
+
+    /* check conditions */
+    if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+
+    /* convert to weight */
+    wksp->bitsToWeight[0] = 0;
+    for (n=1; n<huffLog+1; n++)
+        wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+    for (n=0; n<maxSymbolValue; n++)
+        wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])];
+
+    /* attempt weights compression by FSE */
+    if (maxDstSize < 1) return ERROR(dstSize_tooSmall);
+    {   CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) );
+        if ((hSize>1) & (hSize < maxSymbolValue/2)) {   /* FSE compressed */
+            op[0] = (BYTE)hSize;
+            return hSize+1;
+    }   }
+
+    /* write raw values as 4-bits (max : 15) */
+    if (maxSymbolValue > (256-128)) return ERROR(GENERIC);   /* should not happen : likely means source cannot be compressed */
+    if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall);   /* not enough space within dst buffer */
+    op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1));
+    wksp->huffWeight[maxSymbolValue] = 0;   /* to be sure it doesn't cause msan issue in final combination */
+    for (n=0; n<maxSymbolValue; n+=2)
+        op[(n/2)+1] = (BYTE)((wksp->huffWeight[n] << 4) + wksp->huffWeight[n+1]);
+    return ((maxSymbolValue+1)/2) + 1;
+}
+
+
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+{
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];   /* init not required, even though some static analyzer may complain */
+    U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+    HUF_CElt* const ct = CTable + 1;
+
+    /* get symbol weights */
+    CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
+    *hasZeroWeights = (rankVal[0] > 0);
+
+    /* check result */
+    if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+
+    *maxSymbolValuePtr = nbSymbols - 1;
+
+    HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr);
+
+    /* Prepare base value per rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<=tableLog; n++) {
+            U32 curr = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = curr;
+    }   }
+
+    /* fill nbBits */
+    {   U32 n; for (n=0; n<nbSymbols; n++) {
+            const U32 w = huffWeight[n];
+            HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0));
+    }   }
+
+    /* fill val */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+2]  = {0};  /* support w=0=>n=tableLog+1 */
+        U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
+        { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[n])]++; }
+        /* determine stating value per rank */
+        valPerRank[tableLog+1] = 0;   /* for w==0 */
+        {   U16 min = 0;
+            U32 n; for (n=tableLog; n>0; n--) {  /* start at n=tablelog <-> w=1 */
+                valPerRank[n] = min;     /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        /* assign value within rank, symbol order */
+        { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
+    }
+
+    return readSize;
+}
+
+U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
+{
+    const HUF_CElt* const ct = CTable + 1;
+    assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
+    if (symbolValue > HUF_readCTableHeader(CTable).maxSymbolValue)
+        return 0;
+    return (U32)HUF_getNbBits(ct[symbolValue]);
+}
+
+
+/**
+ * HUF_setMaxHeight():
+ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
+ *
+ * It attempts to convert all nodes with nbBits > @targetNbBits
+ * to employ @targetNbBits instead. Then it adjusts the tree
+ * so that it remains a valid canonical Huffman tree.
+ *
+ * @pre               The sum of the ranks of each symbol == 2^largestBits,
+ *                    where largestBits == huffNode[lastNonNull].nbBits.
+ * @post              The sum of the ranks of each symbol == 2^largestBits,
+ *                    where largestBits is the return value (expected <= targetNbBits).
+ *
+ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
+ *                    It's presumed sorted, from most frequent to rarest symbol.
+ * @param lastNonNull The symbol with the lowest count in the Huffman tree.
+ * @param targetNbBits  The allowed number of bits, which the Huffman tree
+ *                    may not respect. After this function the Huffman tree will
+ *                    respect targetNbBits.
+ * @return            The maximum number of bits of the Huffman tree after adjustment.
+ */
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
+{
+    const U32 largestBits = huffNode[lastNonNull].nbBits;
+    /* early exit : no elt > targetNbBits, so the tree is already valid. */
+    if (largestBits <= targetNbBits) return largestBits;
+
+    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
+
+    /* there are several too large elements (at least >= 2) */
+    {   int totalCost = 0;
+        const U32 baseCost = 1 << (largestBits - targetNbBits);
+        int n = (int)lastNonNull;
+
+        /* Adjust any ranks > targetNbBits to targetNbBits.
+         * Compute totalCost, which is how far the sum of the ranks is
+         * we are over 2^largestBits after adjust the offending ranks.
+         */
+        while (huffNode[n].nbBits > targetNbBits) {
+            totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+            huffNode[n].nbBits = (BYTE)targetNbBits;
+            n--;
+        }
+        /* n stops at huffNode[n].nbBits <= targetNbBits */
+        assert(huffNode[n].nbBits <= targetNbBits);
+        /* n end at index of smallest symbol using < targetNbBits */
+        while (huffNode[n].nbBits == targetNbBits) --n;
+
+        /* renorm totalCost from 2^largestBits to 2^targetNbBits
+         * note : totalCost is necessarily a multiple of baseCost */
+        assert(((U32)totalCost & (baseCost - 1)) == 0);
+        totalCost >>= (largestBits - targetNbBits);
+        assert(totalCost > 0);
+
+        /* repay normalized cost */
+        {   U32 const noSymbol = 0xF0F0F0F0;
+            U32 rankLast[HUF_TABLELOG_MAX+2];
+
+            /* Get pos of last (smallest = lowest cum. count) symbol per rank */
+            ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
+            {   U32 currentNbBits = targetNbBits;
+                int pos;
+                for (pos=n ; pos >= 0; pos--) {
+                    if (huffNode[pos].nbBits >= currentNbBits) continue;
+                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
+                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
+            }   }
+
+            while (totalCost > 0) {
+                /* Try to reduce the next power of 2 above totalCost because we
+                 * gain back half the rank.
+                 */
+                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
+                for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                    U32 const highPos = rankLast[nBitsToDecrease];
+                    U32 const lowPos = rankLast[nBitsToDecrease-1];
+                    if (highPos == noSymbol) continue;
+                    /* Decrease highPos if no symbols of lowPos or if it is
+                     * not cheaper to remove 2 lowPos than highPos.
+                     */
+                    if (lowPos == noSymbol) break;
+                    {   U32 const highTotal = huffNode[highPos].count;
+                        U32 const lowTotal = 2 * huffNode[lowPos].count;
+                        if (highTotal <= lowTotal) break;
+                }   }
+                /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
+                assert(rankLast[nBitsToDecrease] != noSymbol || nBitsToDecrease == 1);
+                /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
+                while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))
+                    nBitsToDecrease++;
+                assert(rankLast[nBitsToDecrease] != noSymbol);
+                /* Increase the number of bits to gain back half the rank cost. */
+                totalCost -= 1 << (nBitsToDecrease-1);
+                huffNode[rankLast[nBitsToDecrease]].nbBits++;
+
+                /* Fix up the new rank.
+                 * If the new rank was empty, this symbol is now its smallest.
+                 * Otherwise, this symbol will be the largest in the new rank so no adjustment.
+                 */
+                if (rankLast[nBitsToDecrease-1] == noSymbol)
+                    rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease];
+                /* Fix up the old rank.
+                 * If the symbol was at position 0, meaning it was the highest weight symbol in the tree,
+                 * it must be the only symbol in its rank, so the old rank now has no symbols.
+                 * Otherwise, since the Huffman nodes are sorted by count, the previous position is now
+                 * the smallest node in the rank. If the previous position belongs to a different rank,
+                 * then the rank is now empty.
+                 */
+                if (rankLast[nBitsToDecrease] == 0)    /* special case, reached largest symbol */
+                    rankLast[nBitsToDecrease] = noSymbol;
+                else {
+                    rankLast[nBitsToDecrease]--;
+                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
+                        rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+                }
+            }   /* while (totalCost > 0) */
+
+            /* If we've removed too much weight, then we have to add it back.
+             * To avoid overshooting again, we only adjust the smallest rank.
+             * We take the largest nodes from the lowest rank 0 and move them
+             * to rank 1. There's guaranteed to be enough rank 0 symbols because
+             * TODO.
+             */
+            while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+                /* special case : no rank 1 symbol (using targetNbBits-1);
+                 * let's create one from largest rank 0 (using targetNbBits).
+                 */
+                if (rankLast[1] == noSymbol) {
+                    while (huffNode[n].nbBits == targetNbBits) n--;
+                    huffNode[n+1].nbBits--;
+                    assert(n >= 0);
+                    rankLast[1] = (U32)(n+1);
+                    totalCost++;
+                    continue;
+                }
+                huffNode[ rankLast[1] + 1 ].nbBits--;
+                rankLast[1]++;
+                totalCost ++;
+            }
+        }   /* repay normalized cost */
+    }   /* there are several too large elements (at least >= 2) */
+
+    return targetNbBits;
+}
+
+typedef struct {
+    U16 base;
+    U16 curr;
+} rankPos;
+
+typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
+
+/* Number of buckets available for HUF_sort() */
+#define RANK_POSITION_TABLE_SIZE 192
+
+typedef struct {
+  huffNodeTable huffNodeTbl;
+  rankPos rankPosition[RANK_POSITION_TABLE_SIZE];
+} HUF_buildCTable_wksp_tables;
+
+/* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
+ * Strategy is to use as many buckets as possible for representing distinct
+ * counts while using the remainder to represent all "large" counts.
+ *
+ * To satisfy this requirement for 192 buckets, we can do the following:
+ * Let buckets 0-166 represent distinct counts of [0, 166]
+ * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
+ */
+#define RANK_POSITION_MAX_COUNT_LOG 32
+#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
+#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
+
+/* Return the appropriate bucket index for a given count. See definition of
+ * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
+ */
+static U32 HUF_getIndex(U32 const count) {
+    return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+        ? count
+        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+}
+
+/* Helper swap function for HUF_quickSortPartition() */
+static void HUF_swapNodes(nodeElt* a, nodeElt* b) {
+	nodeElt tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+/* Returns 0 if the huffNode array is not sorted by descending count */
+MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) {
+    U32 i;
+    for (i = 1; i < maxSymbolValue1; ++i) {
+        if (huffNode[i].count > huffNode[i-1].count) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+/* Insertion sort by descending order */
+HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) {
+    int i;
+    int const size = high-low+1;
+    huffNode += low;
+    for (i = 1; i < size; ++i) {
+        nodeElt const key = huffNode[i];
+        int j = i - 1;
+        while (j >= 0 && huffNode[j].count < key.count) {
+            huffNode[j + 1] = huffNode[j];
+            j--;
+        }
+        huffNode[j + 1] = key;
+    }
+}
+
+/* Pivot helper function for quicksort. */
+static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) {
+    /* Simply select rightmost element as pivot. "Better" selectors like
+     * median-of-three don't experimentally appear to have any benefit.
+     */
+    U32 const pivot = arr[high].count;
+    int i = low - 1;
+    int j = low;
+    for ( ; j < high; j++) {
+        if (arr[j].count > pivot) {
+            i++;
+            HUF_swapNodes(&arr[i], &arr[j]);
+        }
+    }
+    HUF_swapNodes(&arr[i + 1], &arr[high]);
+    return i + 1;
+}
+
+/* Classic quicksort by descending with partially iterative calls
+ * to reduce worst case callstack size.
+ */
+static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) {
+    int const kInsertionSortThreshold = 8;
+    if (high - low < kInsertionSortThreshold) {
+        HUF_insertionSort(arr, low, high);
+        return;
+    }
+    while (low < high) {
+        int const idx = HUF_quickSortPartition(arr, low, high);
+        if (idx - low < high - idx) {
+            HUF_simpleQuickSort(arr, low, idx - 1);
+            low = idx + 1;
+        } else {
+            HUF_simpleQuickSort(arr, idx + 1, high);
+            high = idx - 1;
+        }
+    }
+}
+
+/**
+ * HUF_sort():
+ * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order.
+ * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket.
+ *
+ * @param[out] huffNode       Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled.
+ *                            Must have (maxSymbolValue + 1) entries.
+ * @param[in]  count          Histogram of the symbols.
+ * @param[in]  maxSymbolValue Maximum symbol value.
+ * @param      rankPosition   This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries.
+ */
+static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) {
+    U32 n;
+    U32 const maxSymbolValue1 = maxSymbolValue+1;
+
+    /* Compute base and set curr to base.
+     * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1.
+     * See HUF_getIndex to see bucketing strategy.
+     * We attribute each symbol to lowerRank's base value, because we want to know where
+     * each rank begins in the output, so for rank R we want to count ranks R+1 and above.
+     */
+    ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE);
+    for (n = 0; n < maxSymbolValue1; ++n) {
+        U32 lowerRank = HUF_getIndex(count[n]);
+        assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1);
+        rankPosition[lowerRank].base++;
+    }
+
+    assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0);
+    /* Set up the rankPosition table */
+    for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) {
+        rankPosition[n-1].base += rankPosition[n].base;
+        rankPosition[n-1].curr = rankPosition[n-1].base;
+    }
+
+    /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */
+    for (n = 0; n < maxSymbolValue1; ++n) {
+        U32 const c = count[n];
+        U32 const r = HUF_getIndex(c) + 1;
+        U32 const pos = rankPosition[r].curr++;
+        assert(pos < maxSymbolValue1);
+        huffNode[pos].count = c;
+        huffNode[pos].byte  = (BYTE)n;
+    }
+
+    /* Sort each bucket. */
+    for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
+        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
+        U32 const bucketStartIdx = rankPosition[n].base;
+        if (bucketSize > 1) {
+            assert(bucketStartIdx < maxSymbolValue1);
+            HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1);
+        }
+    }
+
+    assert(HUF_isSorted(huffNode, maxSymbolValue1));
+}
+
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+ */
+#define STARTNODE (HUF_SYMBOLVALUE_MAX+1)
+
+/* HUF_buildTree():
+ * Takes the huffNode array sorted by HUF_sort() and builds an unlimited-depth Huffman tree.
+ *
+ * @param huffNode        The array sorted by HUF_sort(). Builds the Huffman tree in this array.
+ * @param maxSymbolValue  The maximum symbol value.
+ * @return                The smallest node in the Huffman tree (by count).
+ */
+static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+{
+    nodeElt* const huffNode0 = huffNode - 1;
+    int nonNullRank;
+    int lowS, lowN;
+    int nodeNb = STARTNODE;
+    int n, nodeRoot;
+    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
+    /* init for parents */
+    nonNullRank = (int)maxSymbolValue;
+    while(huffNode[nonNullRank].count == 0) nonNullRank--;
+    lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb;
+    huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count;
+    huffNode[lowS].parent = huffNode[lowS-1].parent = (U16)nodeNb;
+    nodeNb++; lowS-=2;
+    for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30);
+    huffNode0[0].count = (U32)(1U<<31);  /* fake entry, strong barrier */
+
+    /* create parents */
+    while (nodeNb <= nodeRoot) {
+        int const n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        int const n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+        huffNode[n1].parent = huffNode[n2].parent = (U16)nodeNb;
+        nodeNb++;
+    }
+
+    /* distribute weights (unlimited tree height) */
+    huffNode[nodeRoot].nbBits = 0;
+    for (n=nodeRoot-1; n>=STARTNODE; n--)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+    for (n=0; n<=nonNullRank; n++)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+
+    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
+
+    return nonNullRank;
+}
+
+/**
+ * HUF_buildCTableFromTree():
+ * Build the CTable given the Huffman tree in huffNode.
+ *
+ * @param[out] CTable         The output Huffman CTable.
+ * @param      huffNode       The Huffman tree.
+ * @param      nonNullRank    The last and smallest node in the Huffman tree.
+ * @param      maxSymbolValue The maximum symbol value.
+ * @param      maxNbBits      The exact maximum number of bits used in the Huffman tree.
+ */
+static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits)
+{
+    HUF_CElt* const ct = CTable + 1;
+    /* fill result into ctable (val, nbBits) */
+    int n;
+    U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
+    U16 valPerRank[HUF_TABLELOG_MAX+1] = {0};
+    int const alphabetSize = (int)(maxSymbolValue + 1);
+    for (n=0; n<=nonNullRank; n++)
+        nbPerRank[huffNode[n].nbBits]++;
+    /* determine starting value per rank */
+    {   U16 min = 0;
+        for (n=(int)maxNbBits; n>0; n--) {
+            valPerRank[n] = min;      /* get starting value within each rank */
+            min += nbPerRank[n];
+            min >>= 1;
+    }   }
+    for (n=0; n<alphabetSize; n++)
+        HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* push nbBits per symbol, symbol order */
+    for (n=0; n<alphabetSize; n++)
+        HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);   /* assign value within rank, symbol order */
+
+    HUF_writeCTableHeader(CTable, maxNbBits, maxSymbolValue);
+}
+
+size_t
+HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+                     void* workSpace, size_t wkspSize)
+{
+    HUF_buildCTable_wksp_tables* const wksp_tables =
+        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+    nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+    nodeElt* const huffNode = huffNode0+1;
+    int nonNullRank;
+
+    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
+
+    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
+
+    /* safety checks */
+    if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+        return ERROR(workSpace_tooSmall);
+    if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+        return ERROR(maxSymbolValue_tooLarge);
+    ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
+
+    /* sort, decreasing order */
+    HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
+    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
+
+    /* build tree */
+    nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
+
+    /* determine and enforce maxTableLog */
+    maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+    if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+
+    HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
+
+    return maxNbBits;
+}
+
+size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
+{
+    HUF_CElt const* ct = CTable + 1;
+    size_t nbBits = 0;
+    int s;
+    for (s = 0; s <= (int)maxSymbolValue; ++s) {
+        nbBits += HUF_getNbBits(ct[s]) * count[s];
+    }
+    return nbBits >> 3;
+}
+
+int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+    HUF_CTableHeader header = HUF_readCTableHeader(CTable);
+    HUF_CElt const* ct = CTable + 1;
+    int bad = 0;
+    int s;
+
+    assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX);
+
+    if (header.maxSymbolValue < maxSymbolValue)
+        return 0;
+
+    for (s = 0; s <= (int)maxSymbolValue; ++s) {
+        bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
+    }
+    return !bad;
+}
+
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+/** HUF_CStream_t:
+ * Huffman uses its own BIT_CStream_t implementation.
+ * There are three major differences from BIT_CStream_t:
+ *   1. HUF_addBits() takes a HUF_CElt (size_t) which is
+ *      the pair (nbBits, value) in the format:
+ *      format:
+ *        - Bits [0, 4)            = nbBits
+ *        - Bits [4, 64 - nbBits)  = 0
+ *        - Bits [64 - nbBits, 64) = value
+ *   2. The bitContainer is built from the upper bits and
+ *      right shifted. E.g. to add a new value of N bits
+ *      you right shift the bitContainer by N, then or in
+ *      the new value into the N upper bits.
+ *   3. The bitstream has two bit containers. You can add
+ *      bits to the second container and merge them into
+ *      the first container.
+ */
+
+#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8)
+
+typedef struct {
+    size_t bitContainer[2];
+    size_t bitPos[2];
+
+    BYTE* startPtr;
+    BYTE* ptr;
+    BYTE* endPtr;
+} HUF_CStream_t;
+
+/**! HUF_initCStream():
+ * Initializes the bitstream.
+ * @returns 0 or an error code.
+ */
+static size_t HUF_initCStream(HUF_CStream_t* bitC,
+                                  void* startPtr, size_t dstCapacity)
+{
+    ZSTD_memset(bitC, 0, sizeof(*bitC));
+    bitC->startPtr = (BYTE*)startPtr;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]);
+    if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall);
+    return 0;
+}
+
+/*! HUF_addBits():
+ * Adds the symbol stored in HUF_CElt elt to the bitstream.
+ *
+ * @param elt   The element we're adding. This is a (nbBits, value) pair.
+ *              See the HUF_CStream_t docs for the format.
+ * @param idx   Insert into the bitstream at this idx.
+ * @param kFast This is a template parameter. If the bitstream is guaranteed
+ *              to have at least 4 unused bits after this call it may be 1,
+ *              otherwise it must be 0. HUF_addBits() is faster when fast is set.
+ */
+FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast)
+{
+    assert(idx <= 1);
+    assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX);
+    /* This is efficient on x86-64 with BMI2 because shrx
+     * only reads the low 6 bits of the register. The compiler
+     * knows this and elides the mask. When fast is set,
+     * every operation can use the same value loaded from elt.
+     */
+    bitC->bitContainer[idx] >>= HUF_getNbBits(elt);
+    bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt);
+    /* We only read the low 8 bits of bitC->bitPos[idx] so it
+     * doesn't matter that the high bits have noise from the value.
+     */
+    bitC->bitPos[idx] += HUF_getNbBitsFast(elt);
+    assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
+    /* The last 4-bits of elt are dirty if fast is set,
+     * so we must not be overwriting bits that have already been
+     * inserted into the bit container.
+     */
+#if DEBUGLEVEL >= 1
+    {
+        size_t const nbBits = HUF_getNbBits(elt);
+        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
+        (void)dirtyBits;
+        /* Middle bits are 0. */
+        assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
+        /* We didn't overwrite any bits in the bit container. */
+        assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
+        (void)dirtyBits;
+    }
+#endif
+}
+
+FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC)
+{
+    bitC->bitContainer[1] = 0;
+    bitC->bitPos[1] = 0;
+}
+
+/*! HUF_mergeIndex1() :
+ * Merges the bit container @ index 1 into the bit container @ index 0
+ * and zeros the bit container @ index 1.
+ */
+FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC)
+{
+    assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER);
+    bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF);
+    bitC->bitContainer[0] |= bitC->bitContainer[1];
+    bitC->bitPos[0] += bitC->bitPos[1];
+    assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER);
+}
+
+/*! HUF_flushBits() :
+* Flushes the bits in the bit container @ index 0.
+*
+* @post bitPos will be < 8.
+* @param kFast If kFast is set then we must know a-priori that
+*              the bit container will not overflow.
+*/
+FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast)
+{
+    /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */
+    size_t const nbBits = bitC->bitPos[0] & 0xFF;
+    size_t const nbBytes = nbBits >> 3;
+    /* The top nbBits bits of bitContainer are the ones we need. */
+    size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits);
+    /* Mask bitPos to account for the bytes we consumed. */
+    bitC->bitPos[0] &= 7;
+    assert(nbBits > 0);
+    assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8);
+    assert(bitC->ptr <= bitC->endPtr);
+    MEM_writeLEST(bitC->ptr, bitContainer);
+    bitC->ptr += nbBytes;
+    assert(!kFast || bitC->ptr <= bitC->endPtr);
+    if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    /* bitContainer doesn't need to be modified because the leftover
+     * bits are already the top bitPos bits. And we don't care about
+     * noise in the lower values.
+     */
+}
+
+/*! HUF_endMark()
+ * @returns The Huffman stream end mark: A 1-bit value = 1.
+ */
+static HUF_CElt HUF_endMark(void)
+{
+    HUF_CElt endMark;
+    HUF_setNbBits(&endMark, 1);
+    HUF_setValue(&endMark, 1);
+    return endMark;
+}
+
+/*! HUF_closeCStream() :
+ *  @return Size of CStream, in bytes,
+ *          or 0 if it could not fit into dstBuffer */
+static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+{
+    HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0);
+    HUF_flushBits(bitC, /* kFast */ 0);
+    {
+        size_t const nbBits = bitC->bitPos[0] & 0xFF;
+        if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
+    }
+}
+
+FORCE_INLINE_TEMPLATE void
+HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast)
+{
+    HUF_addBits(bitCPtr, CTable[symbol], idx, fast);
+}
+
+FORCE_INLINE_TEMPLATE void
+HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC,
+                                   const BYTE* ip, size_t srcSize,
+                                   const HUF_CElt* ct,
+                                   int kUnroll, int kFastFlush, int kLastFast)
+{
+    /* Join to kUnroll */
+    int n = (int)srcSize;
+    int rem = n % kUnroll;
+    if (rem > 0) {
+        for (; rem > 0; --rem) {
+            HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0);
+        }
+        HUF_flushBits(bitC, kFastFlush);
+    }
+    assert(n % kUnroll == 0);
+
+    /* Join to 2 * kUnroll */
+    if (n % (2 * kUnroll)) {
+        int u;
+        for (u = 1; u < kUnroll; ++u) {
+            HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1);
+        }
+        HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast);
+        HUF_flushBits(bitC, kFastFlush);
+        n -= kUnroll;
+    }
+    assert(n % (2 * kUnroll) == 0);
+
+    for (; n>0; n-= 2 * kUnroll) {
+        /* Encode kUnroll symbols into the bitstream @ index 0. */
+        int u;
+        for (u = 1; u < kUnroll; ++u) {
+            HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1);
+        }
+        HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast);
+        HUF_flushBits(bitC, kFastFlush);
+        /* Encode kUnroll symbols into the bitstream @ index 1.
+         * This allows us to start filling the bit container
+         * without any data dependencies.
+         */
+        HUF_zeroIndex1(bitC);
+        for (u = 1; u < kUnroll; ++u) {
+            HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1);
+        }
+        HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast);
+        /* Merge bitstream @ index 1 into the bitstream @ index 0 */
+        HUF_mergeIndex1(bitC);
+        HUF_flushBits(bitC, kFastFlush);
+    }
+    assert(n == 0);
+
+}
+
+/**
+ * Returns a tight upper bound on the output space needed by Huffman
+ * with 8 bytes buffer to handle over-writes. If the output is at least
+ * this large we don't need to do bounds checks during Huffman encoding.
+ */
+static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog)
+{
+    return ((srcSize * tableLog) >> 3) + 8;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    U32 const tableLog = HUF_readCTableHeader(CTable).tableLog;
+    HUF_CElt const* ct = CTable + 1;
+    const BYTE* ip = (const BYTE*) src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    HUF_CStream_t bitC;
+
+    /* init */
+    if (dstSize < 8) return 0;   /* not enough space to compress */
+    { BYTE* op = ostart;
+      size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
+      if (HUF_isError(initErr)) return 0; }
+
+    if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
+        HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0);
+    else {
+        if (MEM_32bits()) {
+            switch (tableLog) {
+            case 11:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 10: ZSTD_FALLTHROUGH;
+            case 9: ZSTD_FALLTHROUGH;
+            case 8:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            case 7: ZSTD_FALLTHROUGH;
+            default:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            }
+        } else {
+            switch (tableLog) {
+            case 11:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 10:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            case 9:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 8:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 7:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 6: ZSTD_FALLTHROUGH;
+            default:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            }
+        }
+    }
+    assert(bitC.ptr <= bitC.endPtr);
+
+    return HUF_closeCStream(&bitC);
+}
+
+#if DYNAMIC_BMI2
+
+static BMI2_TARGET_ATTRIBUTE size_t
+HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+                                      const void* src, size_t srcSize,
+                                      const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int flags)
+{
+    if (flags & HUF_flags_bmi2) {
+        return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+    }
+    return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+}
+
+#else
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int flags)
+{
+    (void)flags;
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+#endif
+
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+{
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+}
+
+static size_t
+HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, int flags)
+{
+    size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    if (dstSize < 6 + 1 + 1 + 1 + 8) return 0;   /* minimum space to compress successfully */
+    if (srcSize < 12) return 0;   /* no saving possible : too small input */
+    op += 6;   /* jumpTable */
+
+    assert(op <= oend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+        if (cSize == 0 || cSize > 65535) return 0;
+        MEM_writeLE16(ostart, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    assert(op <= oend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+        if (cSize == 0 || cSize > 65535) return 0;
+        MEM_writeLE16(ostart+2, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    assert(op <= oend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+        if (cSize == 0 || cSize > 65535) return 0;
+        MEM_writeLE16(ostart+4, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    assert(op <= oend);
+    assert(ip <= iend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
+        if (cSize == 0 || cSize > 65535) return 0;
+        op += cSize;
+    }
+
+    return (size_t)(op-ostart);
+}
+
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+{
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+}
+
+typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+
+static size_t HUF_compressCTable_internal(
+                BYTE* const ostart, BYTE* op, BYTE* const oend,
+                const void* src, size_t srcSize,
+                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
+{
+    size_t const cSize = (nbStreams==HUF_singleStream) ?
+                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
+                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
+    if (HUF_isError(cSize)) { return cSize; }
+    if (cSize==0) { return 0; }   /* uncompressible */
+    op += cSize;
+    /* check compressibility */
+    assert(op >= ostart);
+    if ((size_t)(op-ostart) >= srcSize-1) { return 0; }
+    return (size_t)(op-ostart);
+}
+
+typedef struct {
+    unsigned count[HUF_SYMBOLVALUE_MAX + 1];
+    HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)];
+    union {
+        HUF_buildCTable_wksp_tables buildCTable_wksp;
+        HUF_WriteCTableWksp writeCTable_wksp;
+        U32 hist_wksp[HIST_WKSP_SIZE_U32];
+    } wksps;
+} HUF_compress_tables_t;
+
+#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
+
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
+{
+    unsigned cardinality = 0;
+    unsigned i;
+
+    for (i = 0; i < maxSymbolValue + 1; i++) {
+        if (count[i] != 0) cardinality += 1;
+    }
+
+    return cardinality;
+}
+
+unsigned HUF_minTableLog(unsigned symbolCardinality)
+{
+    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
+    return minBitsSymbols;
+}
+
+unsigned HUF_optimalTableLog(
+            unsigned maxTableLog,
+            size_t srcSize,
+            unsigned maxSymbolValue,
+            void* workSpace, size_t wkspSize,
+            HUF_CElt* table,
+      const unsigned* count,
+            int flags)
+{
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
+
+    if (!(flags & HUF_flags_optimalDepth)) {
+        /* cheap evaluation, based on FSE */
+        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+    }
+
+    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
+        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
+        size_t hSize, newSize;
+        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
+        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
+        size_t optSize = ((size_t) ~0) - 1;
+        unsigned optLog = maxTableLog, optLogGuess;
+
+        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
+
+        /* Search until size increases */
+        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
+            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
+
+            {   size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
+                if (ERR_isError(maxBits)) continue;
+
+                if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
+
+                hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
+            }
+
+            if (ERR_isError(hSize)) continue;
+
+            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
+
+            if (newSize > optSize + 1) {
+                break;
+            }
+
+            if (newSize < optSize) {
+                optSize = newSize;
+                optLog = optLogGuess;
+            }
+        }
+        assert(optLog <= HUF_TABLELOG_MAX);
+        return optLog;
+    }
+}
+
+/* HUF_compress_internal() :
+ * `workSpace_align4` must be aligned on 4-bytes boundaries,
+ * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
+static size_t
+HUF_compress_internal (void* dst, size_t dstSize,
+                 const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       HUF_nbStreams_e nbStreams,
+                       void* workSpace, size_t wkspSize,
+                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
+{
+    HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
+    HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
+
+    /* checks & inits */
+    if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
+    if (!srcSize) return 0;  /* Uncompressed */
+    if (!dstSize) return 0;  /* cannot fit anything within dst budget */
+    if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);   /* current block size limit */
+    if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+    if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+    if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+
+    /* Heuristic : If old table is valid, use it for small inputs */
+    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           nbStreams, oldHufTable, flags);
+    }
+
+    /* If uncompressible data is suspected, do a smaller sampling first */
+    DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
+    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+        size_t largestTotal = 0;
+        DEBUGLOG(5, "input suspected incompressible : sampling to check");
+        {   unsigned maxSymbolValueBegin = maxSymbolValue;
+            CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+            largestTotal += largestBegin;
+        }
+        {   unsigned maxSymbolValueEnd = maxSymbolValue;
+            CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+            largestTotal += largestEnd;
+        }
+        if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+    }
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) );
+        if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+        if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+    }
+    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
+
+    /* Check validity of previous table */
+    if ( repeat
+      && *repeat == HUF_repeat_check
+      && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) {
+        *repeat = HUF_repeat_none;
+    }
+    /* Heuristic : use existing table for small inputs */
+    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           nbStreams, oldHufTable, flags);
+    }
+
+    /* Build Huffman Tree */
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
+    {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+                                            maxSymbolValue, huffLog,
+                                            &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
+        CHECK_F(maxBits);
+        huffLog = (U32)maxBits;
+        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
+    }
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, table->CTable, maxSymbolValue, huffLog,
+                                              &table->wksps.writeCTable_wksp, sizeof(table->wksps.writeCTable_wksp)) );
+        /* Check if using previous huffman table is beneficial */
+        if (repeat && *repeat != HUF_repeat_none) {
+            size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue);
+            size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue);
+            if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                return HUF_compressCTable_internal(ostart, op, oend,
+                                                   src, srcSize,
+                                                   nbStreams, oldHufTable, flags);
+        }   }
+
+        /* Use the new huffman table */
+        if (hSize + 12ul >= srcSize) { return 0; }
+        op += hSize;
+        if (repeat) { *repeat = HUF_repeat_none; }
+        if (oldHufTable)
+            ZSTD_memcpy(oldHufTable, table->CTable, sizeof(table->CTable));  /* Save new table */
+    }
+    return HUF_compressCTable_internal(ostart, op, oend,
+                                       src, srcSize,
+                                       nbStreams, table->CTable, flags);
+}
+
+size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+{
+    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, HUF_singleStream,
+                                 workSpace, wkspSize, hufTable,
+                                 repeat, flags);
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * consider skipping quickly
+ * reuse an existing huffman compression table */
+size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+{
+    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, HUF_fourStreams,
+                                 workSpace, wkspSize,
+                                 hufTable, repeat, flags);
+}
diff --git a/src/zstd/compress/zstd_compress.c b/src/zstd/compress/zstd_compress.c
new file mode 100644
index 000000000..9284e2a48
--- /dev/null
+++ b/src/zstd/compress/zstd_compress.c
@@ -0,0 +1,7153 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+#include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
+#include "../common/mem.h"
+#include "hist.h"           /* HIST_countFast_wksp */
+#define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+#include "../common/fse.h"
+#include "../common/huf.h"
+#include "zstd_compress_internal.h"
+#include "zstd_compress_sequences.h"
+#include "zstd_compress_literals.h"
+#include "zstd_fast.h"
+#include "zstd_double_fast.h"
+#include "zstd_lazy.h"
+#include "zstd_opt.h"
+#include "zstd_ldm.h"
+#include "zstd_compress_superblock.h"
+#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * COMPRESS_HEAPMODE :
+ * Select how default decompression function ZSTD_compress() allocates its context,
+ * on stack (0, default), or into heap (1).
+ * Note that functions with explicit context such as ZSTD_compressCCtx() are unaffected.
+ */
+#ifndef ZSTD_COMPRESS_HEAPMODE
+#  define ZSTD_COMPRESS_HEAPMODE 0
+#endif
+
+/*!
+ * ZSTD_HASHLOG3_MAX :
+ * Maximum size of the hash table dedicated to find 3-bytes matches,
+ * in log format, aka 17 => 1 << 17 == 128Ki positions.
+ * This structure is only used in zstd_opt.
+ * Since allocation is centralized for all strategies, it has to be known here.
+ * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3,
+ * so that zstd_opt.c doesn't need to know about this constant.
+ */
+#ifndef ZSTD_HASHLOG3_MAX
+#  define ZSTD_HASHLOG3_MAX 17
+#endif
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/* ZSTD_compressBound()
+ * Note that the result from this function is only valid for
+ * the one-pass compression functions.
+ * When employing the streaming mode,
+ * if flushes are frequently altering the size of blocks,
+ * the overhead from block headers can make the compressed data larger
+ * than the return value of ZSTD_compressBound().
+ */
+size_t ZSTD_compressBound(size_t srcSize) {
+    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
+    if (r==0) return ERROR(srcSize_wrong);
+    return r;
+}
+
+
+/*-*************************************
+*  Context memory management
+***************************************/
+struct ZSTD_CDict_s {
+    const void* dictContent;
+    size_t dictContentSize;
+    ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */
+    U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */
+    ZSTD_cwksp workspace;
+    ZSTD_matchState_t matchState;
+    ZSTD_compressedBlockState_t cBlockState;
+    ZSTD_customMem customMem;
+    U32 dictID;
+    int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */
+    ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use
+                                           * row-based matchfinder. Unless the cdict is reloaded, we will use
+                                           * the same greedy/lazy matchfinder at compression time.
+                                           */
+};  /* typedef'd to ZSTD_CDict within "zstd.h" */
+
+ZSTD_CCtx* ZSTD_createCCtx(void)
+{
+    return ZSTD_createCCtx_advanced(ZSTD_defaultCMem);
+}
+
+static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager)
+{
+    assert(cctx != NULL);
+    ZSTD_memset(cctx, 0, sizeof(*cctx));
+    cctx->customMem = memManager;
+    cctx->bmi2 = ZSTD_cpuSupportsBmi2();
+    {   size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters);
+        assert(!ZSTD_isError(err));
+        (void)err;
+    }
+}
+
+ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
+{
+    ZSTD_STATIC_ASSERT(zcss_init==0);
+    ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1));
+    if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
+    {   ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_customMalloc(sizeof(ZSTD_CCtx), customMem);
+        if (!cctx) return NULL;
+        ZSTD_initCCtx(cctx, customMem);
+        return cctx;
+    }
+}
+
+ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize)
+{
+    ZSTD_cwksp ws;
+    ZSTD_CCtx* cctx;
+    if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL;  /* minimum size */
+    if ((size_t)workspace & 7) return NULL;  /* must be 8-aligned */
+    ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc);
+
+    cctx = (ZSTD_CCtx*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CCtx));
+    if (cctx == NULL) return NULL;
+
+    ZSTD_memset(cctx, 0, sizeof(ZSTD_CCtx));
+    ZSTD_cwksp_move(&cctx->workspace, &ws);
+    cctx->staticSize = workspaceSize;
+
+    /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
+    if (!ZSTD_cwksp_check_available(&cctx->workspace, ENTROPY_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL;
+    cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+    cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+    cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, ENTROPY_WORKSPACE_SIZE);
+    cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+    return cctx;
+}
+
+/**
+ * Clears and frees all of the dictionaries in the CCtx.
+ */
+static void ZSTD_clearAllDicts(ZSTD_CCtx* cctx)
+{
+    ZSTD_customFree(cctx->localDict.dictBuffer, cctx->customMem);
+    ZSTD_freeCDict(cctx->localDict.cdict);
+    ZSTD_memset(&cctx->localDict, 0, sizeof(cctx->localDict));
+    ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));
+    cctx->cdict = NULL;
+}
+
+static size_t ZSTD_sizeof_localDict(ZSTD_localDict dict)
+{
+    size_t const bufferSize = dict.dictBuffer != NULL ? dict.dictSize : 0;
+    size_t const cdictSize = ZSTD_sizeof_CDict(dict.cdict);
+    return bufferSize + cdictSize;
+}
+
+static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
+{
+    assert(cctx != NULL);
+    assert(cctx->staticSize == 0);
+    ZSTD_clearAllDicts(cctx);
+#ifdef ZSTD_MULTITHREAD
+    ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL;
+#endif
+    ZSTD_cwksp_free(&cctx->workspace, cctx->customMem);
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+{
+    DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx);
+    if (cctx==NULL) return 0;   /* support free on NULL */
+    RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                    "not compatible with static CCtx");
+    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+        ZSTD_freeCCtxContent(cctx);
+        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
+    }
+    return 0;
+}
+
+
+static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    return ZSTDMT_sizeof_CCtx(cctx->mtctx);
+#else
+    (void)cctx;
+    return 0;
+#endif
+}
+
+
+size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support sizeof on NULL */
+    /* cctx may be in the workspace */
+    return (cctx->workspace.workspace == cctx ? 0 : sizeof(*cctx))
+           + ZSTD_cwksp_sizeof(&cctx->workspace)
+           + ZSTD_sizeof_localDict(cctx->localDict)
+           + ZSTD_sizeof_mtctx(cctx);
+}
+
+size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
+{
+    return ZSTD_sizeof_CCtx(zcs);  /* same object */
+}
+
+/* private API call, for dictBuilder only */
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
+
+/* Returns true if the strategy supports using a row based matchfinder */
+static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) {
+    return (strategy >= ZSTD_greedy && strategy <= ZSTD_lazy2);
+}
+
+/* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder
+ * for this compression.
+ */
+static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) {
+    assert(mode != ZSTD_ps_auto);
+    return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable);
+}
+
+/* Returns row matchfinder usage given an initial mode and cParams */
+static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode,
+                                                         const ZSTD_compressionParameters* const cParams) {
+#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON)
+    int const kHasSIMD128 = 1;
+#else
+    int const kHasSIMD128 = 0;
+#endif
+    if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */
+    mode = ZSTD_ps_disable;
+    if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode;
+    if (kHasSIMD128) {
+        if (cParams->windowLog > 14) mode = ZSTD_ps_enable;
+    } else {
+        if (cParams->windowLog > 17) mode = ZSTD_ps_enable;
+    }
+    return mode;
+}
+
+/* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */
+static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode,
+                                                        const ZSTD_compressionParameters* const cParams) {
+    if (mode != ZSTD_ps_auto) return mode;
+    return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable;
+}
+
+/* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */
+static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+                                   const ZSTD_paramSwitch_e useRowMatchFinder,
+                                   const U32 forDDSDict) {
+    assert(useRowMatchFinder != ZSTD_ps_auto);
+    /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate.
+     * We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder.
+     */
+    return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
+}
+
+/* Returns ZSTD_ps_enable if compression parameters are such that we should
+ * enable long distance matching (wlog >= 27, strategy >= btopt).
+ * Returns ZSTD_ps_disable otherwise.
+ */
+static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+                                 const ZSTD_compressionParameters* const cParams) {
+    if (mode != ZSTD_ps_auto) return mode;
+    return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
+}
+
+static int ZSTD_resolveExternalSequenceValidation(int mode) {
+    return mode;
+}
+
+/* Resolves maxBlockSize to the default if no value is present. */
+static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
+    if (maxBlockSize == 0) {
+        return ZSTD_BLOCKSIZE_MAX;
+    } else {
+        return maxBlockSize;
+    }
+}
+
+static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
+    if (value != ZSTD_ps_auto) return value;
+    if (cLevel < 10) {
+        return ZSTD_ps_disable;
+    } else {
+        return ZSTD_ps_enable;
+    }
+}
+
+/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
+ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
+static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
+    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
+}
+
+static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+        ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params cctxParams;
+    /* should not matter, as all cParams are presumed properly defined */
+    ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT);
+    cctxParams.cParams = cParams;
+
+    /* Adjust advanced params according to cParams */
+    cctxParams.ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams.ldmParams.enableLdm, &cParams);
+    if (cctxParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+        ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams);
+        assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog);
+        assert(cctxParams.ldmParams.hashRateLog < 32);
+    }
+    cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
+    cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
+    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
+    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
+    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
+                                                                             cctxParams.compressionLevel);
+    assert(!ZSTD_checkCParams(cParams));
+    return cctxParams;
+}
+
+static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced(
+        ZSTD_customMem customMem)
+{
+    ZSTD_CCtx_params* params;
+    if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
+    params = (ZSTD_CCtx_params*)ZSTD_customCalloc(
+            sizeof(ZSTD_CCtx_params), customMem);
+    if (!params) { return NULL; }
+    ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT);
+    params->customMem = customMem;
+    return params;
+}
+
+ZSTD_CCtx_params* ZSTD_createCCtxParams(void)
+{
+    return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params)
+{
+    if (params == NULL) { return 0; }
+    ZSTD_customFree(params, params->customMem);
+    return 0;
+}
+
+size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params)
+{
+    return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT);
+}
+
+size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) {
+    RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!");
+    ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+    cctxParams->compressionLevel = compressionLevel;
+    cctxParams->fParams.contentSizeFlag = 1;
+    return 0;
+}
+
+#define ZSTD_NO_CLEVEL 0
+
+/**
+ * Initializes `cctxParams` from `params` and `compressionLevel`.
+ * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
+ */
+static void
+ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
+                        const ZSTD_parameters* params,
+                              int compressionLevel)
+{
+    assert(!ZSTD_checkCParams(params->cParams));
+    ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+    cctxParams->cParams = params->cParams;
+    cctxParams->fParams = params->fParams;
+    /* Should not matter, as all cParams are presumed properly defined.
+     * But, set it for tracing anyway.
+     */
+    cctxParams->compressionLevel = compressionLevel;
+    cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
+    cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
+    cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
+    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
+    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
+    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
+    DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
+                cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
+}
+
+size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
+{
+    RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!");
+    FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , "");
+    ZSTD_CCtxParams_init_internal(cctxParams, &params, ZSTD_NO_CLEVEL);
+    return 0;
+}
+
+/**
+ * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
+ * @param params Validated zstd parameters.
+ */
+static void ZSTD_CCtxParams_setZstdParams(
+        ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+{
+    assert(!ZSTD_checkCParams(params->cParams));
+    cctxParams->cParams = params->cParams;
+    cctxParams->fParams = params->fParams;
+    /* Should not matter, as all cParams are presumed properly defined.
+     * But, set it for tracing anyway.
+     */
+    cctxParams->compressionLevel = ZSTD_NO_CLEVEL;
+}
+
+ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+{
+    ZSTD_bounds bounds = { 0, 0, 0 };
+
+    switch(param)
+    {
+    case ZSTD_c_compressionLevel:
+        bounds.lowerBound = ZSTD_minCLevel();
+        bounds.upperBound = ZSTD_maxCLevel();
+        return bounds;
+
+    case ZSTD_c_windowLog:
+        bounds.lowerBound = ZSTD_WINDOWLOG_MIN;
+        bounds.upperBound = ZSTD_WINDOWLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_hashLog:
+        bounds.lowerBound = ZSTD_HASHLOG_MIN;
+        bounds.upperBound = ZSTD_HASHLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_chainLog:
+        bounds.lowerBound = ZSTD_CHAINLOG_MIN;
+        bounds.upperBound = ZSTD_CHAINLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_searchLog:
+        bounds.lowerBound = ZSTD_SEARCHLOG_MIN;
+        bounds.upperBound = ZSTD_SEARCHLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_minMatch:
+        bounds.lowerBound = ZSTD_MINMATCH_MIN;
+        bounds.upperBound = ZSTD_MINMATCH_MAX;
+        return bounds;
+
+    case ZSTD_c_targetLength:
+        bounds.lowerBound = ZSTD_TARGETLENGTH_MIN;
+        bounds.upperBound = ZSTD_TARGETLENGTH_MAX;
+        return bounds;
+
+    case ZSTD_c_strategy:
+        bounds.lowerBound = ZSTD_STRATEGY_MIN;
+        bounds.upperBound = ZSTD_STRATEGY_MAX;
+        return bounds;
+
+    case ZSTD_c_contentSizeFlag:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_checksumFlag:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_dictIDFlag:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_nbWorkers:
+        bounds.lowerBound = 0;
+#ifdef ZSTD_MULTITHREAD
+        bounds.upperBound = ZSTDMT_NBWORKERS_MAX;
+#else
+        bounds.upperBound = 0;
+#endif
+        return bounds;
+
+    case ZSTD_c_jobSize:
+        bounds.lowerBound = 0;
+#ifdef ZSTD_MULTITHREAD
+        bounds.upperBound = ZSTDMT_JOBSIZE_MAX;
+#else
+        bounds.upperBound = 0;
+#endif
+        return bounds;
+
+    case ZSTD_c_overlapLog:
+#ifdef ZSTD_MULTITHREAD
+        bounds.lowerBound = ZSTD_OVERLAPLOG_MIN;
+        bounds.upperBound = ZSTD_OVERLAPLOG_MAX;
+#else
+        bounds.lowerBound = 0;
+        bounds.upperBound = 0;
+#endif
+        return bounds;
+
+    case ZSTD_c_enableDedicatedDictSearch:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_enableLongDistanceMatching:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_ldmHashLog:
+        bounds.lowerBound = ZSTD_LDM_HASHLOG_MIN;
+        bounds.upperBound = ZSTD_LDM_HASHLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_ldmMinMatch:
+        bounds.lowerBound = ZSTD_LDM_MINMATCH_MIN;
+        bounds.upperBound = ZSTD_LDM_MINMATCH_MAX;
+        return bounds;
+
+    case ZSTD_c_ldmBucketSizeLog:
+        bounds.lowerBound = ZSTD_LDM_BUCKETSIZELOG_MIN;
+        bounds.upperBound = ZSTD_LDM_BUCKETSIZELOG_MAX;
+        return bounds;
+
+    case ZSTD_c_ldmHashRateLog:
+        bounds.lowerBound = ZSTD_LDM_HASHRATELOG_MIN;
+        bounds.upperBound = ZSTD_LDM_HASHRATELOG_MAX;
+        return bounds;
+
+    /* experimental parameters */
+    case ZSTD_c_rsyncable:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_forceMaxWindow :
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_format:
+        ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless);
+        bounds.lowerBound = ZSTD_f_zstd1;
+        bounds.upperBound = ZSTD_f_zstd1_magicless;   /* note : how to ensure at compile time that this is the highest value enum ? */
+        return bounds;
+
+    case ZSTD_c_forceAttachDict:
+        ZSTD_STATIC_ASSERT(ZSTD_dictDefaultAttach < ZSTD_dictForceLoad);
+        bounds.lowerBound = ZSTD_dictDefaultAttach;
+        bounds.upperBound = ZSTD_dictForceLoad;       /* note : how to ensure at compile time that this is the highest value enum ? */
+        return bounds;
+
+    case ZSTD_c_literalCompressionMode:
+        ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable < ZSTD_ps_disable);
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_targetCBlockSize:
+        bounds.lowerBound = ZSTD_TARGETCBLOCKSIZE_MIN;
+        bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX;
+        return bounds;
+
+    case ZSTD_c_srcSizeHint:
+        bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN;
+        bounds.upperBound = ZSTD_SRCSIZEHINT_MAX;
+        return bounds;
+
+    case ZSTD_c_stableInBuffer:
+    case ZSTD_c_stableOutBuffer:
+        bounds.lowerBound = (int)ZSTD_bm_buffered;
+        bounds.upperBound = (int)ZSTD_bm_stable;
+        return bounds;
+
+    case ZSTD_c_blockDelimiters:
+        bounds.lowerBound = (int)ZSTD_sf_noBlockDelimiters;
+        bounds.upperBound = (int)ZSTD_sf_explicitBlockDelimiters;
+        return bounds;
+
+    case ZSTD_c_validateSequences:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_useBlockSplitter:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_useRowMatchFinder:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_deterministicRefPrefix:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_prefetchCDictTables:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_enableSeqProducerFallback:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_maxBlockSize:
+        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
+        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
+        return bounds;
+
+    case ZSTD_c_searchForExternalRepcodes:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    default:
+        bounds.error = ERROR(parameter_unsupported);
+        return bounds;
+    }
+}
+
+/* ZSTD_cParam_clampBounds:
+ * Clamps the value into the bounded range.
+ */
+static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value)
+{
+    ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);
+    if (ZSTD_isError(bounds.error)) return bounds.error;
+    if (*value < bounds.lowerBound) *value = bounds.lowerBound;
+    if (*value > bounds.upperBound) *value = bounds.upperBound;
+    return 0;
+}
+
+#define BOUNDCHECK(cParam, val)                                       \
+    do {                                                              \
+        RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val),        \
+                        parameter_outOfBound, "Param out of bounds"); \
+    } while (0)
+
+
+static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+{
+    switch(param)
+    {
+    case ZSTD_c_compressionLevel:
+    case ZSTD_c_hashLog:
+    case ZSTD_c_chainLog:
+    case ZSTD_c_searchLog:
+    case ZSTD_c_minMatch:
+    case ZSTD_c_targetLength:
+    case ZSTD_c_strategy:
+        return 1;
+
+    case ZSTD_c_format:
+    case ZSTD_c_windowLog:
+    case ZSTD_c_contentSizeFlag:
+    case ZSTD_c_checksumFlag:
+    case ZSTD_c_dictIDFlag:
+    case ZSTD_c_forceMaxWindow :
+    case ZSTD_c_nbWorkers:
+    case ZSTD_c_jobSize:
+    case ZSTD_c_overlapLog:
+    case ZSTD_c_rsyncable:
+    case ZSTD_c_enableDedicatedDictSearch:
+    case ZSTD_c_enableLongDistanceMatching:
+    case ZSTD_c_ldmHashLog:
+    case ZSTD_c_ldmMinMatch:
+    case ZSTD_c_ldmBucketSizeLog:
+    case ZSTD_c_ldmHashRateLog:
+    case ZSTD_c_forceAttachDict:
+    case ZSTD_c_literalCompressionMode:
+    case ZSTD_c_targetCBlockSize:
+    case ZSTD_c_srcSizeHint:
+    case ZSTD_c_stableInBuffer:
+    case ZSTD_c_stableOutBuffer:
+    case ZSTD_c_blockDelimiters:
+    case ZSTD_c_validateSequences:
+    case ZSTD_c_useBlockSplitter:
+    case ZSTD_c_useRowMatchFinder:
+    case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+    case ZSTD_c_enableSeqProducerFallback:
+    case ZSTD_c_maxBlockSize:
+    case ZSTD_c_searchForExternalRepcodes:
+    default:
+        return 0;
+    }
+}
+
+size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParameter (%i, %i)", (int)param, value);
+    if (cctx->streamStage != zcss_init) {
+        if (ZSTD_isUpdateAuthorized(param)) {
+            cctx->cParamsChanged = 1;
+        } else {
+            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
+    }   }
+
+    switch(param)
+    {
+    case ZSTD_c_nbWorkers:
+        RETURN_ERROR_IF((value!=0) && cctx->staticSize, parameter_unsupported,
+                        "MT not compatible with static alloc");
+        break;
+
+    case ZSTD_c_compressionLevel:
+    case ZSTD_c_windowLog:
+    case ZSTD_c_hashLog:
+    case ZSTD_c_chainLog:
+    case ZSTD_c_searchLog:
+    case ZSTD_c_minMatch:
+    case ZSTD_c_targetLength:
+    case ZSTD_c_strategy:
+    case ZSTD_c_ldmHashRateLog:
+    case ZSTD_c_format:
+    case ZSTD_c_contentSizeFlag:
+    case ZSTD_c_checksumFlag:
+    case ZSTD_c_dictIDFlag:
+    case ZSTD_c_forceMaxWindow:
+    case ZSTD_c_forceAttachDict:
+    case ZSTD_c_literalCompressionMode:
+    case ZSTD_c_jobSize:
+    case ZSTD_c_overlapLog:
+    case ZSTD_c_rsyncable:
+    case ZSTD_c_enableDedicatedDictSearch:
+    case ZSTD_c_enableLongDistanceMatching:
+    case ZSTD_c_ldmHashLog:
+    case ZSTD_c_ldmMinMatch:
+    case ZSTD_c_ldmBucketSizeLog:
+    case ZSTD_c_targetCBlockSize:
+    case ZSTD_c_srcSizeHint:
+    case ZSTD_c_stableInBuffer:
+    case ZSTD_c_stableOutBuffer:
+    case ZSTD_c_blockDelimiters:
+    case ZSTD_c_validateSequences:
+    case ZSTD_c_useBlockSplitter:
+    case ZSTD_c_useRowMatchFinder:
+    case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+    case ZSTD_c_enableSeqProducerFallback:
+    case ZSTD_c_maxBlockSize:
+    case ZSTD_c_searchForExternalRepcodes:
+        break;
+
+    default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+    }
+    return ZSTD_CCtxParams_setParameter(&cctx->requestedParams, param, value);
+}
+
+size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+                                    ZSTD_cParameter param, int value)
+{
+    DEBUGLOG(4, "ZSTD_CCtxParams_setParameter (%i, %i)", (int)param, value);
+    switch(param)
+    {
+    case ZSTD_c_format :
+        BOUNDCHECK(ZSTD_c_format, value);
+        CCtxParams->format = (ZSTD_format_e)value;
+        return (size_t)CCtxParams->format;
+
+    case ZSTD_c_compressionLevel : {
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+        if (value == 0)
+            CCtxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* 0 == default */
+        else
+            CCtxParams->compressionLevel = value;
+        if (CCtxParams->compressionLevel >= 0) return (size_t)CCtxParams->compressionLevel;
+        return 0;  /* return type (size_t) cannot represent negative values */
+    }
+
+    case ZSTD_c_windowLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_windowLog, value);
+        CCtxParams->cParams.windowLog = (U32)value;
+        return CCtxParams->cParams.windowLog;
+
+    case ZSTD_c_hashLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_hashLog, value);
+        CCtxParams->cParams.hashLog = (U32)value;
+        return CCtxParams->cParams.hashLog;
+
+    case ZSTD_c_chainLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_chainLog, value);
+        CCtxParams->cParams.chainLog = (U32)value;
+        return CCtxParams->cParams.chainLog;
+
+    case ZSTD_c_searchLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_searchLog, value);
+        CCtxParams->cParams.searchLog = (U32)value;
+        return (size_t)value;
+
+    case ZSTD_c_minMatch :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_minMatch, value);
+        CCtxParams->cParams.minMatch = (U32)value;
+        return CCtxParams->cParams.minMatch;
+
+    case ZSTD_c_targetLength :
+        BOUNDCHECK(ZSTD_c_targetLength, value);
+        CCtxParams->cParams.targetLength = (U32)value;
+        return CCtxParams->cParams.targetLength;
+
+    case ZSTD_c_strategy :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_strategy, value);
+        CCtxParams->cParams.strategy = (ZSTD_strategy)value;
+        return (size_t)CCtxParams->cParams.strategy;
+
+    case ZSTD_c_contentSizeFlag :
+        /* Content size written in frame header _when known_ (default:1) */
+        DEBUGLOG(4, "set content size flag = %u", (value!=0));
+        CCtxParams->fParams.contentSizeFlag = value != 0;
+        return (size_t)CCtxParams->fParams.contentSizeFlag;
+
+    case ZSTD_c_checksumFlag :
+        /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+        CCtxParams->fParams.checksumFlag = value != 0;
+        return (size_t)CCtxParams->fParams.checksumFlag;
+
+    case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+        DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+        CCtxParams->fParams.noDictIDFlag = !value;
+        return !CCtxParams->fParams.noDictIDFlag;
+
+    case ZSTD_c_forceMaxWindow :
+        CCtxParams->forceWindow = (value != 0);
+        return (size_t)CCtxParams->forceWindow;
+
+    case ZSTD_c_forceAttachDict : {
+        const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
+        CCtxParams->attachDictPref = pref;
+        return CCtxParams->attachDictPref;
+    }
+
+    case ZSTD_c_literalCompressionMode : {
+        const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
+        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
+        CCtxParams->literalCompressionMode = lcm;
+        return CCtxParams->literalCompressionMode;
+    }
+
+    case ZSTD_c_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+        CCtxParams->nbWorkers = value;
+        return (size_t)(CCtxParams->nbWorkers);
+#endif
+
+    case ZSTD_c_jobSize :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        /* Adjust to the minimum non-default value. */
+        if (value != 0 && value < ZSTDMT_JOBSIZE_MIN)
+            value = ZSTDMT_JOBSIZE_MIN;
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+        assert(value >= 0);
+        CCtxParams->jobSize = value;
+        return CCtxParams->jobSize;
+#endif
+
+    case ZSTD_c_overlapLog :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
+        CCtxParams->overlapLog = value;
+        return (size_t)CCtxParams->overlapLog;
+#endif
+
+    case ZSTD_c_rsyncable :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
+        CCtxParams->rsyncable = value;
+        return (size_t)CCtxParams->rsyncable;
+#endif
+
+    case ZSTD_c_enableDedicatedDictSearch :
+        CCtxParams->enableDedicatedDictSearch = (value!=0);
+        return (size_t)CCtxParams->enableDedicatedDictSearch;
+
+    case ZSTD_c_enableLongDistanceMatching :
+        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
+        CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->ldmParams.enableLdm;
+
+    case ZSTD_c_ldmHashLog :
+        if (value!=0)   /* 0 ==> auto */
+            BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+        CCtxParams->ldmParams.hashLog = (U32)value;
+        return CCtxParams->ldmParams.hashLog;
+
+    case ZSTD_c_ldmMinMatch :
+        if (value!=0)   /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+        CCtxParams->ldmParams.minMatchLength = (U32)value;
+        return CCtxParams->ldmParams.minMatchLength;
+
+    case ZSTD_c_ldmBucketSizeLog :
+        if (value!=0)   /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
+        return CCtxParams->ldmParams.bucketSizeLog;
+
+    case ZSTD_c_ldmHashRateLog :
+        if (value!=0)   /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
+        CCtxParams->ldmParams.hashRateLog = (U32)value;
+        return CCtxParams->ldmParams.hashRateLog;
+
+    case ZSTD_c_targetCBlockSize :
+        if (value!=0) {  /* 0 ==> default */
+            value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN);
+            BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+        }
+        CCtxParams->targetCBlockSize = (U32)value;
+        return CCtxParams->targetCBlockSize;
+
+    case ZSTD_c_srcSizeHint :
+        if (value!=0)    /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+        CCtxParams->srcSizeHint = value;
+        return (size_t)CCtxParams->srcSizeHint;
+
+    case ZSTD_c_stableInBuffer:
+        BOUNDCHECK(ZSTD_c_stableInBuffer, value);
+        CCtxParams->inBufferMode = (ZSTD_bufferMode_e)value;
+        return CCtxParams->inBufferMode;
+
+    case ZSTD_c_stableOutBuffer:
+        BOUNDCHECK(ZSTD_c_stableOutBuffer, value);
+        CCtxParams->outBufferMode = (ZSTD_bufferMode_e)value;
+        return CCtxParams->outBufferMode;
+
+    case ZSTD_c_blockDelimiters:
+        BOUNDCHECK(ZSTD_c_blockDelimiters, value);
+        CCtxParams->blockDelimiters = (ZSTD_sequenceFormat_e)value;
+        return CCtxParams->blockDelimiters;
+
+    case ZSTD_c_validateSequences:
+        BOUNDCHECK(ZSTD_c_validateSequences, value);
+        CCtxParams->validateSequences = value;
+        return (size_t)CCtxParams->validateSequences;
+
+    case ZSTD_c_useBlockSplitter:
+        BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
+        CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->useBlockSplitter;
+
+    case ZSTD_c_useRowMatchFinder:
+        BOUNDCHECK(ZSTD_c_useRowMatchFinder, value);
+        CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->useRowMatchFinder;
+
+    case ZSTD_c_deterministicRefPrefix:
+        BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
+        CCtxParams->deterministicRefPrefix = !!value;
+        return (size_t)CCtxParams->deterministicRefPrefix;
+
+    case ZSTD_c_prefetchCDictTables:
+        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
+        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->prefetchCDictTables;
+
+    case ZSTD_c_enableSeqProducerFallback:
+        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
+        CCtxParams->enableMatchFinderFallback = value;
+        return (size_t)CCtxParams->enableMatchFinderFallback;
+
+    case ZSTD_c_maxBlockSize:
+        if (value!=0)    /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
+        CCtxParams->maxBlockSize = value;
+        return CCtxParams->maxBlockSize;
+
+    case ZSTD_c_searchForExternalRepcodes:
+        BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
+        CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->searchForExternalRepcodes;
+
+    default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+    }
+}
+
+size_t ZSTD_CCtx_getParameter(ZSTD_CCtx const* cctx, ZSTD_cParameter param, int* value)
+{
+    return ZSTD_CCtxParams_getParameter(&cctx->requestedParams, param, value);
+}
+
+size_t ZSTD_CCtxParams_getParameter(
+        ZSTD_CCtx_params const* CCtxParams, ZSTD_cParameter param, int* value)
+{
+    switch(param)
+    {
+    case ZSTD_c_format :
+        *value = CCtxParams->format;
+        break;
+    case ZSTD_c_compressionLevel :
+        *value = CCtxParams->compressionLevel;
+        break;
+    case ZSTD_c_windowLog :
+        *value = (int)CCtxParams->cParams.windowLog;
+        break;
+    case ZSTD_c_hashLog :
+        *value = (int)CCtxParams->cParams.hashLog;
+        break;
+    case ZSTD_c_chainLog :
+        *value = (int)CCtxParams->cParams.chainLog;
+        break;
+    case ZSTD_c_searchLog :
+        *value = CCtxParams->cParams.searchLog;
+        break;
+    case ZSTD_c_minMatch :
+        *value = CCtxParams->cParams.minMatch;
+        break;
+    case ZSTD_c_targetLength :
+        *value = CCtxParams->cParams.targetLength;
+        break;
+    case ZSTD_c_strategy :
+        *value = (unsigned)CCtxParams->cParams.strategy;
+        break;
+    case ZSTD_c_contentSizeFlag :
+        *value = CCtxParams->fParams.contentSizeFlag;
+        break;
+    case ZSTD_c_checksumFlag :
+        *value = CCtxParams->fParams.checksumFlag;
+        break;
+    case ZSTD_c_dictIDFlag :
+        *value = !CCtxParams->fParams.noDictIDFlag;
+        break;
+    case ZSTD_c_forceMaxWindow :
+        *value = CCtxParams->forceWindow;
+        break;
+    case ZSTD_c_forceAttachDict :
+        *value = CCtxParams->attachDictPref;
+        break;
+    case ZSTD_c_literalCompressionMode :
+        *value = CCtxParams->literalCompressionMode;
+        break;
+    case ZSTD_c_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+        assert(CCtxParams->nbWorkers == 0);
+#endif
+        *value = CCtxParams->nbWorkers;
+        break;
+    case ZSTD_c_jobSize :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+        assert(CCtxParams->jobSize <= INT_MAX);
+        *value = (int)CCtxParams->jobSize;
+        break;
+#endif
+    case ZSTD_c_overlapLog :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+        *value = CCtxParams->overlapLog;
+        break;
+#endif
+    case ZSTD_c_rsyncable :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+        *value = CCtxParams->rsyncable;
+        break;
+#endif
+    case ZSTD_c_enableDedicatedDictSearch :
+        *value = CCtxParams->enableDedicatedDictSearch;
+        break;
+    case ZSTD_c_enableLongDistanceMatching :
+        *value = CCtxParams->ldmParams.enableLdm;
+        break;
+    case ZSTD_c_ldmHashLog :
+        *value = CCtxParams->ldmParams.hashLog;
+        break;
+    case ZSTD_c_ldmMinMatch :
+        *value = CCtxParams->ldmParams.minMatchLength;
+        break;
+    case ZSTD_c_ldmBucketSizeLog :
+        *value = CCtxParams->ldmParams.bucketSizeLog;
+        break;
+    case ZSTD_c_ldmHashRateLog :
+        *value = CCtxParams->ldmParams.hashRateLog;
+        break;
+    case ZSTD_c_targetCBlockSize :
+        *value = (int)CCtxParams->targetCBlockSize;
+        break;
+    case ZSTD_c_srcSizeHint :
+        *value = (int)CCtxParams->srcSizeHint;
+        break;
+    case ZSTD_c_stableInBuffer :
+        *value = (int)CCtxParams->inBufferMode;
+        break;
+    case ZSTD_c_stableOutBuffer :
+        *value = (int)CCtxParams->outBufferMode;
+        break;
+    case ZSTD_c_blockDelimiters :
+        *value = (int)CCtxParams->blockDelimiters;
+        break;
+    case ZSTD_c_validateSequences :
+        *value = (int)CCtxParams->validateSequences;
+        break;
+    case ZSTD_c_useBlockSplitter :
+        *value = (int)CCtxParams->useBlockSplitter;
+        break;
+    case ZSTD_c_useRowMatchFinder :
+        *value = (int)CCtxParams->useRowMatchFinder;
+        break;
+    case ZSTD_c_deterministicRefPrefix:
+        *value = (int)CCtxParams->deterministicRefPrefix;
+        break;
+    case ZSTD_c_prefetchCDictTables:
+        *value = (int)CCtxParams->prefetchCDictTables;
+        break;
+    case ZSTD_c_enableSeqProducerFallback:
+        *value = CCtxParams->enableMatchFinderFallback;
+        break;
+    case ZSTD_c_maxBlockSize:
+        *value = (int)CCtxParams->maxBlockSize;
+        break;
+    case ZSTD_c_searchForExternalRepcodes:
+        *value = (int)CCtxParams->searchForExternalRepcodes;
+        break;
+    default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+    }
+    return 0;
+}
+
+/** ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  just applies `params` into `cctx`
+ *  no action is performed, parameters are merely stored.
+ *  If ZSTDMT is enabled, parameters are pushed to cctx->mtctx.
+ *    This is possible even if a compression is ongoing.
+ *    In which case, new parameters will be applied on the fly, starting with next compression job.
+ */
+size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams");
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "The context is in the wrong stage!");
+    RETURN_ERROR_IF(cctx->cdict, stage_wrong,
+                    "Can't override parameters with cdict attached (some must "
+                    "be inherited from the cdict).");
+
+    cctx->requestedParams = *params;
+    return 0;
+}
+
+size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
+{
+    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
+    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
+    /* only update if all parameters are valid */
+    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
+    return 0;
+}
+
+size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
+{
+    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
+    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
+    return 0;
+}
+
+size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParams");
+    /* First check cParams, because we want to update all or none. */
+    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
+    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
+    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
+    /* Finally set cParams, which should succeed. */
+    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
+    return 0;
+}
+
+size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't set pledgedSrcSize when not in init stage.");
+    cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+    return 0;
+}
+
+static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(
+        int const compressionLevel,
+        size_t const dictSize);
+static int ZSTD_dedicatedDictSearch_isSupported(
+        const ZSTD_compressionParameters* cParams);
+static void ZSTD_dedicatedDictSearch_revertCParams(
+        ZSTD_compressionParameters* cParams);
+
+/**
+ * Initializes the local dictionary using requested parameters.
+ * NOTE: Initialization does not employ the pledged src size,
+ * because the dictionary may be used for multiple compressions.
+ */
+static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+{
+    ZSTD_localDict* const dl = &cctx->localDict;
+    if (dl->dict == NULL) {
+        /* No local dictionary. */
+        assert(dl->dictBuffer == NULL);
+        assert(dl->cdict == NULL);
+        assert(dl->dictSize == 0);
+        return 0;
+    }
+    if (dl->cdict != NULL) {
+        /* Local dictionary already initialized. */
+        assert(cctx->cdict == dl->cdict);
+        return 0;
+    }
+    assert(dl->dictSize > 0);
+    assert(cctx->cdict == NULL);
+    assert(cctx->prefixDict.dict == NULL);
+
+    dl->cdict = ZSTD_createCDict_advanced2(
+            dl->dict,
+            dl->dictSize,
+            ZSTD_dlm_byRef,
+            dl->dictContentType,
+            &cctx->requestedParams,
+            cctx->customMem);
+    RETURN_ERROR_IF(!dl->cdict, memory_allocation, "ZSTD_createCDict_advanced failed");
+    cctx->cdict = dl->cdict;
+    return 0;
+}
+
+size_t ZSTD_CCtx_loadDictionary_advanced(
+        ZSTD_CCtx* cctx,
+        const void* dict, size_t dictSize,
+        ZSTD_dictLoadMethod_e dictLoadMethod,
+        ZSTD_dictContentType_e dictContentType)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't load a dictionary when cctx is not in init stage.");
+    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
+    if (dict == NULL || dictSize == 0)  /* no dictionary */
+        return 0;
+    if (dictLoadMethod == ZSTD_dlm_byRef) {
+        cctx->localDict.dict = dict;
+    } else {
+        /* copy dictionary content inside CCtx to own its lifetime */
+        void* dictBuffer;
+        RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                        "static CCtx can't allocate for an internal copy of dictionary");
+        dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
+        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
+                        "allocation failed for dictionary content");
+        ZSTD_memcpy(dictBuffer, dict, dictSize);
+        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
+        cctx->localDict.dict = dictBuffer;        /* read-only reference */
+    }
+    cctx->localDict.dictSize = dictSize;
+    cctx->localDict.dictContentType = dictContentType;
+    return 0;
+}
+
+size_t ZSTD_CCtx_loadDictionary_byReference(
+      ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_CCtx_loadDictionary_advanced(
+            cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_CCtx_loadDictionary_advanced(
+            cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+
+size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't ref a dict when ctx not in init stage.");
+    /* Free the existing local cdict (if any) to save memory. */
+    ZSTD_clearAllDicts(cctx);
+    cctx->cdict = cdict;
+    return 0;
+}
+
+size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool)
+{
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't ref a pool when ctx not in init stage.");
+    cctx->pool = pool;
+    return 0;
+}
+
+size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize)
+{
+    return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+size_t ZSTD_CCtx_refPrefix_advanced(
+        ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't ref a prefix when ctx not in init stage.");
+    ZSTD_clearAllDicts(cctx);
+    if (prefix != NULL && prefixSize > 0) {
+        cctx->prefixDict.dict = prefix;
+        cctx->prefixDict.dictSize = prefixSize;
+        cctx->prefixDict.dictContentType = dictContentType;
+    }
+    return 0;
+}
+
+/*! ZSTD_CCtx_reset() :
+ *  Also dumps dictionary */
+size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+{
+    if ( (reset == ZSTD_reset_session_only)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        cctx->streamStage = zcss_init;
+        cctx->pledgedSrcSizePlusOne = 0;
+    }
+    if ( (reset == ZSTD_reset_parameters)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                        "Reset parameters is only possible during init stage.");
+        ZSTD_clearAllDicts(cctx);
+        return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+    }
+    return 0;
+}
+
+
+/** ZSTD_checkCParams() :
+    control CParam values remain within authorized range.
+    @return : 0, or an error code if one value is beyond authorized range */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+{
+    BOUNDCHECK(ZSTD_c_windowLog, (int)cParams.windowLog);
+    BOUNDCHECK(ZSTD_c_chainLog,  (int)cParams.chainLog);
+    BOUNDCHECK(ZSTD_c_hashLog,   (int)cParams.hashLog);
+    BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog);
+    BOUNDCHECK(ZSTD_c_minMatch,  (int)cParams.minMatch);
+    BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength);
+    BOUNDCHECK(ZSTD_c_strategy,  cParams.strategy);
+    return 0;
+}
+
+/** ZSTD_clampCParams() :
+ *  make CParam values within valid range.
+ *  @return : valid CParams */
+static ZSTD_compressionParameters
+ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+{
+#   define CLAMP_TYPE(cParam, val, type)                                      \
+        do {                                                                  \
+            ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
+            if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
+            else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
+        } while (0)
+#   define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
+    CLAMP(ZSTD_c_windowLog, cParams.windowLog);
+    CLAMP(ZSTD_c_chainLog,  cParams.chainLog);
+    CLAMP(ZSTD_c_hashLog,   cParams.hashLog);
+    CLAMP(ZSTD_c_searchLog, cParams.searchLog);
+    CLAMP(ZSTD_c_minMatch,  cParams.minMatch);
+    CLAMP(ZSTD_c_targetLength,cParams.targetLength);
+    CLAMP_TYPE(ZSTD_c_strategy,cParams.strategy, ZSTD_strategy);
+    return cParams;
+}
+
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
+{
+    U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
+    return hashLog - btScale;
+}
+
+/** ZSTD_dictAndWindowLog() :
+ * Returns an adjusted window log that is large enough to fit the source and the dictionary.
+ * The zstd format says that the entire dictionary is valid if one byte of the dictionary
+ * is within the window. So the hashLog and chainLog should be large enough to reference both
+ * the dictionary and the window. So we must use this adjusted dictAndWindowLog when downsizing
+ * the hashLog and windowLog.
+ * NOTE: srcSize must not be ZSTD_CONTENTSIZE_UNKNOWN.
+ */
+static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize)
+{
+    const U64 maxWindowSize = 1ULL << ZSTD_WINDOWLOG_MAX;
+    /* No dictionary ==> No change */
+    if (dictSize == 0) {
+        return windowLog;
+    }
+    assert(windowLog <= ZSTD_WINDOWLOG_MAX);
+    assert(srcSize != ZSTD_CONTENTSIZE_UNKNOWN); /* Handled in ZSTD_adjustCParams_internal() */
+    {
+        U64 const windowSize = 1ULL << windowLog;
+        U64 const dictAndWindowSize = dictSize + windowSize;
+        /* If the window size is already large enough to fit both the source and the dictionary
+         * then just use the window size. Otherwise adjust so that it fits the dictionary and
+         * the window.
+         */
+        if (windowSize >= dictSize + srcSize) {
+            return windowLog; /* Window size large enough already */
+        } else if (dictAndWindowSize >= maxWindowSize) {
+            return ZSTD_WINDOWLOG_MAX; /* Larger than max window log */
+        } else  {
+            return ZSTD_highbit32((U32)dictAndWindowSize - 1) + 1;
+        }
+    }
+}
+
+/** ZSTD_adjustCParams_internal() :
+ *  optimize `cPar` for a specified input (`srcSize` and `dictSize`).
+ *  mostly downsize to reduce memory consumption and initialization latency.
+ * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known.
+ * `mode` is the mode for parameter adjustment. See docs for `ZSTD_cParamMode_e`.
+ *  note : `srcSize==0` means 0!
+ *  condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */
+static ZSTD_compressionParameters
+ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                            unsigned long long srcSize,
+                            size_t dictSize,
+                            ZSTD_cParamMode_e mode,
+                            ZSTD_paramSwitch_e useRowMatchFinder)
+{
+    const U64 minSrcSize = 513; /* (1<<9) + 1 */
+    const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+    assert(ZSTD_checkCParams(cPar)==0);
+
+    /* Cascade the selected strategy down to the next-highest one built into
+     * this binary. */
+#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_btultra2) {
+        cPar.strategy = ZSTD_btultra;
+    }
+    if (cPar.strategy == ZSTD_btultra) {
+        cPar.strategy = ZSTD_btopt;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_btopt) {
+        cPar.strategy = ZSTD_btlazy2;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_btlazy2) {
+        cPar.strategy = ZSTD_lazy2;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_lazy2) {
+        cPar.strategy = ZSTD_lazy;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_lazy) {
+        cPar.strategy = ZSTD_greedy;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_greedy) {
+        cPar.strategy = ZSTD_dfast;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_dfast) {
+        cPar.strategy = ZSTD_fast;
+        cPar.targetLength = 0;
+    }
+#endif
+
+    switch (mode) {
+    case ZSTD_cpm_unknown:
+    case ZSTD_cpm_noAttachDict:
+        /* If we don't know the source size, don't make any
+         * assumptions about it. We will already have selected
+         * smaller parameters if a dictionary is in use.
+         */
+        break;
+    case ZSTD_cpm_createCDict:
+        /* Assume a small source size when creating a dictionary
+         * with an unknown source size.
+         */
+        if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+            srcSize = minSrcSize;
+        break;
+    case ZSTD_cpm_attachDict:
+        /* Dictionary has its own dedicated parameters which have
+         * already been selected. We are selecting parameters
+         * for only the source.
+         */
+        dictSize = 0;
+        break;
+    default:
+        assert(0);
+        break;
+    }
+
+    /* resize windowLog if input is small enough, to use less memory */
+    if ( (srcSize <= maxWindowResize)
+      && (dictSize <= maxWindowResize) )  {
+        U32 const tSize = (U32)(srcSize + dictSize);
+        static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+        U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+                            ZSTD_highbit32(tSize-1) + 1;
+        if (cPar.windowLog > srcLog) cPar.windowLog = srcLog;
+    }
+    if (srcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+        U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize);
+        U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
+        if (cPar.hashLog > dictAndWindowLog+1) cPar.hashLog = dictAndWindowLog+1;
+        if (cycleLog > dictAndWindowLog)
+            cPar.chainLog -= (cycleLog - dictAndWindowLog);
+    }
+
+    if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+        cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
+
+    /* We can't use more than 32 bits of hash in total, so that means that we require:
+     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
+     */
+    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
+        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
+        if (cPar.hashLog > maxShortCacheHashLog) {
+            cPar.hashLog = maxShortCacheHashLog;
+        }
+        if (cPar.chainLog > maxShortCacheHashLog) {
+            cPar.chainLog = maxShortCacheHashLog;
+        }
+    }
+
+
+    /* At this point, we aren't 100% sure if we are using the row match finder.
+     * Unless it is explicitly disabled, conservatively assume that it is enabled.
+     * In this case it will only be disabled for small sources, so shrinking the
+     * hash log a little bit shouldn't result in any ratio loss.
+     */
+    if (useRowMatchFinder == ZSTD_ps_auto)
+        useRowMatchFinder = ZSTD_ps_enable;
+
+    /* We can't hash more than 32-bits in total. So that means that we require:
+     * (hashLog - rowLog + 8) <= 32
+     */
+    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
+        /* Switch to 32-entry rows if searchLog is 5 (or more) */
+        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
+        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
+        U32 const maxHashLog = maxRowHashLog + rowLog;
+        assert(cPar.hashLog >= rowLog);
+        if (cPar.hashLog > maxHashLog) {
+            cPar.hashLog = maxHashLog;
+        }
+    }
+
+    return cPar;
+}
+
+ZSTD_compressionParameters
+ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+                   unsigned long long srcSize,
+                   size_t dictSize)
+{
+    cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
+    if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
+}
+
+static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+
+static void ZSTD_overrideCParams(
+              ZSTD_compressionParameters* cParams,
+        const ZSTD_compressionParameters* overrides)
+{
+    if (overrides->windowLog)    cParams->windowLog    = overrides->windowLog;
+    if (overrides->hashLog)      cParams->hashLog      = overrides->hashLog;
+    if (overrides->chainLog)     cParams->chainLog     = overrides->chainLog;
+    if (overrides->searchLog)    cParams->searchLog    = overrides->searchLog;
+    if (overrides->minMatch)     cParams->minMatch     = overrides->minMatch;
+    if (overrides->targetLength) cParams->targetLength = overrides->targetLength;
+    if (overrides->strategy)     cParams->strategy     = overrides->strategy;
+}
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
+{
+    ZSTD_compressionParameters cParams;
+    if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) {
+      srcSizeHint = CCtxParams->srcSizeHint;
+    }
+    cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode);
+    if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
+    ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
+    assert(!ZSTD_checkCParams(cParams));
+    /* srcSizeHint == 0 means 0 */
+    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
+}
+
+static size_t
+ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+                       const ZSTD_paramSwitch_e useRowMatchFinder,
+                       const U32 enableDedicatedDictSearch,
+                       const U32 forCCtx)
+{
+    /* chain table size should be 0 for fast or row-hash strategies */
+    size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, enableDedicatedDictSearch && !forCCtx)
+                                ? ((size_t)1 << cParams->chainLog)
+                                : 0;
+    size_t const hSize = ((size_t)1) << cParams->hashLog;
+    U32    const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+    size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
+    /* We don't use ZSTD_cwksp_alloc_size() here because the tables aren't
+     * surrounded by redzones in ASAN. */
+    size_t const tableSpace = chainSize * sizeof(U32)
+                            + hSize * sizeof(U32)
+                            + h3Size * sizeof(U32);
+    size_t const optPotentialSpace =
+        ZSTD_cwksp_aligned_alloc_size((MaxML+1) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t))
+      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
+    size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
+                                            ? ZSTD_cwksp_aligned_alloc_size(hSize)
+                                            : 0;
+    size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+                                ? optPotentialSpace
+                                : 0;
+    size_t const slackSpace = ZSTD_cwksp_slack_space_required();
+
+    /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */
+    ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4);
+    assert(useRowMatchFinder != ZSTD_ps_auto);
+
+    DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u",
+                (U32)chainSize, (U32)hSize, (U32)h3Size);
+    return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
+}
+
+/* Helper function for calculating memory requirements.
+ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
+static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
+    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
+    return blockSize / divider;
+}
+
+static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+        const ZSTD_compressionParameters* cParams,
+        const ldmParams_t* ldmParams,
+        const int isStatic,
+        const ZSTD_paramSwitch_e useRowMatchFinder,
+        const size_t buffInSize,
+        const size_t buffOutSize,
+        const U64 pledgedSrcSize,
+        int useSequenceProducer,
+        size_t maxBlockSize)
+{
+    size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
+    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
+    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
+    size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+                            + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
+                            + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+    size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE);
+    size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t));
+    size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1);
+
+    size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams);
+    size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize);
+    size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ?
+        ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0;
+
+
+    size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize)
+                             + ZSTD_cwksp_alloc_size(buffOutSize);
+
+    size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+
+    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
+    size_t const externalSeqSpace = useSequenceProducer
+        ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
+        : 0;
+
+    size_t const neededSpace =
+        cctxSpace +
+        entropySpace +
+        blockStateSpace +
+        ldmSpace +
+        ldmSeqSpace +
+        matchStateSize +
+        tokenSpace +
+        bufferSpace +
+        externalSeqSpace;
+
+    DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+    return neededSpace;
+}
+
+size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+    ZSTD_compressionParameters const cParams =
+                ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+    ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder,
+                                                                               &cParams);
+
+    RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+    /* estimateCCtxSize is for one-shot compression. So no buffers should
+     * be needed. However, we still allocate two 0-sized buffers, which can
+     * take space under ASAN. */
+    return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+}
+
+size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams);
+    if (ZSTD_rowMatchFinderSupported(cParams.strategy)) {
+        /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */
+        size_t noRowCCtxSize;
+        size_t rowCCtxSize;
+        initialParams.useRowMatchFinder = ZSTD_ps_disable;
+        noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
+        initialParams.useRowMatchFinder = ZSTD_ps_enable;
+        rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
+        return MAX(noRowCCtxSize, rowCCtxSize);
+    } else {
+        return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
+    }
+}
+
+static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel)
+{
+    int tier = 0;
+    size_t largestSize = 0;
+    static const unsigned long long srcSizeTiers[4] = {16 KB, 128 KB, 256 KB, ZSTD_CONTENTSIZE_UNKNOWN};
+    for (; tier < 4; ++tier) {
+        /* Choose the set of cParams for a given level across all srcSizes that give the largest cctxSize */
+        ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeTiers[tier], 0, ZSTD_cpm_noAttachDict);
+        largestSize = MAX(ZSTD_estimateCCtxSize_usingCParams(cParams), largestSize);
+    }
+    return largestSize;
+}
+
+size_t ZSTD_estimateCCtxSize(int compressionLevel)
+{
+    int level;
+    size_t memBudget = 0;
+    for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) {
+        /* Ensure monotonically increasing memory usage as compression level increases */
+        size_t const newMB = ZSTD_estimateCCtxSize_internal(level);
+        if (newMB > memBudget) memBudget = newMB;
+    }
+    return memBudget;
+}
+
+size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+    RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+    {   ZSTD_compressionParameters const cParams =
+                ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
+        size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
+                ? ((size_t)1 << cParams.windowLog) + blockSize
+                : 0;
+        size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered)
+                ? ZSTD_compressBound(blockSize) + 1
+                : 0;
+        ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &params->cParams);
+
+        return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+            &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
+            ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+    }
+}
+
+size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams);
+    if (ZSTD_rowMatchFinderSupported(cParams.strategy)) {
+        /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */
+        size_t noRowCCtxSize;
+        size_t rowCCtxSize;
+        initialParams.useRowMatchFinder = ZSTD_ps_disable;
+        noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
+        initialParams.useRowMatchFinder = ZSTD_ps_enable;
+        rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
+        return MAX(noRowCCtxSize, rowCCtxSize);
+    } else {
+        return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
+    }
+}
+
+static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+    return ZSTD_estimateCStreamSize_usingCParams(cParams);
+}
+
+size_t ZSTD_estimateCStreamSize(int compressionLevel)
+{
+    int level;
+    size_t memBudget = 0;
+    for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) {
+        size_t const newMB = ZSTD_estimateCStreamSize_internal(level);
+        if (newMB > memBudget) memBudget = newMB;
+    }
+    return memBudget;
+}
+
+/* ZSTD_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads (non-blocking mode).
+ */
+ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        return ZSTDMT_getFrameProgression(cctx->mtctx);
+    }
+#endif
+    {   ZSTD_frameProgression fp;
+        size_t const buffered = (cctx->inBuff == NULL) ? 0 :
+                                cctx->inBuffPos - cctx->inToCompress;
+        if (buffered) assert(cctx->inBuffPos >= cctx->inToCompress);
+        assert(buffered <= ZSTD_BLOCKSIZE_MAX);
+        fp.ingested = cctx->consumedSrcSize + buffered;
+        fp.consumed = cctx->consumedSrcSize;
+        fp.produced = cctx->producedCSize;
+        fp.flushed  = cctx->producedCSize;   /* simplified; some data might still be left within streaming output buffer */
+        fp.currentJobID = 0;
+        fp.nbActiveWorkers = 0;
+        return fp;
+}   }
+
+/*! ZSTD_toFlushNow()
+ *  Only useful for multithreading scenarios currently (nbWorkers >= 1).
+ */
+size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        return ZSTDMT_toFlushNow(cctx->mtctx);
+    }
+#endif
+    (void)cctx;
+    return 0;   /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */
+}
+
+static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1,
+                                    ZSTD_compressionParameters cParams2)
+{
+    (void)cParams1;
+    (void)cParams2;
+    assert(cParams1.windowLog    == cParams2.windowLog);
+    assert(cParams1.chainLog     == cParams2.chainLog);
+    assert(cParams1.hashLog      == cParams2.hashLog);
+    assert(cParams1.searchLog    == cParams2.searchLog);
+    assert(cParams1.minMatch     == cParams2.minMatch);
+    assert(cParams1.targetLength == cParams2.targetLength);
+    assert(cParams1.strategy     == cParams2.strategy);
+}
+
+void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
+{
+    int i;
+    for (i = 0; i < ZSTD_REP_NUM; ++i)
+        bs->rep[i] = repStartValue[i];
+    bs->entropy.huf.repeatMode = HUF_repeat_none;
+    bs->entropy.fse.offcode_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.litlength_repeatMode = FSE_repeat_none;
+}
+
+/*! ZSTD_invalidateMatchState()
+ *  Invalidate all the matches in the match finder tables.
+ *  Requires nextSrc and base to be set (can be NULL).
+ */
+static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms)
+{
+    ZSTD_window_clear(&ms->window);
+
+    ms->nextToUpdate = ms->window.dictLimit;
+    ms->loadedDictEnd = 0;
+    ms->opt.litLengthSum = 0;  /* force reset of btopt stats */
+    ms->dictMatchState = NULL;
+}
+
+/**
+ * Controls, for this matchState reset, whether the tables need to be cleared /
+ * prepared for the coming compression (ZSTDcrp_makeClean), or whether the
+ * tables can be left unclean (ZSTDcrp_leaveDirty), because we know that a
+ * subsequent operation will overwrite the table space anyways (e.g., copying
+ * the matchState contents in from a CDict).
+ */
+typedef enum {
+    ZSTDcrp_makeClean,
+    ZSTDcrp_leaveDirty
+} ZSTD_compResetPolicy_e;
+
+/**
+ * Controls, for this matchState reset, whether indexing can continue where it
+ * left off (ZSTDirp_continue), or whether it needs to be restarted from zero
+ * (ZSTDirp_reset).
+ */
+typedef enum {
+    ZSTDirp_continue,
+    ZSTDirp_reset
+} ZSTD_indexResetPolicy_e;
+
+typedef enum {
+    ZSTD_resetTarget_CDict,
+    ZSTD_resetTarget_CCtx
+} ZSTD_resetTarget_e;
+
+/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
+static U64 ZSTD_bitmix(U64 val, U64 len) {
+    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
+    val *= 0x9FB21C651E98DF25ULL;
+    val ^= (val >> 35) + len ;
+    val *= 0x9FB21C651E98DF25ULL;
+    return val ^ (val >> 28);
+}
+
+/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
+static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
+    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
+}
+
+static size_t
+ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+                      ZSTD_cwksp* ws,
+                const ZSTD_compressionParameters* cParams,
+                const ZSTD_paramSwitch_e useRowMatchFinder,
+                const ZSTD_compResetPolicy_e crp,
+                const ZSTD_indexResetPolicy_e forceResetIndex,
+                const ZSTD_resetTarget_e forWho)
+{
+    /* disable chain table allocation for fast or row-based strategies */
+    size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder,
+                                                     ms->dedicatedDictSearch && (forWho == ZSTD_resetTarget_CDict))
+                                ? ((size_t)1 << cParams->chainLog)
+                                : 0;
+    size_t const hSize = ((size_t)1) << cParams->hashLog;
+    U32    const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+    size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
+
+    DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset);
+    assert(useRowMatchFinder != ZSTD_ps_auto);
+    if (forceResetIndex == ZSTDirp_reset) {
+        ZSTD_window_init(&ms->window);
+        ZSTD_cwksp_mark_tables_dirty(ws);
+    }
+
+    ms->hashLog3 = hashLog3;
+    ms->lazySkipping = 0;
+
+    ZSTD_invalidateMatchState(ms);
+
+    assert(!ZSTD_cwksp_reserve_failed(ws)); /* check that allocation hasn't already failed */
+
+    ZSTD_cwksp_clear_tables(ws);
+
+    DEBUGLOG(5, "reserving table space");
+    /* table Space */
+    ms->hashTable = (U32*)ZSTD_cwksp_reserve_table(ws, hSize * sizeof(U32));
+    ms->chainTable = (U32*)ZSTD_cwksp_reserve_table(ws, chainSize * sizeof(U32));
+    ms->hashTable3 = (U32*)ZSTD_cwksp_reserve_table(ws, h3Size * sizeof(U32));
+    RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+                    "failed a workspace allocation in ZSTD_reset_matchState");
+
+    DEBUGLOG(4, "reset table : %u", crp!=ZSTDcrp_leaveDirty);
+    if (crp!=ZSTDcrp_leaveDirty) {
+        /* reset tables only */
+        ZSTD_cwksp_clean_tables(ws);
+    }
+
+    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+        /* Row match finder needs an additional table of hashes ("tags") */
+        size_t const tagTableSize = hSize;
+        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
+         * 0 when we reset a Cdict */
+        if(forWho == ZSTD_resetTarget_CCtx) {
+            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
+            ZSTD_advanceHashSalt(ms);
+        } else {
+            /* When we are not salting we want to always memset the memory */
+            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+            ZSTD_memset(ms->tagTable, 0, tagTableSize);
+            ms->hashSalt = 0;
+        }
+        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+            assert(cParams->hashLog >= rowLog);
+            ms->rowHashLog = cParams->hashLog - rowLog;
+        }
+    }
+
+    /* opt parser space */
+    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+        DEBUGLOG(4, "reserving optimal parser space");
+        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
+        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
+        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
+        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
+        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t));
+        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
+    }
+
+    ms->cParams = *cParams;
+
+    RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+                    "failed a workspace allocation in ZSTD_reset_matchState");
+    return 0;
+}
+
+/* ZSTD_indexTooCloseToMax() :
+ * minor optimization : prefer memset() rather than reduceIndex()
+ * which is measurably slow in some circumstances (reported for Visual Studio).
+ * Works when re-using a context for a lot of smallish inputs :
+ * if all inputs are smaller than ZSTD_INDEXOVERFLOW_MARGIN,
+ * memset() will be triggered before reduceIndex().
+ */
+#define ZSTD_INDEXOVERFLOW_MARGIN (16 MB)
+static int ZSTD_indexTooCloseToMax(ZSTD_window_t w)
+{
+    return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN);
+}
+
+/** ZSTD_dictTooBig():
+ * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be loaded in
+ * one go generically. So we ensure that in that case we reset the tables to zero,
+ * so that we can load as much of the dictionary as possible.
+ */
+static int ZSTD_dictTooBig(size_t const loadedDictSize)
+{
+    return loadedDictSize > ZSTD_CHUNKSIZE_MAX;
+}
+
+/*! ZSTD_resetCCtx_internal() :
+ * @param loadedDictSize The size of the dictionary to be loaded
+ * into the context, if any. If no dictionary is used, or the
+ * dictionary is being attached / copied, then pass 0.
+ * note : `params` are assumed fully validated at this stage.
+ */
+static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+                                      ZSTD_CCtx_params const* params,
+                                      U64 const pledgedSrcSize,
+                                      size_t const loadedDictSize,
+                                      ZSTD_compResetPolicy_e const crp,
+                                      ZSTD_buffered_policy_e const zbuff)
+{
+    ZSTD_cwksp* const ws = &zc->workspace;
+    DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d",
+                (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter);
+    assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+
+    zc->isFirstBlock = 1;
+
+    /* Set applied params early so we can modify them for LDM,
+     * and point params at the applied params.
+     */
+    zc->appliedParams = *params;
+    params = &zc->appliedParams;
+
+    assert(params->useRowMatchFinder != ZSTD_ps_auto);
+    assert(params->useBlockSplitter != ZSTD_ps_auto);
+    assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
+    assert(params->maxBlockSize != 0);
+    if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+        /* Adjust long distance matching parameters */
+        ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
+        assert(params->ldmParams.hashLog >= params->ldmParams.bucketSizeLog);
+        assert(params->ldmParams.hashRateLog < 32);
+    }
+
+    {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
+        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
+        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params));
+        size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
+                ? ZSTD_compressBound(blockSize) + 1
+                : 0;
+        size_t const buffInSize = (zbuff == ZSTDb_buffered && params->inBufferMode == ZSTD_bm_buffered)
+                ? windowSize + blockSize
+                : 0;
+        size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize);
+
+        int const indexTooClose = ZSTD_indexTooCloseToMax(zc->blockState.matchState.window);
+        int const dictTooBig = ZSTD_dictTooBig(loadedDictSize);
+        ZSTD_indexResetPolicy_e needsIndexReset =
+            (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_reset : ZSTDirp_continue;
+
+        size_t const neededSpace =
+            ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+                &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
+                buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
+
+        FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
+
+        if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0);
+
+        {   /* Check if workspace is large enough, alloc a new one if needed */
+            int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
+            int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
+            int resizeWorkspace = workspaceTooSmall || workspaceWasteful;
+            DEBUGLOG(4, "Need %zu B workspace", neededSpace);
+            DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+
+            if (resizeWorkspace) {
+                DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB",
+                            ZSTD_cwksp_sizeof(ws) >> 10,
+                            neededSpace >> 10);
+
+                RETURN_ERROR_IF(zc->staticSize, memory_allocation, "static cctx : no resize");
+
+                needsIndexReset = ZSTDirp_reset;
+
+                ZSTD_cwksp_free(ws, zc->customMem);
+                FORWARD_IF_ERROR(ZSTD_cwksp_create(ws, neededSpace, zc->customMem), "");
+
+                DEBUGLOG(5, "reserving object space");
+                /* Statically sized space.
+                 * entropyWorkspace never moves,
+                 * though prev/next block swap places */
+                assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t)));
+                zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+                RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock");
+                zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+                RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock");
+                zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE);
+                RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace");
+        }   }
+
+        ZSTD_cwksp_clear(ws);
+
+        /* init params */
+        zc->blockState.matchState.cParams = params->cParams;
+        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
+        zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+        zc->consumedSrcSize = 0;
+        zc->producedCSize = 0;
+        if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+            zc->appliedParams.fParams.contentSizeFlag = 0;
+        DEBUGLOG(4, "pledged content size : %u ; flag : %u",
+            (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag);
+        zc->blockSize = blockSize;
+
+        XXH64_reset(&zc->xxhState, 0);
+        zc->stage = ZSTDcs_init;
+        zc->dictID = 0;
+        zc->dictContentSize = 0;
+
+        ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+
+        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+                &zc->blockState.matchState,
+                ws,
+                &params->cParams,
+                params->useRowMatchFinder,
+                crp,
+                needsIndexReset,
+                ZSTD_resetTarget_CCtx), "");
+
+        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+
+        /* ldm hash table */
+        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+            /* TODO: avoid memset? */
+            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+            zc->maxNbLdmSequences = maxNbLdmSeq;
+
+            ZSTD_window_init(&zc->ldmState.window);
+            zc->ldmState.loadedDictEnd = 0;
+        }
+
+        /* reserve space for block-level external sequences */
+        if (ZSTD_hasExtSeqProd(params)) {
+            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
+            zc->extSeqBufCapacity = maxNbExternalSeq;
+            zc->extSeqBuf =
+                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
+        }
+
+        /* buffers */
+
+        /* ZSTD_wildcopy() is used to copy into the literals buffer,
+         * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+         */
+        zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+        zc->seqStore.maxNbLit = blockSize;
+
+        zc->bufferedPolicy = zbuff;
+        zc->inBuffSize = buffInSize;
+        zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+        zc->outBuffSize = buffOutSize;
+        zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize);
+
+        /* ldm bucketOffsets table */
+        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+            /* TODO: avoid memset? */
+            size_t const numBuckets =
+                  ((size_t)1) << (params->ldmParams.hashLog -
+                                  params->ldmParams.bucketSizeLog);
+            zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets);
+            ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets);
+        }
+
+        /* sequences storage */
+        ZSTD_referenceExternalSequences(zc, NULL, 0);
+        zc->seqStore.maxNbSeq = maxNbSeq;
+        zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+        zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+        zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+
+        DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
+
+        zc->initialized = 1;
+
+        return 0;
+    }
+}
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) {
+    int i;
+    for (i=0; i<ZSTD_REP_NUM; i++) cctx->blockState.prevCBlock->rep[i] = 0;
+    assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
+}
+
+/* These are the approximate sizes for each strategy past which copying the
+ * dictionary tables into the working context is faster than using them
+ * in-place.
+ */
+static const size_t attachDictSizeCutoffs[ZSTD_STRATEGY_MAX+1] = {
+    8 KB,  /* unused */
+    8 KB,  /* ZSTD_fast */
+    16 KB, /* ZSTD_dfast */
+    32 KB, /* ZSTD_greedy */
+    32 KB, /* ZSTD_lazy */
+    32 KB, /* ZSTD_lazy2 */
+    32 KB, /* ZSTD_btlazy2 */
+    32 KB, /* ZSTD_btopt */
+    8 KB,  /* ZSTD_btultra */
+    8 KB   /* ZSTD_btultra2 */
+};
+
+static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict,
+                                 const ZSTD_CCtx_params* params,
+                                 U64 pledgedSrcSize)
+{
+    size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy];
+    int const dedicatedDictSearch = cdict->matchState.dedicatedDictSearch;
+    return dedicatedDictSearch
+        || ( ( pledgedSrcSize <= cutoff
+            || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+            || params->attachDictPref == ZSTD_dictForceAttach )
+          && params->attachDictPref != ZSTD_dictForceCopy
+          && !params->forceWindow ); /* dictMatchState isn't correctly
+                                      * handled in _enforceMaxDist */
+}
+
+static size_t
+ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+                        const ZSTD_CDict* cdict,
+                        ZSTD_CCtx_params params,
+                        U64 pledgedSrcSize,
+                        ZSTD_buffered_policy_e zbuff)
+{
+    DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=%llu",
+                (unsigned long long)pledgedSrcSize);
+    {
+        ZSTD_compressionParameters adjusted_cdict_cParams = cdict->matchState.cParams;
+        unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
+        /* Resize working context table params for input only, since the dict
+         * has its own tables. */
+        /* pledgedSrcSize == 0 means 0! */
+
+        if (cdict->matchState.dedicatedDictSearch) {
+            ZSTD_dedicatedDictSearch_revertCParams(&adjusted_cdict_cParams);
+        }
+
+        params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
+                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
+                                                     params.useRowMatchFinder);
+        params.cParams.windowLog = windowLog;
+        params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
+        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+                                                 /* loadedDictSize */ 0,
+                                                 ZSTDcrp_makeClean, zbuff), "");
+        assert(cctx->appliedParams.cParams.strategy == adjusted_cdict_cParams.strategy);
+    }
+
+    {   const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc
+                                  - cdict->matchState.window.base);
+        const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit;
+        if (cdictLen == 0) {
+            /* don't even attach dictionaries with no contents */
+            DEBUGLOG(4, "skipping attaching empty dictionary");
+        } else {
+            DEBUGLOG(4, "attaching dictionary into context");
+            cctx->blockState.matchState.dictMatchState = &cdict->matchState;
+
+            /* prep working match state so dict matches never have negative indices
+             * when they are translated to the working context's index space. */
+            if (cctx->blockState.matchState.window.dictLimit < cdictEnd) {
+                cctx->blockState.matchState.window.nextSrc =
+                    cctx->blockState.matchState.window.base + cdictEnd;
+                ZSTD_window_clear(&cctx->blockState.matchState.window);
+            }
+            /* loadedDictEnd is expressed within the referential of the active context */
+            cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit;
+    }   }
+
+    cctx->dictID = cdict->dictID;
+    cctx->dictContentSize = cdict->dictContentSize;
+
+    /* copy block state */
+    ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+    return 0;
+}
+
+static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
+                                        ZSTD_compressionParameters const* cParams) {
+    if (ZSTD_CDictIndicesAreTagged(cParams)){
+        /* Remove tags from the CDict table if they are present.
+         * See docs on "short cache" in zstd_compress_internal.h for context. */
+        size_t i;
+        for (i = 0; i < tableSize; i++) {
+            U32 const taggedIndex = src[i];
+            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
+            dst[i] = index;
+        }
+    } else {
+        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
+    }
+}
+
+static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            ZSTD_CCtx_params params,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+    const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;
+
+    assert(!cdict->matchState.dedicatedDictSearch);
+    DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=%llu",
+                (unsigned long long)pledgedSrcSize);
+
+    {   unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
+        /* Copy only compression parameters related to tables. */
+        params.cParams = *cdict_cParams;
+        params.cParams.windowLog = windowLog;
+        params.useRowMatchFinder = cdict->useRowMatchFinder;
+        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+                                                 /* loadedDictSize */ 0,
+                                                 ZSTDcrp_leaveDirty, zbuff), "");
+        assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
+        assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog);
+        assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog);
+    }
+
+    ZSTD_cwksp_mark_tables_dirty(&cctx->workspace);
+    assert(params.useRowMatchFinder != ZSTD_ps_auto);
+
+    /* copy tables */
+    {   size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */)
+                                                            ? ((size_t)1 << cdict_cParams->chainLog)
+                                                            : 0;
+        size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
+
+        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
+                                cdict->matchState.hashTable,
+                                hSize, cdict_cParams);
+
+        /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
+        if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
+                                    cdict->matchState.chainTable,
+                                    chainSize, cdict_cParams);
+        }
+        /* copy tag table */
+        if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
+            size_t const tagTableSize = hSize;
+            ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+                        cdict->matchState.tagTable,
+                        tagTableSize);
+            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
+        }
+    }
+
+    /* Zero the hashTable3, since the cdict never fills it */
+    {   int const h3log = cctx->blockState.matchState.hashLog3;
+        size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+        assert(cdict->matchState.hashLog3 == 0);
+        ZSTD_memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32));
+    }
+
+    ZSTD_cwksp_mark_tables_clean(&cctx->workspace);
+
+    /* copy dictionary offsets */
+    {   ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+        ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
+        dstMatchState->window       = srcMatchState->window;
+        dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+        dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+    }
+
+    cctx->dictID = cdict->dictID;
+    cctx->dictContentSize = cdict->dictContentSize;
+
+    /* copy block state */
+    ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+    return 0;
+}
+
+/* We have a choice between copying the dictionary context into the working
+ * context, or referencing the dictionary context from the working context
+ * in-place. We decide here which strategy to use. */
+static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            const ZSTD_CCtx_params* params,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+
+    DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)",
+                (unsigned)pledgedSrcSize);
+
+    if (ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) {
+        return ZSTD_resetCCtx_byAttachingCDict(
+            cctx, cdict, *params, pledgedSrcSize, zbuff);
+    } else {
+        return ZSTD_resetCCtx_byCopyingCDict(
+            cctx, cdict, *params, pledgedSrcSize, zbuff);
+    }
+}
+
+/*! ZSTD_copyCCtx_internal() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  The "context", in this case, refers to the hash and chain tables,
+ *  entropy tables, and dictionary references.
+ * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx.
+ * @return : 0, or an error code */
+static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+                            const ZSTD_CCtx* srcCCtx,
+                            ZSTD_frameParameters fParams,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+    RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong,
+                    "Can't copy a ctx that's not in init stage.");
+    DEBUGLOG(5, "ZSTD_copyCCtx_internal");
+    ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
+    {   ZSTD_CCtx_params params = dstCCtx->requestedParams;
+        /* Copy only compression parameters related to tables. */
+        params.cParams = srcCCtx->appliedParams.cParams;
+        assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto);
+        assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto);
+        assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto);
+        params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder;
+        params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
+        params.ldmParams = srcCCtx->appliedParams.ldmParams;
+        params.fParams = fParams;
+        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
+        ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
+                                /* loadedDictSize */ 0,
+                                ZSTDcrp_leaveDirty, zbuff);
+        assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog);
+        assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy);
+        assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog);
+        assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog);
+        assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3);
+    }
+
+    ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace);
+
+    /* copy tables */
+    {   size_t const chainSize = ZSTD_allocateChainTable(srcCCtx->appliedParams.cParams.strategy,
+                                                         srcCCtx->appliedParams.useRowMatchFinder,
+                                                         0 /* forDDSDict */)
+                                    ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog)
+                                    : 0;
+        size_t const hSize =  (size_t)1 << srcCCtx->appliedParams.cParams.hashLog;
+        int const h3log = srcCCtx->blockState.matchState.hashLog3;
+        size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+
+        ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable,
+               srcCCtx->blockState.matchState.hashTable,
+               hSize * sizeof(U32));
+        ZSTD_memcpy(dstCCtx->blockState.matchState.chainTable,
+               srcCCtx->blockState.matchState.chainTable,
+               chainSize * sizeof(U32));
+        ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable3,
+               srcCCtx->blockState.matchState.hashTable3,
+               h3Size * sizeof(U32));
+    }
+
+    ZSTD_cwksp_mark_tables_clean(&dstCCtx->workspace);
+
+    /* copy dictionary offsets */
+    {
+        const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState;
+        ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState;
+        dstMatchState->window       = srcMatchState->window;
+        dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+        dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+    }
+    dstCCtx->dictID = srcCCtx->dictID;
+    dstCCtx->dictContentSize = srcCCtx->dictContentSize;
+
+    /* copy block state */
+    ZSTD_memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock));
+
+    return 0;
+}
+
+/*! ZSTD_copyCCtx() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  pledgedSrcSize==0 means "unknown".
+*   @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize)
+{
+    ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    ZSTD_buffered_policy_e const zbuff = srcCCtx->bufferedPolicy;
+    ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1);
+    if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN);
+
+    return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx,
+                                fParams, pledgedSrcSize,
+                                zbuff);
+}
+
+
+#define ZSTD_ROWSIZE 16
+/*! ZSTD_reduceTable() :
+ *  reduce table indexes by `reducerValue`, or squash to zero.
+ *  PreserveMark preserves "unsorted mark" for btlazy2 strategy.
+ *  It must be set to a clear 0/1 value, to remove branch during inlining.
+ *  Presume table size is a multiple of ZSTD_ROWSIZE
+ *  to help auto-vectorization */
+FORCE_INLINE_TEMPLATE void
+ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark)
+{
+    int const nbRows = (int)size / ZSTD_ROWSIZE;
+    int cellNb = 0;
+    int rowNb;
+    /* Protect special index values < ZSTD_WINDOW_START_INDEX. */
+    U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX;
+    assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
+    assert(size < (1U<<31));   /* can be casted to int */
+
+#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    /* To validate that the table reuse logic is sound, and that we don't
+     * access table space that we haven't cleaned, we re-"poison" the table
+     * space every time we mark it dirty.
+     *
+     * This function however is intended to operate on those dirty tables and
+     * re-clean them. So when this function is used correctly, we can unpoison
+     * the memory it operated on. This introduces a blind spot though, since
+     * if we now try to operate on __actually__ poisoned memory, we will not
+     * detect that. */
+    __msan_unpoison(table, size * sizeof(U32));
+#endif
+
+    for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
+        int column;
+        for (column=0; column<ZSTD_ROWSIZE; column++) {
+            U32 newVal;
+            if (preserveMark && table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) {
+                /* This write is pointless, but is required(?) for the compiler
+                 * to auto-vectorize the loop. */
+                newVal = ZSTD_DUBT_UNSORTED_MARK;
+            } else if (table[cellNb] < reducerThreshold) {
+                newVal = 0;
+            } else {
+                newVal = table[cellNb] - reducerValue;
+            }
+            table[cellNb] = newVal;
+            cellNb++;
+    }   }
+}
+
+static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 0);
+}
+
+static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 1);
+}
+
+/*! ZSTD_reduceIndex() :
+*   rescale all indexes to avoid future overflow (indexes are U32) */
+static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue)
+{
+    {   U32 const hSize = (U32)1 << params->cParams.hashLog;
+        ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);
+    }
+
+    if (ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMatchFinder, (U32)ms->dedicatedDictSearch)) {
+        U32 const chainSize = (U32)1 << params->cParams.chainLog;
+        if (params->cParams.strategy == ZSTD_btlazy2)
+            ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue);
+        else
+            ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
+    }
+
+    if (ms->hashLog3) {
+        U32 const h3Size = (U32)1 << ms->hashLog3;
+        ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue);
+    }
+}
+
+
+/*-*******************************************************
+*  Block entropic compression
+*********************************************************/
+
+/* See doc/zstd_compression_format.md for detailed format description */
+
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+{
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    BYTE* const llCodeTable = seqStorePtr->llCode;
+    BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    U32 u;
+    int longOffsets = 0;
+    assert(nbSeq <= seqStorePtr->maxNbSeq);
+    for (u=0; u<nbSeq; u++) {
+        U32 const llv = sequences[u].litLength;
+        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
+        U32 const mlv = sequences[u].mlBase;
+        llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+        ofCodeTable[u] = (BYTE)ofCode;
+        mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
+        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
+        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
+            longOffsets = 1;
+    }
+    if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
+        llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+    if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
+        mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+    return longOffsets;
+}
+
+/* ZSTD_useTargetCBlockSize():
+ * Returns if target compressed block size param is being used.
+ * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize.
+ * Returns 1 if true, 0 otherwise. */
+static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams)
+{
+    DEBUGLOG(5, "ZSTD_useTargetCBlockSize (targetCBlockSize=%zu)", cctxParams->targetCBlockSize);
+    return (cctxParams->targetCBlockSize != 0);
+}
+
+/* ZSTD_blockSplitterEnabled():
+ * Returns if block splitting param is being used
+ * If used, compression will do best effort to split a block in order to improve compression ratio.
+ * At the time this function is called, the parameter must be finalized.
+ * Returns 1 if true, 0 otherwise. */
+static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams)
+{
+    DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter);
+    assert(cctxParams->useBlockSplitter != ZSTD_ps_auto);
+    return (cctxParams->useBlockSplitter == ZSTD_ps_enable);
+}
+
+/* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types
+ * and size of the sequences statistics
+ */
+typedef struct {
+    U32 LLtype;
+    U32 Offtype;
+    U32 MLtype;
+    size_t size;
+    size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
+    int longOffsets;
+} ZSTD_symbolEncodingTypeStats_t;
+
+/* ZSTD_buildSequencesStatistics():
+ * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `size` field.
+ * Modifies `nextEntropy` to have the appropriate values as a side effect.
+ * nbSeq must be greater than 0.
+ *
+ * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
+ */
+static ZSTD_symbolEncodingTypeStats_t
+ZSTD_buildSequencesStatistics(
+                const seqStore_t* seqStorePtr, size_t nbSeq,
+                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+                      BYTE* dst, const BYTE* const dstEnd,
+                      ZSTD_strategy strategy, unsigned* countWorkspace,
+                      void* entropyWorkspace, size_t entropyWkspSize)
+{
+    BYTE* const ostart = dst;
+    const BYTE* const oend = dstEnd;
+    BYTE* op = ostart;
+    FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable;
+    FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable;
+    const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    const BYTE* const llCodeTable = seqStorePtr->llCode;
+    const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    ZSTD_symbolEncodingTypeStats_t stats;
+
+    stats.lastCountSize = 0;
+    /* convert length/distances into codes */
+    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
+    assert(op <= oend);
+    assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
+    /* build CTable for Literal Lengths */
+    {   unsigned max = MaxLL;
+        size_t const mostFrequent = HIST_countFast_wksp(countWorkspace, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fail */
+        DEBUGLOG(5, "Building LL table");
+        nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
+        stats.LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode,
+                                        countWorkspace, max, mostFrequent, nbSeq,
+                                        LLFSELog, prevEntropy->litlengthCTable,
+                                        LL_defaultNorm, LL_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(set_basic < set_compressed && set_rle < set_compressed);
+        assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype,
+                countWorkspace, max, llCodeTable, nbSeq,
+                LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                prevEntropy->litlengthCTable,
+                sizeof(prevEntropy->litlengthCTable),
+                entropyWorkspace, entropyWkspSize);
+            if (ZSTD_isError(countSize)) {
+                DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed");
+                stats.size = countSize;
+                return stats;
+            }
+            if (stats.LLtype == set_compressed)
+                stats.lastCountSize = countSize;
+            op += countSize;
+            assert(op <= oend);
+    }   }
+    /* build CTable for Offsets */
+    {   unsigned max = MaxOff;
+        size_t const mostFrequent = HIST_countFast_wksp(
+            countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);  /* can't fail */
+        /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
+        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+        DEBUGLOG(5, "Building OF table");
+        nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
+        stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode,
+                                        countWorkspace, max, mostFrequent, nbSeq,
+                                        OffFSELog, prevEntropy->offcodeCTable,
+                                        OF_defaultNorm, OF_defaultNormLog,
+                                        defaultPolicy, strategy);
+        assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype,
+                countWorkspace, max, ofCodeTable, nbSeq,
+                OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                prevEntropy->offcodeCTable,
+                sizeof(prevEntropy->offcodeCTable),
+                entropyWorkspace, entropyWkspSize);
+            if (ZSTD_isError(countSize)) {
+                DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed");
+                stats.size = countSize;
+                return stats;
+            }
+            if (stats.Offtype == set_compressed)
+                stats.lastCountSize = countSize;
+            op += countSize;
+            assert(op <= oend);
+    }   }
+    /* build CTable for MatchLengths */
+    {   unsigned max = MaxML;
+        size_t const mostFrequent = HIST_countFast_wksp(
+            countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fail */
+        DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op));
+        nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
+        stats.MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode,
+                                        countWorkspace, max, mostFrequent, nbSeq,
+                                        MLFSELog, prevEntropy->matchlengthCTable,
+                                        ML_defaultNorm, ML_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype,
+                countWorkspace, max, mlCodeTable, nbSeq,
+                ML_defaultNorm, ML_defaultNormLog, MaxML,
+                prevEntropy->matchlengthCTable,
+                sizeof(prevEntropy->matchlengthCTable),
+                entropyWorkspace, entropyWkspSize);
+            if (ZSTD_isError(countSize)) {
+                DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed");
+                stats.size = countSize;
+                return stats;
+            }
+            if (stats.MLtype == set_compressed)
+                stats.lastCountSize = countSize;
+            op += countSize;
+            assert(op <= oend);
+    }   }
+    stats.size = (size_t)(op-ostart);
+    return stats;
+}
+
+/* ZSTD_entropyCompressSeqStore_internal():
+ * compresses both literals and sequences
+ * Returns compressed size of block, or a zstd error.
+ */
+#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
+MEM_STATIC size_t
+ZSTD_entropyCompressSeqStore_internal(
+                        const seqStore_t* seqStorePtr,
+                        const ZSTD_entropyCTables_t* prevEntropy,
+                              ZSTD_entropyCTables_t* nextEntropy,
+                        const ZSTD_CCtx_params* cctxParams,
+                              void* dst, size_t dstCapacity,
+                              void* entropyWorkspace, size_t entropyWkspSize,
+                        const int bmi2)
+{
+    ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+    unsigned* count = (unsigned*)entropyWorkspace;
+    FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+    FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    const BYTE* const llCodeTable = seqStorePtr->llCode;
+    const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    size_t lastCountSize;
+    int longOffsets = 0;
+
+    entropyWorkspace = count + (MaxSeq + 1);
+    entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
+
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
+    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+    assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
+
+    /* Compress literals */
+    {   const BYTE* const literals = seqStorePtr->litStart;
+        size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+        size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+        /* Base suspicion of uncompressibility on ratio of literals to sequences */
+        unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
+        size_t const litSize = (size_t)(seqStorePtr->lit - literals);
+
+        size_t const cSize = ZSTD_compressLiterals(
+                                    op, dstCapacity,
+                                    literals, litSize,
+                                    entropyWorkspace, entropyWkspSize,
+                                    &prevEntropy->huf, &nextEntropy->huf,
+                                    cctxParams->cParams.strategy,
+                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                    suspectUncompressible, bmi2);
+        FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+        assert(cSize <= dstCapacity);
+        op += cSize;
+    }
+
+    /* Sequences Header */
+    RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                    dstSize_tooSmall, "Can't fit seq hdr in output buf!");
+    if (nbSeq < 128) {
+        *op++ = (BYTE)nbSeq;
+    } else if (nbSeq < LONGNBSEQ) {
+        op[0] = (BYTE)((nbSeq>>8) + 0x80);
+        op[1] = (BYTE)nbSeq;
+        op+=2;
+    } else {
+        op[0]=0xFF;
+        MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ));
+        op+=3;
+    }
+    assert(op <= oend);
+    if (nbSeq==0) {
+        /* Copy the old tables over as if we repeated them */
+        ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+        return (size_t)(op - ostart);
+    }
+    {   BYTE* const seqHead = op++;
+        /* build stats for sequences */
+        const ZSTD_symbolEncodingTypeStats_t stats =
+                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                             &prevEntropy->fse, &nextEntropy->fse,
+                                              op, oend,
+                                              strategy, count,
+                                              entropyWorkspace, entropyWkspSize);
+        FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!");
+        *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
+        lastCountSize = stats.lastCountSize;
+        op += stats.size;
+        longOffsets = stats.longOffsets;
+    }
+
+    {   size_t const bitstreamSize = ZSTD_encodeSequences(
+                                        op, (size_t)(oend - op),
+                                        CTable_MatchLength, mlCodeTable,
+                                        CTable_OffsetBits, ofCodeTable,
+                                        CTable_LitLength, llCodeTable,
+                                        sequences, nbSeq,
+                                        longOffsets, bmi2);
+        FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed");
+        op += bitstreamSize;
+        assert(op <= oend);
+        /* zstd versions <= 1.3.4 mistakenly report corruption when
+         * FSE_readNCount() receives a buffer < 4 bytes.
+         * Fixed by https://github.com/facebook/zstd/pull/1146.
+         * This can happen when the last set_compressed table present is 2
+         * bytes and the bitstream is only one byte.
+         * In this exceedingly rare case, we will simply emit an uncompressed
+         * block, since it isn't worth optimizing.
+         */
+        if (lastCountSize && (lastCountSize + bitstreamSize) < 4) {
+            /* lastCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+            assert(lastCountSize + bitstreamSize == 3);
+            DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+                        "emitting an uncompressed block.");
+            return 0;
+        }
+    }
+
+    DEBUGLOG(5, "compressed block size : %u", (unsigned)(op - ostart));
+    return (size_t)(op - ostart);
+}
+
+MEM_STATIC size_t
+ZSTD_entropyCompressSeqStore(
+                    const seqStore_t* seqStorePtr,
+                    const ZSTD_entropyCTables_t* prevEntropy,
+                          ZSTD_entropyCTables_t* nextEntropy,
+                    const ZSTD_CCtx_params* cctxParams,
+                          void* dst, size_t dstCapacity,
+                          size_t srcSize,
+                          void* entropyWorkspace, size_t entropyWkspSize,
+                          int bmi2)
+{
+    size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
+                            seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+                            dst, dstCapacity,
+                            entropyWorkspace, entropyWkspSize, bmi2);
+    if (cSize == 0) return 0;
+    /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+     * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+     */
+    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
+        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
+        return 0;  /* block not compressed */
+    }
+    FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
+
+    /* Check compressibility */
+    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+        if (cSize >= maxCSize) return 0;  /* block not compressed */
+    }
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
+    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
+     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
+     */
+    assert(cSize < ZSTD_BLOCKSIZE_MAX);
+    return cSize;
+}
+
+/* ZSTD_selectBlockCompressor() :
+ * Not static, but internal use only (used by long distance matcher)
+ * assumption : strat is a valid strategy */
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode)
+{
+    static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
+        { ZSTD_compressBlock_fast  /* default for 0 */,
+          ZSTD_compressBlock_fast,
+          ZSTD_COMPRESSBLOCK_DOUBLEFAST,
+          ZSTD_COMPRESSBLOCK_GREEDY,
+          ZSTD_COMPRESSBLOCK_LAZY,
+          ZSTD_COMPRESSBLOCK_LAZY2,
+          ZSTD_COMPRESSBLOCK_BTLAZY2,
+          ZSTD_COMPRESSBLOCK_BTOPT,
+          ZSTD_COMPRESSBLOCK_BTULTRA,
+          ZSTD_COMPRESSBLOCK_BTULTRA2
+        },
+        { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
+          ZSTD_compressBlock_fast_extDict,
+          ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT,
+          ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT,
+          ZSTD_COMPRESSBLOCK_LAZY_EXTDICT,
+          ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT,
+          ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT,
+          ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT,
+          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT,
+          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT
+        },
+        { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+          ZSTD_compressBlock_fast_dictMatchState,
+          ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE
+        },
+        { NULL  /* default for 0 */,
+          NULL,
+          NULL,
+          ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH,
+          ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH,
+          ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH,
+          NULL,
+          NULL,
+          NULL,
+          NULL }
+    };
+    ZSTD_blockCompressor selectedCompressor;
+    ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
+
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+    DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
+    if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
+        static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
+            {
+                ZSTD_COMPRESSBLOCK_GREEDY_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY2_ROW
+            },
+            {
+                ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW
+            },
+            {
+                ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW
+            },
+            {
+                ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW
+            }
+        };
+        DEBUGLOG(4, "Selecting a row-based matchfinder");
+        assert(useRowMatchFinder != ZSTD_ps_auto);
+        selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy];
+    } else {
+        selectedCompressor = blockCompressor[(int)dictMode][(int)strat];
+    }
+    assert(selectedCompressor != NULL);
+    return selectedCompressor;
+}
+
+static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
+                                   const BYTE* anchor, size_t lastLLSize)
+{
+    ZSTD_memcpy(seqStorePtr->lit, anchor, lastLLSize);
+    seqStorePtr->lit += lastLLSize;
+}
+
+void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+{
+    ssPtr->lit = ssPtr->litStart;
+    ssPtr->sequences = ssPtr->sequencesStart;
+    ssPtr->longLengthType = ZSTD_llt_none;
+}
+
+/* ZSTD_postProcessSequenceProducerResult() :
+ * Validates and post-processes sequences obtained through the external matchfinder API:
+ *   - Checks whether nbExternalSeqs represents an error condition.
+ *   - Appends a block delimiter to outSeqs if one is not already present.
+ *     See zstd.h for context regarding block delimiters.
+ * Returns the number of sequences after post-processing, or an error code. */
+static size_t ZSTD_postProcessSequenceProducerResult(
+    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
+) {
+    RETURN_ERROR_IF(
+        nbExternalSeqs > outSeqsCapacity,
+        sequenceProducer_failed,
+        "External sequence producer returned error code %lu",
+        (unsigned long)nbExternalSeqs
+    );
+
+    RETURN_ERROR_IF(
+        nbExternalSeqs == 0 && srcSize > 0,
+        sequenceProducer_failed,
+        "Got zero sequences from external sequence producer for a non-empty src buffer!"
+    );
+
+    if (srcSize == 0) {
+        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
+        return 1;
+    }
+
+    {
+        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
+
+        /* We can return early if lastSeq is already a block delimiter. */
+        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
+            return nbExternalSeqs;
+        }
+
+        /* This error condition is only possible if the external matchfinder
+         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
+        RETURN_ERROR_IF(
+            nbExternalSeqs == outSeqsCapacity,
+            sequenceProducer_failed,
+            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
+        );
+
+        /* lastSeq is not a block delimiter, so we need to append one. */
+        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
+        return nbExternalSeqs + 1;
+    }
+}
+
+/* ZSTD_fastSequenceLengthSum() :
+ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
+ * Similar to another function in zstd_compress.c (determine_blockSize),
+ * except it doesn't check for a block delimiter to end summation.
+ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
+ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
+static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
+    size_t matchLenSum, litLenSum, i;
+    matchLenSum = 0;
+    litLenSum = 0;
+    for (i = 0; i < seqBufSize; i++) {
+        litLenSum += seqBuf[i].litLength;
+        matchLenSum += seqBuf[i].matchLength;
+    }
+    return litLenSum + matchLenSum;
+}
+
+typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
+
+static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+{
+    ZSTD_matchState_t* const ms = &zc->blockState.matchState;
+    DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize);
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+    /* Assert that we have correctly flushed the ctx params into the ms's copy */
+    ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+     * additional 1. We need to revisit and change this logic to be more consistent */
+    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+        if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
+            ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
+        } else {
+            ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.minMatch);
+        }
+        return ZSTDbss_noCompress; /* don't even attempt compression below a certain srcSize */
+    }
+    ZSTD_resetSeqStore(&(zc->seqStore));
+    /* required for optimal parser to read stats from dictionary */
+    ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy;
+    /* tell the optimal parser how we expect to compress literals */
+    ms->opt.literalCompressionMode = zc->appliedParams.literalCompressionMode;
+    /* a gap between an attached dict and the current window is not safe,
+     * they must remain adjacent,
+     * and when that stops being the case, the dict must be unset */
+    assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit);
+
+    /* limited update after a very long match */
+    {   const BYTE* const base = ms->window.base;
+        const BYTE* const istart = (const BYTE*)src;
+        const U32 curr = (U32)(istart-base);
+        if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1));   /* ensure no overflow */
+        if (curr > ms->nextToUpdate + 384)
+            ms->nextToUpdate = curr - MIN(192, (U32)(curr - ms->nextToUpdate - 384));
+    }
+
+    /* select and store sequences */
+    {   ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms);
+        size_t lastLLSize;
+        {   int i;
+            for (i = 0; i < ZSTD_REP_NUM; ++i)
+                zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i];
+        }
+        if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+            assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
+
+            /* External matchfinder + LDM is technically possible, just not implemented yet.
+             * We need to revisit soon and implement it. */
+            RETURN_ERROR_IF(
+                ZSTD_hasExtSeqProd(&zc->appliedParams),
+                parameter_combination_unsupported,
+                "Long-distance matching with external sequence producer enabled is not currently supported."
+            );
+
+            /* Updates ldmSeqStore.pos */
+            lastLLSize =
+                ZSTD_ldm_blockCompress(&zc->externSeqStore,
+                                       ms, &zc->seqStore,
+                                       zc->blockState.nextCBlock->rep,
+                                       zc->appliedParams.useRowMatchFinder,
+                                       src, srcSize);
+            assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
+        } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+            rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
+
+            /* External matchfinder + LDM is technically possible, just not implemented yet.
+             * We need to revisit soon and implement it. */
+            RETURN_ERROR_IF(
+                ZSTD_hasExtSeqProd(&zc->appliedParams),
+                parameter_combination_unsupported,
+                "Long-distance matching with external sequence producer enabled is not currently supported."
+            );
+
+            ldmSeqStore.seq = zc->ldmSequences;
+            ldmSeqStore.capacity = zc->maxNbLdmSequences;
+            /* Updates ldmSeqStore.size */
+            FORWARD_IF_ERROR(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore,
+                                               &zc->appliedParams.ldmParams,
+                                               src, srcSize), "");
+            /* Updates ldmSeqStore.pos */
+            lastLLSize =
+                ZSTD_ldm_blockCompress(&ldmSeqStore,
+                                       ms, &zc->seqStore,
+                                       zc->blockState.nextCBlock->rep,
+                                       zc->appliedParams.useRowMatchFinder,
+                                       src, srcSize);
+            assert(ldmSeqStore.pos == ldmSeqStore.size);
+        } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) {
+            assert(
+                zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize)
+            );
+            assert(zc->appliedParams.extSeqProdFunc != NULL);
+
+            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
+
+                size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)(
+                    zc->appliedParams.extSeqProdState,
+                    zc->extSeqBuf,
+                    zc->extSeqBufCapacity,
+                    src, srcSize,
+                    NULL, 0,  /* dict and dictSize, currently not supported */
+                    zc->appliedParams.compressionLevel,
+                    windowSize
+                );
+
+                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
+                    zc->extSeqBuf,
+                    nbExternalSeqs,
+                    zc->extSeqBufCapacity,
+                    srcSize
+                );
+
+                /* Return early if there is no error, since we don't need to worry about last literals */
+                if (!ZSTD_isError(nbPostProcessedSeqs)) {
+                    ZSTD_sequencePosition seqPos = {0,0,0};
+                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs);
+                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
+                    FORWARD_IF_ERROR(
+                        ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
+                            zc, &seqPos,
+                            zc->extSeqBuf, nbPostProcessedSeqs,
+                            src, srcSize,
+                            zc->appliedParams.searchForExternalRepcodes
+                        ),
+                        "Failed to copy external sequences to seqStore!"
+                    );
+                    ms->ldmSeqStore = NULL;
+                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
+                    return ZSTDbss_compress;
+                }
+
+                /* Propagate the error if fallback is disabled */
+                if (!zc->appliedParams.enableMatchFinderFallback) {
+                    return nbPostProcessedSeqs;
+                }
+
+                /* Fallback to software matchfinder */
+                {   ZSTD_blockCompressor const blockCompressor =
+                        ZSTD_selectBlockCompressor(
+                            zc->appliedParams.cParams.strategy,
+                            zc->appliedParams.useRowMatchFinder,
+                            dictMode);
+                    ms->ldmSeqStore = NULL;
+                    DEBUGLOG(
+                        5,
+                        "External sequence producer returned error code %lu. Falling back to internal parser.",
+                        (unsigned long)nbExternalSeqs
+                    );
+                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+            }   }
+        } else {   /* not long range mode and no external matchfinder */
+            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(
+                    zc->appliedParams.cParams.strategy,
+                    zc->appliedParams.useRowMatchFinder,
+                    dictMode);
+            ms->ldmSeqStore = NULL;
+            lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+        }
+        {   const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
+            ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
+    }   }
+    return ZSTDbss_compress;
+}
+
+static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM])
+{
+    const seqDef* inSeqs = seqStore->sequencesStart;
+    const size_t nbInSequences = seqStore->sequences - inSeqs;
+    const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart);
+
+    ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex;
+    const size_t nbOutSequences = nbInSequences + 1;
+    size_t nbOutLiterals = 0;
+    repcodes_t repcodes;
+    size_t i;
+
+    /* Bounds check that we have enough space for every input sequence
+     * and the block delimiter
+     */
+    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
+    RETURN_ERROR_IF(
+        nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex),
+        dstSize_tooSmall,
+        "Not enough space to copy sequences");
+
+    ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes));
+    for (i = 0; i < nbInSequences; ++i) {
+        U32 rawOffset;
+        outSeqs[i].litLength = inSeqs[i].litLength;
+        outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH;
+        outSeqs[i].rep = 0;
+
+        /* Handle the possible single length >= 64K
+         * There can only be one because we add MINMATCH to every match length,
+         * and blocks are at most 128K.
+         */
+        if (i == seqStore->longLengthPos) {
+            if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+                outSeqs[i].litLength += 0x10000;
+            } else if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+                outSeqs[i].matchLength += 0x10000;
+            }
+        }
+
+        /* Determine the raw offset given the offBase, which may be a repcode. */
+        if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) {
+            const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase);
+            assert(repcode > 0);
+            outSeqs[i].rep = repcode;
+            if (outSeqs[i].litLength != 0) {
+                rawOffset = repcodes.rep[repcode - 1];
+            } else {
+                if (repcode == 3) {
+                    assert(repcodes.rep[0] > 1);
+                    rawOffset = repcodes.rep[0] - 1;
+                } else {
+                    rawOffset = repcodes.rep[repcode];
+                }
+            }
+        } else {
+            rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase);
+        }
+        outSeqs[i].offset = rawOffset;
+
+        /* Update repcode history for the sequence */
+        ZSTD_updateRep(repcodes.rep,
+                       inSeqs[i].offBase,
+                       inSeqs[i].litLength == 0);
+
+        nbOutLiterals += outSeqs[i].litLength;
+    }
+    /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
+     * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
+     * for the block boundary, according to the API.
+     */
+    assert(nbInLiterals >= nbOutLiterals);
+    {
+        const size_t lastLLSize = nbInLiterals - nbOutLiterals;
+        outSeqs[nbInSequences].litLength = (U32)lastLLSize;
+        outSeqs[nbInSequences].matchLength = 0;
+        outSeqs[nbInSequences].offset = 0;
+        assert(nbOutSequences == nbInSequences + 1);
+    }
+    seqCollector->seqIndex += nbOutSequences;
+    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
+
+    return 0;
+}
+
+size_t ZSTD_sequenceBound(size_t srcSize) {
+    const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1;
+    const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1;
+    return maxNbSeq + maxNbDelims;
+}
+
+size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+                              size_t outSeqsSize, const void* src, size_t srcSize)
+{
+    const size_t dstCapacity = ZSTD_compressBound(srcSize);
+    void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
+    SeqCollector seqCollector;
+    {
+        int targetCBlockSize;
+        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), "");
+        RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0");
+    }
+    {
+        int nbWorkers;
+        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), "");
+        RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0");
+    }
+
+    RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
+
+    seqCollector.collectSequences = 1;
+    seqCollector.seqStart = outSeqs;
+    seqCollector.seqIndex = 0;
+    seqCollector.maxSequences = outSeqsSize;
+    zc->seqCollector = seqCollector;
+
+    {
+        const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
+        ZSTD_customFree(dst, ZSTD_defaultCMem);
+        FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed");
+    }
+    assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize));
+    return zc->seqCollector.seqIndex;
+}
+
+size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize) {
+    size_t in = 0;
+    size_t out = 0;
+    for (; in < seqsSize; ++in) {
+        if (sequences[in].offset == 0 && sequences[in].matchLength == 0) {
+            if (in != seqsSize - 1) {
+                sequences[in+1].litLength += sequences[in].litLength;
+            }
+        } else {
+            sequences[out] = sequences[in];
+            ++out;
+        }
+    }
+    return out;
+}
+
+/* Unrolled loop to read four size_ts of input at a time. Returns 1 if is RLE, 0 if not. */
+static int ZSTD_isRLE(const BYTE* src, size_t length) {
+    const BYTE* ip = src;
+    const BYTE value = ip[0];
+    const size_t valueST = (size_t)((U64)value * 0x0101010101010101ULL);
+    const size_t unrollSize = sizeof(size_t) * 4;
+    const size_t unrollMask = unrollSize - 1;
+    const size_t prefixLength = length & unrollMask;
+    size_t i;
+    if (length == 1) return 1;
+    /* Check if prefix is RLE first before using unrolled loop */
+    if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
+        return 0;
+    }
+    for (i = prefixLength; i != length; i += unrollSize) {
+        size_t u;
+        for (u = 0; u < unrollSize; u += sizeof(size_t)) {
+            if (MEM_readST(ip + i + u) != valueST) {
+                return 0;
+    }   }   }
+    return 1;
+}
+
+/* Returns true if the given block may be RLE.
+ * This is just a heuristic based on the compressibility.
+ * It may return both false positives and false negatives.
+ */
+static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+{
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+    size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart);
+
+    return nbSeqs < 4 && nbLits < 10;
+}
+
+static void
+ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+{
+    ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
+    bs->prevCBlock = bs->nextCBlock;
+    bs->nextCBlock = tmp;
+}
+
+/* Writes the block header */
+static void
+writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
+{
+    U32 const cBlockHeader = cSize == 1 ?
+                        lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                        lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+    MEM_writeLE24(op, cBlockHeader);
+    DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock);
+}
+
+/** ZSTD_buildBlockEntropyStats_literals() :
+ *  Builds entropy for the literals.
+ *  Stores literals block type (raw, rle, compressed, repeat) and
+ *  huffman description table to hufMetadata.
+ *  Requires ENTROPY_WORKSPACE_SIZE workspace
+ * @return : size of huffman description table, or an error code
+ */
+static size_t
+ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+                               const ZSTD_hufCTables_t* prevHuf,
+                                     ZSTD_hufCTables_t* nextHuf,
+                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
+                               const int literalsCompressionIsDisabled,
+                                     void* workspace, size_t wkspSize,
+                                     int hufFlags)
+{
+    BYTE* const wkspStart = (BYTE*)workspace;
+    BYTE* const wkspEnd = wkspStart + wkspSize;
+    BYTE* const countWkspStart = wkspStart;
+    unsigned* const countWksp = (unsigned*)workspace;
+    const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+    BYTE* const nodeWksp = countWkspStart + countWkspSize;
+    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
+    unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+    unsigned huffLog = LitHufLog;
+    HUF_repeat repeat = prevHuf->repeatMode;
+    DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
+
+    /* Prepare nextEntropy assuming reusing the existing table */
+    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+    if (literalsCompressionIsDisabled) {
+        DEBUGLOG(5, "set_basic - disabled");
+        hufMetadata->hType = set_basic;
+        return 0;
+    }
+
+    /* small ? don't even attempt compression (speed opt) */
+#ifndef COMPRESS_LITERALS_SIZE_MIN
+# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
+#endif
+    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+        if (srcSize <= minLitSize) {
+            DEBUGLOG(5, "set_basic - too small");
+            hufMetadata->hType = set_basic;
+            return 0;
+    }   }
+
+    /* Scan input and build symbol stats */
+    {   size_t const largest =
+            HIST_count_wksp (countWksp, &maxSymbolValue,
+                            (const BYTE*)src, srcSize,
+                            workspace, wkspSize);
+        FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+        if (largest == srcSize) {
+            /* only one literal symbol */
+            DEBUGLOG(5, "set_rle");
+            hufMetadata->hType = set_rle;
+            return 0;
+        }
+        if (largest <= (srcSize >> 7)+4) {
+            /* heuristic: likely not compressible */
+            DEBUGLOG(5, "set_basic - no gain");
+            hufMetadata->hType = set_basic;
+            return 0;
+    }   }
+
+    /* Validate the previous Huffman table */
+    if (repeat == HUF_repeat_check
+      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+        repeat = HUF_repeat_none;
+    }
+
+    /* Build Huffman Tree */
+    ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
+    assert(huffLog <= LitHufLog);
+    {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+                                                    maxSymbolValue, huffLog,
+                                                    nodeWksp, nodeWkspSize);
+        FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+        huffLog = (U32)maxBits;
+    }
+    {   /* Build and write the CTable */
+        size_t const newCSize = HUF_estimateCompressedSize(
+                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+        size_t const hSize = HUF_writeCTable_wksp(
+                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+                nodeWksp, nodeWkspSize);
+        /* Check against repeating the previous CTable */
+        if (repeat != HUF_repeat_none) {
+            size_t const oldCSize = HUF_estimateCompressedSize(
+                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+                DEBUGLOG(5, "set_repeat - smaller");
+                ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+                hufMetadata->hType = set_repeat;
+                return 0;
+        }   }
+        if (newCSize + hSize >= srcSize) {
+            DEBUGLOG(5, "set_basic - no gains");
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            hufMetadata->hType = set_basic;
+            return 0;
+        }
+        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+        hufMetadata->hType = set_compressed;
+        nextHuf->repeatMode = HUF_repeat_check;
+        return hSize;
+    }
+}
+
+
+/* ZSTD_buildDummySequencesStatistics():
+ * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set_basic,
+ * and updates nextEntropy to the appropriate repeatMode.
+ */
+static ZSTD_symbolEncodingTypeStats_t
+ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
+{
+    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
+    nextEntropy->litlength_repeatMode = FSE_repeat_none;
+    nextEntropy->offcode_repeatMode = FSE_repeat_none;
+    nextEntropy->matchlength_repeatMode = FSE_repeat_none;
+    return stats;
+}
+
+/** ZSTD_buildBlockEntropyStats_sequences() :
+ *  Builds entropy for the sequences.
+ *  Stores symbol compression modes and fse table to fseMetadata.
+ *  Requires ENTROPY_WORKSPACE_SIZE wksp.
+ * @return : size of fse tables or error code */
+static size_t
+ZSTD_buildBlockEntropyStats_sequences(
+                const seqStore_t* seqStorePtr,
+                const ZSTD_fseCTables_t* prevEntropy,
+                      ZSTD_fseCTables_t* nextEntropy,
+                const ZSTD_CCtx_params* cctxParams,
+                      ZSTD_fseCTablesMetadata_t* fseMetadata,
+                      void* workspace, size_t wkspSize)
+{
+    ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    BYTE* const ostart = fseMetadata->fseTablesBuffer;
+    BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+    BYTE* op = ostart;
+    unsigned* countWorkspace = (unsigned*)workspace;
+    unsigned* entropyWorkspace = countWorkspace + (MaxSeq + 1);
+    size_t entropyWorkspaceSize = wkspSize - (MaxSeq + 1) * sizeof(*countWorkspace);
+    ZSTD_symbolEncodingTypeStats_t stats;
+
+    DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=%zu)", nbSeq);
+    stats = nbSeq != 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                          prevEntropy, nextEntropy, op, oend,
+                                          strategy, countWorkspace,
+                                          entropyWorkspace, entropyWorkspaceSize)
+                       : ZSTD_buildDummySequencesStatistics(nextEntropy);
+    FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!");
+    fseMetadata->llType = (symbolEncodingType_e) stats.LLtype;
+    fseMetadata->ofType = (symbolEncodingType_e) stats.Offtype;
+    fseMetadata->mlType = (symbolEncodingType_e) stats.MLtype;
+    fseMetadata->lastCountSize = stats.lastCountSize;
+    return stats.size;
+}
+
+
+/** ZSTD_buildBlockEntropyStats() :
+ *  Builds entropy for the block.
+ *  Requires workspace size ENTROPY_WORKSPACE_SIZE
+ * @return : 0 on success, or an error code
+ *  Note : also employed in superblock
+ */
+size_t ZSTD_buildBlockEntropyStats(
+            const seqStore_t* seqStorePtr,
+            const ZSTD_entropyCTables_t* prevEntropy,
+                  ZSTD_entropyCTables_t* nextEntropy,
+            const ZSTD_CCtx_params* cctxParams,
+                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                  void* workspace, size_t wkspSize)
+{
+    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
+    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
+
+    entropyMetadata->hufMetadata.hufDesSize =
+        ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
+                                            &prevEntropy->huf, &nextEntropy->huf,
+                                            &entropyMetadata->hufMetadata,
+                                            ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                            workspace, wkspSize, hufFlags);
+
+    FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
+    entropyMetadata->fseMetadata.fseTablesSize =
+        ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
+                                              &prevEntropy->fse, &nextEntropy->fse,
+                                              cctxParams,
+                                              &entropyMetadata->fseMetadata,
+                                              workspace, wkspSize);
+    FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildBlockEntropyStats_sequences failed");
+    return 0;
+}
+
+/* Returns the size estimate for the literals section (header + content) of a block */
+static size_t
+ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+                               const ZSTD_hufCTables_t* huf,
+                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                               void* workspace, size_t wkspSize,
+                               int writeEntropy)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+    size_t literalSectionHeaderSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB);
+    U32 singleStream = litSize < 256;
+
+    if (hufMetadata->hType == set_basic) return litSize;
+    else if (hufMetadata->hType == set_rle) return 1;
+    else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) {
+        size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize);
+        if (ZSTD_isError(largest)) return litSize;
+        {   size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
+            if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize;
+            if (!singleStream) cLitSizeEstimate += 6; /* multi-stream huffman uses 6-byte jump table */
+            return cLitSizeEstimate + literalSectionHeaderSize;
+    }   }
+    assert(0); /* impossible */
+    return 0;
+}
+
+/* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
+static size_t
+ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+                    const FSE_CTable* fseCTable,
+                    const U8* additionalBits,
+                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                    void* workspace, size_t wkspSize)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    const BYTE* ctp = codeTable;
+    const BYTE* const ctStart = ctp;
+    const BYTE* const ctEnd = ctStart + nbSeq;
+    size_t cSymbolTypeSizeEstimateInBits = 0;
+    unsigned max = maxCode;
+
+    HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize);  /* can't fail */
+    if (type == set_basic) {
+        /* We selected this encoding type, so it must be valid. */
+        assert(max <= defaultMax);
+        (void)defaultMax;
+        cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max);
+    } else if (type == set_rle) {
+        cSymbolTypeSizeEstimateInBits = 0;
+    } else if (type == set_compressed || type == set_repeat) {
+        cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max);
+    }
+    if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) {
+        return nbSeq * 10;
+    }
+    while (ctp < ctEnd) {
+        if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp];
+        else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */
+        ctp++;
+    }
+    return cSymbolTypeSizeEstimateInBits >> 3;
+}
+
+/* Returns the size estimate for the sequences section (header + content) of a block */
+static size_t
+ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+                                 const BYTE* llCodeTable,
+                                 const BYTE* mlCodeTable,
+                                 size_t nbSeq,
+                                 const ZSTD_fseCTables_t* fseTables,
+                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                 void* workspace, size_t wkspSize,
+                                 int writeEntropy)
+{
+    size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
+    size_t cSeqSizeEstimate = 0;
+    cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
+                                    fseTables->offcodeCTable, NULL,
+                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                    workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
+                                    fseTables->litlengthCTable, LL_bits,
+                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                    workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
+                                    fseTables->matchlengthCTable, ML_bits,
+                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                    workspace, wkspSize);
+    if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+    return cSeqSizeEstimate + sequencesSectionHeaderSize;
+}
+
+/* Returns the size estimate for a given stream of literals, of, ll, ml */
+static size_t
+ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+                       const BYTE* ofCodeTable,
+                       const BYTE* llCodeTable,
+                       const BYTE* mlCodeTable,
+                       size_t nbSeq,
+                       const ZSTD_entropyCTables_t* entropy,
+                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                       void* workspace, size_t wkspSize,
+                       int writeLitEntropy, int writeSeqEntropy)
+{
+    size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
+                                    &entropy->huf, &entropyMetadata->hufMetadata,
+                                    workspace, wkspSize, writeLitEntropy);
+    size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                    workspace, wkspSize, writeSeqEntropy);
+    return seqSize + literalsSize + ZSTD_blockHeaderSize;
+}
+
+/* Builds entropy statistics and uses them for blocksize estimation.
+ *
+ * @return: estimated compressed size of the seqStore, or a zstd error.
+ */
+static size_t
+ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
+{
+    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+    DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
+    FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
+                    &zc->blockState.prevCBlock->entropy,
+                    &zc->blockState.nextCBlock->entropy,
+                    &zc->appliedParams,
+                    entropyMetadata,
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
+    return ZSTD_estimateBlockSize(
+                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                    seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
+                    (size_t)(seqStore->sequences - seqStore->sequencesStart),
+                    &zc->blockState.nextCBlock->entropy,
+                    entropyMetadata,
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                    (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
+}
+
+/* Returns literals bytes represented in a seqStore */
+static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
+{
+    size_t literalsBytes = 0;
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+    size_t i;
+    for (i = 0; i < nbSeqs; ++i) {
+        seqDef const seq = seqStore->sequencesStart[i];
+        literalsBytes += seq.litLength;
+        if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
+            literalsBytes += 0x10000;
+    }   }
+    return literalsBytes;
+}
+
+/* Returns match bytes represented in a seqStore */
+static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
+{
+    size_t matchBytes = 0;
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+    size_t i;
+    for (i = 0; i < nbSeqs; ++i) {
+        seqDef seq = seqStore->sequencesStart[i];
+        matchBytes += seq.mlBase + MINMATCH;
+        if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
+            matchBytes += 0x10000;
+    }   }
+    return matchBytes;
+}
+
+/* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx).
+ * Stores the result in resultSeqStore.
+ */
+static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+                               const seqStore_t* originalSeqStore,
+                                     size_t startIdx, size_t endIdx)
+{
+    *resultSeqStore = *originalSeqStore;
+    if (startIdx > 0) {
+        resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
+        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+    }
+
+    /* Move longLengthPos into the correct position if necessary */
+    if (originalSeqStore->longLengthType != ZSTD_llt_none) {
+        if (originalSeqStore->longLengthPos < startIdx || originalSeqStore->longLengthPos > endIdx) {
+            resultSeqStore->longLengthType = ZSTD_llt_none;
+        } else {
+            resultSeqStore->longLengthPos -= (U32)startIdx;
+        }
+    }
+    resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
+    resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
+    if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
+        /* This accounts for possible last literals if the derived chunk reaches the end of the block */
+        assert(resultSeqStore->lit == originalSeqStore->lit);
+    } else {
+        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
+    }
+    resultSeqStore->llCode += startIdx;
+    resultSeqStore->mlCode += startIdx;
+    resultSeqStore->ofCode += startIdx;
+}
+
+/**
+ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
+ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
+ */
+static U32
+ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
+{
+    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
+    assert(OFFBASE_IS_REPCODE(offBase));
+    if (adjustedRepCode == ZSTD_REP_NUM) {
+        assert(ll0);
+        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
+         * This is only valid if it results in a valid offset value, aka > 0.
+         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
+         * In which case this function will return 0, which is an invalid offset.
+         * It's not an issue though, since this value will be
+         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
+         */
+        return rep[0] - 1;
+    }
+    return rep[adjustedRepCode];
+}
+
+/**
+ * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise
+ * due to emission of RLE/raw blocks that disturb the offset history,
+ * and replaces any repcodes within the seqStore that may be invalid.
+ *
+ * dRepcodes are updated as would be on the decompression side.
+ * cRepcodes are updated exactly in accordance with the seqStore.
+ *
+ * Note : this function assumes seq->offBase respects the following numbering scheme :
+ *        0 : invalid
+ *        1-3 : repcode 1-3
+ *        4+ : real_offset+3
+ */
+static void
+ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+                        const seqStore_t* const seqStore, U32 const nbSeq)
+{
+    U32 idx = 0;
+    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
+    for (; idx < nbSeq; ++idx) {
+        seqDef* const seq = seqStore->sequencesStart + idx;
+        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
+        U32 const offBase = seq->offBase;
+        assert(offBase > 0);
+        if (OFFBASE_IS_REPCODE(offBase)) {
+            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
+            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
+            /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
+             * the repcode with the offset it actually references, determined by the compression
+             * repcode history.
+             */
+            if (dRawOffset != cRawOffset) {
+                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
+            }
+        }
+        /* Compression repcode history is always updated with values directly from the unmodified seqStore.
+         * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
+         */
+        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
+        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
+    }
+}
+
+/* ZSTD_compressSeqStore_singleBlock():
+ * Compresses a seqStore into a block with a block header, into the buffer dst.
+ *
+ * Returns the total size of that block (including header) or a ZSTD error code.
+ */
+static size_t
+ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
+                            const seqStore_t* const seqStore,
+                                  repcodes_t* const dRep, repcodes_t* const cRep,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                                  U32 lastBlock, U32 isPartition)
+{
+    const U32 rleMaxLength = 25;
+    BYTE* op = (BYTE*)dst;
+    const BYTE* ip = (const BYTE*)src;
+    size_t cSize;
+    size_t cSeqsSize;
+
+    /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */
+    repcodes_t const dRepOriginal = *dRep;
+    DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock");
+    if (isPartition)
+        ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart));
+
+    RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "Block header doesn't fit");
+    cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore,
+                &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+                &zc->appliedParams,
+                op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize,
+                srcSize,
+                zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+                zc->bmi2);
+    FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!");
+
+    if (!zc->isFirstBlock &&
+        cSeqsSize < rleMaxLength &&
+        ZSTD_isRLE((BYTE const*)src, srcSize)) {
+        /* We don't want to emit our first block as a RLE even if it qualifies because
+        * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+        * This is only an issue for zstd <= v1.4.3
+        */
+        cSeqsSize = 1;
+    }
+
+    /* Sequence collection not supported when block splitting */
+    if (zc->seqCollector.collectSequences) {
+        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed");
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+        return 0;
+    }
+
+    if (cSeqsSize == 0) {
+        cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
+        FORWARD_IF_ERROR(cSize, "Nocompress block failed");
+        DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize);
+        *dRep = dRepOriginal; /* reset simulated decompression repcode history */
+    } else if (cSeqsSize == 1) {
+        cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock);
+        FORWARD_IF_ERROR(cSize, "RLE compress block failed");
+        DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize);
+        *dRep = dRepOriginal; /* reset simulated decompression repcode history */
+    } else {
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+        writeBlockHeader(op, cSeqsSize, srcSize, lastBlock);
+        cSize = ZSTD_blockHeaderSize + cSeqsSize;
+        DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize);
+    }
+
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
+}
+
+/* Struct to keep track of where we are in our recursive calls. */
+typedef struct {
+    U32* splitLocations;    /* Array of split indices */
+    size_t idx;             /* The current index within splitLocations being worked on */
+} seqStoreSplits;
+
+#define MIN_SEQUENCES_BLOCK_SPLITTING 300
+
+/* Helper function to perform the recursive search for block splits.
+ * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
+ * If advantageous to split, then we recurse down the two sub-blocks.
+ * If not, or if an error occurred in estimation, then we do not recurse.
+ *
+ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
+ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+ * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
+ * In practice, recursion depth usually doesn't go beyond 4.
+ *
+ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
+ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+ * maximum of 128 KB, this value is actually impossible to reach.
+ */
+static void
+ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
+                             ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
+{
+    seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+    seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+    seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+    size_t estimatedOriginalSize;
+    size_t estimatedFirstHalfSize;
+    size_t estimatedSecondHalfSize;
+    size_t midIdx = (startIdx + endIdx)/2;
+
+    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+    assert(endIdx >= startIdx);
+    if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
+        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
+        return;
+    }
+    ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
+    ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
+    ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
+    estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
+    estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
+    estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
+    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+             estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
+    if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
+        return;
+    }
+    if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
+        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
+        ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
+        splits->splitLocations[splits->idx] = (U32)midIdx;
+        splits->idx++;
+        ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqStore);
+    }
+}
+
+/* Base recursive function.
+ * Populates a table with intra-block partition indices that can improve compression ratio.
+ *
+ * @return: number of splits made (which equals the size of the partition table - 1).
+ */
+static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+{
+    seqStoreSplits splits;
+    splits.splitLocations = partitions;
+    splits.idx = 0;
+    if (nbSeq <= 4) {
+        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
+        /* Refuse to try and split anything with less than 4 sequences */
+        return 0;
+    }
+    ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore);
+    splits.splitLocations[splits.idx] = nbSeq;
+    DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits.idx+1);
+    return splits.idx;
+}
+
+/* ZSTD_compressBlock_splitBlock():
+ * Attempts to split a given block into multiple blocks to improve compression ratio.
+ *
+ * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
+ */
+static size_t
+ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
+                                    void* dst, size_t dstCapacity,
+                              const void* src, size_t blockSize,
+                                    U32 lastBlock, U32 nbSeq)
+{
+    size_t cSize = 0;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    size_t i = 0;
+    size_t srcBytesTotal = 0;
+    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+    seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+    seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
+    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+
+    /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
+     * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
+     * separate repcode histories that simulate repcode history on compression and decompression side,
+     * and use the histories to determine whether we must replace a particular repcode with its raw offset.
+     *
+     * 1) cRep gets updated for each partition, regardless of whether the block was emitted as uncompressed
+     *    or RLE. This allows us to retrieve the offset value that an invalid repcode references within
+     *    a nocompress/RLE block.
+     * 2) dRep gets updated only for compressed partitions, and when a repcode gets replaced, will use
+     *    the replacement offset value rather than the original repcode to update the repcode history.
+     *    dRep also will be the final repcode history sent to the next block.
+     *
+     * See ZSTD_seqStore_resolveOffCodes() for more details.
+     */
+    repcodes_t dRep;
+    repcodes_t cRep;
+    ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+    ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+    ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
+
+    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                (unsigned)zc->blockState.matchState.nextToUpdate);
+
+    if (numSplits == 0) {
+        size_t cSizeSingleBlock =
+            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+                                            &dRep, &cRep,
+                                            op, dstCapacity,
+                                            ip, blockSize,
+                                            lastBlock, 0 /* isPartition */);
+        FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
+        DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+        assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
+        assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
+        return cSizeSingleBlock;
+    }
+
+    ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
+    for (i = 0; i <= numSplits; ++i) {
+        size_t cSizeChunk;
+        U32 const lastPartition = (i == numSplits);
+        U32 lastBlockEntireSrc = 0;
+
+        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+        srcBytesTotal += srcBytes;
+        if (lastPartition) {
+            /* This is the final partition, need to account for possible last literals */
+            srcBytes += blockSize - srcBytesTotal;
+            lastBlockEntireSrc = lastBlock;
+        } else {
+            ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]);
+        }
+
+        cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, currSeqStore,
+                                                      &dRep, &cRep,
+                                                       op, dstCapacity,
+                                                       ip, srcBytes,
+                                                       lastBlockEntireSrc, 1 /* isPartition */);
+        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
+                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+        FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
+
+        ip += srcBytes;
+        op += cSizeChunk;
+        dstCapacity -= cSizeChunk;
+        cSize += cSizeChunk;
+        *currSeqStore = *nextSeqStore;
+        assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
+    }
+    /* cRep and dRep may have diverged during the compression.
+     * If so, we use the dRep repcodes for the next block.
+     */
+    ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
+    return cSize;
+}
+
+static size_t
+ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+                              void* dst, size_t dstCapacity,
+                              const void* src, size_t srcSize, U32 lastBlock)
+{
+    U32 nbSeq;
+    size_t cSize;
+    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+    assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable);
+
+    {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+        FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+        if (bss == ZSTDbss_noCompress) {
+            if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+                zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
+            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+            FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+            DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
+            return cSize;
+        }
+        nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart);
+    }
+
+    cSize = ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity, src, srcSize, lastBlock, nbSeq);
+    FORWARD_IF_ERROR(cSize, "Splitting blocks failed!");
+    return cSize;
+}
+
+static size_t
+ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                            void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize, U32 frame)
+{
+    /* This is an estimated upper bound for the length of an rle block.
+     * This isn't the actual upper bound.
+     * Finding the real threshold needs further investigation.
+     */
+    const U32 rleMaxLength = 25;
+    size_t cSize;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                (unsigned)zc->blockState.matchState.nextToUpdate);
+
+    {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+        FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+        if (bss == ZSTDbss_noCompress) {
+            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
+            cSize = 0;
+            goto out;
+        }
+    }
+
+    if (zc->seqCollector.collectSequences) {
+        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed");
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+        return 0;
+    }
+
+    /* encode sequences and literals */
+    cSize = ZSTD_entropyCompressSeqStore(&zc->seqStore,
+            &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+            &zc->appliedParams,
+            dst, dstCapacity,
+            srcSize,
+            zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+            zc->bmi2);
+
+    if (frame &&
+        /* We don't want to emit our first block as a RLE even if it qualifies because
+         * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+         * This is only an issue for zstd <= v1.4.3
+         */
+        !zc->isFirstBlock &&
+        cSize < rleMaxLength &&
+        ZSTD_isRLE(ip, srcSize))
+    {
+        cSize = 1;
+        op[0] = ip[0];
+    }
+
+out:
+    if (!ZSTD_isError(cSize) && cSize > 1) {
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+    }
+    /* We check that dictionaries have offset codes available for the first
+     * block. After the first block, the offcode table might not have large
+     * enough codes to represent the offsets in the data.
+     */
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
+}
+
+static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               const size_t bss, U32 lastBlock)
+{
+    DEBUGLOG(6, "Attempting ZSTD_compressSuperBlock()");
+    if (bss == ZSTDbss_compress) {
+        if (/* We don't want to emit our first block as a RLE even if it qualifies because
+            * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+            * This is only an issue for zstd <= v1.4.3
+            */
+            !zc->isFirstBlock &&
+            ZSTD_maybeRLE(&zc->seqStore) &&
+            ZSTD_isRLE((BYTE const*)src, srcSize))
+        {
+            return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const*)src, srcSize, lastBlock);
+        }
+        /* Attempt superblock compression.
+         *
+         * Note that compressed size of ZSTD_compressSuperBlock() is not bound by the
+         * standard ZSTD_compressBound(). This is a problem, because even if we have
+         * space now, taking an extra byte now could cause us to run out of space later
+         * and violate ZSTD_compressBound().
+         *
+         * Define blockBound(blockSize) = blockSize + ZSTD_blockHeaderSize.
+         *
+         * In order to respect ZSTD_compressBound() we must attempt to emit a raw
+         * uncompressed block in these cases:
+         *   * cSize == 0: Return code for an uncompressed block.
+         *   * cSize == dstSize_tooSmall: We may have expanded beyond blockBound(srcSize).
+         *     ZSTD_noCompressBlock() will return dstSize_tooSmall if we are really out of
+         *     output space.
+         *   * cSize >= blockBound(srcSize): We have expanded the block too much so
+         *     emit an uncompressed block.
+         */
+        {   size_t const cSize =
+                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+            if (cSize != ERROR(dstSize_tooSmall)) {
+                size_t const maxCSize =
+                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+                if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+                    ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+                    return cSize;
+                }
+            }
+        }
+    } /* if (bss == ZSTDbss_compress)*/
+
+    DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+    /* Superblock compression failed, attempt to emit a single no compress block.
+     * The decoder will be able to stream this block since it is uncompressed.
+     */
+    return ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+}
+
+static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               U32 lastBlock)
+{
+    size_t cSize = 0;
+    const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+    DEBUGLOG(5, "ZSTD_compressBlock_targetCBlockSize (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u, srcSize=%zu)",
+                (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate, srcSize);
+    FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+
+    cSize = ZSTD_compressBlock_targetCBlockSize_body(zc, dst, dstCapacity, src, srcSize, bss, lastBlock);
+    FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize_body failed");
+
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
+}
+
+static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+                                         ZSTD_cwksp* ws,
+                                         ZSTD_CCtx_params const* params,
+                                         void const* ip,
+                                         void const* iend)
+{
+    U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy);
+    U32 const maxDist = (U32)1 << params->cParams.windowLog;
+    if (ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, ms->loadedDictEnd, ip, iend)) {
+        U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip);
+        ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
+        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
+        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+        ZSTD_cwksp_mark_tables_dirty(ws);
+        ZSTD_reduceIndex(ms, params, correction);
+        ZSTD_cwksp_mark_tables_clean(ws);
+        if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
+        else ms->nextToUpdate -= correction;
+        /* invalidate dictionaries on overflow correction */
+        ms->loadedDictEnd = 0;
+        ms->dictMatchState = NULL;
+    }
+}
+
+/*! ZSTD_compress_frameChunk() :
+*   Compress a chunk of data into one or multiple blocks.
+*   All blocks will be terminated, all input will be consumed.
+*   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+*   Frame is supposed already started (header already produced)
+*  @return : compressed size, or an error code
+*/
+static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     U32 lastFrameChunk)
+{
+    size_t blockSize = cctx->blockSize;
+    size_t remaining = srcSize;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog;
+
+    assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX);
+
+    DEBUGLOG(4, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize);
+    if (cctx->appliedParams.fParams.checksumFlag && srcSize)
+        XXH64_update(&cctx->xxhState, src, srcSize);
+
+    while (remaining) {
+        ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+        U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
+                        dstSize_tooSmall,
+                        "not enough space to store compressed block");
+        if (remaining < blockSize) blockSize = remaining;
+
+        ZSTD_overflowCorrectIfNeeded(
+            ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize);
+        ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
+        ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
+
+        /* Ensure hash/chain table insertion resumes no sooner than lowlimit */
+        if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit;
+
+        {   size_t cSize;
+            if (ZSTD_useTargetCBlockSize(&cctx->appliedParams)) {
+                cSize = ZSTD_compressBlock_targetCBlockSize(cctx, op, dstCapacity, ip, blockSize, lastBlock);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed");
+                assert(cSize > 0);
+                assert(cSize <= blockSize + ZSTD_blockHeaderSize);
+            } else if (ZSTD_blockSplitterEnabled(&cctx->appliedParams)) {
+                cSize = ZSTD_compressBlock_splitBlock(cctx, op, dstCapacity, ip, blockSize, lastBlock);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock failed");
+                assert(cSize > 0 || cctx->seqCollector.collectSequences == 1);
+            } else {
+                cSize = ZSTD_compressBlock_internal(cctx,
+                                        op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize,
+                                        ip, blockSize, 1 /* frame */);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_internal failed");
+
+                if (cSize == 0) {  /* block is not compressible */
+                    cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+                    FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+                } else {
+                    U32 const cBlockHeader = cSize == 1 ?
+                        lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                        lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+                    MEM_writeLE24(op, cBlockHeader);
+                    cSize += ZSTD_blockHeaderSize;
+                }
+            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
+
+
+            ip += blockSize;
+            assert(remaining >= blockSize);
+            remaining -= blockSize;
+            op += cSize;
+            assert(dstCapacity >= cSize);
+            dstCapacity -= cSize;
+            cctx->isFirstBlock = 0;
+            DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u",
+                        (unsigned)cSize);
+    }   }
+
+    if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending;
+    return (size_t)(op-ostart);
+}
+
+
+static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
+                                    const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID)
+{   BYTE* const op = (BYTE*)dst;
+    U32   const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536);   /* 0-3 */
+    U32   const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength;   /* 0-3 */
+    U32   const checksumFlag = params->fParams.checksumFlag>0;
+    U32   const windowSize = (U32)1 << params->cParams.windowLog;
+    U32   const singleSegment = params->fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
+    BYTE  const windowLogByte = (BYTE)((params->cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3);
+    U32   const fcsCode = params->fParams.contentSizeFlag ?
+                     (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : 0;  /* 0-3 */
+    BYTE  const frameHeaderDescriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) );
+    size_t pos=0;
+
+    assert(!(params->fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN));
+    RETURN_ERROR_IF(dstCapacity < ZSTD_FRAMEHEADERSIZE_MAX, dstSize_tooSmall,
+                    "dst buf is too small to fit worst-case frame header size.");
+    DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u",
+                !params->fParams.noDictIDFlag, (unsigned)dictID, (unsigned)dictIDSizeCode);
+    if (params->format == ZSTD_f_zstd1) {
+        MEM_writeLE32(dst, ZSTD_MAGICNUMBER);
+        pos = 4;
+    }
+    op[pos++] = frameHeaderDescriptionByte;
+    if (!singleSegment) op[pos++] = windowLogByte;
+    switch(dictIDSizeCode)
+    {
+        default:
+            assert(0); /* impossible */
+            ZSTD_FALLTHROUGH;
+        case 0 : break;
+        case 1 : op[pos] = (BYTE)(dictID); pos++; break;
+        case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break;
+        case 3 : MEM_writeLE32(op+pos, dictID); pos+=4; break;
+    }
+    switch(fcsCode)
+    {
+        default:
+            assert(0); /* impossible */
+            ZSTD_FALLTHROUGH;
+        case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break;
+        case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break;
+        case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break;
+        case 3 : MEM_writeLE64(op+pos, (U64)(pledgedSrcSize)); pos+=8; break;
+    }
+    return pos;
+}
+
+/* ZSTD_writeSkippableFrame_advanced() :
+ * Writes out a skippable frame with the specified magic number variant (16 are supported),
+ * from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15, and the desired source data.
+ *
+ * Returns the total number of bytes written, or a ZSTD error code.
+ */
+size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize, unsigned magicVariant) {
+    BYTE* op = (BYTE*)dst;
+    RETURN_ERROR_IF(dstCapacity < srcSize + ZSTD_SKIPPABLEHEADERSIZE /* Skippable frame overhead */,
+                    dstSize_tooSmall, "Not enough room for skippable frame");
+    RETURN_ERROR_IF(srcSize > (unsigned)0xFFFFFFFF, srcSize_wrong, "Src size too large for skippable frame");
+    RETURN_ERROR_IF(magicVariant > 15, parameter_outOfBound, "Skippable frame magic number variant not supported");
+
+    MEM_writeLE32(op, (U32)(ZSTD_MAGIC_SKIPPABLE_START + magicVariant));
+    MEM_writeLE32(op+4, (U32)srcSize);
+    ZSTD_memcpy(op+8, src, srcSize);
+    return srcSize + ZSTD_SKIPPABLEHEADERSIZE;
+}
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ *           or an error code if `dstCapacity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
+{
+    RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall,
+                    "dst buf is too small to write frame trailer empty block.");
+    {   U32 const cBlockHeader24 = 1 /*lastBlock*/ + (((U32)bt_raw)<<1);  /* 0 size */
+        MEM_writeLE24(dst, cBlockHeader24);
+        return ZSTD_blockHeaderSize;
+    }
+}
+
+void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+{
+    assert(cctx->stage == ZSTDcs_init);
+    assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable);
+    cctx->externSeqStore.seq = seq;
+    cctx->externSeqStore.size = nbSeq;
+    cctx->externSeqStore.capacity = nbSeq;
+    cctx->externSeqStore.pos = 0;
+    cctx->externSeqStore.posInSequence = 0;
+}
+
+
+static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                               U32 frame, U32 lastFrameChunk)
+{
+    ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+    size_t fhSize = 0;
+
+    DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
+                cctx->stage, (unsigned)srcSize);
+    RETURN_ERROR_IF(cctx->stage==ZSTDcs_created, stage_wrong,
+                    "missing init (ZSTD_compressBegin)");
+
+    if (frame && (cctx->stage==ZSTDcs_init)) {
+        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams,
+                                       cctx->pledgedSrcSizePlusOne-1, cctx->dictID);
+        FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+        assert(fhSize <= dstCapacity);
+        dstCapacity -= fhSize;
+        dst = (char*)dst + fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    if (!srcSize) return fhSize;  /* do not generate an empty block if no input */
+
+    if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) {
+        ms->forceNonContiguous = 0;
+        ms->nextToUpdate = ms->window.dictLimit;
+    }
+    if (cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+        ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0);
+    }
+
+    if (!frame) {
+        /* overflow check and correction for block mode */
+        ZSTD_overflowCorrectIfNeeded(
+            ms, &cctx->workspace, &cctx->appliedParams,
+            src, (BYTE const*)src + srcSize);
+    }
+
+    DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize);
+    {   size_t const cSize = frame ?
+                             ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) :
+                             ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */);
+        FORWARD_IF_ERROR(cSize, "%s", frame ? "ZSTD_compress_frameChunk failed" : "ZSTD_compressBlock_internal failed");
+        cctx->consumedSrcSize += srcSize;
+        cctx->producedCSize += (cSize + fhSize);
+        assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+        if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+            ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
+            RETURN_ERROR_IF(
+                cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne,
+                srcSize_wrong,
+                "error : pledgedSrcSize = %u, while realSrcSize >= %u",
+                (unsigned)cctx->pledgedSrcSizePlusOne-1,
+                (unsigned)cctx->consumedSrcSize);
+        }
+        return cSize + fhSize;
+    }
+}
+
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
+                                        void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+}
+
+/* NOTE: Must just wrap ZSTD_compressContinue_public() */
+size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
+                             void* dst, size_t dstCapacity,
+                       const void* src, size_t srcSize)
+{
+    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
+}
+
+static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
+{
+    ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+    assert(!ZSTD_checkCParams(cParams));
+    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
+}
+
+/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+{
+    return ZSTD_getBlockSize_deprecated(cctx);
+}
+
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
+      RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+}
+
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
+}
+
+/*! ZSTD_loadDictionaryContent() :
+ *  @return : 0, or an error code
+ */
+static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                                         ldmState_t* ls,
+                                         ZSTD_cwksp* ws,
+                                         ZSTD_CCtx_params const* params,
+                                         const void* src, size_t srcSize,
+                                         ZSTD_dictTableLoadMethod_e dtlm,
+                                         ZSTD_tableFillPurpose_e tfp)
+{
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+    int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
+
+    /* Assert that the ms params match the params we're being given */
+    ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+
+    {   /* Ensure large dictionaries can't cause index overflow */
+
+        /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
+         * Dictionaries right at the edge will immediately trigger overflow
+         * correction, but I don't want to insert extra constraints here.
+         */
+        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
+
+        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
+        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
+            /* Some dictionary matchfinders in zstd use "short cache",
+             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
+             * CDict hashtable entry as a tag rather than as part of an index.
+             * When short cache is used, we need to truncate the dictionary
+             * so that its indices don't overlap with the tag. */
+            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
+            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
+            assert(!loadLdmDict);
+        }
+
+        /* If the dictionary is too large, only load the suffix of the dictionary. */
+        if (srcSize > maxDictSize) {
+            ip = iend - maxDictSize;
+            src = ip;
+            srcSize = maxDictSize;
+        }
+    }
+
+    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+        /* We must have cleared our windows when our source is this large. */
+        assert(ZSTD_window_isEmpty(ms->window));
+        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
+    }
+    ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
+
+    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
+
+    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
+        ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
+        ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
+        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+    }
+
+    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
+    if (params->cParams.strategy < ZSTD_btultra) {
+        U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
+        if (srcSize > maxDictSize) {
+            ip = iend - maxDictSize;
+            src = ip;
+            srcSize = maxDictSize;
+        }
+    }
+
+    ms->nextToUpdate = (U32)(ip - ms->window.base);
+    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+    ms->forceNonContiguous = params->deterministicRefPrefix;
+
+    if (srcSize <= HASH_READ_SIZE) return 0;
+
+    ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
+
+    switch(params->cParams.strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
+        break;
+    case ZSTD_dfast:
+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
+#else
+        assert(0); /* shouldn't be called: cparams should've been adjusted. */
+#endif
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR)
+        assert(srcSize >= HASH_READ_SIZE);
+        if (ms->dedicatedDictSearch) {
+            assert(ms->chainTable != NULL);
+            ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_READ_SIZE);
+        } else {
+            assert(params->useRowMatchFinder != ZSTD_ps_auto);
+            if (params->useRowMatchFinder == ZSTD_ps_enable) {
+                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
+                ZSTD_memset(ms->tagTable, 0, tagTableSize);
+                ZSTD_row_update(ms, iend-HASH_READ_SIZE);
+                DEBUGLOG(4, "Using row-based hash table for lazy dict");
+            } else {
+                ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE);
+                DEBUGLOG(4, "Using chain-based hash table for lazy dict");
+            }
+        }
+#else
+        assert(0); /* shouldn't be called: cparams should've been adjusted. */
+#endif
+        break;
+
+    case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+    case ZSTD_btopt:
+    case ZSTD_btultra:
+    case ZSTD_btultra2:
+#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+        assert(srcSize >= HASH_READ_SIZE);
+        ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
+#else
+        assert(0); /* shouldn't be called: cparams should've been adjusted. */
+#endif
+        break;
+
+    default:
+        assert(0);  /* not possible : not a valid strategy id */
+    }
+
+    ms->nextToUpdate = (U32)(iend - ms->window.base);
+    return 0;
+}
+
+
+/* Dictionaries that assign zero probability to symbols that show up causes problems
+ * when FSE encoding. Mark dictionaries with zero probability symbols as FSE_repeat_check
+ * and only dictionaries with 100% valid symbols can be assumed valid.
+ */
+static FSE_repeat ZSTD_dictNCountRepeat(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue)
+{
+    U32 s;
+    if (dictMaxSymbolValue < maxSymbolValue) {
+        return FSE_repeat_check;
+    }
+    for (s = 0; s <= maxSymbolValue; ++s) {
+        if (normalizedCounter[s] == 0) {
+            return FSE_repeat_check;
+        }
+    }
+    return FSE_repeat_valid;
+}
+
+size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+                         const void* const dict, size_t dictSize)
+{
+    short offcodeNCount[MaxOff+1];
+    unsigned offcodeMaxValue = MaxOff;
+    const BYTE* dictPtr = (const BYTE*)dict;    /* skip magic num and dict ID */
+    const BYTE* const dictEnd = dictPtr + dictSize;
+    dictPtr += 8;
+    bs->entropy.huf.repeatMode = HUF_repeat_check;
+
+    {   unsigned maxSymbolValue = 255;
+        unsigned hasZeroWeights = 1;
+        size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr,
+            dictEnd-dictPtr, &hasZeroWeights);
+
+        /* We only set the loaded table as valid if it contains all non-zero
+         * weights. Otherwise, we set it to check */
+        if (!hasZeroWeights && maxSymbolValue == 255)
+            bs->entropy.huf.repeatMode = HUF_repeat_valid;
+
+        RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
+        dictPtr += hufHeaderSize;
+    }
+
+    {   unsigned offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
+        /* fill all offset symbols to avoid garbage at end of table */
+        RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+                bs->entropy.fse.offcodeCTable,
+                offcodeNCount, MaxOff, offcodeLog,
+                workspace, HUF_WORKSPACE_SIZE)),
+            dictionary_corrupted, "");
+        /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
+        RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+                bs->entropy.fse.matchlengthCTable,
+                matchlengthNCount, matchlengthMaxValue, matchlengthLog,
+                workspace, HUF_WORKSPACE_SIZE)),
+            dictionary_corrupted, "");
+        bs->entropy.fse.matchlength_repeatMode = ZSTD_dictNCountRepeat(matchlengthNCount, matchlengthMaxValue, MaxML);
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
+        RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+                bs->entropy.fse.litlengthCTable,
+                litlengthNCount, litlengthMaxValue, litlengthLog,
+                workspace, HUF_WORKSPACE_SIZE)),
+            dictionary_corrupted, "");
+        bs->entropy.fse.litlength_repeatMode = ZSTD_dictNCountRepeat(litlengthNCount, litlengthMaxValue, MaxLL);
+        dictPtr += litlengthHeaderSize;
+    }
+
+    RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, "");
+    bs->rep[0] = MEM_readLE32(dictPtr+0);
+    bs->rep[1] = MEM_readLE32(dictPtr+4);
+    bs->rep[2] = MEM_readLE32(dictPtr+8);
+    dictPtr += 12;
+
+    {   size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+        U32 offcodeMax = MaxOff;
+        if (dictContentSize <= ((U32)-1) - 128 KB) {
+            U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+            offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
+        }
+        /* All offset values <= dictContentSize + 128 KB must be representable for a valid table */
+        bs->entropy.fse.offcode_repeatMode = ZSTD_dictNCountRepeat(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff));
+
+        /* All repCodes must be <= dictContentSize and != 0 */
+        {   U32 u;
+            for (u=0; u<3; u++) {
+                RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
+                RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
+    }   }   }
+
+    return dictPtr - (const BYTE*)dict;
+}
+
+/* Dictionary format :
+ * See :
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#dictionary-format
+ */
+/*! ZSTD_loadZstdDictionary() :
+ * @return : dictID, or an error code
+ *  assumptions : magic number supposed already checked
+ *                dictSize supposed >= 8
+ */
+static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+                                      ZSTD_matchState_t* ms,
+                                      ZSTD_cwksp* ws,
+                                      ZSTD_CCtx_params const* params,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictTableLoadMethod_e dtlm,
+                                      ZSTD_tableFillPurpose_e tfp,
+                                      void* workspace)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+    size_t dictID;
+    size_t eSize;
+    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+    assert(dictSize >= 8);
+    assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
+
+    dictID = params->fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr + 4 /* skip magic number */ );
+    eSize = ZSTD_loadCEntropy(bs, workspace, dict, dictSize);
+    FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed");
+    dictPtr += eSize;
+
+    {
+        size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+        FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
+    }
+    return dictID;
+}
+
+/** ZSTD_compress_insertDictionary() :
+*   @return : dictID, or an error code */
+static size_t
+ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+                               ZSTD_matchState_t* ms,
+                               ldmState_t* ls,
+                               ZSTD_cwksp* ws,
+                         const ZSTD_CCtx_params* params,
+                         const void* dict, size_t dictSize,
+                               ZSTD_dictContentType_e dictContentType,
+                               ZSTD_dictTableLoadMethod_e dtlm,
+                               ZSTD_tableFillPurpose_e tfp,
+                               void* workspace)
+{
+    DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+    if ((dict==NULL) || (dictSize<8)) {
+        RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+        return 0;
+    }
+
+    ZSTD_reset_compressedBlockState(bs);
+
+    /* dict restricted modes */
+    if (dictContentType == ZSTD_dct_rawContent)
+        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+        if (dictContentType == ZSTD_dct_auto) {
+            DEBUGLOG(4, "raw content dictionary detected");
+            return ZSTD_loadDictionaryContent(
+                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+        }
+        RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+        assert(0);   /* impossible */
+    }
+
+    /* dict as full zstd dictionary */
+    return ZSTD_loadZstdDictionary(
+        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
+}
+
+#define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+#define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
+
+/*! ZSTD_compressBegin_internal() :
+ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
+ * @return : 0, or an error code */
+static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    const ZSTD_CCtx_params* params, U64 pledgedSrcSize,
+                                    ZSTD_buffered_policy_e zbuff)
+{
+    size_t const dictContentSize = cdict ? cdict->dictContentSize : dictSize;
+#if ZSTD_TRACE
+    cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0;
+#endif
+    DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog);
+    /* params are supposed to be fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+    if ( (cdict)
+      && (cdict->dictContentSize > 0)
+      && ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF
+        || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER
+        || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+        || cdict->compressionLevel == 0)
+      && (params->attachDictPref != ZSTD_dictForceLoad) ) {
+        return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff);
+    }
+
+    FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                     dictContentSize,
+                                     ZSTDcrp_makeClean, zbuff) , "");
+    {   size_t const dictID = cdict ?
+                ZSTD_compress_insertDictionary(
+                        cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                        &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+                        cdict->dictContentSize, cdict->dictContentType, dtlm,
+                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
+              : ZSTD_compress_insertDictionary(
+                        cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                        &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
+        FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+        assert(dictID <= UINT_MAX);
+        cctx->dictID = (U32)dictID;
+        cctx->dictContentSize = dictContentSize;
+    }
+    return 0;
+}
+
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    const ZSTD_CCtx_params* params,
+                                    unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params->cParams.windowLog);
+    /* compression parameters verification and optimization */
+    FORWARD_IF_ERROR( ZSTD_checkCParams(params->cParams) , "");
+    return ZSTD_compressBegin_internal(cctx,
+                                       dict, dictSize, dictContentType, dtlm,
+                                       cdict,
+                                       params, pledgedSrcSize,
+                                       ZSTDb_not_buffered);
+}
+
+/*! ZSTD_compressBegin_advanced() :
+*   @return : 0, or an error code */
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                             const void* dict, size_t dictSize,
+                                   ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    ZSTD_CCtx_params cctxParams;
+    ZSTD_CCtxParams_init_internal(&cctxParams, &params, ZSTD_NO_CLEVEL);
+    return ZSTD_compressBegin_advanced_internal(cctx,
+                                            dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast,
+                                            NULL /*cdict*/,
+                                            &cctxParams, pledgedSrcSize);
+}
+
+static size_t
+ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_CCtx_params cctxParams;
+    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+        ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
+    }
+    DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+    return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                                       &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+}
+
+size_t
+ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
+}
+
+size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+{
+    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
+}
+
+
+/*! ZSTD_writeEpilogue() :
+*   Ends a frame.
+*   @return : nb of bytes written into dst (or an error code) */
+static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+
+    DEBUGLOG(4, "ZSTD_writeEpilogue");
+    RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
+
+    /* special case : empty frame */
+    if (cctx->stage == ZSTDcs_init) {
+        size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
+        FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+        dstCapacity -= fhSize;
+        op += fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    if (cctx->stage != ZSTDcs_ending) {
+        /* write one last empty block, make it the "last" block */
+        U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+        ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3);
+        RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue");
+        MEM_writeLE24(op, cBlockHeader24);
+        op += ZSTD_blockHeaderSize;
+        dstCapacity -= ZSTD_blockHeaderSize;
+    }
+
+    if (cctx->appliedParams.fParams.checksumFlag) {
+        U32 const checksum = (U32) XXH64_digest(&cctx->xxhState);
+        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum");
+        DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", (unsigned)checksum);
+        MEM_writeLE32(op, checksum);
+        op += 4;
+    }
+
+    cctx->stage = ZSTDcs_created;  /* return to "created but no init" status */
+    return op-ostart;
+}
+
+void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+{
+#if ZSTD_TRACE
+    if (cctx->traceCtx && ZSTD_trace_compress_end != NULL) {
+        int const streaming = cctx->inBuffSize > 0 || cctx->outBuffSize > 0 || cctx->appliedParams.nbWorkers > 0;
+        ZSTD_Trace trace;
+        ZSTD_memset(&trace, 0, sizeof(trace));
+        trace.version = ZSTD_VERSION_NUMBER;
+        trace.streaming = streaming;
+        trace.dictionaryID = cctx->dictID;
+        trace.dictionarySize = cctx->dictContentSize;
+        trace.uncompressedSize = cctx->consumedSrcSize;
+        trace.compressedSize = cctx->producedCSize + extraCSize;
+        trace.params = &cctx->appliedParams;
+        trace.cctx = cctx;
+        ZSTD_trace_compress_end(cctx->traceCtx, &trace);
+    }
+    cctx->traceCtx = 0;
+#else
+    (void)cctx;
+    (void)extraCSize;
+#endif
+}
+
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize)
+{
+    size_t endResult;
+    size_t const cSize = ZSTD_compressContinue_internal(cctx,
+                                dst, dstCapacity, src, srcSize,
+                                1 /* frame mode */, 1 /* last chunk */);
+    FORWARD_IF_ERROR(cSize, "ZSTD_compressContinue_internal failed");
+    endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize);
+    FORWARD_IF_ERROR(endResult, "ZSTD_writeEpilogue failed");
+    assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+    if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+        ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
+        DEBUGLOG(4, "end of frame : controlling src size");
+        RETURN_ERROR_IF(
+            cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1,
+            srcSize_wrong,
+             "error : pledgedSrcSize = %u, while realSrcSize = %u",
+            (unsigned)cctx->pledgedSrcSizePlusOne-1,
+            (unsigned)cctx->consumedSrcSize);
+    }
+    ZSTD_CCtx_trace(cctx, endResult);
+    return cSize + endResult;
+}
+
+/* NOTE: Must just wrap ZSTD_compressEnd_public() */
+size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
+                        void* dst, size_t dstCapacity,
+                  const void* src, size_t srcSize)
+{
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+}
+
+size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict,size_t dictSize,
+                               ZSTD_parameters params)
+{
+    DEBUGLOG(4, "ZSTD_compress_advanced");
+    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
+    ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, &params, ZSTD_NO_CLEVEL);
+    return ZSTD_compress_advanced_internal(cctx,
+                                           dst, dstCapacity,
+                                           src, srcSize,
+                                           dict, dictSize,
+                                           &cctx->simpleApiParams);
+}
+
+/* Internal */
+size_t ZSTD_compress_advanced_internal(
+        ZSTD_CCtx* cctx,
+        void* dst, size_t dstCapacity,
+        const void* src, size_t srcSize,
+        const void* dict,size_t dictSize,
+        const ZSTD_CCtx_params* params)
+{
+    DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (unsigned)srcSize);
+    FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                         dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                         params, srcSize, ZSTDb_not_buffered) , "");
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+}
+
+size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict, size_t dictSize,
+                               int compressionLevel)
+{
+    {
+        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict);
+        assert(params.fParams.contentSizeFlag == 1);
+        ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel);
+    }
+    DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize);
+    return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctx->simpleApiParams);
+}
+
+size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                         void* dst, size_t dstCapacity,
+                   const void* src, size_t srcSize,
+                         int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (unsigned)srcSize);
+    assert(cctx != NULL);
+    return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel);
+}
+
+size_t ZSTD_compress(void* dst, size_t dstCapacity,
+               const void* src, size_t srcSize,
+                     int compressionLevel)
+{
+    size_t result;
+#if ZSTD_COMPRESS_HEAPMODE
+    ZSTD_CCtx* cctx = ZSTD_createCCtx();
+    RETURN_ERROR_IF(!cctx, memory_allocation, "ZSTD_createCCtx failed");
+    result = ZSTD_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel);
+    ZSTD_freeCCtx(cctx);
+#else
+    ZSTD_CCtx ctxBody;
+    ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem);
+    result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel);
+    ZSTD_freeCCtxContent(&ctxBody);   /* can't free ctxBody itself, as it's on stack; free only heap content */
+#endif
+    return result;
+}
+
+
+/* =====  Dictionary API  ===== */
+
+/*! ZSTD_estimateCDictSize_advanced() :
+ *  Estimate amount of memory that will be needed to create a dictionary with following arguments */
+size_t ZSTD_estimateCDictSize_advanced(
+        size_t dictSize, ZSTD_compressionParameters cParams,
+        ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+    DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict));
+    return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
+         + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
+         /* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small
+          * in case we are using DDS with row-hash. */
+         + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams),
+                                  /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0)
+         + (dictLoadMethod == ZSTD_dlm_byRef ? 0
+            : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *))));
+}
+
+size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+    return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy);
+}
+
+size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support sizeof on NULL */
+    DEBUGLOG(5, "sizeof(*cdict) : %u", (unsigned)sizeof(*cdict));
+    /* cdict may be in the workspace */
+    return (cdict->workspace.workspace == cdict ? 0 : sizeof(*cdict))
+        + ZSTD_cwksp_sizeof(&cdict->workspace);
+}
+
+static size_t ZSTD_initCDict_internal(
+                    ZSTD_CDict* cdict,
+              const void* dictBuffer, size_t dictSize,
+                    ZSTD_dictLoadMethod_e dictLoadMethod,
+                    ZSTD_dictContentType_e dictContentType,
+                    ZSTD_CCtx_params params)
+{
+    DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (unsigned)dictContentType);
+    assert(!ZSTD_checkCParams(params.cParams));
+    cdict->matchState.cParams = params.cParams;
+    cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch;
+    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
+        cdict->dictContent = dictBuffer;
+    } else {
+         void *internalBuffer = ZSTD_cwksp_reserve_object(&cdict->workspace, ZSTD_cwksp_align(dictSize, sizeof(void*)));
+        RETURN_ERROR_IF(!internalBuffer, memory_allocation, "NULL pointer!");
+        cdict->dictContent = internalBuffer;
+        ZSTD_memcpy(internalBuffer, dictBuffer, dictSize);
+    }
+    cdict->dictContentSize = dictSize;
+    cdict->dictContentType = dictContentType;
+
+    cdict->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cdict->workspace, HUF_WORKSPACE_SIZE);
+
+
+    /* Reset the state to no dictionary */
+    ZSTD_reset_compressedBlockState(&cdict->cBlockState);
+    FORWARD_IF_ERROR(ZSTD_reset_matchState(
+        &cdict->matchState,
+        &cdict->workspace,
+        &params.cParams,
+        params.useRowMatchFinder,
+        ZSTDcrp_makeClean,
+        ZSTDirp_reset,
+        ZSTD_resetTarget_CDict), "");
+    /* (Maybe) load the dictionary
+     * Skips loading the dictionary if it is < 8 bytes.
+     */
+    {   params.compressionLevel = ZSTD_CLEVEL_DEFAULT;
+        params.fParams.contentSizeFlag = 1;
+        {   size_t const dictID = ZSTD_compress_insertDictionary(
+                    &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+                    &params, cdict->dictContent, cdict->dictContentSize,
+                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
+            FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+            assert(dictID <= (size_t)(U32)-1);
+            cdict->dictID = (U32)dictID;
+        }
+    }
+
+    return 0;
+}
+
+static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_compressionParameters cParams,
+                                      ZSTD_paramSwitch_e useRowMatchFinder,
+                                      U32 enableDedicatedDictSearch,
+                                      ZSTD_customMem customMem)
+{
+    if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
+
+    {   size_t const workspaceSize =
+            ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +
+            ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) +
+            ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) +
+            (dictLoadMethod == ZSTD_dlm_byRef ? 0
+             : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))));
+        void* const workspace = ZSTD_customMalloc(workspaceSize, customMem);
+        ZSTD_cwksp ws;
+        ZSTD_CDict* cdict;
+
+        if (!workspace) {
+            ZSTD_customFree(workspace, customMem);
+            return NULL;
+        }
+
+        ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_dynamic_alloc);
+
+        cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict));
+        assert(cdict != NULL);
+        ZSTD_cwksp_move(&cdict->workspace, &ws);
+        cdict->customMem = customMem;
+        cdict->compressionLevel = ZSTD_NO_CLEVEL; /* signals advanced API usage */
+        cdict->useRowMatchFinder = useRowMatchFinder;
+        return cdict;
+    }
+}
+
+ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType,
+                                      ZSTD_compressionParameters cParams,
+                                      ZSTD_customMem customMem)
+{
+    ZSTD_CCtx_params cctxParams;
+    ZSTD_memset(&cctxParams, 0, sizeof(cctxParams));
+    ZSTD_CCtxParams_init(&cctxParams, 0);
+    cctxParams.cParams = cParams;
+    cctxParams.customMem = customMem;
+    return ZSTD_createCDict_advanced2(
+        dictBuffer, dictSize,
+        dictLoadMethod, dictContentType,
+        &cctxParams, customMem);
+}
+
+ZSTD_CDict* ZSTD_createCDict_advanced2(
+        const void* dict, size_t dictSize,
+        ZSTD_dictLoadMethod_e dictLoadMethod,
+        ZSTD_dictContentType_e dictContentType,
+        const ZSTD_CCtx_params* originalCctxParams,
+        ZSTD_customMem customMem)
+{
+    ZSTD_CCtx_params cctxParams = *originalCctxParams;
+    ZSTD_compressionParameters cParams;
+    ZSTD_CDict* cdict;
+
+    DEBUGLOG(3, "ZSTD_createCDict_advanced2, mode %u", (unsigned)dictContentType);
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+    if (cctxParams.enableDedicatedDictSearch) {
+        cParams = ZSTD_dedicatedDictSearch_getCParams(
+            cctxParams.compressionLevel, dictSize);
+        ZSTD_overrideCParams(&cParams, &cctxParams.cParams);
+    } else {
+        cParams = ZSTD_getCParamsFromCCtxParams(
+            &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+    }
+
+    if (!ZSTD_dedicatedDictSearch_isSupported(&cParams)) {
+        /* Fall back to non-DDSS params */
+        cctxParams.enableDedicatedDictSearch = 0;
+        cParams = ZSTD_getCParamsFromCCtxParams(
+            &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+    }
+
+    DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch);
+    cctxParams.cParams = cParams;
+    cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
+
+    cdict = ZSTD_createCDict_advanced_internal(dictSize,
+                        dictLoadMethod, cctxParams.cParams,
+                        cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch,
+                        customMem);
+
+    if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                    dict, dictSize,
+                                    dictLoadMethod, dictContentType,
+                                    cctxParams) )) {
+        ZSTD_freeCDict(cdict);
+        return NULL;
+    }
+
+    return cdict;
+}
+
+ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+    ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize,
+                                                  ZSTD_dlm_byCopy, ZSTD_dct_auto,
+                                                  cParams, ZSTD_defaultCMem);
+    if (cdict)
+        cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel;
+    return cdict;
+}
+
+ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+    ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize,
+                                     ZSTD_dlm_byRef, ZSTD_dct_auto,
+                                     cParams, ZSTD_defaultCMem);
+    if (cdict)
+        cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel;
+    return cdict;
+}
+
+size_t ZSTD_freeCDict(ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = cdict->customMem;
+        int cdictInWorkspace = ZSTD_cwksp_owns_buffer(&cdict->workspace, cdict);
+        ZSTD_cwksp_free(&cdict->workspace, cMem);
+        if (!cdictInWorkspace) {
+            ZSTD_customFree(cdict, cMem);
+        }
+        return 0;
+    }
+}
+
+/*! ZSTD_initStaticCDict_advanced() :
+ *  Generate a digested dictionary in provided memory area.
+ *  workspace: The memory area to emplace the dictionary into.
+ *             Provided pointer must 8-bytes aligned.
+ *             It must outlive dictionary usage.
+ *  workspaceSize: Use ZSTD_estimateCDictSize()
+ *                 to determine how large workspace must be.
+ *  cParams : use ZSTD_getCParams() to transform a compression level
+ *            into its relevants cParams.
+ * @return : pointer to ZSTD_CDict*, or NULL if error (size too small)
+ *  Note : there is no corresponding "free" function.
+ *         Since workspace was allocated externally, it must be freed externally.
+ */
+const ZSTD_CDict* ZSTD_initStaticCDict(
+                                 void* workspace, size_t workspaceSize,
+                           const void* dict, size_t dictSize,
+                                 ZSTD_dictLoadMethod_e dictLoadMethod,
+                                 ZSTD_dictContentType_e dictContentType,
+                                 ZSTD_compressionParameters cParams)
+{
+    ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams);
+    /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */
+    size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0);
+    size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
+                            + (dictLoadMethod == ZSTD_dlm_byRef ? 0
+                               : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))))
+                            + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
+                            + matchStateSize;
+    ZSTD_CDict* cdict;
+    ZSTD_CCtx_params params;
+
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+
+    {
+        ZSTD_cwksp ws;
+        ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc);
+        cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict));
+        if (cdict == NULL) return NULL;
+        ZSTD_cwksp_move(&cdict->workspace, &ws);
+    }
+
+    DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u",
+        (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize));
+    if (workspaceSize < neededSize) return NULL;
+
+    ZSTD_CCtxParams_init(&params, 0);
+    params.cParams = cParams;
+    params.useRowMatchFinder = useRowMatchFinder;
+    cdict->useRowMatchFinder = useRowMatchFinder;
+    cdict->compressionLevel = ZSTD_NO_CLEVEL;
+
+    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                              dict, dictSize,
+                                              dictLoadMethod, dictContentType,
+                                              params) ))
+        return NULL;
+
+    return cdict;
+}
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict)
+{
+    assert(cdict != NULL);
+    return cdict->matchState.cParams;
+}
+
+/*! ZSTD_getDictID_fromCDict() :
+ *  Provides the dictID of the dictionary loaded into `cdict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;
+    return cdict->dictID;
+}
+
+/* ZSTD_compressBegin_usingCDict_internal() :
+ * Implementation of various ZSTD_compressBegin_usingCDict* functions.
+ */
+static size_t ZSTD_compressBegin_usingCDict_internal(
+    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
+    ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize)
+{
+    ZSTD_CCtx_params cctxParams;
+    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal");
+    RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!");
+    /* Initialize the cctxParams from the cdict */
+    {
+        ZSTD_parameters params;
+        params.fParams = fParams;
+        params.cParams = ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF
+                        || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER
+                        || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+                        || cdict->compressionLevel == 0 ) ?
+                ZSTD_getCParamsFromCDict(cdict)
+              : ZSTD_getCParams(cdict->compressionLevel,
+                                pledgedSrcSize,
+                                cdict->dictContentSize);
+        ZSTD_CCtxParams_init_internal(&cctxParams, &params, cdict->compressionLevel);
+    }
+    /* Increase window log to fit the entire dictionary and source if the
+     * source size is known. Limit the increase to 19, which is the
+     * window log for compression level 1 with the largest source size.
+     */
+    if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+        U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19);
+        U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1;
+        cctxParams.cParams.windowLog = MAX(cctxParams.cParams.windowLog, limitedSrcLog);
+    }
+    return ZSTD_compressBegin_internal(cctx,
+                                        NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast,
+                                        cdict,
+                                        &cctxParams, pledgedSrcSize,
+                                        ZSTDb_not_buffered);
+}
+
+
+/* ZSTD_compressBegin_usingCDict_advanced() :
+ * This function is DEPRECATED.
+ * cdict must be != NULL */
+size_t ZSTD_compressBegin_usingCDict_advanced(
+    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
+    ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize)
+{
+    return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pledgedSrcSize);
+}
+
+/* ZSTD_compressBegin_usingCDict() :
+ * cdict must be != NULL */
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
+}
+
+/*! ZSTD_compress_usingCDict_internal():
+ * Implementation of various ZSTD_compress_usingCDict* functions.
+ */
+static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+{
+    FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+}
+
+/*! ZSTD_compress_usingCDict_advanced():
+ * This function is DEPRECATED.
+ */
+size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+{
+    return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);
+}
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+ *  Note that compression parameters are decided at CDict creation time
+ *  while frame parameters are hardcoded */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);
+}
+
+
+
+/* ******************************************************************
+*  Streaming
+********************************************************************/
+
+ZSTD_CStream* ZSTD_createCStream(void)
+{
+    DEBUGLOG(3, "ZSTD_createCStream");
+    return ZSTD_createCStream_advanced(ZSTD_defaultCMem);
+}
+
+ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticCCtx(workspace, workspaceSize);
+}
+
+ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem)
+{   /* CStream and CCtx are now same object */
+    return ZSTD_createCCtx_advanced(customMem);
+}
+
+size_t ZSTD_freeCStream(ZSTD_CStream* zcs)
+{
+    return ZSTD_freeCCtx(zcs);   /* same object */
+}
+
+
+
+/*======   Initialization   ======*/
+
+size_t ZSTD_CStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_CStreamOutSize(void)
+{
+    return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ;
+}
+
+static ZSTD_cParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize)
+{
+    if (cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize))
+        return ZSTD_cpm_attachDict;
+    else
+        return ZSTD_cpm_noAttachDict;
+}
+
+/* ZSTD_resetCStream():
+ * pledgedSrcSize == 0 means "unknown" */
+size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss)
+{
+    /* temporary : 0 interpreted as "unknown" during transition period.
+     * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN.
+     * 0 will be interpreted as "empty" in the future.
+     */
+    U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+    DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (unsigned)pledgedSrcSize);
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    return 0;
+}
+
+/*! ZSTD_initCStream_internal() :
+ *  Note : for lib/compress only. Used by zstdmt_compress.c.
+ *  Assumption 1 : params are valid
+ *  Assumption 2 : either dict, or cdict, is defined, not both */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                    const void* dict, size_t dictSize, const ZSTD_CDict* cdict,
+                    const ZSTD_CCtx_params* params,
+                    unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_internal");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+    zcs->requestedParams = *params;
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+    if (dict) {
+        FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+    } else {
+        /* Dictionary is cleared if !cdict */
+        FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+    }
+    return 0;
+}
+
+/* ZSTD_initCStream_usingCDict_advanced() :
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */
+size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                            const ZSTD_CDict* cdict,
+                                            ZSTD_frameParameters fParams,
+                                            unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    zcs->requestedParams.fParams = fParams;
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+    return 0;
+}
+
+/* note : cdict must outlive compression session */
+size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_usingCDict");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+    return 0;
+}
+
+
+/* ZSTD_initCStream_advanced() :
+ * pledgedSrcSize must be exact.
+ * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * dict is loaded with default parameters ZSTD_dct_auto and ZSTD_dlm_byCopy. */
+size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                                 const void* dict, size_t dictSize,
+                                 ZSTD_parameters params, unsigned long long pss)
+{
+    /* for compatibility with older programs relying on this behavior.
+     * Users should now specify ZSTD_CONTENTSIZE_UNKNOWN.
+     * This line will be removed in the future.
+     */
+    U64 const pledgedSrcSize = (pss==0 && params.fParams.contentSizeFlag==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+    DEBUGLOG(4, "ZSTD_initCStream_advanced");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , "");
+    ZSTD_CCtxParams_setZstdParams(&zcs->requestedParams, &params);
+    FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+    return 0;
+}
+
+size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_usingDict");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+    return 0;
+}
+
+size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss)
+{
+    /* temporary : 0 interpreted as "unknown" during transition period.
+     * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN.
+     * 0 will be interpreted as "empty" in the future.
+     */
+    U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+    DEBUGLOG(4, "ZSTD_initCStream_srcSize");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    return 0;
+}
+
+size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_initCStream");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+    return 0;
+}
+
+/*======   Compression   ======*/
+
+static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+{
+    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        return cctx->blockSize - cctx->stableIn_notConsumed;
+    }
+    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
+    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+        if (hintInSize==0) hintInSize = cctx->blockSize;
+        return hintInSize;
+    }
+}
+
+/** ZSTD_compressStream_generic():
+ *  internal function for all *compressStream*() variants
+ * @return : hint size for next input to complete ongoing block */
+static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                          ZSTD_outBuffer* output,
+                                          ZSTD_inBuffer* input,
+                                          ZSTD_EndDirective const flushMode)
+{
+    const char* const istart = (assert(input != NULL), (const char*)input->src);
+    const char* const iend = (istart != NULL) ? istart + input->size : istart;
+    const char* ip = (istart != NULL) ? istart + input->pos : istart;
+    char* const ostart = (assert(output != NULL), (char*)output->dst);
+    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
+    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
+    U32 someMoreWork = 1;
+
+    /* check expectations */
+    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
+    assert(zcs != NULL);
+    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        assert(input->pos >= zcs->stableIn_notConsumed);
+        input->pos -= zcs->stableIn_notConsumed;
+        if (ip) ip -= zcs->stableIn_notConsumed;
+        zcs->stableIn_notConsumed = 0;
+    }
+    if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+        assert(zcs->inBuff != NULL);
+        assert(zcs->inBuffSize > 0);
+    }
+    if (zcs->appliedParams.outBufferMode == ZSTD_bm_buffered) {
+        assert(zcs->outBuff !=  NULL);
+        assert(zcs->outBuffSize > 0);
+    }
+    if (input->src == NULL) assert(input->size == 0);
+    assert(input->pos <= input->size);
+    if (output->dst == NULL) assert(output->size == 0);
+    assert(output->pos <= output->size);
+    assert((U32)flushMode <= (U32)ZSTD_e_end);
+
+    while (someMoreWork) {
+        switch(zcs->streamStage)
+        {
+        case zcss_init:
+            RETURN_ERROR(init_missing, "call ZSTD_initCStream() first!");
+
+        case zcss_load:
+            if ( (flushMode == ZSTD_e_end)
+              && ( (size_t)(oend-op) >= ZSTD_compressBound(iend-ip)     /* Enough output space */
+                || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
+              && (zcs->inBuffPos == 0) ) {
+                /* shortcut to compression pass directly into output buffer */
+                size_t const cSize = ZSTD_compressEnd_public(zcs,
+                                                op, oend-op, ip, iend-ip);
+                DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+                ip = iend;
+                op += cSize;
+                zcs->frameEnded = 1;
+                ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+                someMoreWork = 0; break;
+            }
+            /* complete loading into inBuffer in buffered mode */
+            if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+                size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+                size_t const loaded = ZSTD_limitCopy(
+                                        zcs->inBuff + zcs->inBuffPos, toLoad,
+                                        ip, iend-ip);
+                zcs->inBuffPos += loaded;
+                if (ip) ip += loaded;
+                if ( (flushMode == ZSTD_e_continue)
+                  && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                    /* not enough input to fill full block : stop here */
+                    someMoreWork = 0; break;
+                }
+                if ( (flushMode == ZSTD_e_flush)
+                  && (zcs->inBuffPos == zcs->inToCompress) ) {
+                    /* empty */
+                    someMoreWork = 0; break;
+                }
+            } else {
+                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+                if ( (flushMode == ZSTD_e_continue)
+                  && ( (size_t)(iend - ip) < zcs->blockSize) ) {
+                    /* can't compress a full block : stop here */
+                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
+                    ip = iend;  /* pretend to have consumed input */
+                    someMoreWork = 0; break;
+                }
+                if ( (flushMode == ZSTD_e_flush)
+                  && (ip == iend) ) {
+                    /* empty */
+                    someMoreWork = 0; break;
+                }
+            }
+            /* compress current block (note : this stage cannot be stopped in the middle) */
+            DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+            {   int const inputBuffered = (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered);
+                void* cDst;
+                size_t cSize;
+                size_t oSize = oend-op;
+                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
+                                                   : MIN((size_t)(iend - ip), zcs->blockSize);
+                if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
+                    cDst = op;   /* compress into output buffer, to skip flush stage */
+                else
+                    cDst = zcs->outBuff, oSize = zcs->outBuffSize;
+                if (inputBuffered) {
+                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                    cSize = lastBlock ?
+                            ZSTD_compressEnd_public(zcs, cDst, oSize,
+                                        zcs->inBuff + zcs->inToCompress, iSize) :
+                            ZSTD_compressContinue_public(zcs, cDst, oSize,
+                                        zcs->inBuff + zcs->inToCompress, iSize);
+                    FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                    zcs->frameEnded = lastBlock;
+                    /* prepare next block */
+                    zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
+                    if (zcs->inBuffTarget > zcs->inBuffSize)
+                        zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize;
+                    DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u",
+                            (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize);
+                    if (!lastBlock)
+                        assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                    zcs->inToCompress = zcs->inBuffPos;
+                } else { /* !inputBuffered, hence ZSTD_bm_stable */
+                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
+                    cSize = lastBlock ?
+                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
+                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
+                    /* Consume the input prior to error checking to mirror buffered mode. */
+                    if (ip) ip += iSize;
+                    FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                    zcs->frameEnded = lastBlock;
+                    if (lastBlock) assert(ip == iend);
+                }
+                if (cDst == op) {  /* no need to flush */
+                    op += cSize;
+                    if (zcs->frameEnded) {
+                        DEBUGLOG(5, "Frame completed directly in outBuffer");
+                        someMoreWork = 0;
+                        ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+                    }
+                    break;
+                }
+                zcs->outBuffContentSize = cSize;
+                zcs->outBuffFlushedSize = 0;
+                zcs->streamStage = zcss_flush; /* pass-through to flush stage */
+            }
+	    ZSTD_FALLTHROUGH;
+        case zcss_flush:
+            DEBUGLOG(5, "flush stage");
+            assert(zcs->appliedParams.outBufferMode == ZSTD_bm_buffered);
+            {   size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+                size_t const flushed = ZSTD_limitCopy(op, (size_t)(oend-op),
+                            zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+                DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u",
+                            (unsigned)toFlush, (unsigned)(oend-op), (unsigned)flushed);
+                if (flushed)
+                    op += flushed;
+                zcs->outBuffFlushedSize += flushed;
+                if (toFlush!=flushed) {
+                    /* flush not fully completed, presumably because dst is too small */
+                    assert(op==oend);
+                    someMoreWork = 0;
+                    break;
+                }
+                zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+                if (zcs->frameEnded) {
+                    DEBUGLOG(5, "Frame completed on flush");
+                    someMoreWork = 0;
+                    ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+                    break;
+                }
+                zcs->streamStage = zcss_load;
+                break;
+            }
+
+        default: /* impossible */
+            assert(0);
+        }
+    }
+
+    input->pos = ip - istart;
+    output->pos = op - ostart;
+    if (zcs->frameEnded) return 0;
+    return ZSTD_nextInputSizeHint(zcs);
+}
+
+static size_t ZSTD_nextInputSizeHint_MTorST(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers >= 1) {
+        assert(cctx->mtctx != NULL);
+        return ZSTDMT_nextInputSizeHint(cctx->mtctx);
+    }
+#endif
+    return ZSTD_nextInputSizeHint(cctx);
+
+}
+
+size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    FORWARD_IF_ERROR( ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue) , "");
+    return ZSTD_nextInputSizeHint_MTorST(zcs);
+}
+
+/* After a compression call set the expected input/output buffer.
+ * This is validated at the start of the next compression call.
+ */
+static void
+ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
+{
+    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
+    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        cctx->expectedInBuffer = *input;
+    }
+    if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+        cctx->expectedOutBufferSize = output->size - output->pos;
+    }
+}
+
+/* Validate that the input/output buffers match the expectations set by
+ * ZSTD_setBufferExpectations.
+ */
+static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
+                                        ZSTD_outBuffer const* output,
+                                        ZSTD_inBuffer const* input,
+                                        ZSTD_EndDirective endOp)
+{
+    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        ZSTD_inBuffer const expect = cctx->expectedInBuffer;
+        if (expect.src != input->src || expect.pos != input->pos)
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
+    }
+    (void)endOp;
+    if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+        size_t const outBufferSize = output->size - output->pos;
+        if (cctx->expectedOutBufferSize != outBufferSize)
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+    }
+    return 0;
+}
+
+static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+                                             ZSTD_EndDirective endOp,
+                                             size_t inSize)
+{
+    ZSTD_CCtx_params params = cctx->requestedParams;
+    ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+    FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+    ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));   /* single usage */
+    assert(prefixDict.dict==NULL || cctx->cdict==NULL);    /* only one can be set */
+    if (cctx->cdict && !cctx->localDict.cdict) {
+        /* Let the cdict's compression level take priority over the requested params.
+         * But do not take the cdict's compression level if the "cdict" is actually a localDict
+         * generated from ZSTD_initLocalDict().
+         */
+        params.compressionLevel = cctx->cdict->compressionLevel;
+    }
+    DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
+
+    {   size_t const dictSize = prefixDict.dict
+                ? prefixDict.dictSize
+                : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
+        ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
+        params.cParams = ZSTD_getCParamsFromCCtxParams(
+                &params, cctx->pledgedSrcSizePlusOne-1,
+                dictSize, mode);
+    }
+
+    params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
+    params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
+    params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
+    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
+    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
+    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
+
+#ifdef ZSTD_MULTITHREAD
+    /* If external matchfinder is enabled, make sure to fail before checking job size (for consistency) */
+    RETURN_ERROR_IF(
+        ZSTD_hasExtSeqProd(&params) && params.nbWorkers >= 1,
+        parameter_combination_unsupported,
+        "External sequence producer isn't supported with nbWorkers >= 1"
+    );
+
+    if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) {
+        params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */
+    }
+    if (params.nbWorkers > 0) {
+#if ZSTD_TRACE
+        cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0;
+#endif
+        /* mt context creation */
+        if (cctx->mtctx == NULL) {
+            DEBUGLOG(4, "ZSTD_compressStream2: creating new mtctx for nbWorkers=%u",
+                        params.nbWorkers);
+            cctx->mtctx = ZSTDMT_createCCtx_advanced((U32)params.nbWorkers, cctx->customMem, cctx->pool);
+            RETURN_ERROR_IF(cctx->mtctx == NULL, memory_allocation, "NULL pointer!");
+        }
+        /* mt compression */
+        DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers);
+        FORWARD_IF_ERROR( ZSTDMT_initCStream_internal(
+                    cctx->mtctx,
+                    prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType,
+                    cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) , "");
+        cctx->dictID = cctx->cdict ? cctx->cdict->dictID : 0;
+        cctx->dictContentSize = cctx->cdict ? cctx->cdict->dictContentSize : prefixDict.dictSize;
+        cctx->consumedSrcSize = 0;
+        cctx->producedCSize = 0;
+        cctx->streamStage = zcss_load;
+        cctx->appliedParams = params;
+    } else
+#endif  /* ZSTD_MULTITHREAD */
+    {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
+        assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+        FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, ZSTD_dtlm_fast,
+                cctx->cdict,
+                &params, pledgedSrcSize,
+                ZSTDb_buffered) , "");
+        assert(cctx->appliedParams.nbWorkers == 0);
+        cctx->inToCompress = 0;
+        cctx->inBuffPos = 0;
+        if (cctx->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+            /* for small input: avoid automatic flush on reaching end of block, since
+            * it would require to add a 3-bytes null block to end frame
+            */
+            cctx->inBuffTarget = cctx->blockSize + (cctx->blockSize == pledgedSrcSize);
+        } else {
+            cctx->inBuffTarget = 0;
+        }
+        cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0;
+        cctx->streamStage = zcss_load;
+        cctx->frameEnded = 0;
+    }
+    return 0;
+}
+
+/* @return provides a minimum amount of data remaining to be flushed from internal buffers
+ */
+size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                             ZSTD_outBuffer* output,
+                             ZSTD_inBuffer* input,
+                             ZSTD_EndDirective endOp)
+{
+    DEBUGLOG(5, "ZSTD_compressStream2, endOp=%u ", (unsigned)endOp);
+    /* check conditions */
+    RETURN_ERROR_IF(output->pos > output->size, dstSize_tooSmall, "invalid output buffer");
+    RETURN_ERROR_IF(input->pos  > input->size, srcSize_wrong, "invalid input buffer");
+    RETURN_ERROR_IF((U32)endOp > (U32)ZSTD_e_end, parameter_outOfBound, "invalid endDirective");
+    assert(cctx != NULL);
+
+    /* transparent initialization stage */
+    if (cctx->streamStage == zcss_init) {
+        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
+        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
+        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
+          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
+          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
+            if (cctx->stableIn_notConsumed) {  /* not the first time */
+                /* check stable source guarantees */
+                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
+                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
+            }
+            /* pretend input was consumed, to give a sense forward progress */
+            input->pos = input->size;
+            /* save stable inBuffer, for later control, and flush/end */
+            cctx->expectedInBuffer = *input;
+            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
+            cctx->stableIn_notConsumed += inputSize;
+            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
+            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
+        }
+        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
+        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
+    }
+    /* end of transparent initialization stage */
+
+    FORWARD_IF_ERROR(ZSTD_checkBufferStability(cctx, output, input, endOp), "invalid buffers");
+    /* compression stage */
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        size_t flushMin;
+        if (cctx->cParamsChanged) {
+            ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams);
+            cctx->cParamsChanged = 0;
+        }
+        if (cctx->stableIn_notConsumed) {
+            assert(cctx->appliedParams.inBufferMode == ZSTD_bm_stable);
+            /* some early data was skipped - make it available for consumption */
+            assert(input->pos >= cctx->stableIn_notConsumed);
+            input->pos -= cctx->stableIn_notConsumed;
+            cctx->stableIn_notConsumed = 0;
+        }
+        for (;;) {
+            size_t const ipos = input->pos;
+            size_t const opos = output->pos;
+            flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp);
+            cctx->consumedSrcSize += (U64)(input->pos - ipos);
+            cctx->producedCSize += (U64)(output->pos - opos);
+            if ( ZSTD_isError(flushMin)
+              || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */
+                if (flushMin == 0)
+                    ZSTD_CCtx_trace(cctx, 0);
+                ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only);
+            }
+            FORWARD_IF_ERROR(flushMin, "ZSTDMT_compressStream_generic failed");
+
+            if (endOp == ZSTD_e_continue) {
+                /* We only require some progress with ZSTD_e_continue, not maximal progress.
+                 * We're done if we've consumed or produced any bytes, or either buffer is
+                 * full.
+                 */
+                if (input->pos != ipos || output->pos != opos || input->pos == input->size || output->pos == output->size)
+                    break;
+            } else {
+                assert(endOp == ZSTD_e_flush || endOp == ZSTD_e_end);
+                /* We require maximal progress. We're done when the flush is complete or the
+                 * output buffer is full.
+                 */
+                if (flushMin == 0 || output->pos == output->size)
+                    break;
+            }
+        }
+        DEBUGLOG(5, "completed ZSTD_compressStream2 delegating to ZSTDMT_compressStream_generic");
+        /* Either we don't require maximum forward progress, we've finished the
+         * flush, or we are out of output space.
+         */
+        assert(endOp == ZSTD_e_continue || flushMin == 0 || output->pos == output->size);
+        ZSTD_setBufferExpectations(cctx, output, input);
+        return flushMin;
+    }
+#endif /* ZSTD_MULTITHREAD */
+    FORWARD_IF_ERROR( ZSTD_compressStream_generic(cctx, output, input, endOp) , "");
+    DEBUGLOG(5, "completed ZSTD_compressStream2");
+    ZSTD_setBufferExpectations(cctx, output, input);
+    return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */
+}
+
+size_t ZSTD_compressStream2_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp)
+{
+    ZSTD_outBuffer output;
+    ZSTD_inBuffer  input;
+    output.dst = dst;
+    output.size = dstCapacity;
+    output.pos = *dstPos;
+    input.src = src;
+    input.size = srcSize;
+    input.pos = *srcPos;
+    /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+        *dstPos = output.pos;
+        *srcPos = input.pos;
+        return cErr;
+    }
+}
+
+size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+                      void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    ZSTD_bufferMode_e const originalInBufferMode = cctx->requestedParams.inBufferMode;
+    ZSTD_bufferMode_e const originalOutBufferMode = cctx->requestedParams.outBufferMode;
+    DEBUGLOG(4, "ZSTD_compress2 (srcSize=%u)", (unsigned)srcSize);
+    ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only);
+    /* Enable stable input/output buffers. */
+    cctx->requestedParams.inBufferMode = ZSTD_bm_stable;
+    cctx->requestedParams.outBufferMode = ZSTD_bm_stable;
+    {   size_t oPos = 0;
+        size_t iPos = 0;
+        size_t const result = ZSTD_compressStream2_simpleArgs(cctx,
+                                        dst, dstCapacity, &oPos,
+                                        src, srcSize, &iPos,
+                                        ZSTD_e_end);
+        /* Reset to the original values. */
+        cctx->requestedParams.inBufferMode = originalInBufferMode;
+        cctx->requestedParams.outBufferMode = originalOutBufferMode;
+
+        FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+        if (result != 0) {  /* compression not completed, due to lack of output space */
+            assert(oPos == dstCapacity);
+            RETURN_ERROR(dstSize_tooSmall, "");
+        }
+        assert(iPos == srcSize);   /* all input is expected consumed */
+        return oPos;
+    }
+}
+
+/* ZSTD_validateSequence() :
+ * @offCode : is presumed to follow format required by ZSTD_storeSeq()
+ * @returns a ZSTD error code if sequence is not valid
+ */
+static size_t
+ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
+                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
+{
+    U32 const windowSize = 1u << windowLog;
+    /* posInSrc represents the amount of data the decoder would decode up to this point.
+     * As long as the amount of data decoded is less than or equal to window size, offsets may be
+     * larger than the total length of output decoded in order to reference the dict, even larger than
+     * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
+     */
+    size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
+    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
+    RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
+    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
+    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
+    return 0;
+}
+
+/* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
+static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+{
+    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
+
+    if (!ll0 && rawOffset == rep[0]) {
+        offBase = REPCODE1_TO_OFFBASE;
+    } else if (rawOffset == rep[1]) {
+        offBase = REPCODE_TO_OFFBASE(2 - ll0);
+    } else if (rawOffset == rep[2]) {
+        offBase = REPCODE_TO_OFFBASE(3 - ll0);
+    } else if (ll0 && rawOffset == rep[0] - 1) {
+        offBase = REPCODE3_TO_OFFBASE;
+    }
+    return offBase;
+}
+
+size_t
+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                              ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                        const void* src, size_t blockSize,
+                                        ZSTD_paramSwitch_e externalRepSearch)
+{
+    U32 idx = seqPos->idx;
+    U32 const startIdx = idx;
+    BYTE const* ip = (BYTE const*)(src);
+    const BYTE* const iend = ip + blockSize;
+    repcodes_t updatedRepcodes;
+    U32 dictSize;
+
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
+
+    if (cctx->cdict) {
+        dictSize = (U32)cctx->cdict->dictContentSize;
+    } else if (cctx->prefixDict.dict) {
+        dictSize = (U32)cctx->prefixDict.dictSize;
+    } else {
+        dictSize = 0;
+    }
+    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
+        U32 const litLength = inSeqs[idx].litLength;
+        U32 const matchLength = inSeqs[idx].matchLength;
+        U32 offBase;
+
+        if (externalRepSearch == ZSTD_ps_disable) {
+            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
+        } else {
+            U32 const ll0 = (litLength == 0);
+            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+        }
+
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+        if (cctx->appliedParams.validateSequences) {
+            seqPos->posInSrc += litLength + matchLength;
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
+                                                cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                "Sequence validation failed");
+        }
+        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                        "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+        ip += matchLength + litLength;
+    }
+
+    /* If we skipped repcode search while parsing, we need to update repcodes now */
+    assert(externalRepSearch != ZSTD_ps_auto);
+    assert(idx >= startIdx);
+    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
+        U32* const rep = updatedRepcodes.rep;
+        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
+
+        if (lastSeqIdx >= startIdx + 2) {
+            rep[2] = inSeqs[lastSeqIdx - 2].offset;
+            rep[1] = inSeqs[lastSeqIdx - 1].offset;
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        } else if (lastSeqIdx == startIdx + 1) {
+            rep[2] = rep[0];
+            rep[1] = inSeqs[lastSeqIdx - 1].offset;
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        } else {
+            assert(lastSeqIdx == startIdx);
+            rep[2] = rep[1];
+            rep[1] = rep[0];
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        }
+    }
+
+    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
+
+    if (inSeqs[idx].litLength) {
+        DEBUGLOG(6, "Storing last literals of size: %u", inSeqs[idx].litLength);
+        ZSTD_storeLastLiterals(&cctx->seqStore, ip, inSeqs[idx].litLength);
+        ip += inSeqs[idx].litLength;
+        seqPos->posInSrc += inSeqs[idx].litLength;
+    }
+    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
+    seqPos->idx = idx+1;
+    return 0;
+}
+
+size_t
+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
+{
+    U32 idx = seqPos->idx;
+    U32 startPosInSequence = seqPos->posInSequence;
+    U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize;
+    size_t dictSize;
+    BYTE const* ip = (BYTE const*)(src);
+    BYTE const* iend = ip + blockSize;  /* May be adjusted if we decide to process fewer than blockSize bytes */
+    repcodes_t updatedRepcodes;
+    U32 bytesAdjustment = 0;
+    U32 finalMatchSplit = 0;
+
+    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
+    (void)externalRepSearch;
+
+    if (cctx->cdict) {
+        dictSize = cctx->cdict->dictContentSize;
+    } else if (cctx->prefixDict.dict) {
+        dictSize = cctx->prefixDict.dictSize;
+    } else {
+        dictSize = 0;
+    }
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+    DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+    while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
+        const ZSTD_Sequence currSeq = inSeqs[idx];
+        U32 litLength = currSeq.litLength;
+        U32 matchLength = currSeq.matchLength;
+        U32 const rawOffset = currSeq.offset;
+        U32 offBase;
+
+        /* Modify the sequence depending on where endPosInSequence lies */
+        if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
+            if (startPosInSequence >= litLength) {
+                startPosInSequence -= litLength;
+                litLength = 0;
+                matchLength -= startPosInSequence;
+            } else {
+                litLength -= startPosInSequence;
+            }
+            /* Move to the next sequence */
+            endPosInSequence -= currSeq.litLength + currSeq.matchLength;
+            startPosInSequence = 0;
+        } else {
+            /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
+               does not reach the end of the match. So, we have to split the sequence */
+            DEBUGLOG(6, "Require a split: diff: %u, idx: %u PIS: %u",
+                     currSeq.litLength + currSeq.matchLength - endPosInSequence, idx, endPosInSequence);
+            if (endPosInSequence > litLength) {
+                U32 firstHalfMatchLength;
+                litLength = startPosInSequence >= litLength ? 0 : litLength - startPosInSequence;
+                firstHalfMatchLength = endPosInSequence - startPosInSequence - litLength;
+                if (matchLength > blockSize && firstHalfMatchLength >= cctx->appliedParams.cParams.minMatch) {
+                    /* Only ever split the match if it is larger than the block size */
+                    U32 secondHalfMatchLength = currSeq.matchLength + currSeq.litLength - endPosInSequence;
+                    if (secondHalfMatchLength < cctx->appliedParams.cParams.minMatch) {
+                        /* Move the endPosInSequence backward so that it creates match of minMatch length */
+                        endPosInSequence -= cctx->appliedParams.cParams.minMatch - secondHalfMatchLength;
+                        bytesAdjustment = cctx->appliedParams.cParams.minMatch - secondHalfMatchLength;
+                        firstHalfMatchLength -= bytesAdjustment;
+                    }
+                    matchLength = firstHalfMatchLength;
+                    /* Flag that we split the last match - after storing the sequence, exit the loop,
+                       but keep the value of endPosInSequence */
+                    finalMatchSplit = 1;
+                } else {
+                    /* Move the position in sequence backwards so that we don't split match, and break to store
+                     * the last literals. We use the original currSeq.litLength as a marker for where endPosInSequence
+                     * should go. We prefer to do this whenever it is not necessary to split the match, or if doing so
+                     * would cause the first half of the match to be too small
+                     */
+                    bytesAdjustment = endPosInSequence - currSeq.litLength;
+                    endPosInSequence = currSeq.litLength;
+                    break;
+                }
+            } else {
+                /* This sequence ends inside the literals, break to store the last literals */
+                break;
+            }
+        }
+        /* Check if this offset can be represented with a repcode */
+        {   U32 const ll0 = (litLength == 0);
+            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+        }
+
+        if (cctx->appliedParams.validateSequences) {
+            seqPos->posInSrc += litLength + matchLength;
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
+                                                   cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
+                                                   "Sequence validation failed");
+        }
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                        "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+        ip += matchLength + litLength;
+        if (!finalMatchSplit)
+            idx++; /* Next Sequence */
+    }
+    DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+    assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
+    seqPos->idx = idx;
+    seqPos->posInSequence = endPosInSequence;
+    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
+
+    iend -= bytesAdjustment;
+    if (ip != iend) {
+        /* Store any last literals */
+        U32 lastLLSize = (U32)(iend - ip);
+        assert(ip <= iend);
+        DEBUGLOG(6, "Storing last literals of size: %u", lastLLSize);
+        ZSTD_storeLastLiterals(&cctx->seqStore, ip, lastLLSize);
+        seqPos->posInSrc += lastLLSize;
+    }
+
+    return bytesAdjustment;
+}
+
+typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                       const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                       const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+{
+    ZSTD_sequenceCopier sequenceCopier = NULL;
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode));
+    if (mode == ZSTD_sf_explicitBlockDelimiters) {
+        return ZSTD_copySequencesToSeqStoreExplicitBlockDelim;
+    } else if (mode == ZSTD_sf_noBlockDelimiters) {
+        return ZSTD_copySequencesToSeqStoreNoBlockDelim;
+    }
+    assert(sequenceCopier != NULL);
+    return sequenceCopier;
+}
+
+/* Discover the size of next block by searching for the delimiter.
+ * Note that a block delimiter **must** exist in this mode,
+ * otherwise it's an input error.
+ * The block size retrieved will be later compared to ensure it remains within bounds */
+static size_t
+blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    int end = 0;
+    size_t blockSize = 0;
+    size_t spos = seqPos.idx;
+    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
+    assert(spos <= inSeqsSize);
+    while (spos < inSeqsSize) {
+        end = (inSeqs[spos].offset == 0);
+        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
+        if (end) {
+            if (inSeqs[spos].matchLength != 0)
+                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
+            break;
+        }
+        spos++;
+    }
+    if (!end)
+        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
+    return blockSize;
+}
+
+/* More a "target" block size */
+static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
+{
+    int const lastBlock = (remaining <= blockSize);
+    return lastBlock ? remaining : blockSize;
+}
+
+static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
+                           size_t blockSize, size_t remaining,
+                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
+    if (mode == ZSTD_sf_noBlockDelimiters)
+        return blockSize_noDelimiter(blockSize, remaining);
+    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
+        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
+        if (explicitBlockSize > blockSize)
+            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
+        if (explicitBlockSize > remaining)
+            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
+        return explicitBlockSize;
+    }
+}
+
+/* Compress, block-by-block, all of the sequences given.
+ *
+ * Returns the cumulative size of all compressed blocks (including their headers),
+ * otherwise a ZSTD error.
+ */
+static size_t
+ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                          const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                          const void* src, size_t srcSize)
+{
+    size_t cSize = 0;
+    size_t remaining = srcSize;
+    ZSTD_sequencePosition seqPos = {0, 0, 0};
+
+    BYTE const* ip = (BYTE const*)src;
+    BYTE* op = (BYTE*)dst;
+    ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters);
+
+    DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize);
+    /* Special case: empty frame */
+    if (remaining == 0) {
+        U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1);
+        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "No room for empty frame block header");
+        MEM_writeLE32(op, cBlockHeader24);
+        op += ZSTD_blockHeaderSize;
+        dstCapacity -= ZSTD_blockHeaderSize;
+        cSize += ZSTD_blockHeaderSize;
+    }
+
+    while (remaining) {
+        size_t compressedSeqsSize;
+        size_t cBlockSize;
+        size_t additionalByteAdjustment;
+        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
+                                        cctx->blockSize, remaining,
+                                        inSeqs, inSeqsSize, seqPos);
+        U32 const lastBlock = (blockSize == remaining);
+        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
+        assert(blockSize <= remaining);
+        ZSTD_resetSeqStore(&cctx->seqStore);
+        DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
+
+        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
+        FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
+        blockSize -= additionalByteAdjustment;
+
+        /* If blocks are too small, emit as a nocompress block */
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+            cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+            DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+            cSize += cBlockSize;
+            ip += blockSize;
+            op += cBlockSize;
+            remaining -= blockSize;
+            dstCapacity -= cBlockSize;
+            continue;
+        }
+
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
+        compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
+                                &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
+                                &cctx->appliedParams,
+                                op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize,
+                                blockSize,
+                                cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+                                cctx->bmi2);
+        FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
+        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
+
+        if (!cctx->isFirstBlock &&
+            ZSTD_maybeRLE(&cctx->seqStore) &&
+            ZSTD_isRLE(ip, blockSize)) {
+            /* We don't want to emit our first block as a RLE even if it qualifies because
+            * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+            * This is only an issue for zstd <= v1.4.3
+            */
+            compressedSeqsSize = 1;
+        }
+
+        if (compressedSeqsSize == 0) {
+            /* ZSTD_noCompressBlock writes the block header as well */
+            cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
+            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
+        } else if (compressedSeqsSize == 1) {
+            cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
+            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
+        } else {
+            U32 cBlockHeader;
+            /* Error checking and repcodes update */
+            ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState);
+            if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+                cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+            /* Write block header into beginning of block*/
+            cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
+            MEM_writeLE24(op, cBlockHeader);
+            cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
+            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
+        }
+
+        cSize += cBlockSize;
+
+        if (lastBlock) {
+            break;
+        } else {
+            ip += blockSize;
+            op += cBlockSize;
+            remaining -= blockSize;
+            dstCapacity -= cBlockSize;
+            cctx->isFirstBlock = 0;
+        }
+        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
+    }
+
+    DEBUGLOG(4, "cSize final total: %zu", cSize);
+    return cSize;
+}
+
+size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                              const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                              const void* src, size_t srcSize)
+{
+    BYTE* op = (BYTE*)dst;
+    size_t cSize = 0;
+    size_t compressedBlocksSize = 0;
+    size_t frameHeaderSize = 0;
+
+    /* Transparent initialization stage, same as compressStream2() */
+    DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
+    assert(cctx != NULL);
+    FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
+    /* Begin writing output, starting with frame header */
+    frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, &cctx->appliedParams, srcSize, cctx->dictID);
+    op += frameHeaderSize;
+    dstCapacity -= frameHeaderSize;
+    cSize += frameHeaderSize;
+    if (cctx->appliedParams.fParams.checksumFlag && srcSize) {
+        XXH64_update(&cctx->xxhState, src, srcSize);
+    }
+    /* cSize includes block header size and compressed sequences size */
+    compressedBlocksSize = ZSTD_compressSequences_internal(cctx,
+                                                           op, dstCapacity,
+                                                           inSeqs, inSeqsSize,
+                                                           src, srcSize);
+    FORWARD_IF_ERROR(compressedBlocksSize, "Compressing blocks failed!");
+    cSize += compressedBlocksSize;
+    dstCapacity -= compressedBlocksSize;
+
+    if (cctx->appliedParams.fParams.checksumFlag) {
+        U32 const checksum = (U32) XXH64_digest(&cctx->xxhState);
+        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum");
+        DEBUGLOG(4, "Write checksum : %08X", (unsigned)checksum);
+        MEM_writeLE32((char*)dst + cSize, checksum);
+        cSize += 4;
+    }
+
+    DEBUGLOG(4, "Final compressed size: %zu", cSize);
+    return cSize;
+}
+
+/*======   Finalize   ======*/
+
+static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
+{
+    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
+    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+    return stableInput ? zcs->expectedInBuffer : nullInput;
+}
+
+/*! ZSTD_flushStream() :
+ * @return : amount of data remaining to flush */
+size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+    input.size = input.pos; /* do not ingest more input during flush */
+    return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+}
+
+
+size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+    size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
+    if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
+    /* single thread mode : attempt to calculate remaining to flush more precisely */
+    {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+        size_t const checksumSize = (size_t)(zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4);
+        size_t const toFlush = remainingToFlush + lastBlockSize + checksumSize;
+        DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (unsigned)toFlush);
+        return toFlush;
+    }
+}
+
+
+/*-=====  Pre-defined compression levels  =====-*/
+#include "clevels.h"
+
+int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; }
+int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; }
+
+static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, 0, dictSize, ZSTD_cpm_createCDict);
+    switch (cParams.strategy) {
+        case ZSTD_fast:
+        case ZSTD_dfast:
+            break;
+        case ZSTD_greedy:
+        case ZSTD_lazy:
+        case ZSTD_lazy2:
+            cParams.hashLog += ZSTD_LAZY_DDSS_BUCKET_LOG;
+            break;
+        case ZSTD_btlazy2:
+        case ZSTD_btopt:
+        case ZSTD_btultra:
+        case ZSTD_btultra2:
+            break;
+    }
+    return cParams;
+}
+
+static int ZSTD_dedicatedDictSearch_isSupported(
+        ZSTD_compressionParameters const* cParams)
+{
+    return (cParams->strategy >= ZSTD_greedy)
+        && (cParams->strategy <= ZSTD_lazy2)
+        && (cParams->hashLog > cParams->chainLog)
+        && (cParams->chainLog <= 24);
+}
+
+/**
+ * Reverses the adjustment applied to cparams when enabling dedicated dict
+ * search. This is used to recover the params set to be used in the working
+ * context. (Otherwise, those tables would also grow.)
+ */
+static void ZSTD_dedicatedDictSearch_revertCParams(
+        ZSTD_compressionParameters* cParams) {
+    switch (cParams->strategy) {
+        case ZSTD_fast:
+        case ZSTD_dfast:
+            break;
+        case ZSTD_greedy:
+        case ZSTD_lazy:
+        case ZSTD_lazy2:
+            cParams->hashLog -= ZSTD_LAZY_DDSS_BUCKET_LOG;
+            if (cParams->hashLog < ZSTD_HASHLOG_MIN) {
+                cParams->hashLog = ZSTD_HASHLOG_MIN;
+            }
+            break;
+        case ZSTD_btlazy2:
+        case ZSTD_btopt:
+        case ZSTD_btultra:
+        case ZSTD_btultra2:
+            break;
+    }
+}
+
+static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
+{
+    switch (mode) {
+    case ZSTD_cpm_unknown:
+    case ZSTD_cpm_noAttachDict:
+    case ZSTD_cpm_createCDict:
+        break;
+    case ZSTD_cpm_attachDict:
+        dictSize = 0;
+        break;
+    default:
+        assert(0);
+        break;
+    }
+    {   int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN;
+        size_t const addedSize = unknown && dictSize > 0 ? 500 : 0;
+        return unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize;
+    }
+}
+
+/*! ZSTD_getCParams_internal() :
+ * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+ *  Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown.
+ *        Use dictSize == 0 for unknown or unused.
+ *  Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_cParamMode_e`. */
+static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
+{
+    U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode);
+    U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);
+    int row;
+    DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel);
+
+    /* row */
+    if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT;   /* 0 == default */
+    else if (compressionLevel < 0) row = 0;   /* entry 0 is baseline for fast mode */
+    else if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL;
+    else row = compressionLevel;
+
+    {   ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row];
+        DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u strat: %u", tableID, row, (U32)cp.strategy);
+        /* acceleration factor */
+        if (compressionLevel < 0) {
+            int const clampedCompressionLevel = MAX(ZSTD_minCLevel(), compressionLevel);
+            cp.targetLength = (unsigned)(-clampedCompressionLevel);
+        }
+        /* refine parameters based on srcSize & dictSize */
+        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
+    }
+}
+
+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+ *  Size values are optional, provide 0 if not known or unused */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
+{
+    if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+}
+
+/*! ZSTD_getParams() :
+ *  same idea as ZSTD_getCParams()
+ * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+ *  Fields of `ZSTD_frameParameters` are set to default values */
+static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) {
+    ZSTD_parameters params;
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode);
+    DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel);
+    ZSTD_memset(&params, 0, sizeof(params));
+    params.cParams = cParams;
+    params.fParams.contentSizeFlag = 1;
+    return params;
+}
+
+/*! ZSTD_getParams() :
+ *  same idea as ZSTD_getCParams()
+ * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+ *  Fields of `ZSTD_frameParameters` are set to default values */
+ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
+    if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+}
+
+void ZSTD_registerSequenceProducer(
+    ZSTD_CCtx* zc,
+    void* extSeqProdState,
+    ZSTD_sequenceProducer_F extSeqProdFunc
+) {
+    assert(zc != NULL);
+    ZSTD_CCtxParams_registerSequenceProducer(
+        &zc->requestedParams, extSeqProdState, extSeqProdFunc
+    );
+}
+
+void ZSTD_CCtxParams_registerSequenceProducer(
+  ZSTD_CCtx_params* params,
+  void* extSeqProdState,
+  ZSTD_sequenceProducer_F extSeqProdFunc
+) {
+    assert(params != NULL);
+    if (extSeqProdFunc != NULL) {
+        params->extSeqProdFunc = extSeqProdFunc;
+        params->extSeqProdState = extSeqProdState;
+    } else {
+        params->extSeqProdFunc = NULL;
+        params->extSeqProdState = NULL;
+    }
+}
diff --git a/src/zstd/compress/zstd_compress_internal.h b/src/zstd/compress/zstd_compress_internal.h
new file mode 100644
index 000000000..e41d7b78e
--- /dev/null
+++ b/src/zstd/compress/zstd_compress_internal.h
@@ -0,0 +1,1534 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This header contains definitions
+ * that shall **only** be used by modules within lib/compress.
+ */
+
+#ifndef ZSTD_COMPRESS_H
+#define ZSTD_COMPRESS_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "../common/zstd_internal.h"
+#include "zstd_cwksp.h"
+#ifdef ZSTD_MULTITHREAD
+#  include "zstdmt_compress.h"
+#endif
+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+*  Constants
+***************************************/
+#define kSearchStrength      8
+#define HASH_READ_SIZE       8
+#define ZSTD_DUBT_UNSORTED_MARK 1   /* For btlazy2 strategy, index ZSTD_DUBT_UNSORTED_MARK==1 means "unsorted".
+                                       It could be confused for a real successor at index "1", if sorted as larger than its predecessor.
+                                       It's not a big deal though : candidate will just be sorted again.
+                                       Additionally, candidate position 1 will be lost.
+                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy.
+                                       This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
+
+
+/*-*************************************
+*  Context memory management
+***************************************/
+typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
+typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage;
+
+typedef struct ZSTD_prefixDict_s {
+    const void* dict;
+    size_t dictSize;
+    ZSTD_dictContentType_e dictContentType;
+} ZSTD_prefixDict;
+
+typedef struct {
+    void* dictBuffer;
+    void const* dict;
+    size_t dictSize;
+    ZSTD_dictContentType_e dictContentType;
+    ZSTD_CDict* cdict;
+} ZSTD_localDict;
+
+typedef struct {
+    HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)];
+    HUF_repeat repeatMode;
+} ZSTD_hufCTables_t;
+
+typedef struct {
+    FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
+    FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
+    FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
+    FSE_repeat offcode_repeatMode;
+    FSE_repeat matchlength_repeatMode;
+    FSE_repeat litlength_repeatMode;
+} ZSTD_fseCTables_t;
+
+typedef struct {
+    ZSTD_hufCTables_t huf;
+    ZSTD_fseCTables_t fse;
+} ZSTD_entropyCTables_t;
+
+/***********************************************
+*  Entropy buffer statistics structs and funcs *
+***********************************************/
+/** ZSTD_hufCTablesMetadata_t :
+ *  Stores Literals Block Type for a super-block in hType, and
+ *  huffman tree description in hufDesBuffer.
+ *  hufDesSize refers to the size of huffman tree description in bytes.
+ *  This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */
+typedef struct {
+    symbolEncodingType_e hType;
+    BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE];
+    size_t hufDesSize;
+} ZSTD_hufCTablesMetadata_t;
+
+/** ZSTD_fseCTablesMetadata_t :
+ *  Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and
+ *  fse tables in fseTablesBuffer.
+ *  fseTablesSize refers to the size of fse tables in bytes.
+ *  This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */
+typedef struct {
+    symbolEncodingType_e llType;
+    symbolEncodingType_e ofType;
+    symbolEncodingType_e mlType;
+    BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE];
+    size_t fseTablesSize;
+    size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
+} ZSTD_fseCTablesMetadata_t;
+
+typedef struct {
+    ZSTD_hufCTablesMetadata_t hufMetadata;
+    ZSTD_fseCTablesMetadata_t fseMetadata;
+} ZSTD_entropyCTablesMetadata_t;
+
+/** ZSTD_buildBlockEntropyStats() :
+ *  Builds entropy for the block.
+ *  @return : 0 on success or error code */
+size_t ZSTD_buildBlockEntropyStats(
+                    const seqStore_t* seqStorePtr,
+                    const ZSTD_entropyCTables_t* prevEntropy,
+                          ZSTD_entropyCTables_t* nextEntropy,
+                    const ZSTD_CCtx_params* cctxParams,
+                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                          void* workspace, size_t wkspSize);
+
+/*********************************
+*  Compression internals structs *
+*********************************/
+
+typedef struct {
+    U32 off;            /* Offset sumtype code for the match, using ZSTD_storeSeq() format */
+    U32 len;            /* Raw length of match */
+} ZSTD_match_t;
+
+typedef struct {
+    U32 offset;         /* Offset of sequence */
+    U32 litLength;      /* Length of literals prior to match */
+    U32 matchLength;    /* Raw length of match */
+} rawSeq;
+
+typedef struct {
+  rawSeq* seq;          /* The start of the sequences */
+  size_t pos;           /* The index in seq where reading stopped. pos <= size. */
+  size_t posInSequence; /* The position within the sequence at seq[pos] where reading
+                           stopped. posInSequence <= seq[pos].litLength + seq[pos].matchLength */
+  size_t size;          /* The number of sequences. <= capacity. */
+  size_t capacity;      /* The capacity starting from `seq` pointer */
+} rawSeqStore_t;
+
+typedef struct {
+    U32 idx;            /* Index in array of ZSTD_Sequence */
+    U32 posInSequence;  /* Position within sequence at idx */
+    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
+} ZSTD_sequencePosition;
+
+UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
+
+typedef struct {
+    int price;  /* price from beginning of segment to this position */
+    U32 off;    /* offset of previous match */
+    U32 mlen;   /* length of previous match */
+    U32 litlen; /* nb of literals since previous match */
+    U32 rep[ZSTD_REP_NUM];  /* offset history after previous match */
+} ZSTD_optimal_t;
+
+typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+
+#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3)
+typedef struct {
+    /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+    unsigned* litFreq;           /* table of literals statistics, of size 256 */
+    unsigned* litLengthFreq;     /* table of litLength statistics, of size (MaxLL+1) */
+    unsigned* matchLengthFreq;   /* table of matchLength statistics, of size (MaxML+1) */
+    unsigned* offCodeFreq;       /* table of offCode statistics, of size (MaxOff+1) */
+    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_SIZE */
+    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */
+
+    U32  litSum;                 /* nb of literals */
+    U32  litLengthSum;           /* nb of litLength codes */
+    U32  matchLengthSum;         /* nb of matchLength codes */
+    U32  offCodeSum;             /* nb of offset codes */
+    U32  litSumBasePrice;        /* to compare to log2(litfreq) */
+    U32  litLengthSumBasePrice;  /* to compare to log2(llfreq)  */
+    U32  matchLengthSumBasePrice;/* to compare to log2(mlfreq)  */
+    U32  offCodeSumBasePrice;    /* to compare to log2(offreq)  */
+    ZSTD_OptPrice_e priceType;   /* prices can be determined dynamically, or follow a pre-defined cost structure */
+    const ZSTD_entropyCTables_t* symbolCosts;  /* pre-calculated dictionary statistics */
+    ZSTD_paramSwitch_e literalCompressionMode;
+} optState_t;
+
+typedef struct {
+  ZSTD_entropyCTables_t entropy;
+  U32 rep[ZSTD_REP_NUM];
+} ZSTD_compressedBlockState_t;
+
+typedef struct {
+    BYTE const* nextSrc;       /* next block here to continue on current prefix */
+    BYTE const* base;          /* All regular indexes relative to this position */
+    BYTE const* dictBase;      /* extDict indexes relative to this position */
+    U32 dictLimit;             /* below that point, need extDict */
+    U32 lowLimit;              /* below that point, no more valid data */
+    U32 nbOverflowCorrections; /* Number of times overflow correction has run since
+                                * ZSTD_window_init(). Useful for debugging coredumps
+                                * and for ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY.
+                                */
+} ZSTD_window_t;
+
+#define ZSTD_WINDOW_START_INDEX 2
+
+typedef struct ZSTD_matchState_t ZSTD_matchState_t;
+
+#define ZSTD_ROW_HASH_CACHE_SIZE 8       /* Size of prefetching hash cache for row-based matchfinder */
+
+struct ZSTD_matchState_t {
+    ZSTD_window_t window;   /* State for window round buffer management */
+    U32 loadedDictEnd;      /* index of end of dictionary, within context's referential.
+                             * When loadedDictEnd != 0, a dictionary is in use, and still valid.
+                             * This relies on a mechanism to set loadedDictEnd=0 when dictionary is no longer within distance.
+                             * Such mechanism is provided within ZSTD_window_enforceMaxDist() and ZSTD_checkDictValidity().
+                             * When dict referential is copied into active context (i.e. not attached),
+                             * loadedDictEnd == dictSize, since referential starts from zero.
+                             */
+    U32 nextToUpdate;       /* index from which to continue table update */
+    U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
+
+    U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
+    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+    U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
+    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for reuse of tag table */
+    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
+
+    U32* hashTable;
+    U32* hashTable3;
+    U32* chainTable;
+
+    U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */
+
+    int dedicatedDictSearch;  /* Indicates whether this matchState is using the
+                               * dedicated dictionary search structure.
+                               */
+    optState_t opt;         /* optimal parser state */
+    const ZSTD_matchState_t* dictMatchState;
+    ZSTD_compressionParameters cParams;
+    const rawSeqStore_t* ldmSeqStore;
+
+    /* Controls prefetching in some dictMatchState matchfinders.
+     * This behavior is controlled from the cctx ms.
+     * This parameter has no effect in the cdict ms. */
+    int prefetchCDictTables;
+
+    /* When == 0, lazy match finders insert every position.
+     * When != 0, lazy match finders only insert positions they search.
+     * This allows them to skip much faster over incompressible data,
+     * at a small cost to compression ratio.
+     */
+    int lazySkipping;
+};
+
+typedef struct {
+    ZSTD_compressedBlockState_t* prevCBlock;
+    ZSTD_compressedBlockState_t* nextCBlock;
+    ZSTD_matchState_t matchState;
+} ZSTD_blockState_t;
+
+typedef struct {
+    U32 offset;
+    U32 checksum;
+} ldmEntry_t;
+
+typedef struct {
+    BYTE const* split;
+    U32 hash;
+    U32 checksum;
+    ldmEntry_t* bucket;
+} ldmMatchCandidate_t;
+
+#define LDM_BATCH_SIZE 64
+
+typedef struct {
+    ZSTD_window_t window;   /* State for the window round buffer management */
+    ldmEntry_t* hashTable;
+    U32 loadedDictEnd;
+    BYTE* bucketOffsets;    /* Next position in bucket to insert entry */
+    size_t splitIndices[LDM_BATCH_SIZE];
+    ldmMatchCandidate_t matchCandidates[LDM_BATCH_SIZE];
+} ldmState_t;
+
+typedef struct {
+    ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */
+    U32 hashLog;            /* Log size of hashTable */
+    U32 bucketSizeLog;      /* Log bucket size for collision resolution, at most 8 */
+    U32 minMatchLength;     /* Minimum match length */
+    U32 hashRateLog;       /* Log number of entries to skip */
+    U32 windowLog;          /* Window log for the LDM */
+} ldmParams_t;
+
+typedef struct {
+    int collectSequences;
+    ZSTD_Sequence* seqStart;
+    size_t seqIndex;
+    size_t maxSequences;
+} SeqCollector;
+
+struct ZSTD_CCtx_params_s {
+    ZSTD_format_e format;
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+
+    int compressionLevel;
+    int forceWindow;           /* force back-references to respect limit of
+                                * 1<<wLog, even for dictionary */
+    size_t targetCBlockSize;   /* Tries to fit compressed block size to be around targetCBlockSize.
+                                * No target when targetCBlockSize == 0.
+                                * There is no guarantee on compressed block size */
+    int srcSizeHint;           /* User's best guess of source size.
+                                * Hint is not valid when srcSizeHint == 0.
+                                * There is no guarantee that hint is close to actual source size */
+
+    ZSTD_dictAttachPref_e attachDictPref;
+    ZSTD_paramSwitch_e literalCompressionMode;
+
+    /* Multithreading: used to pass parameters to mtctx */
+    int nbWorkers;
+    size_t jobSize;
+    int overlapLog;
+    int rsyncable;
+
+    /* Long distance matching parameters */
+    ldmParams_t ldmParams;
+
+    /* Dedicated dict search algorithm trigger */
+    int enableDedicatedDictSearch;
+
+    /* Input/output buffer modes */
+    ZSTD_bufferMode_e inBufferMode;
+    ZSTD_bufferMode_e outBufferMode;
+
+    /* Sequence compression API */
+    ZSTD_sequenceFormat_e blockDelimiters;
+    int validateSequences;
+
+    /* Block splitting */
+    ZSTD_paramSwitch_e useBlockSplitter;
+
+    /* Param for deciding whether to use row-based matchfinder */
+    ZSTD_paramSwitch_e useRowMatchFinder;
+
+    /* Always load a dictionary in ext-dict mode (not prefix mode)? */
+    int deterministicRefPrefix;
+
+    /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+    ZSTD_customMem customMem;
+
+    /* Controls prefetching in some dictMatchState matchfinders */
+    ZSTD_paramSwitch_e prefetchCDictTables;
+
+    /* Controls whether zstd will fall back to an internal matchfinder
+     * if the external matchfinder returns an error code. */
+    int enableMatchFinderFallback;
+
+    /* Parameters for the external sequence producer API.
+     * Users set these parameters through ZSTD_registerSequenceProducer().
+     * It is not possible to set these parameters individually through the public API. */
+    void* extSeqProdState;
+    ZSTD_sequenceProducer_F extSeqProdFunc;
+
+    /* Adjust the max block size*/
+    size_t maxBlockSize;
+
+    /* Controls repcode search in external sequence parsing */
+    ZSTD_paramSwitch_e searchForExternalRepcodes;
+};  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+
+#define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
+#define ENTROPY_WORKSPACE_SIZE (HUF_WORKSPACE_SIZE + COMPRESS_SEQUENCES_WORKSPACE_SIZE)
+
+/**
+ * Indicates whether this compression proceeds directly from user-provided
+ * source buffer to user-provided destination buffer (ZSTDb_not_buffered), or
+ * whether the context needs to buffer the input/output (ZSTDb_buffered).
+ */
+typedef enum {
+    ZSTDb_not_buffered,
+    ZSTDb_buffered
+} ZSTD_buffered_policy_e;
+
+/**
+ * Struct that contains all elements of block splitter that should be allocated
+ * in a wksp.
+ */
+#define ZSTD_MAX_NB_BLOCK_SPLITS 196
+typedef struct {
+    seqStore_t fullSeqStoreChunk;
+    seqStore_t firstHalfSeqStore;
+    seqStore_t secondHalfSeqStore;
+    seqStore_t currSeqStore;
+    seqStore_t nextSeqStore;
+
+    U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS];
+    ZSTD_entropyCTablesMetadata_t entropyMetadata;
+} ZSTD_blockSplitCtx;
+
+struct ZSTD_CCtx_s {
+    ZSTD_compressionStage_e stage;
+    int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
+    int bmi2;                            /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+    ZSTD_CCtx_params requestedParams;
+    ZSTD_CCtx_params appliedParams;
+    ZSTD_CCtx_params simpleApiParams;    /* Param storage used by the simple API - not sticky. Must only be used in top-level simple API functions for storage. */
+    U32   dictID;
+    size_t dictContentSize;
+
+    ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */
+    size_t blockSize;
+    unsigned long long pledgedSrcSizePlusOne;  /* this way, 0 (default) == unknown */
+    unsigned long long consumedSrcSize;
+    unsigned long long producedCSize;
+    XXH64_state_t xxhState;
+    ZSTD_customMem customMem;
+    ZSTD_threadPool* pool;
+    size_t staticSize;
+    SeqCollector seqCollector;
+    int isFirstBlock;
+    int initialized;
+
+    seqStore_t seqStore;      /* sequences storage ptrs */
+    ldmState_t ldmState;      /* long distance matching state */
+    rawSeq* ldmSequences;     /* Storage for the ldm output sequences */
+    size_t maxNbLdmSequences;
+    rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
+    ZSTD_blockState_t blockState;
+    U32* entropyWorkspace;  /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */
+
+    /* Whether we are streaming or not */
+    ZSTD_buffered_policy_e bufferedPolicy;
+
+    /* streaming */
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inToCompress;
+    size_t inBuffPos;
+    size_t inBuffTarget;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outBuffContentSize;
+    size_t outBuffFlushedSize;
+    ZSTD_cStreamStage streamStage;
+    U32    frameEnded;
+
+    /* Stable in/out buffer verification */
+    ZSTD_inBuffer expectedInBuffer;
+    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
+    size_t expectedOutBufferSize;
+
+    /* Dictionary */
+    ZSTD_localDict localDict;
+    const ZSTD_CDict* cdict;
+    ZSTD_prefixDict prefixDict;   /* single-usage dictionary */
+
+    /* Multi-threading */
+#ifdef ZSTD_MULTITHREAD
+    ZSTDMT_CCtx* mtctx;
+#endif
+
+    /* Tracing */
+#if ZSTD_TRACE
+    ZSTD_TraceCtx traceCtx;
+#endif
+
+    /* Workspace for block splitter */
+    ZSTD_blockSplitCtx blockSplitCtx;
+
+    /* Buffer for output from external sequence producer */
+    ZSTD_Sequence* extSeqBuf;
+    size_t extSeqBufCapacity;
+};
+
+typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
+
+typedef enum {
+    ZSTD_noDict = 0,
+    ZSTD_extDict = 1,
+    ZSTD_dictMatchState = 2,
+    ZSTD_dedicatedDictSearch = 3
+} ZSTD_dictMode_e;
+
+typedef enum {
+    ZSTD_cpm_noAttachDict = 0,  /* Compression with ZSTD_noDict or ZSTD_extDict.
+                                 * In this mode we use both the srcSize and the dictSize
+                                 * when selecting and adjusting parameters.
+                                 */
+    ZSTD_cpm_attachDict = 1,    /* Compression with ZSTD_dictMatchState or ZSTD_dedicatedDictSearch.
+                                 * In this mode we only take the srcSize into account when selecting
+                                 * and adjusting parameters.
+                                 */
+    ZSTD_cpm_createCDict = 2,   /* Creating a CDict.
+                                 * In this mode we take both the source size and the dictionary size
+                                 * into account when selecting and adjusting the parameters.
+                                 */
+    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+                                 * We don't know what these parameters are for. We default to the legacy
+                                 * behavior of taking both the source size and the dict size into account
+                                 * when selecting and adjusting parameters.
+                                 */
+} ZSTD_cParamMode_e;
+
+typedef size_t (*ZSTD_blockCompressor) (
+        ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);
+
+
+MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
+{
+    static const BYTE LL_Code[64] = {  0,  1,  2,  3,  4,  5,  6,  7,
+                                       8,  9, 10, 11, 12, 13, 14, 15,
+                                      16, 16, 17, 17, 18, 18, 19, 19,
+                                      20, 20, 20, 20, 21, 21, 21, 21,
+                                      22, 22, 22, 22, 22, 22, 22, 22,
+                                      23, 23, 23, 23, 23, 23, 23, 23,
+                                      24, 24, 24, 24, 24, 24, 24, 24,
+                                      24, 24, 24, 24, 24, 24, 24, 24 };
+    static const U32 LL_deltaCode = 19;
+    return (litLength > 63) ? ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+}
+
+/* ZSTD_MLcode() :
+ * note : mlBase = matchLength - MINMATCH;
+ *        because it's the format it's stored in seqStore->sequences */
+MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)
+{
+    static const BYTE ML_Code[128] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+                                      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                      32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37,
+                                      38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39,
+                                      40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+                                      41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
+                                      42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+                                      42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 };
+    static const U32 ML_deltaCode = 36;
+    return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase];
+}
+
+/* ZSTD_cParam_withinBounds:
+ * @return 1 if value is within cParam bounds,
+ * 0 otherwise */
+MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+{
+    ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);
+    if (ZSTD_isError(bounds.error)) return 0;
+    if (value < bounds.lowerBound) return 0;
+    if (value > bounds.upperBound) return 0;
+    return 1;
+}
+
+/* ZSTD_noCompressBlock() :
+ * Writes uncompressed block to dst buffer from given src.
+ * Returns the size of the block */
+MEM_STATIC size_t
+ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+{
+    U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
+    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
+    RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+                    dstSize_tooSmall, "dst buf too small for uncompressed block");
+    MEM_writeLE24(dst, cBlockHeader24);
+    ZSTD_memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
+    return ZSTD_blockHeaderSize + srcSize;
+}
+
+MEM_STATIC size_t
+ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+{
+    BYTE* const op = (BYTE*)dst;
+    U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+    RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, "");
+    MEM_writeLE24(op, cBlockHeader);
+    op[3] = src;
+    return 4;
+}
+
+
+/* ZSTD_minGain() :
+ * minimum compression required
+ * to generate a compress block or a compressed literals section.
+ * note : use same formula for both situations */
+MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+{
+    U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+    ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
+    return (srcSize >> minlog) + 2;
+}
+
+MEM_STATIC int ZSTD_literalsCompressionIsDisabled(const ZSTD_CCtx_params* cctxParams)
+{
+    switch (cctxParams->literalCompressionMode) {
+    case ZSTD_ps_enable:
+        return 0;
+    case ZSTD_ps_disable:
+        return 1;
+    default:
+        assert(0 /* impossible: pre-validated */);
+        ZSTD_FALLTHROUGH;
+    case ZSTD_ps_auto:
+        return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
+    }
+}
+
+/*! ZSTD_safecopyLiterals() :
+ *  memcpy() function that won't read beyond more than WILDCOPY_OVERLENGTH bytes past ilimit_w.
+ *  Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single
+ *  large copies.
+ */
+static void
+ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w)
+{
+    assert(iend > ilimit_w);
+    if (ip <= ilimit_w) {
+        ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap);
+        op += ilimit_w - ip;
+        ip = ilimit_w;
+    }
+    while (ip < iend) *op++ = *ip++;
+}
+
+
+#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
+#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
+#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
+#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
+#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
+#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
+#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
+#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
+#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
+
+/*! ZSTD_storeSeq() :
+ *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
+ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
+ *  @matchLength : must be >= MINMATCH
+ *  Allowed to over-read literals up to litLimit.
+*/
+HINT_INLINE UNUSED_ATTR void
+ZSTD_storeSeq(seqStore_t* seqStorePtr,
+              size_t litLength, const BYTE* literals, const BYTE* litLimit,
+              U32 offBase,
+              size_t matchLength)
+{
+    BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+    BYTE const* const litEnd = literals + litLength;
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL >= 6)
+    static const BYTE* g_start = NULL;
+    if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+    {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
+               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
+    }
+#endif
+    assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+    /* copy Literals */
+    assert(seqStorePtr->maxNbLit <= 128 KB);
+    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
+    assert(literals + litLength <= litLimit);
+    if (litEnd <= litLimit_w) {
+        /* Common case we can use wildcopy.
+         * First copy 16 bytes, because literals are likely short.
+         */
+        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
+        ZSTD_copy16(seqStorePtr->lit, literals);
+        if (litLength > 16) {
+            ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+        }
+    } else {
+        ZSTD_safecopyLiterals(seqStorePtr->lit, literals, litEnd, litLimit_w);
+    }
+    seqStorePtr->lit += litLength;
+
+    /* literal Length */
+    if (litLength>0xFFFF) {
+        assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
+        seqStorePtr->longLengthType = ZSTD_llt_literalLength;
+        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    }
+    seqStorePtr->sequences[0].litLength = (U16)litLength;
+
+    /* match offset */
+    seqStorePtr->sequences[0].offBase = offBase;
+
+    /* match Length */
+    assert(matchLength >= MINMATCH);
+    {   size_t const mlBase = matchLength - MINMATCH;
+        if (mlBase>0xFFFF) {
+            assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
+            seqStorePtr->longLengthType = ZSTD_llt_matchLength;
+            seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+        }
+        seqStorePtr->sequences[0].mlBase = (U16)mlBase;
+    }
+
+    seqStorePtr->sequences++;
+}
+
+/* ZSTD_updateRep() :
+ * updates in-place @rep (array of repeat offsets)
+ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
+ */
+MEM_STATIC void
+ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+{
+    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
+        rep[2] = rep[1];
+        rep[1] = rep[0];
+        rep[0] = OFFBASE_TO_OFFSET(offBase);
+    } else {   /* repcode */
+        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
+        if (repCode > 0) {  /* note : if repCode==0, no change */
+            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+            rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+            rep[1] = rep[0];
+            rep[0] = currentOffset;
+        } else {   /* repCode == 0 */
+            /* nothing to do */
+        }
+    }
+}
+
+typedef struct repcodes_s {
+    U32 rep[3];
+} repcodes_t;
+
+MEM_STATIC repcodes_t
+ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+{
+    repcodes_t newReps;
+    ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+    ZSTD_updateRep(newReps.rep, offBase, ll0);
+    return newReps;
+}
+
+
+/*-*************************************
+*  Match length counter
+***************************************/
+MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+{
+    const BYTE* const pStart = pIn;
+    const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1);
+
+    if (pIn < pInLoopLimit) {
+        { size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+          if (diff) return ZSTD_NbCommonBytes(diff); }
+        pIn+=sizeof(size_t); pMatch+=sizeof(size_t);
+        while (pIn < pInLoopLimit) {
+            size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+            if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
+            pIn += ZSTD_NbCommonBytes(diff);
+            return (size_t)(pIn - pStart);
+    }   }
+    if (MEM_64bits() && (pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (size_t)(pIn - pStart);
+}
+
+/** ZSTD_count_2segments() :
+ *  can count match length with `ip` & `match` in 2 different segments.
+ *  convention : on reaching mEnd, match count continue starting from iStart
+ */
+MEM_STATIC size_t
+ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+                     const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
+{
+    const BYTE* const vEnd = MIN( ip + (mEnd - match), iEnd);
+    size_t const matchLength = ZSTD_count(ip, match, vEnd);
+    if (match + matchLength != mEnd) return matchLength;
+    DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength);
+    DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match);
+    DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip);
+    DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart);
+    DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd));
+    return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd);
+}
+
+
+/*-*************************************
+ *  Hashes
+ ***************************************/
+static const U32 prime3bytes = 506832829U;
+static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
+MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
+
+static const U32 prime4bytes = 2654435761U;
+static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
+static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
+
+static const U64 prime5bytes = 889523592379ULL;
+static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
+
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
+
+static const U64 prime7bytes = 58295818150454627ULL;
+static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
+
+
+MEM_STATIC FORCE_INLINE_ATTR
+size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+{
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
+    switch(mls)
+    {
+    default:
+    case 4: return ZSTD_hash4Ptr(p, hBits);
+    case 5: return ZSTD_hash5Ptr(p, hBits);
+    case 6: return ZSTD_hash6Ptr(p, hBits);
+    case 7: return ZSTD_hash7Ptr(p, hBits);
+    case 8: return ZSTD_hash8Ptr(p, hBits);
+    }
+}
+
+MEM_STATIC FORCE_INLINE_ATTR
+size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
+    switch(mls)
+    {
+        default:
+        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
+        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
+        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
+        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
+        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
+    }
+}
+
+
+/** ZSTD_ipow() :
+ * Return base^exponent.
+ */
+static U64 ZSTD_ipow(U64 base, U64 exponent)
+{
+    U64 power = 1;
+    while (exponent) {
+      if (exponent & 1) power *= base;
+      exponent >>= 1;
+      base *= base;
+    }
+    return power;
+}
+
+#define ZSTD_ROLL_HASH_CHAR_OFFSET 10
+
+/** ZSTD_rollingHash_append() :
+ * Add the buffer to the hash value.
+ */
+static U64 ZSTD_rollingHash_append(U64 hash, void const* buf, size_t size)
+{
+    BYTE const* istart = (BYTE const*)buf;
+    size_t pos;
+    for (pos = 0; pos < size; ++pos) {
+        hash *= prime8bytes;
+        hash += istart[pos] + ZSTD_ROLL_HASH_CHAR_OFFSET;
+    }
+    return hash;
+}
+
+/** ZSTD_rollingHash_compute() :
+ * Compute the rolling hash value of the buffer.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_compute(void const* buf, size_t size)
+{
+    return ZSTD_rollingHash_append(0, buf, size);
+}
+
+/** ZSTD_rollingHash_primePower() :
+ * Compute the primePower to be passed to ZSTD_rollingHash_rotate() for a hash
+ * over a window of length bytes.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_primePower(U32 length)
+{
+    return ZSTD_ipow(prime8bytes, length - 1);
+}
+
+/** ZSTD_rollingHash_rotate() :
+ * Rotate the rolling hash by one byte.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 primePower)
+{
+    hash -= (toRemove + ZSTD_ROLL_HASH_CHAR_OFFSET) * primePower;
+    hash *= prime8bytes;
+    hash += toAdd + ZSTD_ROLL_HASH_CHAR_OFFSET;
+    return hash;
+}
+
+/*-*************************************
+*  Round buffer management
+***************************************/
+#if (ZSTD_WINDOWLOG_MAX_64 > 31)
+# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX"
+#endif
+/* Max current allowed */
+#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
+/* Maximum chunk size before overflow correction needs to be called again */
+#define ZSTD_CHUNKSIZE_MAX                                                     \
+    ( ((U32)-1)                  /* Maximum ending current index */            \
+    - ZSTD_CURRENT_MAX)          /* Maximum beginning lowLimit */
+
+/**
+ * ZSTD_window_clear():
+ * Clears the window containing the history by simply setting it to empty.
+ */
+MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window)
+{
+    size_t const endT = (size_t)(window->nextSrc - window->base);
+    U32 const end = (U32)endT;
+
+    window->lowLimit = end;
+    window->dictLimit = end;
+}
+
+MEM_STATIC U32 ZSTD_window_isEmpty(ZSTD_window_t const window)
+{
+    return window.dictLimit == ZSTD_WINDOW_START_INDEX &&
+           window.lowLimit == ZSTD_WINDOW_START_INDEX &&
+           (window.nextSrc - window.base) == ZSTD_WINDOW_START_INDEX;
+}
+
+/**
+ * ZSTD_window_hasExtDict():
+ * Returns non-zero if the window has a non-empty extDict.
+ */
+MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window)
+{
+    return window.lowLimit < window.dictLimit;
+}
+
+/**
+ * ZSTD_matchState_dictMode():
+ * Inspects the provided matchState and figures out what dictMode should be
+ * passed to the compressor.
+ */
+MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms)
+{
+    return ZSTD_window_hasExtDict(ms->window) ?
+        ZSTD_extDict :
+        ms->dictMatchState != NULL ?
+            (ms->dictMatchState->dedicatedDictSearch ? ZSTD_dedicatedDictSearch : ZSTD_dictMatchState) :
+            ZSTD_noDict;
+}
+
+/* Defining this macro to non-zero tells zstd to run the overflow correction
+ * code much more frequently. This is very inefficient, and should only be
+ * used for tests and fuzzers.
+ */
+#ifndef ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY
+#  ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+#    define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 1
+#  else
+#    define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 0
+#  endif
+#endif
+
+/**
+ * ZSTD_window_canOverflowCorrect():
+ * Returns non-zero if the indices are large enough for overflow correction
+ * to work correctly without impacting compression ratio.
+ */
+MEM_STATIC U32 ZSTD_window_canOverflowCorrect(ZSTD_window_t const window,
+                                              U32 cycleLog,
+                                              U32 maxDist,
+                                              U32 loadedDictEnd,
+                                              void const* src)
+{
+    U32 const cycleSize = 1u << cycleLog;
+    U32 const curr = (U32)((BYTE const*)src - window.base);
+    U32 const minIndexToOverflowCorrect = cycleSize
+                                        + MAX(maxDist, cycleSize)
+                                        + ZSTD_WINDOW_START_INDEX;
+
+    /* Adjust the min index to backoff the overflow correction frequency,
+     * so we don't waste too much CPU in overflow correction. If this
+     * computation overflows we don't really care, we just need to make
+     * sure it is at least minIndexToOverflowCorrect.
+     */
+    U32 const adjustment = window.nbOverflowCorrections + 1;
+    U32 const adjustedIndex = MAX(minIndexToOverflowCorrect * adjustment,
+                                  minIndexToOverflowCorrect);
+    U32 const indexLargeEnough = curr > adjustedIndex;
+
+    /* Only overflow correct early if the dictionary is invalidated already,
+     * so we don't hurt compression ratio.
+     */
+    U32 const dictionaryInvalidated = curr > maxDist + loadedDictEnd;
+
+    return indexLargeEnough && dictionaryInvalidated;
+}
+
+/**
+ * ZSTD_window_needOverflowCorrection():
+ * Returns non-zero if the indices are getting too large and need overflow
+ * protection.
+ */
+MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+                                                  U32 cycleLog,
+                                                  U32 maxDist,
+                                                  U32 loadedDictEnd,
+                                                  void const* src,
+                                                  void const* srcEnd)
+{
+    U32 const curr = (U32)((BYTE const*)srcEnd - window.base);
+    if (ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) {
+        if (ZSTD_window_canOverflowCorrect(window, cycleLog, maxDist, loadedDictEnd, src)) {
+            return 1;
+        }
+    }
+    return curr > ZSTD_CURRENT_MAX;
+}
+
+/**
+ * ZSTD_window_correctOverflow():
+ * Reduces the indices to protect from index overflow.
+ * Returns the correction made to the indices, which must be applied to every
+ * stored index.
+ *
+ * The least significant cycleLog bits of the indices must remain the same,
+ * which may be 0. Every index up to maxDist in the past must be valid.
+ */
+MEM_STATIC
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                           U32 maxDist, void const* src)
+{
+    /* preemptive overflow correction:
+     * 1. correction is large enough:
+     *    lowLimit > (3<<29) ==> current > 3<<29 + 1<<windowLog
+     *    1<<windowLog <= newCurrent < 1<<chainLog + 1<<windowLog
+     *
+     *    current - newCurrent
+     *    > (3<<29 + 1<<windowLog) - (1<<windowLog + 1<<chainLog)
+     *    > (3<<29) - (1<<chainLog)
+     *    > (3<<29) - (1<<30)             (NOTE: chainLog <= 30)
+     *    > 1<<29
+     *
+     * 2. (ip+ZSTD_CHUNKSIZE_MAX - cctx->base) doesn't overflow:
+     *    After correction, current is less than (1<<chainLog + 1<<windowLog).
+     *    In 64-bit mode we are safe, because we have 64-bit ptrdiff_t.
+     *    In 32-bit mode we are safe, because (chainLog <= 29), so
+     *    ip+ZSTD_CHUNKSIZE_MAX - cctx->base < 1<<32.
+     * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32:
+     *    windowLog <= 31 ==> 3<<29 + 1<<windowLog < 7<<29 < 1<<32.
+     */
+    U32 const cycleSize = 1u << cycleLog;
+    U32 const cycleMask = cycleSize - 1;
+    U32 const curr = (U32)((BYTE const*)src - window->base);
+    U32 const currentCycle = curr & cycleMask;
+    /* Ensure newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX. */
+    U32 const currentCycleCorrection = currentCycle < ZSTD_WINDOW_START_INDEX
+                                     ? MAX(cycleSize, ZSTD_WINDOW_START_INDEX)
+                                     : 0;
+    U32 const newCurrent = currentCycle
+                         + currentCycleCorrection
+                         + MAX(maxDist, cycleSize);
+    U32 const correction = curr - newCurrent;
+    /* maxDist must be a power of two so that:
+     *   (newCurrent & cycleMask) == (curr & cycleMask)
+     * This is required to not corrupt the chains / binary tree.
+     */
+    assert((maxDist & (maxDist - 1)) == 0);
+    assert((curr & cycleMask) == (newCurrent & cycleMask));
+    assert(curr > newCurrent);
+    if (!ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) {
+        /* Loose bound, should be around 1<<29 (see above) */
+        assert(correction > 1<<28);
+    }
+
+    window->base += correction;
+    window->dictBase += correction;
+    if (window->lowLimit < correction + ZSTD_WINDOW_START_INDEX) {
+        window->lowLimit = ZSTD_WINDOW_START_INDEX;
+    } else {
+        window->lowLimit -= correction;
+    }
+    if (window->dictLimit < correction + ZSTD_WINDOW_START_INDEX) {
+        window->dictLimit = ZSTD_WINDOW_START_INDEX;
+    } else {
+        window->dictLimit -= correction;
+    }
+
+    /* Ensure we can still reference the full window. */
+    assert(newCurrent >= maxDist);
+    assert(newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX);
+    /* Ensure that lowLimit and dictLimit didn't underflow. */
+    assert(window->lowLimit <= newCurrent);
+    assert(window->dictLimit <= newCurrent);
+
+    ++window->nbOverflowCorrections;
+
+    DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction,
+             window->lowLimit);
+    return correction;
+}
+
+/**
+ * ZSTD_window_enforceMaxDist():
+ * Updates lowLimit so that:
+ *    (srcEnd - base) - lowLimit == maxDist + loadedDictEnd
+ *
+ * It ensures index is valid as long as index >= lowLimit.
+ * This must be called before a block compression call.
+ *
+ * loadedDictEnd is only defined if a dictionary is in use for current compression.
+ * As the name implies, loadedDictEnd represents the index at end of dictionary.
+ * The value lies within context's referential, it can be directly compared to blockEndIdx.
+ *
+ * If loadedDictEndPtr is NULL, no dictionary is in use, and we use loadedDictEnd == 0.
+ * If loadedDictEndPtr is not NULL, we set it to zero after updating lowLimit.
+ * This is because dictionaries are allowed to be referenced fully
+ * as long as the last byte of the dictionary is in the window.
+ * Once input has progressed beyond window size, dictionary cannot be referenced anymore.
+ *
+ * In normal dict mode, the dictionary lies between lowLimit and dictLimit.
+ * In dictMatchState mode, lowLimit and dictLimit are the same,
+ * and the dictionary is below them.
+ * forceWindow and dictMatchState are therefore incompatible.
+ */
+MEM_STATIC void
+ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
+                     const void* blockEnd,
+                           U32   maxDist,
+                           U32*  loadedDictEndPtr,
+                     const ZSTD_matchState_t** dictMatchStatePtr)
+{
+    U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base);
+    U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0;
+    DEBUGLOG(5, "ZSTD_window_enforceMaxDist: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u",
+                (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+
+    /* - When there is no dictionary : loadedDictEnd == 0.
+         In which case, the test (blockEndIdx > maxDist) is merely to avoid
+         overflowing next operation `newLowLimit = blockEndIdx - maxDist`.
+       - When there is a standard dictionary :
+         Index referential is copied from the dictionary,
+         which means it starts from 0.
+         In which case, loadedDictEnd == dictSize,
+         and it makes sense to compare `blockEndIdx > maxDist + dictSize`
+         since `blockEndIdx` also starts from zero.
+       - When there is an attached dictionary :
+         loadedDictEnd is expressed within the referential of the context,
+         so it can be directly compared against blockEndIdx.
+    */
+    if (blockEndIdx > maxDist + loadedDictEnd) {
+        U32 const newLowLimit = blockEndIdx - maxDist;
+        if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit;
+        if (window->dictLimit < window->lowLimit) {
+            DEBUGLOG(5, "Update dictLimit to match lowLimit, from %u to %u",
+                        (unsigned)window->dictLimit, (unsigned)window->lowLimit);
+            window->dictLimit = window->lowLimit;
+        }
+        /* On reaching window size, dictionaries are invalidated */
+        if (loadedDictEndPtr) *loadedDictEndPtr = 0;
+        if (dictMatchStatePtr) *dictMatchStatePtr = NULL;
+    }
+}
+
+/* Similar to ZSTD_window_enforceMaxDist(),
+ * but only invalidates dictionary
+ * when input progresses beyond window size.
+ * assumption : loadedDictEndPtr and dictMatchStatePtr are valid (non NULL)
+ *              loadedDictEnd uses same referential as window->base
+ *              maxDist is the window size */
+MEM_STATIC void
+ZSTD_checkDictValidity(const ZSTD_window_t* window,
+                       const void* blockEnd,
+                             U32   maxDist,
+                             U32*  loadedDictEndPtr,
+                       const ZSTD_matchState_t** dictMatchStatePtr)
+{
+    assert(loadedDictEndPtr != NULL);
+    assert(dictMatchStatePtr != NULL);
+    {   U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base);
+        U32 const loadedDictEnd = *loadedDictEndPtr;
+        DEBUGLOG(5, "ZSTD_checkDictValidity: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u",
+                    (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+        assert(blockEndIdx >= loadedDictEnd);
+
+        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
+            /* On reaching window size, dictionaries are invalidated.
+             * For simplification, if window size is reached anywhere within next block,
+             * the dictionary is invalidated for the full block.
+             *
+             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
+             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
+             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
+             * dictMatchState, so setting it to NULL is not a problem.
+             */
+            DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+            *loadedDictEndPtr = 0;
+            *dictMatchStatePtr = NULL;
+        } else {
+            if (*loadedDictEndPtr != 0) {
+                DEBUGLOG(6, "dictionary considered valid for current block");
+    }   }   }
+}
+
+MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
+    ZSTD_memset(window, 0, sizeof(*window));
+    window->base = (BYTE const*)" ";
+    window->dictBase = (BYTE const*)" ";
+    ZSTD_STATIC_ASSERT(ZSTD_DUBT_UNSORTED_MARK < ZSTD_WINDOW_START_INDEX); /* Start above ZSTD_DUBT_UNSORTED_MARK */
+    window->dictLimit = ZSTD_WINDOW_START_INDEX;    /* start from >0, so that 1st position is valid */
+    window->lowLimit = ZSTD_WINDOW_START_INDEX;     /* it ensures first and later CCtx usages compress the same */
+    window->nextSrc = window->base + ZSTD_WINDOW_START_INDEX;   /* see issue #1241 */
+    window->nbOverflowCorrections = 0;
+}
+
+/**
+ * ZSTD_window_update():
+ * Updates the window by appending [src, src + srcSize) to the window.
+ * If it is not contiguous, the current prefix becomes the extDict, and we
+ * forget about the extDict. Handles overlap of the prefix and extDict.
+ * Returns non-zero if the segment is contiguous.
+ */
+MEM_STATIC
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_window_update(ZSTD_window_t* window,
+                                  void const* src, size_t srcSize,
+                                  int forceNonContiguous)
+{
+    BYTE const* const ip = (BYTE const*)src;
+    U32 contiguous = 1;
+    DEBUGLOG(5, "ZSTD_window_update");
+    if (srcSize == 0)
+        return contiguous;
+    assert(window->base != NULL);
+    assert(window->dictBase != NULL);
+    /* Check if blocks follow each other */
+    if (src != window->nextSrc || forceNonContiguous) {
+        /* not contiguous */
+        size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
+        DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit);
+        window->lowLimit = window->dictLimit;
+        assert(distanceFromBase == (size_t)(U32)distanceFromBase);  /* should never overflow */
+        window->dictLimit = (U32)distanceFromBase;
+        window->dictBase = window->base;
+        window->base = ip - distanceFromBase;
+        /* ms->nextToUpdate = window->dictLimit; */
+        if (window->dictLimit - window->lowLimit < HASH_READ_SIZE) window->lowLimit = window->dictLimit;   /* too small extDict */
+        contiguous = 0;
+    }
+    window->nextSrc = ip + srcSize;
+    /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+    if ( (ip+srcSize > window->dictBase + window->lowLimit)
+       & (ip < window->dictBase + window->dictLimit)) {
+        ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
+        U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
+        window->lowLimit = lowLimitMax;
+        DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit);
+    }
+    return contiguous;
+}
+
+/**
+ * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix.
+ */
+MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog)
+{
+    U32 const maxDistance = 1U << windowLog;
+    U32 const lowestValid = ms->window.lowLimit;
+    U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
+    U32 const isDictionary = (ms->loadedDictEnd != 0);
+    /* When using a dictionary the entire dictionary is valid if a single byte of the dictionary
+     * is within the window. We invalidate the dictionary (and set loadedDictEnd to 0) when it isn't
+     * valid for the entire block. So this check is sufficient to find the lowest valid match index.
+     */
+    U32 const matchLowest = isDictionary ? lowestValid : withinWindow;
+    return matchLowest;
+}
+
+/**
+ * Returns the lowest allowed match index in the prefix.
+ */
+MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog)
+{
+    U32    const maxDistance = 1U << windowLog;
+    U32    const lowestValid = ms->window.dictLimit;
+    U32    const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
+    U32    const isDictionary = (ms->loadedDictEnd != 0);
+    /* When computing the lowest prefix index we need to take the dictionary into account to handle
+     * the edge case where the dictionary and the source are contiguous in memory.
+     */
+    U32    const matchLowest = isDictionary ? lowestValid : withinWindow;
+    return matchLowest;
+}
+
+
+
+/* debug functions */
+#if (DEBUGLEVEL>=2)
+
+MEM_STATIC double ZSTD_fWeight(U32 rawStat)
+{
+    U32 const fp_accuracy = 8;
+    U32 const fp_multiplier = (1 << fp_accuracy);
+    U32 const newStat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(newStat);
+    U32 const BWeight = hb * fp_multiplier;
+    U32 const FWeight = (newStat << fp_accuracy) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + fp_accuracy < 31);
+    return (double)weight / fp_multiplier;
+}
+
+/* display a table content,
+ * listing each element, its frequency, and its predicted bit cost */
+MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+{
+    unsigned u, sum;
+    for (u=0, sum=0; u<=max; u++) sum += table[u];
+    DEBUGLOG(2, "total nb elts: %u", sum);
+    for (u=0; u<=max; u++) {
+        DEBUGLOG(2, "%2u: %5u  (%.2f)",
+                u, table[u], ZSTD_fWeight(sum) - ZSTD_fWeight(table[u]) );
+    }
+}
+
+#endif
+
+/* Short Cache */
+
+/* Normally, zstd matchfinders follow this flow:
+ *     1. Compute hash at ip
+ *     2. Load index from hashTable[hash]
+ *     3. Check if *ip == *(base + index)
+ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
+ *
+ * Short cache is an optimization which allows us to avoid step 3 most of the time
+ * when the data doesn't actually match. With short cache, the flow becomes:
+ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
+ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
+ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
+ *
+ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
+ * dictMatchState matchfinders.
+ */
+#define ZSTD_SHORT_CACHE_TAG_BITS 8
+#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
+
+/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
+ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
+MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
+    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
+    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
+    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
+}
+
+/* Helper function for short cache matchfinders.
+ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
+MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
+    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
+    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
+    return tag1 == tag2;
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+/* ===============================================================
+ * Shared internal declarations
+ * These prototypes may be called from sources not in lib/compress
+ * =============================================================== */
+
+/* ZSTD_loadCEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary.
+ * return : size of dictionary header (size of magic number + dict ID + entropy tables)
+ * assumptions : magic number supposed already checked
+ *               and dictSize >= 8 */
+size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+                         const void* const dict, size_t dictSize);
+
+void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
+
+/* ==============================================================
+ * Private declarations
+ * These prototypes shall only be called from within lib/compress
+ * ============================================================== */
+
+/* ZSTD_getCParamsFromCCtxParams() :
+ * cParams are built depending on compressionLevel, src size hints,
+ * LDM and manually set compression parameters.
+ * Note: srcSizeHint == 0 means 0!
+ */
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+
+/*! ZSTD_initCStream_internal() :
+ *  Private use only. Init streaming operation.
+ *  expects params to be valid.
+ *  must receive dict, or cdict, or none, but not both.
+ *  @return : 0, or an error code */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                     const ZSTD_CDict* cdict,
+                     const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize);
+
+void ZSTD_resetSeqStore(seqStore_t* ssPtr);
+
+/*! ZSTD_getCParamsFromCDict() :
+ *  as the name implies */
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict);
+
+/* ZSTD_compressBegin_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    const ZSTD_CCtx_params* params,
+                                    unsigned long long pledgedSrcSize);
+
+/* ZSTD_compress_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compress_advanced_internal(ZSTD_CCtx* cctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                 const void* dict,size_t dictSize,
+                                 const ZSTD_CCtx_params* params);
+
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ *           or an error code if `dstCapacity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+
+
+/* ZSTD_referenceExternalSequences() :
+ * Must be called before starting a compression operation.
+ * seqs must parse a prefix of the source.
+ * This cannot be used when long range matching is enabled.
+ * Zstd will use these sequences, and pass the literals to a secondary block
+ * compressor.
+ * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+ * access and data corruption.
+ */
+void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+
+/** ZSTD_CCtx_trace() :
+ *  Trace the end of a compression call.
+ */
+void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
+
+/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+ * Note that the block delimiter must include the last literals of the block.
+ */
+size_t
+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                              ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                        const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+
+/* Returns the number of bytes to move the current read position back by.
+ * Only non-zero if we ended up splitting a sequence.
+ * Otherwise, it may return a ZSTD error if something went wrong.
+ *
+ * This function will attempt to scan through blockSize bytes
+ * represented by the sequences in @inSeqs,
+ * storing any (partial) sequences.
+ *
+ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+ */
+size_t
+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+
+/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */
+MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) {
+    return params->extSeqProdFunc != NULL;
+}
+
+/* ===============================================================
+ * Deprecated definitions that are still used internally to avoid
+ * deprecation warnings. These functions are exactly equivalent to
+ * their public variants, but avoid the deprecation warnings.
+ * =============================================================== */
+
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
+                                    void* dst, size_t dstCapacity,
+                              const void* src, size_t srcSize);
+
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+#endif /* ZSTD_COMPRESS_H */
diff --git a/src/zstd/compress/zstd_compress_literals.c b/src/zstd/compress/zstd_compress_literals.c
new file mode 100644
index 000000000..bfd4f11ab
--- /dev/null
+++ b/src/zstd/compress/zstd_compress_literals.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+#include "zstd_compress_literals.h"
+
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+#if DEBUGLEVEL >= 2
+
+static size_t showHexa(const void* src, size_t srcSize)
+{
+    const BYTE* const ip = (const BYTE*)src;
+    size_t u;
+    for (u=0; u<srcSize; u++) {
+        RAWLOG(5, " %02X", ip[u]); (void)ip;
+    }
+    RAWLOG(5, " \n");
+    return srcSize;
+}
+
+#endif
+
+
+/* **************************************************************
+*  Literals compression - special cases
+****************************************************************/
+size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
+
+    RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_basic + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    ZSTD_memcpy(ostart + flSize, src, srcSize);
+    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+    return srcSize + flSize;
+}
+
+static int allBytesIdentical(const void* src, size_t srcSize)
+{
+    assert(srcSize >= 1);
+    assert(src != NULL);
+    {   const BYTE b = ((const BYTE*)src)[0];
+        size_t p;
+        for (p=1; p<srcSize; p++) {
+            if (((const BYTE*)src)[p] != b) return 0;
+        }
+        return 1;
+    }
+}
+
+size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    assert(dstCapacity >= 4); (void)dstCapacity;
+    assert(allBytesIdentical(src, srcSize));
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_rle + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    ostart[flSize] = *(const BYTE*)src;
+    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
+    return flSize+1;
+}
+
+/* ZSTD_minLiteralsToCompress() :
+ * returns minimal amount of literals
+ * for literal compression to even be attempted.
+ * Minimum is made tighter as compression strategy increases.
+ */
+static size_t
+ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
+{
+    assert((int)strategy >= 0);
+    assert((int)strategy <= 9);
+    /* btultra2 : min 8 bytes;
+     * then 2x larger for each successive compression strategy
+     * max threshold 64 bytes */
+    {   int const shift = MIN(9-(int)strategy, 3);
+        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
+        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
+        return mintc;
+    }
+}
+
+size_t ZSTD_compressLiterals (
+                  void* dst, size_t dstCapacity,
+            const void* src, size_t srcSize,
+                  void* entropyWorkspace, size_t entropyWorkspaceSize,
+            const ZSTD_hufCTables_t* prevHuf,
+                  ZSTD_hufCTables_t* nextHuf,
+                  ZSTD_strategy strategy,
+                  int disableLiteralCompression,
+                  int suspectUncompressible,
+                  int bmi2)
+{
+    size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+    BYTE*  const ostart = (BYTE*)dst;
+    U32 singleStream = srcSize < 256;
+    symbolEncodingType_e hType = set_compressed;
+    size_t cLitSize;
+
+    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
+                disableLiteralCompression, (U32)srcSize, dstCapacity);
+
+    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
+
+    /* Prepare nextEntropy assuming reusing the existing table */
+    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+    if (disableLiteralCompression)
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+
+    /* if too small, don't even attempt compression (speed opt) */
+    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+
+    RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+    {   HUF_repeat repeat = prevHuf->repeatMode;
+        int const flags = 0
+            | (bmi2 ? HUF_flags_bmi2 : 0)
+            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
+            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
+            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
+
+        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
+        huf_compress_f huf_compress;
+        if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
+        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
+                                src, srcSize,
+                                HUF_SYMBOLVALUE_MAX, LitHufLog,
+                                entropyWorkspace, entropyWorkspaceSize,
+                                (HUF_CElt*)nextHuf->CTable,
+                                &repeat, flags);
+        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
+        if (repeat != HUF_repeat_none) {
+            /* reused the existing table */
+            DEBUGLOG(5, "reusing statistics from previous huffman block");
+            hType = set_repeat;
+        }
+    }
+
+    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
+        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }   }
+    if (cLitSize==1) {
+        /* A return value of 1 signals that the alphabet consists of a single symbol.
+         * However, in some rare circumstances, it could be the compressed size (a single byte).
+         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
+         * (it's also necessary to not generate statistics).
+         * Therefore, in such a case, actively check that all bytes are identical. */
+        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+    }   }
+
+    if (hType == set_compressed) {
+        /* using a newly constructed table */
+        nextHuf->repeatMode = HUF_repeat_check;
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+        {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    case 5: /* 2 - 2 - 18 - 18 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+        {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    default:  /* not possible : lhSize is {3,4,5} */
+        assert(0);
+    }
+    DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)srcSize, (U32)(lhSize+cLitSize));
+    return lhSize+cLitSize;
+}
diff --git a/src/zstd/zstd_compress_literals.h b/src/zstd/compress/zstd_compress_literals.h
similarity index 56%
rename from src/zstd/zstd_compress_literals.h
rename to src/zstd/compress/zstd_compress_literals.h
index 2c4d289a7..b060c8ad2 100644
--- a/src/zstd/zstd_compress_literals.h
+++ b/src/zstd/compress/zstd_compress_literals.h
@@ -13,26 +13,27 @@
 
 #include "zstd_compress_internal.h" /* ZSTD_hufCTables_t, ZSTD_minGain() */
 
-size_t ZSTD_noCompressLiterals(void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 /* ZSTD_compressRleLiteralsBlock() :
  * Conditions :
  * - All bytes in @src are identical
  * - dstCapacity >= 4 */
-size_t ZSTD_compressRleLiteralsBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 /* ZSTD_compressLiterals():
  * @entropyWorkspace: must be aligned on 4-bytes boundaries
  * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
  * @suspectUncompressible: sampling checks, to potentially skip huffman coding
  */
-size_t ZSTD_compressLiterals(void* dst, size_t dstCapacity,
-                             const void* src, size_t srcSize,
-                             void* entropyWorkspace, size_t entropyWorkspaceSize,
-                             const ZSTD_hufCTables_t* prevHuf,
-                             ZSTD_hufCTables_t* nextHuf,
-                             ZSTD_strategy strategy, int disableLiteralCompression,
-                             int suspectUncompressible,
-                             int bmi2);
+size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+                        const ZSTD_hufCTables_t* prevHuf,
+                              ZSTD_hufCTables_t* nextHuf,
+                              ZSTD_strategy strategy, int disableLiteralCompression,
+                              int suspectUncompressible,
+                              int bmi2);
 
 #endif /* ZSTD_COMPRESS_LITERALS_H */
diff --git a/src/zstd/compress/zstd_compress_sequences.c b/src/zstd/compress/zstd_compress_sequences.c
new file mode 100644
index 000000000..8872d4d35
--- /dev/null
+++ b/src/zstd/compress/zstd_compress_sequences.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+#include "zstd_compress_sequences.h"
+
+/**
+ * -log2(x / 256) lookup table for x in [0, 256).
+ * If x == 0: Return 0
+ * Else: Return floor(-log2(x / 256) * 256)
+ */
+static unsigned const kInverseProbabilityLog256[256] = {
+    0,    2048, 1792, 1642, 1536, 1453, 1386, 1329, 1280, 1236, 1197, 1162,
+    1130, 1100, 1073, 1047, 1024, 1001, 980,  960,  941,  923,  906,  889,
+    874,  859,  844,  830,  817,  804,  791,  779,  768,  756,  745,  734,
+    724,  714,  704,  694,  685,  676,  667,  658,  650,  642,  633,  626,
+    618,  610,  603,  595,  588,  581,  574,  567,  561,  554,  548,  542,
+    535,  529,  523,  517,  512,  506,  500,  495,  489,  484,  478,  473,
+    468,  463,  458,  453,  448,  443,  438,  434,  429,  424,  420,  415,
+    411,  407,  402,  398,  394,  390,  386,  382,  377,  373,  370,  366,
+    362,  358,  354,  350,  347,  343,  339,  336,  332,  329,  325,  322,
+    318,  315,  311,  308,  305,  302,  298,  295,  292,  289,  286,  282,
+    279,  276,  273,  270,  267,  264,  261,  258,  256,  253,  250,  247,
+    244,  241,  239,  236,  233,  230,  228,  225,  222,  220,  217,  215,
+    212,  209,  207,  204,  202,  199,  197,  194,  192,  190,  187,  185,
+    182,  180,  178,  175,  173,  171,  168,  166,  164,  162,  159,  157,
+    155,  153,  151,  149,  146,  144,  142,  140,  138,  136,  134,  132,
+    130,  128,  126,  123,  121,  119,  117,  115,  114,  112,  110,  108,
+    106,  104,  102,  100,  98,   96,   94,   93,   91,   89,   87,   85,
+    83,   82,   80,   78,   76,   74,   73,   71,   69,   67,   66,   64,
+    62,   61,   59,   57,   55,   54,   52,   50,   49,   47,   46,   44,
+    42,   41,   39,   37,   36,   34,   33,   31,   30,   28,   26,   25,
+    23,   22,   20,   19,   17,   16,   14,   13,   11,   10,   8,    7,
+    5,    4,    2,    1,
+};
+
+static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) {
+  void const* ptr = ctable;
+  U16 const* u16ptr = (U16 const*)ptr;
+  U32 const maxSymbolValue = MEM_read16(u16ptr + 1);
+  return maxSymbolValue;
+}
+
+/**
+ * Returns true if we should use ncount=-1 else we should
+ * use ncount=1 for low probability symbols instead.
+ */
+static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
+{
+    /* Heuristic: This should cover most blocks <= 16K and
+     * start to fade out after 16K to about 32K depending on
+     * compressibility.
+     */
+    return nbSeq >= 2048;
+}
+
+/**
+ * Returns the cost in bytes of encoding the normalized count header.
+ * Returns an error if any of the helper functions return an error.
+ */
+static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max,
+                              size_t const nbSeq, unsigned const FSELog)
+{
+    BYTE wksp[FSE_NCOUNTBOUND];
+    S16 norm[MaxSeq + 1];
+    const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+    FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max, ZSTD_useLowProbCount(nbSeq)), "");
+    return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog);
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution described by count
+ * using the entropy bound.
+ */
+static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t const total)
+{
+    unsigned cost = 0;
+    unsigned s;
+
+    assert(total > 0);
+    for (s = 0; s <= max; ++s) {
+        unsigned norm = (unsigned)((256 * count[s]) / total);
+        if (count[s] != 0 && norm == 0)
+            norm = 1;
+        assert(count[s] < total);
+        cost += count[s] * kInverseProbabilityLog256[norm];
+    }
+    return cost >> 8;
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using ctable.
+ * Returns an error if ctable cannot represent all the symbols in count.
+ */
+size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max)
+{
+    unsigned const kAccuracyLog = 8;
+    size_t cost = 0;
+    unsigned s;
+    FSE_CState_t cstate;
+    FSE_initCState(&cstate, ctable);
+    if (ZSTD_getFSEMaxSymbolValue(ctable) < max) {
+        DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u",
+                    ZSTD_getFSEMaxSymbolValue(ctable), max);
+        return ERROR(GENERIC);
+    }
+    for (s = 0; s <= max; ++s) {
+        unsigned const tableLog = cstate.stateLog;
+        unsigned const badCost = (tableLog + 1) << kAccuracyLog;
+        unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog);
+        if (count[s] == 0)
+            continue;
+        if (bitCost >= badCost) {
+            DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s);
+            return ERROR(GENERIC);
+        }
+        cost += (size_t)count[s] * bitCost;
+    }
+    return cost >> kAccuracyLog;
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using the
+ * table described by norm. The max symbol support by norm is assumed >= max.
+ * norm must be valid for every symbol with non-zero probability in count.
+ */
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                             unsigned const* count, unsigned const max)
+{
+    unsigned const shift = 8 - accuracyLog;
+    size_t cost = 0;
+    unsigned s;
+    assert(accuracyLog <= 8);
+    for (s = 0; s <= max; ++s) {
+        unsigned const normAcc = (norm[s] != -1) ? (unsigned)norm[s] : 1;
+        unsigned const norm256 = normAcc << shift;
+        assert(norm256 > 0);
+        assert(norm256 < 256);
+        cost += count[s] * kInverseProbabilityLog256[norm256];
+    }
+    return cost >> 8;
+}
+
+symbolEncodingType_e
+ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+        size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+        FSE_CTable const* prevCTable,
+        short const* defaultNorm, U32 defaultNormLog,
+        ZSTD_defaultPolicy_e const isDefaultAllowed,
+        ZSTD_strategy const strategy)
+{
+    ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
+    if (mostFrequent == nbSeq) {
+        *repeatMode = FSE_repeat_none;
+        if (isDefaultAllowed && nbSeq <= 2) {
+            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
+             * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+             * If basic encoding isn't possible, always choose RLE.
+             */
+            DEBUGLOG(5, "Selected set_basic");
+            return set_basic;
+        }
+        DEBUGLOG(5, "Selected set_rle");
+        return set_rle;
+    }
+    if (strategy < ZSTD_lazy) {
+        if (isDefaultAllowed) {
+            size_t const staticFse_nbSeq_max = 1000;
+            size_t const mult = 10 - strategy;
+            size_t const baseLog = 3;
+            size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog;  /* 28-36 for offset, 56-72 for lengths */
+            assert(defaultNormLog >= 5 && defaultNormLog <= 6);  /* xx_DEFAULTNORMLOG */
+            assert(mult <= 9 && mult >= 7);
+            if ( (*repeatMode == FSE_repeat_valid)
+              && (nbSeq < staticFse_nbSeq_max) ) {
+                DEBUGLOG(5, "Selected set_repeat");
+                return set_repeat;
+            }
+            if ( (nbSeq < dynamicFse_nbSeq_min)
+              || (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) {
+                DEBUGLOG(5, "Selected set_basic");
+                /* The format allows default tables to be repeated, but it isn't useful.
+                 * When using simple heuristics to select encoding type, we don't want
+                 * to confuse these tables with dictionaries. When running more careful
+                 * analysis, we don't need to waste time checking both repeating tables
+                 * and default tables.
+                 */
+                *repeatMode = FSE_repeat_none;
+                return set_basic;
+            }
+        }
+    } else {
+        size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC);
+        size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC);
+        size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog);
+        size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq);
+
+        if (isDefaultAllowed) {
+            assert(!ZSTD_isError(basicCost));
+            assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost)));
+        }
+        assert(!ZSTD_isError(NCountCost));
+        assert(compressedCost < ERROR(maxCode));
+        DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u",
+                    (unsigned)basicCost, (unsigned)repeatCost, (unsigned)compressedCost);
+        if (basicCost <= repeatCost && basicCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_basic");
+            assert(isDefaultAllowed);
+            *repeatMode = FSE_repeat_none;
+            return set_basic;
+        }
+        if (repeatCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_repeat");
+            assert(!ZSTD_isError(repeatCost));
+            return set_repeat;
+        }
+        assert(compressedCost < basicCost && compressedCost < repeatCost);
+    }
+    DEBUGLOG(5, "Selected set_compressed");
+    *repeatMode = FSE_repeat_check;
+    return set_compressed;
+}
+
+typedef struct {
+    S16 norm[MaxSeq + 1];
+    U32 wksp[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(MaxSeq, MaxFSELog)];
+} ZSTD_BuildCTableWksp;
+
+size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+                unsigned* count, U32 max,
+                const BYTE* codeTable, size_t nbSeq,
+                const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                const FSE_CTable* prevCTable, size_t prevCTableSize,
+                void* entropyWorkspace, size_t entropyWorkspaceSize)
+{
+    BYTE* op = (BYTE*)dst;
+    const BYTE* const oend = op + dstCapacity;
+    DEBUGLOG(6, "ZSTD_buildCTable (dstCapacity=%u)", (unsigned)dstCapacity);
+
+    switch (type) {
+    case set_rle:
+        FORWARD_IF_ERROR(FSE_buildCTable_rle(nextCTable, (BYTE)max), "");
+        RETURN_ERROR_IF(dstCapacity==0, dstSize_tooSmall, "not enough space");
+        *op = codeTable[0];
+        return 1;
+    case set_repeat:
+        ZSTD_memcpy(nextCTable, prevCTable, prevCTableSize);
+        return 0;
+    case set_basic:
+        FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize), "");  /* note : could be pre-calculated */
+        return 0;
+    case set_compressed: {
+        ZSTD_BuildCTableWksp* wksp = (ZSTD_BuildCTableWksp*)entropyWorkspace;
+        size_t nbSeq_1 = nbSeq;
+        const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+        if (count[codeTable[nbSeq-1]] > 1) {
+            count[codeTable[nbSeq-1]]--;
+            nbSeq_1--;
+        }
+        assert(nbSeq_1 > 1);
+        assert(entropyWorkspaceSize >= sizeof(ZSTD_BuildCTableWksp));
+        (void)entropyWorkspaceSize;
+        FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed");
+        assert(oend >= op);
+        {   size_t const NCountSize = FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, max, tableLog);   /* overflow protected */
+            FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed");
+            FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp failed");
+            return NCountSize;
+        }
+    }
+    default: assert(0); RETURN_ERROR(GENERIC, "impossible to reach");
+    }
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_encodeSequences_body(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    BIT_CStream_t blockStream;
+    FSE_CState_t  stateMatchLength;
+    FSE_CState_t  stateOffsetBits;
+    FSE_CState_t  stateLitLength;
+
+    RETURN_ERROR_IF(
+        ERR_isError(BIT_initCStream(&blockStream, dst, dstCapacity)),
+        dstSize_tooSmall, "not enough space remaining");
+    DEBUGLOG(6, "available space for bitstream : %i  (dstCapacity=%u)",
+                (int)(blockStream.endPtr - blockStream.startPtr),
+                (unsigned)dstCapacity);
+
+    /* first symbols */
+    FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateOffsetBits,  CTable_OffsetBits,  ofCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateLitLength,   CTable_LitLength,   llCodeTable[nbSeq-1]);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    if (longOffsets) {
+        U32 const ofBits = ofCodeTable[nbSeq-1];
+        unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+        if (extraBits) {
+            BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, extraBits);
+            BIT_flushBits(&blockStream);
+        }
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offBase >> extraBits,
+                    ofBits - extraBits);
+    } else {
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]);
+    }
+    BIT_flushBits(&blockStream);
+
+    {   size_t n;
+        for (n=nbSeq-2 ; n<nbSeq ; n--) {      /* intentional underflow */
+            BYTE const llCode = llCodeTable[n];
+            BYTE const ofCode = ofCodeTable[n];
+            BYTE const mlCode = mlCodeTable[n];
+            U32  const llBits = LL_bits[llCode];
+            U32  const ofBits = ofCode;
+            U32  const mlBits = ML_bits[mlCode];
+            DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u",
+                        (unsigned)sequences[n].litLength,
+                        (unsigned)sequences[n].mlBase + MINMATCH,
+                        (unsigned)sequences[n].offBase);
+                                                                            /* 32b*/  /* 64b*/
+                                                                            /* (7)*/  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode);       /* 15 */  /* 15 */
+            FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode);      /* 24 */  /* 24 */
+            if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateLitLength, llCode);        /* 16 */  /* 33 */
+            if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
+                BIT_flushBits(&blockStream);                                /* (7)*/
+            BIT_addBits(&blockStream, sequences[n].litLength, llBits);
+            if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
+            BIT_addBits(&blockStream, sequences[n].mlBase, mlBits);
+            if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream);
+            if (longOffsets) {
+                unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+                if (extraBits) {
+                    BIT_addBits(&blockStream, sequences[n].offBase, extraBits);
+                    BIT_flushBits(&blockStream);                            /* (7)*/
+                }
+                BIT_addBits(&blockStream, sequences[n].offBase >> extraBits,
+                            ofBits - extraBits);                            /* 31 */
+            } else {
+                BIT_addBits(&blockStream, sequences[n].offBase, ofBits);     /* 31 */
+            }
+            BIT_flushBits(&blockStream);                                    /* (7)*/
+            DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr));
+    }   }
+
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog);
+    FSE_flushCState(&blockStream, &stateMatchLength);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog);
+    FSE_flushCState(&blockStream, &stateOffsetBits);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog);
+    FSE_flushCState(&blockStream, &stateLitLength);
+
+    {   size_t const streamSize = BIT_closeCStream(&blockStream);
+        RETURN_ERROR_IF(streamSize==0, dstSize_tooSmall, "not enough space");
+        return streamSize;
+    }
+}
+
+static size_t
+ZSTD_encodeSequences_default(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+
+#if DYNAMIC_BMI2
+
+static BMI2_TARGET_ATTRIBUTE size_t
+ZSTD_encodeSequences_bmi2(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+#endif
+
+size_t ZSTD_encodeSequences(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2)
+{
+    DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity);
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return ZSTD_encodeSequences_bmi2(dst, dstCapacity,
+                                         CTable_MatchLength, mlCodeTable,
+                                         CTable_OffsetBits, ofCodeTable,
+                                         CTable_LitLength, llCodeTable,
+                                         sequences, nbSeq, longOffsets);
+    }
+#endif
+    (void)bmi2;
+    return ZSTD_encodeSequences_default(dst, dstCapacity,
+                                        CTable_MatchLength, mlCodeTable,
+                                        CTable_OffsetBits, ofCodeTable,
+                                        CTable_LitLength, llCodeTable,
+                                        sequences, nbSeq, longOffsets);
+}
diff --git a/src/zstd/compress/zstd_compress_sequences.h b/src/zstd/compress/zstd_compress_sequences.h
new file mode 100644
index 000000000..4a3a05da9
--- /dev/null
+++ b/src/zstd/compress/zstd_compress_sequences.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_SEQUENCES_H
+#define ZSTD_COMPRESS_SEQUENCES_H
+
+#include "../common/fse.h" /* FSE_repeat, FSE_CTable */
+#include "../common/zstd_internal.h" /* symbolEncodingType_e, ZSTD_strategy */
+
+typedef enum {
+    ZSTD_defaultDisallowed = 0,
+    ZSTD_defaultAllowed = 1
+} ZSTD_defaultPolicy_e;
+
+symbolEncodingType_e
+ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+        size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+        FSE_CTable const* prevCTable,
+        short const* defaultNorm, U32 defaultNormLog,
+        ZSTD_defaultPolicy_e const isDefaultAllowed,
+        ZSTD_strategy const strategy);
+
+size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+                unsigned* count, U32 max,
+                const BYTE* codeTable, size_t nbSeq,
+                const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                const FSE_CTable* prevCTable, size_t prevCTableSize,
+                void* entropyWorkspace, size_t entropyWorkspaceSize);
+
+size_t ZSTD_encodeSequences(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2);
+
+size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max);
+
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                             unsigned const* count, unsigned const max);
+#endif /* ZSTD_COMPRESS_SEQUENCES_H */
diff --git a/src/zstd/compress/zstd_compress_superblock.c b/src/zstd/compress/zstd_compress_superblock.c
new file mode 100644
index 000000000..628a2dccd
--- /dev/null
+++ b/src/zstd/compress/zstd_compress_superblock.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+#include "zstd_compress_superblock.h"
+
+#include "../common/zstd_internal.h"  /* ZSTD_getSequenceLength */
+#include "hist.h"                     /* HIST_countFast_wksp */
+#include "zstd_compress_internal.h"   /* ZSTD_[huf|fse|entropy]CTablesMetadata_t */
+#include "zstd_compress_sequences.h"
+#include "zstd_compress_literals.h"
+
+/** ZSTD_compressSubBlock_literal() :
+ *  Compresses literals section for a sub-block.
+ *  When we have to write the Huffman table we will sometimes choose a header
+ *  size larger than necessary. This is because we have to pick the header size
+ *  before we know the table size + compressed size, so we have a bound on the
+ *  table size. If we guessed incorrectly, we fall back to uncompressed literals.
+ *
+ *  We write the header when writeEntropy=1 and set entropyWritten=1 when we succeeded
+ *  in writing the header, otherwise it is set to 0.
+ *
+ *  hufMetadata->hType has literals block type info.
+ *      If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block.
+ *      If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block.
+ *      If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block
+ *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+ *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+ *  @return : compressed size of literals section of a sub-block
+ *            Or 0 if unable to compress.
+ *            Or error code */
+static size_t
+ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                              const BYTE* literals, size_t litSize,
+                              void* dst, size_t dstSize,
+                              const int bmi2, int writeEntropy, int* entropyWritten)
+{
+    size_t const header = writeEntropy ? 200 : 0;
+    size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart + lhSize;
+    U32 const singleStream = lhSize == 3;
+    symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+    size_t cLitSize = 0;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+
+    *entropyWritten = 0;
+    if (litSize == 0 || hufMetadata->hType == set_basic) {
+      DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal");
+      return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+    } else if (hufMetadata->hType == set_rle) {
+      DEBUGLOG(5, "ZSTD_compressSubBlock_literal using rle literal");
+      return ZSTD_compressRleLiteralsBlock(dst, dstSize, literals, litSize);
+    }
+
+    assert(litSize > 0);
+    assert(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat);
+
+    if (writeEntropy && hufMetadata->hType == set_compressed) {
+        ZSTD_memcpy(op, hufMetadata->hufDesBuffer, hufMetadata->hufDesSize);
+        op += hufMetadata->hufDesSize;
+        cLitSize += hufMetadata->hufDesSize;
+        DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+    }
+
+    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
+        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags)
+                                          : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags);
+        op += cSize;
+        cLitSize += cSize;
+        if (cSize == 0 || ERR_isError(cSize)) {
+            DEBUGLOG(5, "Failed to write entropy tables %s", ZSTD_getErrorName(cSize));
+            return 0;
+        }
+        /* If we expand and we aren't writing a header then emit uncompressed */
+        if (!writeEntropy && cLitSize >= litSize) {
+            DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal because uncompressible");
+            return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+        }
+        /* If we are writing headers then allow expansion that doesn't change our header size. */
+        if (lhSize < (size_t)(3 + (cLitSize >= 1 KB) + (cLitSize >= 16 KB))) {
+            assert(cLitSize > litSize);
+            DEBUGLOG(5, "Literals expanded beyond allowed header size");
+            return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+        }
+        DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize);
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        {   U32 const lhc = hType + (2 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    case 5: /* 2 - 2 - 18 - 18 */
+        {   U32 const lhc = hType + (3 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    default:  /* not possible : lhSize is {3,4,5} */
+        assert(0);
+    }
+    *entropyWritten = 1;
+    DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
+    return (size_t)(op-ostart);
+}
+
+static size_t
+ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
+                   const seqDef* sequences, size_t nbSeqs,
+                         size_t litSize, int lastSubBlock)
+{
+    size_t matchLengthSum = 0;
+    size_t litLengthSum = 0;
+    size_t n;
+    for (n=0; n<nbSeqs; n++) {
+        const ZSTD_sequenceLength seqLen = ZSTD_getSequenceLength(seqStore, sequences+n);
+        litLengthSum += seqLen.litLength;
+        matchLengthSum += seqLen.matchLength;
+    }
+    DEBUGLOG(5, "ZSTD_seqDecompressedSize: %u sequences from %p: %u literals + %u matchlength",
+                (unsigned)nbSeqs, (const void*)sequences,
+                (unsigned)litLengthSum, (unsigned)matchLengthSum);
+    if (!lastSubBlock)
+        assert(litLengthSum == litSize);
+    else
+        assert(litLengthSum <= litSize);
+    (void)litLengthSum;
+    return matchLengthSum + litSize;
+}
+
+/** ZSTD_compressSubBlock_sequences() :
+ *  Compresses sequences section for a sub-block.
+ *  fseMetadata->llType, fseMetadata->ofType, and fseMetadata->mlType have
+ *  symbol compression modes for the super-block.
+ *  The first successfully compressed block will have these in its header.
+ *  We set entropyWritten=1 when we succeed in compressing the sequences.
+ *  The following sub-blocks will always have repeat mode.
+ *  @return : compressed size of sequences section of a sub-block
+ *            Or 0 if it is unable to compress
+ *            Or error code. */
+static size_t
+ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                const seqDef* sequences, size_t nbSeq,
+                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                const ZSTD_CCtx_params* cctxParams,
+                                void* dst, size_t dstCapacity,
+                                const int bmi2, int writeEntropy, int* entropyWritten)
+{
+    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    BYTE* seqHead;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq, writeEntropy, longOffsets);
+
+    *entropyWritten = 0;
+    /* Sequences Header */
+    RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                    dstSize_tooSmall, "");
+    if (nbSeq < 128)
+        *op++ = (BYTE)nbSeq;
+    else if (nbSeq < LONGNBSEQ)
+        op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+    else
+        op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+    if (nbSeq==0) {
+        return (size_t)(op - ostart);
+    }
+
+    /* seqHead : flags for FSE encoding type */
+    seqHead = op++;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (seqHeadSize=%u)", (unsigned)(op-ostart));
+
+    if (writeEntropy) {
+        const U32 LLtype = fseMetadata->llType;
+        const U32 Offtype = fseMetadata->ofType;
+        const U32 MLtype = fseMetadata->mlType;
+        DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (fseTablesSize=%zu)", fseMetadata->fseTablesSize);
+        *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+        ZSTD_memcpy(op, fseMetadata->fseTablesBuffer, fseMetadata->fseTablesSize);
+        op += fseMetadata->fseTablesSize;
+    } else {
+        const U32 repeat = set_repeat;
+        *seqHead = (BYTE)((repeat<<6) + (repeat<<4) + (repeat<<2));
+    }
+
+    {   size_t const bitstreamSize = ZSTD_encodeSequences(
+                                        op, (size_t)(oend - op),
+                                        fseTables->matchlengthCTable, mlCode,
+                                        fseTables->offcodeCTable, ofCode,
+                                        fseTables->litlengthCTable, llCode,
+                                        sequences, nbSeq,
+                                        longOffsets, bmi2);
+        FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed");
+        op += bitstreamSize;
+        /* zstd versions <= 1.3.4 mistakenly report corruption when
+         * FSE_readNCount() receives a buffer < 4 bytes.
+         * Fixed by https://github.com/facebook/zstd/pull/1146.
+         * This can happen when the last set_compressed table present is 2
+         * bytes and the bitstream is only one byte.
+         * In this exceedingly rare case, we will simply emit an uncompressed
+         * block, since it isn't worth optimizing.
+         */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+        if (writeEntropy && fseMetadata->lastCountSize && fseMetadata->lastCountSize + bitstreamSize < 4) {
+            /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+            assert(fseMetadata->lastCountSize + bitstreamSize == 3);
+            DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+                        "emitting an uncompressed block.");
+            return 0;
+        }
+#endif
+        DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize);
+    }
+
+    /* zstd versions <= 1.4.0 mistakenly report error when
+     * sequences section body size is less than 3 bytes.
+     * Fixed by https://github.com/facebook/zstd/pull/1664.
+     * This can happen when the previous sequences section block is compressed
+     * with rle mode and the current block's sequences section is compressed
+     * with repeat mode where sequences section body size can be 1 byte.
+     */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    if (op-seqHead < 4) {
+        DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.4.0 by emitting "
+                    "an uncompressed block when sequences are < 4 bytes");
+        return 0;
+    }
+#endif
+
+    *entropyWritten = 1;
+    return (size_t)(op - ostart);
+}
+
+/** ZSTD_compressSubBlock() :
+ *  Compresses a single sub-block.
+ *  @return : compressed size of the sub-block
+ *            Or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                    const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                    const seqDef* sequences, size_t nbSeq,
+                                    const BYTE* literals, size_t litSize,
+                                    const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                    const ZSTD_CCtx_params* cctxParams,
+                                    void* dst, size_t dstCapacity,
+                                    const int bmi2,
+                                    int writeLitEntropy, int writeSeqEntropy,
+                                    int* litEntropyWritten, int* seqEntropyWritten,
+                                    U32 lastBlock)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart + ZSTD_blockHeaderSize;
+    DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeLitEntropy=%d, writeSeqEntropy=%d, lastBlock=%d)",
+                litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
+    {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+                                                        &entropyMetadata->hufMetadata, literals, litSize,
+                                                        op, (size_t)(oend-op),
+                                                        bmi2, writeLitEntropy, litEntropyWritten);
+        FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
+        if (cLitSize == 0) return 0;
+        op += cLitSize;
+    }
+    {   size_t cSeqSize = ZSTD_compressSubBlock_sequences(&entropy->fse,
+                                                  &entropyMetadata->fseMetadata,
+                                                  sequences, nbSeq,
+                                                  llCode, mlCode, ofCode,
+                                                  cctxParams,
+                                                  op, (size_t)(oend-op),
+                                                  bmi2, writeSeqEntropy, seqEntropyWritten);
+        FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
+        if (cSeqSize == 0) return 0;
+        op += cSeqSize;
+    }
+    /* Write block header */
+    {   size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize;
+        U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+        MEM_writeLE24(ostart, cBlockHeader24);
+    }
+    return (size_t)(op-ostart);
+}
+
+static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+                                                const ZSTD_hufCTables_t* huf,
+                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                                void* workspace, size_t wkspSize,
+                                                int writeEntropy)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    unsigned maxSymbolValue = 255;
+    size_t literalSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+
+    if (hufMetadata->hType == set_basic) return litSize;
+    else if (hufMetadata->hType == set_rle) return 1;
+    else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) {
+        size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize);
+        if (ZSTD_isError(largest)) return litSize;
+        {   size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
+            if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize;
+            return cLitSizeEstimate + literalSectionHeaderSize;
+    }   }
+    assert(0); /* impossible */
+    return 0;
+}
+
+static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type,
+                        const BYTE* codeTable, unsigned maxCode,
+                        size_t nbSeq, const FSE_CTable* fseCTable,
+                        const U8* additionalBits,
+                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                        void* workspace, size_t wkspSize)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    const BYTE* ctp = codeTable;
+    const BYTE* const ctStart = ctp;
+    const BYTE* const ctEnd = ctStart + nbSeq;
+    size_t cSymbolTypeSizeEstimateInBits = 0;
+    unsigned max = maxCode;
+
+    HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize);  /* can't fail */
+    if (type == set_basic) {
+        /* We selected this encoding type, so it must be valid. */
+        assert(max <= defaultMax);
+        cSymbolTypeSizeEstimateInBits = max <= defaultMax
+                ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max)
+                : ERROR(GENERIC);
+    } else if (type == set_rle) {
+        cSymbolTypeSizeEstimateInBits = 0;
+    } else if (type == set_compressed || type == set_repeat) {
+        cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max);
+    }
+    if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) return nbSeq * 10;
+    while (ctp < ctEnd) {
+        if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp];
+        else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */
+        ctp++;
+    }
+    return cSymbolTypeSizeEstimateInBits / 8;
+}
+
+static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
+                                                  const BYTE* llCodeTable,
+                                                  const BYTE* mlCodeTable,
+                                                  size_t nbSeq,
+                                                  const ZSTD_fseCTables_t* fseTables,
+                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                                  void* workspace, size_t wkspSize,
+                                                  int writeEntropy)
+{
+    size_t const sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+    size_t cSeqSizeEstimate = 0;
+    if (nbSeq == 0) return sequencesSectionHeaderSize;
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff,
+                                         nbSeq, fseTables->offcodeCTable, NULL,
+                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->llType, llCodeTable, MaxLL,
+                                         nbSeq, fseTables->litlengthCTable, LL_bits,
+                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, MaxML,
+                                         nbSeq, fseTables->matchlengthCTable, ML_bits,
+                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                         workspace, wkspSize);
+    if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+    return cSeqSizeEstimate + sequencesSectionHeaderSize;
+}
+
+typedef struct {
+    size_t estLitSize;
+    size_t estBlockSize;
+} EstimatedBlockSize;
+static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                        const BYTE* ofCodeTable,
+                                        const BYTE* llCodeTable,
+                                        const BYTE* mlCodeTable,
+                                        size_t nbSeq,
+                                        const ZSTD_entropyCTables_t* entropy,
+                                        const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                        void* workspace, size_t wkspSize,
+                                        int writeLitEntropy, int writeSeqEntropy)
+{
+    EstimatedBlockSize ebs;
+    ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize,
+                                                        &entropy->huf, &entropyMetadata->hufMetadata,
+                                                        workspace, wkspSize, writeLitEntropy);
+    ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                         workspace, wkspSize, writeSeqEntropy);
+    ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize;
+    return ebs;
+}
+
+static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+{
+    if (fseMetadata->llType == set_compressed || fseMetadata->llType == set_rle)
+        return 1;
+    if (fseMetadata->mlType == set_compressed || fseMetadata->mlType == set_rle)
+        return 1;
+    if (fseMetadata->ofType == set_compressed || fseMetadata->ofType == set_rle)
+        return 1;
+    return 0;
+}
+
+static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount)
+{
+    size_t n, total = 0;
+    assert(sp != NULL);
+    for (n=0; n<seqCount; n++) {
+        total += ZSTD_getSequenceLength(seqStore, sp+n).litLength;
+    }
+    DEBUGLOG(6, "countLiterals for %zu sequences from %p => %zu bytes", seqCount, (const void*)sp, total);
+    return total;
+}
+
+#define BYTESCALE 256
+
+static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs,
+                size_t targetBudget, size_t avgLitCost, size_t avgSeqCost,
+                int firstSubBlock)
+{
+    size_t n, budget = 0, inSize=0;
+    /* entropy headers */
+    size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */
+    assert(firstSubBlock==0 || firstSubBlock==1);
+    budget += headerSize;
+
+    /* first sequence => at least one sequence*/
+    budget += sp[0].litLength * avgLitCost + avgSeqCost;
+    if (budget > targetBudget) return 1;
+    inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH);
+
+    /* loop over sequences */
+    for (n=1; n<nbSeqs; n++) {
+        size_t currentCost = sp[n].litLength * avgLitCost + avgSeqCost;
+        budget += currentCost;
+        inSize += sp[n].litLength + (sp[n].mlBase+MINMATCH);
+        /* stop when sub-block budget is reached */
+        if ( (budget > targetBudget)
+            /* though continue to expand until the sub-block is deemed compressible */
+          && (budget < inSize * BYTESCALE) )
+            break;
+    }
+
+    return n;
+}
+
+/** ZSTD_compressSubBlock_multi() :
+ *  Breaks super-block into multiple sub-blocks and compresses them.
+ *  Entropy will be written into the first block.
+ *  The following blocks use repeat_mode to compress.
+ *  Sub-blocks are all compressed, except the last one when beneficial.
+ *  @return : compressed size of the super block (which features multiple ZSTD blocks)
+ *            or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+                            const ZSTD_compressedBlockState_t* prevCBlock,
+                            ZSTD_compressedBlockState_t* nextCBlock,
+                            const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                            const ZSTD_CCtx_params* cctxParams,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const int bmi2, U32 lastBlock,
+                            void* workspace, size_t wkspSize)
+{
+    const seqDef* const sstart = seqStorePtr->sequencesStart;
+    const seqDef* const send = seqStorePtr->sequences;
+    const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */
+    size_t const nbSeqs = (size_t)(send - sstart);
+    const BYTE* const lstart = seqStorePtr->litStart;
+    const BYTE* const lend = seqStorePtr->lit;
+    const BYTE* lp = lstart;
+    size_t const nbLiterals = (size_t)(lend - lstart);
+    BYTE const* ip = (BYTE const*)src;
+    BYTE const* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    const BYTE* llCodePtr = seqStorePtr->llCode;
+    const BYTE* mlCodePtr = seqStorePtr->mlCode;
+    const BYTE* ofCodePtr = seqStorePtr->ofCode;
+    size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */
+    size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize);
+    int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed);
+    int writeSeqEntropy = 1;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)",
+               (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart));
+
+        /* let's start by a general estimation for the full block */
+    if (nbSeqs > 0) {
+        EstimatedBlockSize const ebs =
+                ZSTD_estimateSubBlockSize(lp, nbLiterals,
+                                        ofCodePtr, llCodePtr, mlCodePtr, nbSeqs,
+                                        &nextCBlock->entropy, entropyMetadata,
+                                        workspace, wkspSize,
+                                        writeLitEntropy, writeSeqEntropy);
+        /* quick estimation */
+        size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE;
+        size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs;
+        const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1);
+        size_t n, avgBlockBudget, blockBudgetSupp=0;
+        avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks;
+        DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes",
+                    (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE,
+                    (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE);
+        /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately
+         * this will result in the production of a single uncompressed block covering @srcSize.*/
+        if (ebs.estBlockSize > srcSize) return 0;
+
+        /* compress and write sub-blocks */
+        assert(nbSubBlocks>0);
+        for (n=0; n < nbSubBlocks-1; n++) {
+            /* determine nb of sequences for current sub-block + nbLiterals from next sequence */
+            size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp),
+                                        avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0);
+            /* if reached last sequence : break to last sub-block (simplification) */
+            assert(seqCount <= (size_t)(send-sp));
+            if (sp + seqCount == send) break;
+            assert(seqCount > 0);
+            /* compress sub-block */
+            {   int litEntropyWritten = 0;
+                int seqEntropyWritten = 0;
+                size_t litSize = countLiterals(seqStorePtr, sp, seqCount);
+                const size_t decompressedSize =
+                        ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0);
+                size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+                                                sp, seqCount,
+                                                lp, litSize,
+                                                llCodePtr, mlCodePtr, ofCodePtr,
+                                                cctxParams,
+                                                op, (size_t)(oend-op),
+                                                bmi2, writeLitEntropy, writeSeqEntropy,
+                                                &litEntropyWritten, &seqEntropyWritten,
+                                                0);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+
+                /* check compressibility, update state components */
+                if (cSize > 0 && cSize < decompressedSize) {
+                    DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes",
+                                (unsigned)decompressedSize, (unsigned)cSize);
+                    assert(ip + decompressedSize <= iend);
+                    ip += decompressedSize;
+                    lp += litSize;
+                    op += cSize;
+                    llCodePtr += seqCount;
+                    mlCodePtr += seqCount;
+                    ofCodePtr += seqCount;
+                    /* Entropy only needs to be written once */
+                    if (litEntropyWritten) {
+                        writeLitEntropy = 0;
+                    }
+                    if (seqEntropyWritten) {
+                        writeSeqEntropy = 0;
+                    }
+                    sp += seqCount;
+                    blockBudgetSupp = 0;
+            }   }
+            /* otherwise : do not compress yet, coalesce current sub-block with following one */
+        }
+    } /* if (nbSeqs > 0) */
+
+    /* write last block */
+    DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp));
+    {   int litEntropyWritten = 0;
+        int seqEntropyWritten = 0;
+        size_t litSize = (size_t)(lend - lp);
+        size_t seqCount = (size_t)(send - sp);
+        const size_t decompressedSize =
+                ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1);
+        size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+                                            sp, seqCount,
+                                            lp, litSize,
+                                            llCodePtr, mlCodePtr, ofCodePtr,
+                                            cctxParams,
+                                            op, (size_t)(oend-op),
+                                            bmi2, writeLitEntropy, writeSeqEntropy,
+                                            &litEntropyWritten, &seqEntropyWritten,
+                                            lastBlock);
+        FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+
+        /* update pointers, the nb of literals borrowed from next sequence must be preserved */
+        if (cSize > 0 && cSize < decompressedSize) {
+            DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes",
+                        (unsigned)decompressedSize, (unsigned)cSize);
+            assert(ip + decompressedSize <= iend);
+            ip += decompressedSize;
+            lp += litSize;
+            op += cSize;
+            llCodePtr += seqCount;
+            mlCodePtr += seqCount;
+            ofCodePtr += seqCount;
+            /* Entropy only needs to be written once */
+            if (litEntropyWritten) {
+                writeLitEntropy = 0;
+            }
+            if (seqEntropyWritten) {
+                writeSeqEntropy = 0;
+            }
+            sp += seqCount;
+        }
+    }
+
+
+    if (writeLitEntropy) {
+        DEBUGLOG(5, "Literal entropy tables were never written");
+        ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
+    }
+    if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+        /* If we haven't written our entropy tables, then we've violated our contract and
+         * must emit an uncompressed block.
+         */
+        DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block");
+        return 0;
+    }
+
+    if (ip < iend) {
+        /* some data left : last part of the block sent uncompressed */
+        size_t const rSize = (size_t)((iend - ip));
+        size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock);
+        DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize));
+        FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+        assert(cSize != 0);
+        op += cSize;
+        /* We have to regenerate the repcodes because we've skipped some sequences */
+        if (sp < send) {
+            const seqDef* seq;
+            repcodes_t rep;
+            ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
+            for (seq = sstart; seq < sp; ++seq) {
+                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+            }
+            ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
+        }
+    }
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u",
+                (unsigned)(op-ostart));
+    return (size_t)(op-ostart);
+}
+
+size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               unsigned lastBlock)
+{
+    ZSTD_entropyCTablesMetadata_t entropyMetadata;
+
+    FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
+          &zc->blockState.prevCBlock->entropy,
+          &zc->blockState.nextCBlock->entropy,
+          &zc->appliedParams,
+          &entropyMetadata,
+          zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+
+    return ZSTD_compressSubBlock_multi(&zc->seqStore,
+            zc->blockState.prevCBlock,
+            zc->blockState.nextCBlock,
+            &entropyMetadata,
+            &zc->appliedParams,
+            dst, dstCapacity,
+            src, srcSize,
+            zc->bmi2, lastBlock,
+            zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */);
+}
diff --git a/src/zstd/zstd_compress_superblock.h b/src/zstd/compress/zstd_compress_superblock.h
similarity index 80%
rename from src/zstd/zstd_compress_superblock.h
rename to src/zstd/compress/zstd_compress_superblock.h
index c174a72d5..8e494f0d5 100644
--- a/src/zstd/zstd_compress_superblock.h
+++ b/src/zstd/compress/zstd_compress_superblock.h
@@ -12,21 +12,21 @@
 #define ZSTD_COMPRESS_ADVANCED_H
 
 /*-*************************************
- *  Dependencies
- ***************************************/
+*  Dependencies
+***************************************/
 
-#include "zstd.h" /* ZSTD_CCtx */
+#include "../zstd.h" /* ZSTD_CCtx */
 
 /*-*************************************
- *  Target Compressed Block Size
- ***************************************/
+*  Target Compressed Block Size
+***************************************/
 
 /* ZSTD_compressSuperBlock() :
  * Used to compress a super block when targetCBlockSize is being used.
  * The given block will be compressed into multiple sub blocks that are around targetCBlockSize. */
 size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
                                void* dst, size_t dstCapacity,
-                               void const * src, size_t srcSize,
+                               void const* src, size_t srcSize,
                                unsigned lastBlock);
 
 #endif /* ZSTD_COMPRESS_ADVANCED_H */
diff --git a/src/zstd/compress/zstd_cwksp.h b/src/zstd/compress/zstd_cwksp.h
new file mode 100644
index 000000000..3eddbd334
--- /dev/null
+++ b/src/zstd/compress/zstd_cwksp.h
@@ -0,0 +1,748 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CWKSP_H
+#define ZSTD_CWKSP_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+#include "../common/zstd_internal.h"
+#include "../common/portability_macros.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+*  Constants
+***************************************/
+
+/* Since the workspace is effectively its own little malloc implementation /
+ * arena, when we run under ASAN, we should similarly insert redzones between
+ * each internal element of the workspace, so ASAN will catch overruns that
+ * reach outside an object but that stay inside the workspace.
+ *
+ * This defines the size of that redzone.
+ */
+#ifndef ZSTD_CWKSP_ASAN_REDZONE_SIZE
+#define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128
+#endif
+
+
+/* Set our tables and aligneds to align by 64 bytes */
+#define ZSTD_CWKSP_ALIGNMENT_BYTES 64
+
+/*-*************************************
+*  Structures
+***************************************/
+typedef enum {
+    ZSTD_cwksp_alloc_objects,
+    ZSTD_cwksp_alloc_aligned_init_once,
+    ZSTD_cwksp_alloc_aligned,
+    ZSTD_cwksp_alloc_buffers
+} ZSTD_cwksp_alloc_phase_e;
+
+/**
+ * Used to describe whether the workspace is statically allocated (and will not
+ * necessarily ever be freed), or if it's dynamically allocated and we can
+ * expect a well-formed caller to free this.
+ */
+typedef enum {
+    ZSTD_cwksp_dynamic_alloc,
+    ZSTD_cwksp_static_alloc
+} ZSTD_cwksp_static_alloc_e;
+
+/**
+ * Zstd fits all its internal datastructures into a single continuous buffer,
+ * so that it only needs to perform a single OS allocation (or so that a buffer
+ * can be provided to it and it can perform no allocations at all). This buffer
+ * is called the workspace.
+ *
+ * Several optimizations complicate that process of allocating memory ranges
+ * from this workspace for each internal datastructure:
+ *
+ * - These different internal datastructures have different setup requirements:
+ *
+ *   - The static objects need to be cleared once and can then be trivially
+ *     reused for each compression.
+ *
+ *   - Various buffers don't need to be initialized at all--they are always
+ *     written into before they're read.
+ *
+ *   - The matchstate tables have a unique requirement that they don't need
+ *     their memory to be totally cleared, but they do need the memory to have
+ *     some bound, i.e., a guarantee that all values in the memory they've been
+ *     allocated is less than some maximum value (which is the starting value
+ *     for the indices that they will then use for compression). When this
+ *     guarantee is provided to them, they can use the memory without any setup
+ *     work. When it can't, they have to clear the area.
+ *
+ * - These buffers also have different alignment requirements.
+ *
+ * - We would like to reuse the objects in the workspace for multiple
+ *   compressions without having to perform any expensive reallocation or
+ *   reinitialization work.
+ *
+ * - We would like to be able to efficiently reuse the workspace across
+ *   multiple compressions **even when the compression parameters change** and
+ *   we need to resize some of the objects (where possible).
+ *
+ * To attempt to manage this buffer, given these constraints, the ZSTD_cwksp
+ * abstraction was created. It works as follows:
+ *
+ * Workspace Layout:
+ *
+ * [                        ... workspace ...                           ]
+ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
+ *
+ * The various objects that live in the workspace are divided into the
+ * following categories, and are allocated separately:
+ *
+ * - Static objects: this is optionally the enclosing ZSTD_CCtx or ZSTD_CDict,
+ *   so that literally everything fits in a single buffer. Note: if present,
+ *   this must be the first object in the workspace, since ZSTD_customFree{CCtx,
+ *   CDict}() rely on a pointer comparison to see whether one or two frees are
+ *   required.
+ *
+ * - Fixed size objects: these are fixed-size, fixed-count objects that are
+ *   nonetheless "dynamically" allocated in the workspace so that we can
+ *   control how they're initialized separately from the broader ZSTD_CCtx.
+ *   Examples:
+ *   - Entropy Workspace
+ *   - 2 x ZSTD_compressedBlockState_t
+ *   - CDict dictionary contents
+ *
+ * - Tables: these are any of several different datastructures (hash tables,
+ *   chain tables, binary trees) that all respect a common format: they are
+ *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+ *   Their sizes depend on the cparams. These tables are 64-byte aligned.
+ *
+ * - Init once: these buffers require to be initialized at least once before
+ *   use. They should be used when we want to skip memory initialization
+ *   while not triggering memory checkers (like Valgrind) when reading from
+ *   from this memory without writing to it first.
+ *   These buffers should be used carefully as they might contain data
+ *   from previous compressions.
+ *   Buffers are aligned to 64 bytes.
+ *
+ * - Aligned: these buffers don't require any initialization before they're
+ *   used. The user of the buffer should make sure they write into a buffer
+ *   location before reading from it.
+ *   Buffers are aligned to 64 bytes.
+ *
+ * - Buffers: these buffers are used for various purposes that don't require
+ *   any alignment or initialization before they're used. This means they can
+ *   be moved around at no cost for a new compression.
+ *
+ * Allocating Memory:
+ *
+ * The various types of objects must be allocated in order, so they can be
+ * correctly packed into the workspace buffer. That order is:
+ *
+ * 1. Objects
+ * 2. Init once / Tables
+ * 3. Aligned / Tables
+ * 4. Buffers / Tables
+ *
+ * Attempts to reserve objects of different types out of order will fail.
+ */
+typedef struct {
+    void* workspace;
+    void* workspaceEnd;
+
+    void* objectEnd;
+    void* tableEnd;
+    void* tableValidEnd;
+    void* allocStart;
+    void* initOnceStart;
+
+    BYTE allocFailed;
+    int workspaceOversizedDuration;
+    ZSTD_cwksp_alloc_phase_e phase;
+    ZSTD_cwksp_static_alloc_e isStatic;
+} ZSTD_cwksp;
+
+/*-*************************************
+*  Functions
+***************************************/
+
+MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
+
+MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+    (void)ws;
+    assert(ws->workspace <= ws->objectEnd);
+    assert(ws->objectEnd <= ws->tableEnd);
+    assert(ws->objectEnd <= ws->tableValidEnd);
+    assert(ws->tableEnd <= ws->allocStart);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    assert(ws->allocStart <= ws->workspaceEnd);
+    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
+    assert(ws->workspace <= ws->initOnceStart);
+#if ZSTD_MEMORY_SANITIZER
+    {
+        intptr_t const offset = __msan_test_shadow(ws->initOnceStart,
+            (U8*)ZSTD_cwksp_initialAllocStart(ws) - (U8*)ws->initOnceStart);
+        (void)offset;
+#if defined(ZSTD_MSAN_PRINT)
+        if(offset!=-1) {
+            __msan_print_shadow((U8*)ws->initOnceStart + offset - 8, 32);
+        }
+#endif
+        assert(offset==-1);
+    };
+#endif
+}
+
+/**
+ * Align must be a power of 2.
+ */
+MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
+    size_t const mask = align - 1;
+    assert((align & mask) == 0);
+    return (size + mask) & ~mask;
+}
+
+/**
+ * Use this to determine how much space in the workspace we will consume to
+ * allocate this object. (Normally it should be exactly the size of the object,
+ * but under special conditions, like ASAN, where we pad each object, it might
+ * be larger.)
+ *
+ * Since tables aren't currently redzoned, you don't need to call through this
+ * to figure out how much space you need for the matchState tables. Everything
+ * else is though.
+ *
+ * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size().
+ */
+MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
+    if (size == 0)
+        return 0;
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    return size + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#else
+    return size;
+#endif
+}
+
+/**
+ * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes.
+ * Used to determine the number of bytes required for a given "aligned".
+ */
+MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+    return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES));
+}
+
+/**
+ * Returns the amount of additional space the cwksp must allocate
+ * for internal purposes (currently only alignment).
+ */
+MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
+     * bytes to align the beginning of tables section and end of buffers;
+     */
+    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
+    return slackSpace;
+}
+
+
+/**
+ * Return the number of additional bytes required to align a pointer to the given number of bytes.
+ * alignBytes must be a power of two.
+ */
+MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) {
+    size_t const alignBytesMask = alignBytes - 1;
+    size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
+    assert((alignBytes & alignBytesMask) == 0);
+    assert(bytes < alignBytes);
+    return bytes;
+}
+
+/**
+ * Returns the initial value for allocStart which is used to determine the position from
+ * which we can allocate from the end of the workspace.
+ */
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
+    return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
+}
+
+/**
+ * Internal function. Do not use directly.
+ * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+ * which counts from the end of the wksp (as opposed to the object/table segment).
+ *
+ * Returns a pointer to the beginning of that space.
+ */
+MEM_STATIC void*
+ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes)
+{
+    void* const alloc = (BYTE*)ws->allocStart - bytes;
+    void* const bottom = ws->tableEnd;
+    DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
+        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    assert(alloc >= bottom);
+    if (alloc < bottom) {
+        DEBUGLOG(4, "cwksp: alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    /* the area is reserved from the end of wksp.
+     * If it overlaps with tableValidEnd, it voids guarantees on values' range */
+    if (alloc < ws->tableValidEnd) {
+        ws->tableValidEnd = alloc;
+    }
+    ws->allocStart = alloc;
+    return alloc;
+}
+
+/**
+ * Moves the cwksp to the next phase, and does any necessary allocations.
+ * cwksp initialization must necessarily go through each phase in order.
+ * Returns a 0 on success, or zstd error
+ */
+MEM_STATIC size_t
+ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase)
+{
+    assert(phase >= ws->phase);
+    if (phase > ws->phase) {
+        /* Going from allocating objects to allocating initOnce / tables */
+        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
+            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
+            ws->tableValidEnd = ws->objectEnd;
+            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+
+            {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
+                void *const alloc = ws->objectEnd;
+                size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
+                DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
+                RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
+                                "table phase - alignment initial allocation failed!");
+                ws->objectEnd = objectEnd;
+                ws->tableEnd = objectEnd;  /* table area starts being empty */
+                if (ws->tableValidEnd < ws->tableEnd) {
+                    ws->tableValidEnd = ws->tableEnd;
+                }
+            }
+        }
+        ws->phase = phase;
+        ZSTD_cwksp_assert_internal_consistency(ws);
+    }
+    return 0;
+}
+
+/**
+ * Returns whether this object/buffer/etc was allocated in this workspace.
+ */
+MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+{
+    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
+}
+
+/**
+ * Internal function. Do not use directly.
+ */
+MEM_STATIC void*
+ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase)
+{
+    void* alloc;
+    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) {
+        return NULL;
+    }
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* over-reserve space */
+    bytes += 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#endif
+
+    alloc = ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes);
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
+     * either size. */
+    if (alloc) {
+        alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+        if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+            /* We need to keep the redzone poisoned while unpoisoning the bytes that
+             * are actually allocated. */
+            __asan_unpoison_memory_region(alloc, bytes - 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE);
+        }
+    }
+#endif
+
+    return alloc;
+}
+
+/**
+ * Reserves and returns unaligned memory.
+ */
+MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+{
+    return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
+}
+
+/**
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+ * This memory has been initialized at least once in the past.
+ * This doesn't mean it has been initialized this time, and it might contain data from previous
+ * operations.
+ * The main usage is for algorithms that might need read access into uninitialized memory.
+ * The algorithm must maintain safety under these conditions and must make sure it doesn't
+ * leak any of the past data (directly or in side channels).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
+{
+    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
+    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
+    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    if(ptr && ptr < ws->initOnceStart) {
+        /* We assume the memory following the current allocation is either:
+         * 1. Not usable as initOnce memory (end of workspace)
+         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
+         * 3. An ASAN redzone, in which case we don't want to write on it
+         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
+         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
+        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
+        ws->initOnceStart = ptr;
+    }
+#if ZSTD_MEMORY_SANITIZER
+    assert(__msan_test_shadow(ptr, bytes) == -1);
+#endif
+    return ptr;
+}
+
+/**
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+{
+    void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES),
+                                            ZSTD_cwksp_alloc_aligned);
+    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    return ptr;
+}
+
+/**
+ * Aligned on 64 bytes. These buffers have the special property that
+ * their values remain constrained, allowing us to reuse them without
+ * memset()-ing them.
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+{
+    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
+    void* alloc;
+    void* end;
+    void* top;
+
+    /* We can only start allocating tables after we are done reserving space for objects at the
+     * start of the workspace */
+    if(ws->phase < phase) {
+        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+            return NULL;
+        }
+    }
+    alloc = ws->tableEnd;
+    end = (BYTE *)alloc + bytes;
+    top = ws->allocStart;
+
+    DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining",
+        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+    assert((bytes & (sizeof(U32)-1)) == 0);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    assert(end <= top);
+    if (end > top) {
+        DEBUGLOG(4, "cwksp: table alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    ws->tableEnd = end;
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        __asan_unpoison_memory_region(alloc, bytes);
+    }
+#endif
+
+    assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
+    assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    return alloc;
+}
+
+/**
+ * Aligned on sizeof(void*).
+ * Note : should happen only once, at workspace first initialization
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes)
+{
+    size_t const roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*));
+    void* alloc = ws->objectEnd;
+    void* end = (BYTE*)alloc + roundedBytes;
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* over-reserve space */
+    end = (BYTE *)end + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#endif
+
+    DEBUGLOG(4,
+        "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining",
+        alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes);
+    assert((size_t)alloc % ZSTD_ALIGNOF(void*) == 0);
+    assert(bytes % ZSTD_ALIGNOF(void*) == 0);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    /* we must be in the first phase, no advance is possible */
+    if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) {
+        DEBUGLOG(3, "cwksp: object alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    ws->objectEnd = end;
+    ws->tableEnd = end;
+    ws->tableValidEnd = end;
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
+     * either size. */
+    alloc = (BYTE*)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        __asan_unpoison_memory_region(alloc, bytes);
+    }
+#endif
+
+    return alloc;
+}
+
+MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws)
+{
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty");
+
+#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    /* To validate that the table reuse logic is sound, and that we don't
+     * access table space that we haven't cleaned, we re-"poison" the table
+     * space every time we mark it dirty.
+     * Since tableValidEnd space and initOnce space may overlap we don't poison
+     * the initOnce portion as it break its promise. This means that this poisoning
+     * check isn't always applied fully. */
+    {
+        size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
+        assert(__msan_test_shadow(ws->objectEnd, size) == -1);
+        if((BYTE*)ws->tableValidEnd < (BYTE*)ws->initOnceStart) {
+            __msan_poison(ws->objectEnd, size);
+        } else {
+            assert(ws->initOnceStart >= ws->objectEnd);
+            __msan_poison(ws->objectEnd, (BYTE*)ws->initOnceStart - (BYTE*)ws->objectEnd);
+        }
+    }
+#endif
+
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    ws->tableValidEnd = ws->objectEnd;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+MEM_STATIC void ZSTD_cwksp_mark_tables_clean(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_clean");
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    if (ws->tableValidEnd < ws->tableEnd) {
+        ws->tableValidEnd = ws->tableEnd;
+    }
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * Zero the part of the allocated tables not already marked clean.
+ */
+MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_clean_tables");
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    if (ws->tableValidEnd < ws->tableEnd) {
+        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
+    }
+    ZSTD_cwksp_mark_tables_clean(ws);
+}
+
+/**
+ * Invalidates table allocations.
+ * All other allocations remain valid.
+ */
+MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: clearing tables!");
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* We don't do this when the workspace is statically allocated, because
+     * when that is the case, we have no capability to hook into the end of the
+     * workspace's lifecycle to unpoison the memory.
+     */
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
+        __asan_poison_memory_region(ws->objectEnd, size);
+    }
+#endif
+
+    ws->tableEnd = ws->objectEnd;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * Invalidates all buffer, aligned, and table allocations.
+ * Object allocations remain valid.
+ */
+MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: clearing!");
+
+#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    /* To validate that the context reuse logic is sound, and that we don't
+     * access stuff that this compression hasn't initialized, we re-"poison"
+     * the workspace except for the areas in which we expect memory reuse
+     * without initialization (objects, valid tables area and init once
+     * memory). */
+    {
+        if((BYTE*)ws->tableValidEnd < (BYTE*)ws->initOnceStart) {
+            size_t size = (BYTE*)ws->initOnceStart - (BYTE*)ws->tableValidEnd;
+            __msan_poison(ws->tableValidEnd, size);
+        }
+    }
+#endif
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* We don't do this when the workspace is statically allocated, because
+     * when that is the case, we have no capability to hook into the end of the
+     * workspace's lifecycle to unpoison the memory.
+     */
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->objectEnd;
+        __asan_poison_memory_region(ws->objectEnd, size);
+    }
+#endif
+
+    ws->tableEnd = ws->objectEnd;
+    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
+    ws->allocFailed = 0;
+    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
+        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
+    }
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+}
+
+MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
+         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
+}
+
+/**
+ * The provided workspace takes ownership of the buffer [start, start+size).
+ * Any existing values in the workspace are ignored (the previously managed
+ * buffer, if present, must be separately freed).
+ */
+MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_cwksp_static_alloc_e isStatic) {
+    DEBUGLOG(4, "cwksp: init'ing workspace with %zd bytes", size);
+    assert(((size_t)start & (sizeof(void*)-1)) == 0); /* ensure correct alignment */
+    ws->workspace = start;
+    ws->workspaceEnd = (BYTE*)start + size;
+    ws->objectEnd = ws->workspace;
+    ws->tableValidEnd = ws->objectEnd;
+    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+    ws->phase = ZSTD_cwksp_alloc_objects;
+    ws->isStatic = isStatic;
+    ZSTD_cwksp_clear(ws);
+    ws->workspaceOversizedDuration = 0;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+MEM_STATIC size_t ZSTD_cwksp_create(ZSTD_cwksp* ws, size_t size, ZSTD_customMem customMem) {
+    void* workspace = ZSTD_customMalloc(size, customMem);
+    DEBUGLOG(4, "cwksp: creating new workspace with %zd bytes", size);
+    RETURN_ERROR_IF(workspace == NULL, memory_allocation, "NULL pointer!");
+    ZSTD_cwksp_init(ws, workspace, size, ZSTD_cwksp_dynamic_alloc);
+    return 0;
+}
+
+MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) {
+    void *ptr = ws->workspace;
+    DEBUGLOG(4, "cwksp: freeing workspace");
+#if ZSTD_MEMORY_SANITIZER && !defined(ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    if (ptr != NULL && customMem.customFree != NULL) {
+        __msan_unpoison(ptr, ZSTD_cwksp_sizeof(ws));
+    }
+#endif
+    ZSTD_memset(ws, 0, sizeof(ZSTD_cwksp));
+    ZSTD_customFree(ptr, customMem);
+}
+
+/**
+ * Moves the management of a workspace from one cwksp to another. The src cwksp
+ * is left in an invalid state (src must be re-init()'ed before it's used again).
+ */
+MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
+    *dst = *src;
+    ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
+}
+
+MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+    return ws->allocFailed;
+}
+
+/*-*************************************
+*  Functions Checking Free Space
+***************************************/
+
+/* ZSTD_alignmentSpaceWithinBounds() :
+ * Returns if the estimated space needed for a wksp is within an acceptable limit of the
+ * actual amount of space used.
+ */
+MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
+    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
+     * the alignment bytes difference between estimation and actual usage */
+    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
+           ZSTD_cwksp_used(ws) <= estimatedSpace;
+}
+
+
+MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd);
+}
+
+MEM_STATIC int ZSTD_cwksp_check_available(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_available_space(ws) >= additionalNeededSpace;
+}
+
+MEM_STATIC int ZSTD_cwksp_check_too_large(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_check_available(
+        ws, additionalNeededSpace * ZSTD_WORKSPACETOOLARGE_FACTOR);
+}
+
+MEM_STATIC int ZSTD_cwksp_check_wasteful(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)
+        && ws->workspaceOversizedDuration > ZSTD_WORKSPACETOOLARGE_MAXDURATION;
+}
+
+MEM_STATIC void ZSTD_cwksp_bump_oversized_duration(
+        ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    if (ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)) {
+        ws->workspaceOversizedDuration++;
+    } else {
+        ws->workspaceOversizedDuration = 0;
+    }
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_CWKSP_H */
diff --git a/src/zstd/compress/zstd_double_fast.c b/src/zstd/compress/zstd_double_fast.c
new file mode 100644
index 000000000..a4e9c50d3
--- /dev/null
+++ b/src/zstd/compress/zstd_double_fast.c
@@ -0,0 +1,770 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "zstd_double_fast.h"
+
+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0) {
+                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
+            }
+            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
+                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
+            }
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+    }   }
+}
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0)
+                hashSmall[smHash] = curr + i;
+            if (i == 0 || hashLarge[lgHash] == 0)
+                hashLarge[lgHash] = curr + i;
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+        }   }
+}
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls /* template */)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32* const hashLong = ms->hashTable;
+    const U32 hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    const U32 hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* anchor = istart;
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+    /* presumes that, if there is a dictionary, it must be using Attach mode */
+    const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    size_t mLength;
+    U32 offset;
+    U32 curr;
+
+    /* how many positions to search before increasing step size */
+    const size_t kStepIncr = 1 << kSearchStrength;
+    /* the position at which to increment the step size if no match is found */
+    const BYTE* nextStep;
+    size_t step; /* the current step size */
+
+    size_t hl0; /* the long hash at ip */
+    size_t hl1; /* the long hash at ip1 */
+
+    U32 idxl0; /* the long match index for ip */
+    U32 idxl1; /* the long match index for ip1 */
+
+    const BYTE* matchl0; /* the long match for ip */
+    const BYTE* matchs0; /* the short match for ip */
+    const BYTE* matchl1; /* the long match for ip1 */
+
+    const BYTE* ip = istart; /* the current position */
+    const BYTE* ip1; /* the next position */
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic");
+
+    /* init */
+    ip += ((ip - prefixLowest) == 0);
+    {
+        U32 const current = (U32)(ip - base);
+        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+        U32 const maxRep = current - windowLow;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+
+    /* Outer Loop: one iteration per match found and stored */
+    while (1) {
+        step = 1;
+        nextStep = ip + kStepIncr;
+        ip1 = ip + step;
+
+        if (ip1 > ilimit) {
+            goto _cleanup;
+        }
+
+        hl0 = ZSTD_hashPtr(ip, hBitsL, 8);
+        idxl0 = hashLong[hl0];
+        matchl0 = base + idxl0;
+
+        /* Inner Loop: one iteration per search / position */
+        do {
+            const size_t hs0 = ZSTD_hashPtr(ip, hBitsS, mls);
+            const U32 idxs0 = hashSmall[hs0];
+            curr = (U32)(ip-base);
+            matchs0 = base + idxs0;
+
+            hashLong[hl0] = hashSmall[hs0] = curr;   /* update hash tables */
+
+            /* check noDict repcode */
+            if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+                mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+                ip++;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                goto _match_stored;
+            }
+
+            hl1 = ZSTD_hashPtr(ip1, hBitsL, 8);
+
+            if (idxl0 > prefixLowestIndex) {
+                /* check prefix long match */
+                if (MEM_read64(matchl0) == MEM_read64(ip)) {
+                    mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8;
+                    offset = (U32)(ip-matchl0);
+                    while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
+            }
+
+            idxl1 = hashLong[hl1];
+            matchl1 = base + idxl1;
+
+            if (idxs0 > prefixLowestIndex) {
+                /* check prefix short match */
+                if (MEM_read32(matchs0) == MEM_read32(ip)) {
+                    goto _search_next_long;
+                }
+            }
+
+            if (ip1 >= nextStep) {
+                PREFETCH_L1(ip1 + 64);
+                PREFETCH_L1(ip1 + 128);
+                step++;
+                nextStep += kStepIncr;
+            }
+            ip = ip1;
+            ip1 += step;
+
+            hl0 = hl1;
+            idxl0 = idxl1;
+            matchl0 = matchl1;
+    #if defined(__aarch64__)
+            PREFETCH_L1(ip+256);
+    #endif
+        } while (ip1 <= ilimit);
+
+_cleanup:
+        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+        /* save reps for next block */
+        rep[0] = offset_1 ? offset_1 : offsetSaved1;
+        rep[1] = offset_2 ? offset_2 : offsetSaved2;
+
+        /* Return the last literals size */
+        return (size_t)(iend - anchor);
+
+_search_next_long:
+
+        /* check prefix long +1 match */
+        if (idxl1 > prefixLowestIndex) {
+            if (MEM_read64(matchl1) == MEM_read64(ip1)) {
+                ip = ip1;
+                mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8;
+                offset = (U32)(ip-matchl1);
+                while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        }
+
+        /* if no long +1 match, explore the short match we found */
+        mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4;
+        offset = (U32)(ip - matchs0);
+        while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */
+
+        /* fall-through */
+
+_match_found: /* requires ip, offset, mLength */
+        offset_2 = offset_1;
+        offset_1 = offset;
+
+        if (step < 4) {
+            /* It is unsafe to write this value back to the hashtable when ip1 is
+             * greater than or equal to the new ip we will have after we're done
+             * processing this match. Rather than perform that test directly
+             * (ip1 >= ip + mLength), which costs speed in practice, we do a simpler
+             * more predictable test. The minmatch even if we take a short match is
+             * 4 bytes, so as long as step, the distance between ip and ip1
+             * (initially) is less than 4, we know ip1 < new ip. */
+            hashLong[hl1] = (U32)(ip1 - base);
+        }
+
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+_match_stored:
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert = curr+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+            }
+
+            /* check immediate repcode */
+            while ( (ip <= ilimit)
+                 && ( (offset_2>0)
+                    & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                /* store sequence */
+                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                ip += rLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+            }
+        }
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls /* template */)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32* const hashLong = ms->hashTable;
+    const U32 hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    const U32 hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+    /* presumes that, if there is a dictionary, it must be using Attach mode */
+    const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+    const U32* const dictHashLong  = dms->hashTable;
+    const U32* const dictHashSmall = dms->chainTable;
+    const U32 dictStartIndex       = dms->window.dictLimit;
+    const BYTE* const dictBase     = dms->window.base;
+    const BYTE* const dictStart    = dictBase + dictStartIndex;
+    const BYTE* const dictEnd      = dms->window.nextSrc;
+    const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
+    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
+
+    /* if a dictionary is attached, it must be within window range */
+    assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashLong, hashTableBytes);
+        PREFETCH_AREA(dictHashSmall, chainTableBytes);
+    }
+
+    /* init */
+    ip += (dictAndPrefixLength == 0);
+
+    /* dictMatchState repCode checks don't currently handle repCode == 0
+     * disabling. */
+    assert(offset_1 <= dictAndPrefixLength);
+    assert(offset_2 <= dictAndPrefixLength);
+
+    /* Main Search Loop */
+    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+        size_t mLength;
+        U32 offset;
+        size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+        size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
+        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
+        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
+        U32 const curr = (U32)(ip-base);
+        U32 const matchIndexL = hashLong[h2];
+        U32 matchIndexS = hashSmall[h];
+        const BYTE* matchLong = base + matchIndexL;
+        const BYTE* match = base + matchIndexS;
+        const U32 repIndex = curr + 1 - offset_1;
+        const BYTE* repMatch = (repIndex < prefixLowestIndex) ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+        hashLong[h2] = hashSmall[h] = curr;   /* update hash tables */
+
+        /* check repcode */
+        if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+            goto _match_stored;
+        }
+
+        if (matchIndexL > prefixLowestIndex) {
+            /* check prefix long match */
+            if (MEM_read64(matchLong) == MEM_read64(ip)) {
+                mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
+                offset = (U32)(ip-matchLong);
+                while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        } else if (dictTagsMatchL) {
+            /* check dictMatchState long match */
+            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
+            const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+            assert(dictMatchL < dictEnd);
+
+            if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) {
+                mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8;
+                offset = (U32)(curr - dictMatchIndexL - dictIndexDelta);
+                while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */
+                goto _match_found;
+        }   }
+
+        if (matchIndexS > prefixLowestIndex) {
+            /* check prefix short match */
+            if (MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+            }
+        } else if (dictTagsMatchS) {
+            /* check dictMatchState short match */
+            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
+            match = dictBase + dictMatchIndexS;
+            matchIndexS = dictMatchIndexS + dictIndexDelta;
+
+            if (match > dictStart && MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+        }   }
+
+        ip += ((ip-anchor) >> kSearchStrength) + 1;
+#if defined(__aarch64__)
+        PREFETCH_L1(ip+256);
+#endif
+        continue;
+
+_search_next_long:
+        {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+            U32 const matchIndexL3 = hashLong[hl3];
+            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
+            const BYTE* matchL3 = base + matchIndexL3;
+            hashLong[hl3] = curr + 1;
+
+            /* check prefix long +1 match */
+            if (matchIndexL3 > prefixLowestIndex) {
+                if (MEM_read64(matchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
+                    ip++;
+                    offset = (U32)(ip-matchL3);
+                    while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
+            } else if (dictTagsMatchL3) {
+                /* check dict long +1 match */
+                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
+                const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                assert(dictMatchL3 < dictEnd);
+                if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8;
+                    ip++;
+                    offset = (U32)(curr + 1 - dictMatchIndexL3 - dictIndexDelta);
+                    while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+        }   }   }
+
+        /* if no long +1 match, explore the short match we found */
+        if (matchIndexS < prefixLowestIndex) {
+            mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4;
+            offset = (U32)(curr - matchIndexS);
+            while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        } else {
+            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+            offset = (U32)(ip - match);
+            while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        }
+
+_match_found:
+        offset_2 = offset_1;
+        offset_1 = offset;
+
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+_match_stored:
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert = curr+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+            }
+
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ?
+                        dictBase + repIndex2 - dictIndexDelta :
+                        base + repIndex2;
+                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+            }
+        }
+    }   /* while (ip < ilimit) */
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+#define ZSTD_GEN_DFAST_FN(dictMode, mls)                                                                 \
+    static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls(                                      \
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                          \
+            void const* src, size_t srcSize)                                                             \
+    {                                                                                                    \
+        return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \
+    }
+
+ZSTD_GEN_DFAST_FN(noDict, 4)
+ZSTD_GEN_DFAST_FN(noDict, 5)
+ZSTD_GEN_DFAST_FN(noDict, 6)
+ZSTD_GEN_DFAST_FN(noDict, 7)
+
+ZSTD_GEN_DFAST_FN(dictMatchState, 4)
+ZSTD_GEN_DFAST_FN(dictMatchState, 5)
+ZSTD_GEN_DFAST_FN(dictMatchState, 6)
+ZSTD_GEN_DFAST_FN(dictMatchState, 7)
+
+
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    const U32 mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize);
+    }
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    const U32 mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize);
+    }
+}
+
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls /* template */)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32* const hashLong = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+    const U32   dictStartIndex = lowLimit;
+    const U32   dictLimit = ms->window.dictLimit;
+    const U32   prefixStartIndex = (dictLimit > lowLimit) ? dictLimit : lowLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize);
+
+    /* if extDict is invalidated due to maxDistance, switch to "regular" variant */
+    if (prefixStartIndex == dictStartIndex)
+        return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize);
+
+    /* Search Loop */
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+        const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
+        const U32 matchIndex = hashSmall[hSmall];
+        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* match = matchBase + matchIndex;
+
+        const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
+        const U32 matchLongIndex = hashLong[hLong];
+        const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* matchLong = matchLongBase + matchLongIndex;
+
+        const U32 curr = (U32)(ip-base);
+        const U32 repIndex = curr + 1 - offset_1;   /* offset_1 expected <= curr +1 */
+        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* const repMatch = repBase + repIndex;
+        size_t mLength;
+        hashSmall[hSmall] = hashLong[hLong] = curr;   /* update hash table */
+
+        if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
+            & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+        } else {
+            if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart;
+                U32 offset;
+                mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8;
+                offset = curr - matchLongIndex;
+                while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+            } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+                U32 const matchIndex3 = hashLong[h3];
+                const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base;
+                const BYTE* match3 = match3Base + matchIndex3;
+                U32 offset;
+                hashLong[h3] = curr + 1;
+                if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
+                    const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8;
+                    ip++;
+                    offset = curr+1 - matchIndex3;
+                    while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
+                } else {
+                    const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+                    offset = curr - matchIndex;
+                    while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+                }
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+            } else {
+                ip += ((ip-anchor) >> kSearchStrength) + 1;
+                continue;
+        }   }
+
+        /* move to next sequence start */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert = curr+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+            }
+
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3)   /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
+                    & (offset_2 <= current2 - dictStartIndex))
+                  && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+    }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+ZSTD_GEN_DFAST_FN(extDict, 4)
+ZSTD_GEN_DFAST_FN(extDict, 5)
+ZSTD_GEN_DFAST_FN(extDict, 6)
+ZSTD_GEN_DFAST_FN(extDict, 7)
+
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
+    }
+}
+
+#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
diff --git a/src/zstd/compress/zstd_double_fast.h b/src/zstd/compress/zstd_double_fast.h
new file mode 100644
index 000000000..ce6ed8c97
--- /dev/null
+++ b/src/zstd/compress/zstd_double_fast.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_DOUBLE_FAST_H
+#define ZSTD_DOUBLE_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "../common/mem.h"      /* U32 */
+#include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+
+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                              ZSTD_tableFillPurpose_e tfp);
+
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict
+#else
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL
+#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_DOUBLE_FAST_H */
diff --git a/src/zstd/compress/zstd_fast.c b/src/zstd/compress/zstd_fast.c
new file mode 100644
index 000000000..6c4554cfc
--- /dev/null
+++ b/src/zstd/compress/zstd_fast.c
@@ -0,0 +1,968 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
+#include "zstd_fast.h"
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_full);
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
+            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
+
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
+                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
+                }   }   }   }
+}
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_fast);
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
+        hashTable[hash0] = curr;
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hash] == 0) {  /* not yet filled */
+                    hashTable[hash] = curr + p;
+    }   }   }   }
+}
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
+    }
+}
+
+
+/**
+ * If you squint hard enough (and ignore repcodes), the search operation at any
+ * given position is broken into 4 stages:
+ *
+ * 1. Hash   (map position to hash value via input read)
+ * 2. Lookup (map hash val to index via hashtable read)
+ * 3. Load   (map index to value at that position via input read)
+ * 4. Compare
+ *
+ * Each of these steps involves a memory read at an address which is computed
+ * from the previous step. This means these steps must be sequenced and their
+ * latencies are cumulative.
+ *
+ * Rather than do 1->2->3->4 sequentially for a single position before moving
+ * onto the next, this implementation interleaves these operations across the
+ * next few positions:
+ *
+ * R = Repcode Read & Compare
+ * H = Hash
+ * T = Table Lookup
+ * M = Match Read & Compare
+ *
+ * Pos | Time -->
+ * ----+-------------------
+ * N   | ... M
+ * N+1 | ...   TM
+ * N+2 |    R H   T M
+ * N+3 |         H    TM
+ * N+4 |           R H   T M
+ * N+5 |                H   ...
+ * N+6 |                  R ...
+ *
+ * This is very much analogous to the pipelining of execution in a CPU. And just
+ * like a CPU, we have to dump the pipeline when we find a match (i.e., take a
+ * branch).
+ *
+ * When this happens, we throw away our current state, and do the following prep
+ * to re-enter the loop:
+ *
+ * Pos | Time -->
+ * ----+-------------------
+ * N   | H T
+ * N+1 |  H
+ *
+ * This is also the work we do at the beginning to enter the loop initially.
+ */
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_fast_noDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls, U32 const hasStep)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+
+    const BYTE* anchor = istart;
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    const BYTE* ip2;
+    const BYTE* ip3;
+    U32 current0;
+
+    U32 rep_offset1 = rep[0];
+    U32 rep_offset2 = rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    size_t hash0; /* hash for ip0 */
+    size_t hash1; /* hash for ip1 */
+    U32 idx; /* match idx for ip0 */
+    U32 mval; /* src value at match idx */
+
+    U32 offcode;
+    const BYTE* match0;
+    size_t mLength;
+
+    /* ip0 and ip1 are always adjacent. The targetLength skipping and
+     * uncompressibility acceleration is applied to every other position,
+     * matching the behavior of #1562. step therefore represents the gap
+     * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
+    size_t step;
+    const BYTE* nextStep;
+    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
+    ip0 += (ip0 == prefixStart);
+    {   U32 const curr = (U32)(ip0 - base);
+        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
+        U32 const maxRep = curr - windowLow;
+        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
+        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
+    }
+
+    /* start each op */
+_start: /* Requires: ip0 */
+
+    step = stepSize;
+    nextStep = ip0 + kStepIncr;
+
+    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
+    ip1 = ip0 + 1;
+    ip2 = ip0 + step;
+    ip3 = ip2 + 1;
+
+    if (ip3 >= ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+
+    idx = hashTable[hash0];
+
+    do {
+        /* load repcode match for ip[2]*/
+        const U32 rval = MEM_read32(ip2 - rep_offset1);
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        /* check repcode at ip[2] */
+        if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) {
+            ip0 = ip2;
+            match0 = ip0 - rep_offset1;
+            mLength = ip0[-1] == match0[-1];
+            ip0 -= mLength;
+            match0 -= mLength;
+            offcode = REPCODE1_TO_OFFBASE;
+            mLength += 4;
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 is before the
+             * repcode (ip2). */
+            hashTable[hash1] = (U32)(ip1 - base);
+
+            goto _match;
+        }
+
+        /* load match for ip[0] */
+        if (idx >= prefixStartIndex) {
+            mval = MEM_read32(base + idx);
+        } else {
+            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
+        }
+
+        /* check match at ip[0] */
+        if (MEM_read32(ip0) == mval) {
+            /* found a match! */
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 == ip0 + 1, so
+             * we know we will resume searching after ip1 */
+            hashTable[hash1] = (U32)(ip1 - base);
+
+            goto _offset;
+        }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip3;
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        /* load match for ip[0] */
+        if (idx >= prefixStartIndex) {
+            mval = MEM_read32(base + idx);
+        } else {
+            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
+        }
+
+        /* check match at ip[0] */
+        if (MEM_read32(ip0) == mval) {
+            /* found a match! */
+
+            /* first write next hash table entry; we've already calculated it */
+            if (step <= 4) {
+                /* We need to avoid writing an index into the hash table >= the
+                 * position at which we will pick up our searching after we've
+                 * taken this match.
+                 *
+                 * The minimum possible match has length 4, so the earliest ip0
+                 * can be after we take this match will be the current ip0 + 4.
+                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
+                 * write this position.
+                 */
+                hashTable[hash1] = (U32)(ip1 - base);
+            }
+
+            goto _offset;
+        }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip0 + step;
+        ip3 = ip1 + step;
+
+        /* calculate step */
+        if (ip2 >= nextStep) {
+            step++;
+            PREFETCH_L1(ip1 + 64);
+            PREFETCH_L1(ip1 + 128);
+            nextStep += kStepIncr;
+        }
+    } while (ip3 < ilimit);
+
+_cleanup:
+    /* Note that there are probably still a couple positions we could search.
+     * However, it seems to be a meaningful performance hit to try to search
+     * them. So let's not. */
+
+    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
+     * When the offsets are still zero, we need to restore them after the block to have a correct
+     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
+     * offsets were invalid. We need to figure out which offset to refill with.
+     *     - If both offsets are zero they are in the same order.
+     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
+     *     - If only one is zero, we need to decide which offset to restore.
+     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
+     *         - It is impossible for rep_offset2 to be non-zero.
+     *
+     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
+     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
+     */
+    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
+    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx */
+
+    /* Compute the offset code. */
+    match0 = base + idx;
+    rep_offset2 = rep_offset1;
+    rep_offset1 = (U32)(ip0-match0);
+    offcode = OFFSET_TO_OFFBASE(rep_offset1);
+    mLength = 4;
+
+    /* Count the backwards match length. */
+    while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) {
+        ip0--;
+        match0--;
+        mLength++;
+    }
+
+_match: /* Requires: ip0, match0, offcode */
+
+    /* Count the forward length. */
+    mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);
+
+    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
+
+    ip0 += mLength;
+    anchor = ip0;
+
+    /* Fill table and check for immediate repcode. */
+    if (ip0 <= ilimit) {
+        /* Fill Table */
+        assert(base+current0+2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+        if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */
+            while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) {
+                /* store sequence */
+                size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4;
+                { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                ip0 += rLength;
+                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                anchor = ip0;
+                continue;   /* faster when present (confirmed on gcc-8) ... (?) */
+    }   }   }
+
+    goto _start;
+}
+
+#define ZSTD_GEN_FAST_FN(dictMode, mls, step)                                                            \
+    static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step(                                      \
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                    \
+            void const* src, size_t srcSize)                                                       \
+    {                                                                                              \
+        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \
+    }
+
+ZSTD_GEN_FAST_FN(noDict, 4, 1)
+ZSTD_GEN_FAST_FN(noDict, 5, 1)
+ZSTD_GEN_FAST_FN(noDict, 6, 1)
+ZSTD_GEN_FAST_FN(noDict, 7, 1)
+
+ZSTD_GEN_FAST_FN(noDict, 4, 0)
+ZSTD_GEN_FAST_FN(noDict, 5, 0)
+ZSTD_GEN_FAST_FN(noDict, 6, 0)
+ZSTD_GEN_FAST_FN(noDict, 7, 0)
+
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState == NULL);
+    if (ms->cParams.targetLength > 1) {
+        switch(mls)
+        {
+        default: /* includes case 3 */
+        case 4 :
+            return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);
+        case 5 :
+            return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);
+        case 6 :
+            return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);
+        case 7 :
+            return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
+        }
+    } else {
+        switch(mls)
+        {
+        default: /* includes case 3 */
+        case 4 :
+            return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);
+        case 5 :
+            return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);
+        case 6 :
+            return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);
+        case 7 :
+            return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
+        }
+
+    }
+}
+
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip0 = istart;
+    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
+    const BYTE* anchor = istart;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+    const U32* const dictHashTable = dms->hashTable;
+    const U32 dictStartIndex       = dms->window.dictLimit;
+    const BYTE* const dictBase     = dms->window.base;
+    const BYTE* const dictStart    = dictBase + dictStartIndex;
+    const BYTE* const dictEnd      = dms->window.nextSrc;
+    const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
+    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
+    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+
+    /* if a dictionary is still attached, it necessarily means that
+     * it is within window size. So we just check it. */
+    const U32 maxDistance = 1U << cParams->windowLog;
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+    assert(endIndex - prefixStartIndex <= maxDistance);
+    (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
+
+    (void)hasStep; /* not currently specialized on whether it's accelerated */
+
+    /* ensure there will be no underflow
+     * when translating a dict index into a local index */
+    assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashTable, hashTableBytes);
+    }
+
+    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+    ip0 += (dictAndPrefixLength == 0);
+    /* dictMatchState repCode checks don't currently handle repCode == 0
+     * disabling. */
+    assert(offset_1 <= dictAndPrefixLength);
+    assert(offset_2 <= dictAndPrefixLength);
+
+    /* Outer search loop */
+    assert(stepSize >= 1);
+    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+        size_t mLength;
+        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+
+        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
+        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
+
+        U32 matchIndex = hashTable[hash0];
+        U32 curr = (U32)(ip0 - base);
+        size_t step = stepSize;
+        const size_t kStepIncr = 1 << kSearchStrength;
+        const BYTE* nextStep = ip0 + kStepIncr;
+
+        /* Inner search loop */
+        while (1) {
+            const BYTE* match = base + matchIndex;
+            const U32 repIndex = curr + 1 - offset_1;
+            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
+            hashTable[hash0] = curr;   /* update hash table */
+
+            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
+                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
+                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
+                ip0++;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                break;
+            }
+
+            if (dictTagsMatch) {
+                /* Found a possible dict match */
+                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+                const BYTE* dictMatch = dictBase + dictMatchIndex;
+                if (dictMatchIndex > dictStartIndex &&
+                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
+                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
+                    if (matchIndex <= prefixStartIndex) {
+                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
+                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
+                        while (((ip0 > anchor) & (dictMatch > dictStart))
+                            && (ip0[-1] == dictMatch[-1])) {
+                            ip0--;
+                            dictMatch--;
+                            mLength++;
+                        } /* catch up */
+                        offset_2 = offset_1;
+                        offset_1 = offset;
+                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                        break;
+                    }
+                }
+            }
+
+            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
+                /* found a regular match */
+                U32 const offset = (U32) (ip0 - match);
+                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
+                while (((ip0 > anchor) & (match > prefixStart))
+                       && (ip0[-1] == match[-1])) {
+                    ip0--;
+                    match--;
+                    mLength++;
+                } /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                break;
+            }
+
+            /* Prepare for next iteration */
+            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
+            matchIndex = hashTable[hash1];
+
+            if (ip1 >= nextStep) {
+                step++;
+                nextStep += kStepIncr;
+            }
+            ip0 = ip1;
+            ip1 = ip1 + step;
+            if (ip1 > ilimit) goto _cleanup;
+
+            curr = (U32)(ip0 - base);
+            hash0 = hash1;
+        }   /* end inner search loop */
+
+        /* match found */
+        assert(mLength);
+        ip0 += mLength;
+        anchor = ip0;
+
+        if (ip0 <= ilimit) {
+            /* Fill Table */
+            assert(base+curr+2 > istart);  /* check base overflow */
+            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
+            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+            /* check immediate repcode */
+            while (ip0 <= ilimit) {
+                U32 const current2 = (U32)(ip0-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                        dictBase - dictIndexDelta + repIndex2 :
+                        base + repIndex2;
+                if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
+                    ip0 += repLength2;
+                    anchor = ip0;
+                    continue;
+                }
+                break;
+            }
+        }
+
+        /* Prepare for next iteration */
+        assert(ip0 == anchor);
+        ip1 = ip0 + stepSize;
+    }
+
+_cleanup:
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
+
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState != NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);
+    }
+}
+
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_fast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* anchor = istart;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+    const U32   dictStartIndex = lowLimit;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const U32   dictLimit = ms->window.dictLimit;
+    const U32   prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    const BYTE* ip2;
+    const BYTE* ip3;
+    U32 current0;
+
+
+    size_t hash0; /* hash for ip0 */
+    size_t hash1; /* hash for ip1 */
+    U32 idx; /* match idx for ip0 */
+    const BYTE* idxBase; /* base pointer for idx */
+
+    U32 offcode;
+    const BYTE* match0;
+    size_t mLength;
+    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
+
+    size_t step;
+    const BYTE* nextStep;
+    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+
+    (void)hasStep; /* not currently specialized on whether it's accelerated */
+
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
+
+    /* switch to "regular" variant if extDict is invalidated due to maxDistance */
+    if (prefixStartIndex == dictStartIndex)
+        return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
+
+    {   U32 const curr = (U32)(ip0 - base);
+        U32 const maxRep = curr - dictStartIndex;
+        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+
+    /* start each op */
+_start: /* Requires: ip0 */
+
+    step = stepSize;
+    nextStep = ip0 + kStepIncr;
+
+    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
+    ip1 = ip0 + 1;
+    ip2 = ip0 + step;
+    ip3 = ip2 + 1;
+
+    if (ip3 >= ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+
+    idx = hashTable[hash0];
+    idxBase = idx < prefixStartIndex ? dictBase : base;
+
+    do {
+        {   /* load repcode match for ip[2] */
+            U32 const current2 = (U32)(ip2 - base);
+            U32 const repIndex = current2 - offset_1;
+            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+            U32 rval;
+            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
+                 & (offset_1 > 0) ) {
+                rval = MEM_read32(repBase + repIndex);
+            } else {
+                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
+            }
+
+            /* write back hash table entry */
+            current0 = (U32)(ip0 - base);
+            hashTable[hash0] = current0;
+
+            /* check repcode at ip[2] */
+            if (MEM_read32(ip2) == rval) {
+                ip0 = ip2;
+                match0 = repBase + repIndex;
+                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                assert((match0 != prefixStart) & (match0 != dictStart));
+                mLength = ip0[-1] == match0[-1];
+                ip0 -= mLength;
+                match0 -= mLength;
+                offcode = REPCODE1_TO_OFFBASE;
+                mLength += 4;
+                goto _match;
+        }   }
+
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip3;
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip0 + step;
+        ip3 = ip1 + step;
+
+        /* calculate step */
+        if (ip2 >= nextStep) {
+            step++;
+            PREFETCH_L1(ip1 + 64);
+            PREFETCH_L1(ip1 + 128);
+            nextStep += kStepIncr;
+        }
+    } while (ip3 < ilimit);
+
+_cleanup:
+    /* Note that there are probably still a couple positions we could search.
+     * However, it seems to be a meaningful performance hit to try to search
+     * them. So let's not. */
+
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx, idxBase */
+
+    /* Compute the offset code. */
+    {   U32 const offset = current0 - idx;
+        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
+        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
+        match0 = idxBase + idx;
+        offset_2 = offset_1;
+        offset_1 = offset;
+        offcode = OFFSET_TO_OFFBASE(offset);
+        mLength = 4;
+
+        /* Count the backwards match length. */
+        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
+            ip0--;
+            match0--;
+            mLength++;
+    }   }
+
+_match: /* Requires: ip0, match0, offcode, matchEnd */
+
+    /* Count the forward length. */
+    assert(matchEnd != 0);
+    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
+
+    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
+
+    ip0 += mLength;
+    anchor = ip0;
+
+    /* write next hash table entry */
+    if (ip1 < ip0) {
+        hashTable[hash1] = (U32)(ip1 - base);
+    }
+
+    /* Fill table and check for immediate repcode. */
+    if (ip0 <= ilimit) {
+        /* Fill Table */
+        assert(base+current0+2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+        while (ip0 <= ilimit) {
+            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
+            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
+                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
+                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                ip0 += repLength2;
+                anchor = ip0;
+                continue;
+            }
+            break;
+    }   }
+
+    goto _start;
+}
+
+ZSTD_GEN_FAST_FN(extDict, 4, 0)
+ZSTD_GEN_FAST_FN(extDict, 5, 0)
+ZSTD_GEN_FAST_FN(extDict, 6, 0)
+ZSTD_GEN_FAST_FN(extDict, 7, 0)
+
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState == NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);
+    }
+}
diff --git a/src/zstd/zstd_fast.h b/src/zstd/compress/zstd_fast.h
similarity index 58%
rename from src/zstd/zstd_fast.h
rename to src/zstd/compress/zstd_fast.h
index 43752a49e..9e4236b47 100644
--- a/src/zstd/zstd_fast.h
+++ b/src/zstd/compress/zstd_fast.h
@@ -11,27 +11,27 @@
 #ifndef ZSTD_FAST_H
 #define ZSTD_FAST_H
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 extern "C" {
 #endif
 
-#include "mem.h" /* U32 */
+#include "../common/mem.h"      /* U32 */
 #include "zstd_compress_internal.h"
 
 void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        void const * end, ZSTD_dictTableLoadMethod_e dtlm,
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
                         ZSTD_tableFillPurpose_e tfp);
 size_t ZSTD_compressBlock_fast(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_fast_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_fast_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 }
 #endif
 
diff --git a/src/zstd/compress/zstd_lazy.c b/src/zstd/compress/zstd_lazy.c
new file mode 100644
index 000000000..67dd55fdb
--- /dev/null
+++ b/src/zstd/compress/zstd_lazy.c
@@ -0,0 +1,2199 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "zstd_lazy.h"
+#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
+
+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+
+#define kLazySkippingStep 8
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+                const BYTE* ip, const BYTE* iend,
+                U32 mls)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hashLog = cParams->hashLog;
+
+    U32* const bt = ms->chainTable;
+    U32  const btLog  = cParams->chainLog - 1;
+    U32  const btMask = (1 << btLog) - 1;
+
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    if (idx != target)
+        DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
+                    idx, target, ms->window.dictLimit);
+    assert(ip + 8 <= iend);   /* condition for ZSTD_hashPtr */
+    (void)iend;
+
+    assert(idx >= ms->window.dictLimit);   /* condition for valid base+idx */
+    for ( ; idx < target ; idx++) {
+        size_t const h  = ZSTD_hashPtr(base + idx, hashLog, mls);   /* assumption : ip + 8 <= iend */
+        U32    const matchIndex = hashTable[h];
+
+        U32*   const nextCandidatePtr = bt + 2*(idx&btMask);
+        U32*   const sortMarkPtr  = nextCandidatePtr + 1;
+
+        DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
+        hashTable[h] = idx;   /* Update Hash Table */
+        *nextCandidatePtr = matchIndex;   /* update BT like a chain */
+        *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
+    }
+    ms->nextToUpdate = target;
+}
+
+
+/** ZSTD_insertDUBT1() :
+ *  sort one already inserted but unsorted position
+ *  assumption : curr >= btlow == (curr - btmask)
+ *  doesn't fail */
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+                 U32 curr, const BYTE* inputEnd,
+                 U32 nbCompares, U32 btLow,
+                 const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const bt = ms->chainTable;
+    U32  const btLog  = cParams->chainLog - 1;
+    U32  const btMask = (1 << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
+    const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    U32* smallerPtr = bt + 2*(curr&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 matchIndex = *smallerPtr;   /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
+    U32 dummy32;   /* to be nullified at the end */
+    U32 const windowValid = ms->window.lowLimit;
+    U32 const maxDistance = 1U << cParams->windowLog;
+    U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
+
+
+    DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
+                curr, dictLimit, windowLow);
+    assert(curr >= btLow);
+    assert(ip < iend);   /* condition for ZSTD_count */
+
+    for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(matchIndex < curr);
+        /* note : all candidates are now supposed sorted,
+         * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
+         * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
+
+        if ( (dictMode != ZSTD_extDict)
+          || (matchIndex+matchLength >= dictLimit)  /* both in current segment*/
+          || (curr < dictLimit) /* both in extDict */) {
+            const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
+                                     || (matchIndex+matchLength >= dictLimit)) ?
+                                        base : dictBase;
+            assert( (matchIndex+matchLength >= dictLimit)   /* might be wrong if extDict is incorrectly set to 0 */
+                 || (curr < dictLimit) );
+            match = mBase + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* preparation for next read of match[matchLength] */
+        }
+
+        DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
+                    curr, matchIndex, (U32)matchLength);
+
+        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
+                        matchIndex, btLow, nextPtr[1]);
+            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
+                        matchIndex, btLow, nextPtr[0]);
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+}
+
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_DUBT_findBetterDictMatch (
+        const ZSTD_matchState_t* ms,
+        const BYTE* const ip, const BYTE* const iend,
+        size_t* offsetPtr,
+        size_t bestLength,
+        U32 nbCompares,
+        U32 const mls,
+        const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_matchState_t * const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
+    const U32 * const dictHashTable = dms->hashTable;
+    U32         const hashLog = dmsCParams->hashLog;
+    size_t      const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32               dictMatchIndex = dictHashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;
+    U32         const curr = (U32)(ip-base);
+    const BYTE* const dictBase = dms->window.base;
+    const BYTE* const dictEnd = dms->window.nextSrc;
+    U32         const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
+    U32         const dictLowLimit = dms->window.lowLimit;
+    U32         const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
+
+    U32*        const dictBt = dms->chainTable;
+    U32         const btLog  = dmsCParams->chainLog - 1;
+    U32         const btMask = (1 << btLog) - 1;
+    U32         const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
+
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+
+    (void)dictMode;
+    assert(dictMode == ZSTD_dictMatchState);
+
+    for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
+        U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        const BYTE* match = dictBase + dictMatchIndex;
+        matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+        if (dictMatchIndex+matchLength >= dictHighLimit)
+            match = base + dictMatchIndex + dictIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+        if (matchLength > bestLength) {
+            U32 matchIndex = dictMatchIndex + dictIndexDelta;
+            if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
+                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+            }
+            if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+            }
+        }
+
+        if (match[matchLength] < ip[matchLength]) {
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+        } else {
+            /* match is larger than current */
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthLarger = matchLength;
+            dictMatchIndex = nextPtr[0];
+        }
+    }
+
+    if (bestLength >= MINMATCH) {
+        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
+        DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                    curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+    }
+    return bestLength;
+
+}
+
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iend,
+                        size_t* offBasePtr,
+                        U32 const mls,
+                        const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32          matchIndex  = hashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    U32    const curr = (U32)(ip-base);
+    U32    const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
+
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32    const btLow = (btMask >= curr) ? 0 : curr - btMask;
+    U32    const unsortLimit = MAX(btLow, windowLow);
+
+    U32*         nextCandidate = bt + 2*(matchIndex&btMask);
+    U32*         unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+    U32          nbCompares = 1U << cParams->searchLog;
+    U32          nbCandidates = nbCompares;
+    U32          previousCandidate = 0;
+
+    DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
+    assert(ip <= iend-8);   /* required for h calculation */
+    assert(dictMode != ZSTD_dedicatedDictSearch);
+
+    /* reach end of unsorted candidates list */
+    while ( (matchIndex > unsortLimit)
+         && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
+         && (nbCandidates > 1) ) {
+        DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
+                    matchIndex);
+        *unsortedMark = previousCandidate;  /* the unsortedMark becomes a reversed chain, to move up back to original position */
+        previousCandidate = matchIndex;
+        matchIndex = *nextCandidate;
+        nextCandidate = bt + 2*(matchIndex&btMask);
+        unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+        nbCandidates --;
+    }
+
+    /* nullify last candidate if it's still unsorted
+     * simplification, detrimental to compression ratio, beneficial for speed */
+    if ( (matchIndex > unsortLimit)
+      && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
+        DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
+                    matchIndex);
+        *nextCandidate = *unsortedMark = 0;
+    }
+
+    /* batch sort stacked candidates */
+    matchIndex = previousCandidate;
+    while (matchIndex) {  /* will end on matchIndex == 0 */
+        U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
+        U32 const nextCandidateIdx = *nextCandidateIdxPtr;
+        ZSTD_insertDUBT1(ms, matchIndex, iend,
+                         nbCandidates, unsortLimit, dictMode);
+        matchIndex = nextCandidateIdx;
+        nbCandidates++;
+    }
+
+    /* find longest match */
+    {   size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+        const BYTE* const dictBase = ms->window.dictBase;
+        const U32 dictLimit = ms->window.dictLimit;
+        const BYTE* const dictEnd = dictBase + dictLimit;
+        const BYTE* const prefixStart = base + dictLimit;
+        U32* smallerPtr = bt + 2*(curr&btMask);
+        U32* largerPtr  = bt + 2*(curr&btMask) + 1;
+        U32 matchEndIdx = curr + 8 + 1;
+        U32 dummy32;   /* to be nullified at the end */
+        size_t bestLength = 0;
+
+        matchIndex  = hashTable[h];
+        hashTable[h] = curr;   /* Update Hash Table */
+
+        for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
+            U32* const nextPtr = bt + 2*(matchIndex & btMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match;
+
+            if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
+                match = base + matchIndex;
+                matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+            } else {
+                match = dictBase + matchIndex;
+                matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+                if (matchIndex+matchLength >= dictLimit)
+                    match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+            }
+
+            if (matchLength > bestLength) {
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
+                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                    if (dictMode == ZSTD_dictMatchState) {
+                        nbCompares = 0; /* in addition to avoiding checking any
+                                         * further in this loop, make sure we
+                                         * skip checking in the dictionary. */
+                    }
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+                }
+            }
+
+            if (match[matchLength] < ip[matchLength]) {
+                /* match is smaller than current */
+                *smallerPtr = matchIndex;             /* update smaller idx */
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+                matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                *largerPtr = matchIndex;
+                commonLengthLarger = matchLength;
+                if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                largerPtr = nextPtr;
+                matchIndex = nextPtr[0];
+        }   }
+
+        *smallerPtr = *largerPtr = 0;
+
+        assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
+        if (dictMode == ZSTD_dictMatchState && nbCompares) {
+            bestLength = ZSTD_DUBT_findBetterDictMatch(
+                    ms, ip, iend,
+                    offBasePtr, bestLength, nbCompares,
+                    mls, dictMode);
+        }
+
+        assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
+        ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+        if (bestLength >= MINMATCH) {
+            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
+            DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
+        }
+        return bestLength;
+    }
+}
+
+
+/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iLimit,
+                      size_t* offBasePtr,
+                const U32 mls /* template */,
+                const ZSTD_dictMode_e dictMode)
+{
+    DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateDUBT(ms, ip, iLimit, mls);
+    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
+}
+
+/***********************************
+* Dedicated dict search
+***********************************/
+
+void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
+{
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32* const hashTable = ms->hashTable;
+    U32* const chainTable = ms->chainTable;
+    U32 const chainSize = 1 << ms->cParams.chainLog;
+    U32 idx = ms->nextToUpdate;
+    U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
+    U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
+    U32 const cacheSize = bucketSize - 1;
+    U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
+    U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
+
+    /* We know the hashtable is oversized by a factor of `bucketSize`.
+     * We are going to temporarily pretend `bucketSize == 1`, keeping only a
+     * single entry. We will use the rest of the space to construct a temporary
+     * chaintable.
+     */
+    U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
+    U32* const tmpHashTable = hashTable;
+    U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
+    U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
+    U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
+    U32 hashIdx;
+
+    assert(ms->cParams.chainLog <= 24);
+    assert(ms->cParams.hashLog > ms->cParams.chainLog);
+    assert(idx != 0);
+    assert(tmpMinChain <= minChain);
+
+    /* fill conventional hash table and conventional chain table */
+    for ( ; idx < target; idx++) {
+        U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
+        if (idx >= tmpMinChain) {
+            tmpChainTable[idx - tmpMinChain] = hashTable[h];
+        }
+        tmpHashTable[h] = idx;
+    }
+
+    /* sort chains into ddss chain table */
+    {
+        U32 chainPos = 0;
+        for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
+            U32 count;
+            U32 countBeyondMinChain = 0;
+            U32 i = tmpHashTable[hashIdx];
+            for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
+                /* skip through the chain to the first position that won't be
+                 * in the hash cache bucket */
+                if (i < minChain) {
+                    countBeyondMinChain++;
+                }
+                i = tmpChainTable[i - tmpMinChain];
+            }
+            if (count == cacheSize) {
+                for (count = 0; count < chainLimit;) {
+                    if (i < minChain) {
+                        if (!i || ++countBeyondMinChain > cacheSize) {
+                            /* only allow pulling `cacheSize` number of entries
+                             * into the cache or chainTable beyond `minChain`,
+                             * to replace the entries pulled out of the
+                             * chainTable into the cache. This lets us reach
+                             * back further without increasing the total number
+                             * of entries in the chainTable, guaranteeing the
+                             * DDSS chain table will fit into the space
+                             * allocated for the regular one. */
+                            break;
+                        }
+                    }
+                    chainTable[chainPos++] = i;
+                    count++;
+                    if (i < tmpMinChain) {
+                        break;
+                    }
+                    i = tmpChainTable[i - tmpMinChain];
+                }
+            } else {
+                count = 0;
+            }
+            if (count) {
+                tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
+            } else {
+                tmpHashTable[hashIdx] = 0;
+            }
+        }
+        assert(chainPos <= chainSize); /* I believe this is guaranteed... */
+    }
+
+    /* move chain pointers into the last entry of each hash bucket */
+    for (hashIdx = (1 << hashLog); hashIdx; ) {
+        U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
+        U32 const chainPackedPointer = tmpHashTable[hashIdx];
+        U32 i;
+        for (i = 0; i < cacheSize; i++) {
+            hashTable[bucketIdx + i] = 0;
+        }
+        hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
+    }
+
+    /* fill the buckets of the hash table */
+    for (idx = ms->nextToUpdate; idx < target; idx++) {
+        U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
+                   << ZSTD_LAZY_DDSS_BUCKET_LOG;
+        U32 i;
+        /* Shift hash cache down 1. */
+        for (i = cacheSize - 1; i; i--)
+            hashTable[h + i] = hashTable[h + i - 1];
+        hashTable[h] = idx;
+    }
+
+    ms->nextToUpdate = target;
+}
+
+/* Returns the longest match length found in the dedicated dict search structure.
+ * If none are longer than the argument ml, then ml will be returned.
+ */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
+                                            const ZSTD_matchState_t* const dms,
+                                            const BYTE* const ip, const BYTE* const iLimit,
+                                            const BYTE* const prefixStart, const U32 curr,
+                                            const U32 dictLimit, const size_t ddsIdx) {
+    const U32 ddsLowestIndex  = dms->window.dictLimit;
+    const BYTE* const ddsBase = dms->window.base;
+    const BYTE* const ddsEnd  = dms->window.nextSrc;
+    const U32 ddsSize         = (U32)(ddsEnd - ddsBase);
+    const U32 ddsIndexDelta   = dictLimit - ddsSize;
+    const U32 bucketSize      = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
+    const U32 bucketLimit     = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
+    U32 ddsAttempt;
+    U32 matchIndex;
+
+    for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
+        PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
+    }
+
+    {
+        U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+        U32 const chainIndex = chainPackedPointer >> 8;
+
+        PREFETCH_L1(&dms->chainTable[chainIndex]);
+    }
+
+    for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
+        size_t currentMl=0;
+        const BYTE* match;
+        matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
+        match = ddsBase + matchIndex;
+
+        if (!matchIndex) {
+            return ml;
+        }
+
+        /* guaranteed by table construction */
+        (void)ddsLowestIndex;
+        assert(matchIndex >= ddsLowestIndex);
+        assert(match+4 <= ddsEnd);
+        if (MEM_read32(match) == MEM_read32(ip)) {
+            /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+            currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
+        }
+
+        /* save best solution */
+        if (currentMl > ml) {
+            ml = currentMl;
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+            if (ip+currentMl == iLimit) {
+                /* best possible, avoids read overflow on next attempt */
+                return ml;
+            }
+        }
+    }
+
+    {
+        U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+        U32 chainIndex = chainPackedPointer >> 8;
+        U32 const chainLength = chainPackedPointer & 0xFF;
+        U32 const chainAttempts = nbAttempts - ddsAttempt;
+        U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
+        U32 chainAttempt;
+
+        for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
+            PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
+        }
+
+        for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
+            size_t currentMl=0;
+            const BYTE* match;
+            matchIndex = dms->chainTable[chainIndex];
+            match = ddsBase + matchIndex;
+
+            /* guaranteed by table construction */
+            assert(matchIndex >= ddsLowestIndex);
+            assert(match+4 <= ddsEnd);
+            if (MEM_read32(match) == MEM_read32(ip)) {
+                /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
+            }
+
+            /* save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+        }
+    }
+    return ml;
+}
+
+
+/* *********************************
+*  Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & (mask)]
+
+/* Update chains up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_insertAndFindFirstIndex_internal(
+                        ZSTD_matchState_t* ms,
+                        const ZSTD_compressionParameters* const cParams,
+                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
+{
+    U32* const hashTable  = ms->hashTable;
+    const U32 hashLog = cParams->hashLog;
+    U32* const chainTable = ms->chainTable;
+    const U32 chainMask = (1 << cParams->chainLog) - 1;
+    const BYTE* const base = ms->window.base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    while(idx < target) { /* catch up */
+        size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
+        NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+        hashTable[h] = idx;
+        idx++;
+        /* Stop inserting every position when in the lazy skipping mode. */
+        if (lazySkipping)
+            break;
+    }
+
+    ms->nextToUpdate = target;
+    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
+}
+
+/* inlining is important to hardwire a hot branch (template emulation) */
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_HcFindBestMatch(
+                        ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls, const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const chainTable = ms->chainTable;
+    const U32 chainSize = (1 << cParams->chainLog);
+    const U32 chainMask = chainSize-1;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const U32 curr = (U32)(ip-base);
+    const U32 maxDistance = 1U << cParams->windowLog;
+    const U32 lowestValid = ms->window.lowLimit;
+    const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
+    const U32 isDictionary = (ms->loadedDictEnd != 0);
+    const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
+    const U32 minChain = curr > chainSize ? curr - chainSize : 0;
+    U32 nbAttempts = 1U << cParams->searchLog;
+    size_t ml=4-1;
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
+                         ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
+    const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
+                        ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
+
+    U32 matchIndex;
+
+    if (dictMode == ZSTD_dedicatedDictSearch) {
+        const U32* entry = &dms->hashTable[ddsIdx];
+        PREFETCH_L1(entry);
+    }
+
+    /* HC4 match finder */
+    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
+
+    for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+        size_t currentMl=0;
+        if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+            const BYTE* const match = base + matchIndex;
+            assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                currentMl = ZSTD_count(ip, match, iLimit);
+        } else {
+            const BYTE* const match = dictBase + matchIndex;
+            assert(match+4 <= dictEnd);
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
+        }
+
+        /* save best solution */
+        if (currentMl > ml) {
+            ml = currentMl;
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+            if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+        }
+
+        if (matchIndex <= minChain) break;
+        matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
+    }
+
+    assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
+    if (dictMode == ZSTD_dedicatedDictSearch) {
+        ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
+                                                  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
+    } else if (dictMode == ZSTD_dictMatchState) {
+        const U32* const dmsChainTable = dms->chainTable;
+        const U32 dmsChainSize         = (1 << dms->cParams.chainLog);
+        const U32 dmsChainMask         = dmsChainSize - 1;
+        const U32 dmsLowestIndex       = dms->window.dictLimit;
+        const BYTE* const dmsBase      = dms->window.base;
+        const BYTE* const dmsEnd       = dms->window.nextSrc;
+        const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+        const U32 dmsIndexDelta        = dictLimit - dmsSize;
+        const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
+
+        matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
+
+        for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
+            size_t currentMl=0;
+            const BYTE* const match = dmsBase + matchIndex;
+            assert(match+4 <= dmsEnd);
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
+
+            /* save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                assert(curr > matchIndex + dmsIndexDelta);
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+
+            if (matchIndex <= dmsMinChain) break;
+
+            matchIndex = dmsChainTable[matchIndex & dmsChainMask];
+        }
+    }
+
+    return ml;
+}
+
+/* *********************************
+* (SIMD) Row-based matchfinder
+***********************************/
+/* Constants for row-based hash */
+#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+#define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
+
+#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
+
+typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 representing a mask of matches */
+
+/* ZSTD_VecMask_next():
+ * Starting from the LSB, returns the idx of the next non-zero bit.
+ * Basically counting the nb of trailing zeroes.
+ */
+MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+    return ZSTD_countTrailingZeros64(val);
+}
+
+/* ZSTD_row_nextIndex():
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
+ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
+ */
+FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+    U32 next = (*tagRow-1) & rowMask;
+    next += (next == 0) ? rowMask : 0; /* skip first position */
+    *tagRow = (BYTE)next;
+    return next;
+}
+
+/* ZSTD_isAligned():
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
+ */
+MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+    assert((align & (align - 1)) == 0);
+    return (((size_t)ptr) & (align - 1)) == 0;
+}
+
+/* ZSTD_row_prefetch():
+ * Performs prefetching for the hashTable and tagTable at a given row.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
+    PREFETCH_L1(hashTable + relRow);
+    if (rowLog >= 5) {
+        PREFETCH_L1(hashTable + relRow + 16);
+        /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
+    }
+    PREFETCH_L1(tagTable + relRow);
+    if (rowLog == 6) {
+        PREFETCH_L1(tagTable + relRow + 32);
+    }
+    assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
+    assert(ZSTD_isAligned(hashTable + relRow, 64));                 /* prefetched hash row always 64-byte aligned */
+    assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
+}
+
+/* ZSTD_row_fillHashCache():
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
+ * but not beyond iLimit.
+ */
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
+                                   U32 const rowLog, U32 const mls,
+                                   U32 idx, const BYTE* const iLimit)
+{
+    U32 const* const hashTable = ms->hashTable;
+    BYTE const* const tagTable = ms->tagTable;
+    U32 const hashLog = ms->rowHashLog;
+    U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
+    U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+
+    for (; idx < lim; ++idx) {
+        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+        U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+        ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+    }
+
+    DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
+                                                     ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
+                                                     ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
+}
+
+/* ZSTD_row_nextCachedHash():
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+ */
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+                                                  BYTE const* tagTable, BYTE const* base,
+                                                  U32 idx, U32 const hashLog,
+                                                  U32 const rowLog, U32 const mls,
+                                                  U64 const hashSalt)
+{
+    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+    U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+    ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+    {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+        cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
+        return hash;
+    }
+}
+
+/* ZSTD_row_update_internalImpl():
+ * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
+ */
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+                                  U32 updateStartIdx, U32 const updateEndIdx,
+                                  U32 const mls, U32 const rowLog,
+                                  U32 const rowMask, U32 const useCache)
+{
+    U32* const hashTable = ms->hashTable;
+    BYTE* const tagTable = ms->tagTable;
+    U32 const hashLog = ms->rowHashLog;
+    const BYTE* const base = ms->window.base;
+
+    DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
+    for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
+                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+        U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        U32* const row = hashTable + relRow;
+        BYTE* tagRow = tagTable + relRow;
+        U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+
+        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
+        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
+        row[pos] = updateStartIdx;
+    }
+}
+
+/* ZSTD_row_update_internal():
+ * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
+ * Skips sections of long matches as is necessary.
+ */
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+                              U32 const mls, U32 const rowLog,
+                              U32 const rowMask, U32 const useCache)
+{
+    U32 idx = ms->nextToUpdate;
+    const BYTE* const base = ms->window.base;
+    const U32 target = (U32)(ip - base);
+    const U32 kSkipThreshold = 384;
+    const U32 kMaxMatchStartPositionsToUpdate = 96;
+    const U32 kMaxMatchEndPositionsToUpdate = 32;
+
+    if (useCache) {
+        /* Only skip positions when using hash cache, i.e.
+         * if we are loading a dict, don't skip anything.
+         * If we decide to skip, then we only update a set number
+         * of positions at the beginning and end of the match.
+         */
+        if (UNLIKELY(target - idx > kSkipThreshold)) {
+            U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
+            ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
+            idx = target - kMaxMatchEndPositionsToUpdate;
+            ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
+        }
+    }
+    assert(target >= idx);
+    ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
+    ms->nextToUpdate = target;
+}
+
+/* ZSTD_row_update():
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
+ * processing.
+ */
+void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+    const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+    const U32 rowMask = (1u << rowLog) - 1;
+    const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+
+    DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
+}
+
+/* Returns the mask width of bits group of which will be set to 1. Given not all
+ * architectures have easy movemask instruction, this helps to iterate over
+ * groups of bits easier and faster.
+ */
+FORCE_INLINE_TEMPLATE U32
+ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    (void)rowEntries;
+#if defined(ZSTD_ARCH_ARM_NEON)
+    /* NEON path only works for little endian */
+    if (!MEM_isLittleEndian()) {
+        return 1;
+    }
+    if (rowEntries == 16) {
+        return 4;
+    }
+    if (rowEntries == 32) {
+        return 2;
+    }
+    if (rowEntries == 64) {
+        return 1;
+    }
+#endif
+    return 1;
+}
+
+#if defined(ZSTD_ARCH_X86_SSE2)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
+{
+    const __m128i comparisonMask = _mm_set1_epi8((char)tag);
+    int matches[4] = {0};
+    int i;
+    assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
+    for (i=0; i<nbChunks; i++) {
+        const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));
+        const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
+        matches[i] = _mm_movemask_epi8(equalMask);
+    }
+    if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);
+    if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
+    assert(nbChunks == 4);
+    return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
+}
+#endif
+
+#if defined(ZSTD_ARCH_ARM_NEON)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    if (rowEntries == 16) {
+        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
+         * After that groups of 4 bits represent the equalMask. We lower
+         * all bits except the highest in these groups by doing AND with
+         * 0x88 = 0b10001000.
+         */
+        const uint8x16_t chunk = vld1q_u8(src);
+        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
+    } else if (rowEntries == 32) {
+        /* Same idea as with rowEntries == 16 but doing AND with
+         * 0x55 = 0b01010101.
+         */
+        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
+        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
+        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
+        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
+    } else { /* rowEntries == 64 */
+        const uint8x16x4_t chunk = vld4q_u8(src);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+
+        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped);
+    }
+}
+#endif
+
+/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
+ * matches the hash at the nth position in a row of the tagTable.
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
+ * must rotate the "matches" bitfield to match up with the actual layout of the
+ * entries within the hashTable */
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
+{
+    const BYTE* const src = tagRow;
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
+
+#if defined(ZSTD_ARCH_X86_SSE2)
+
+    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
+
+#else /* SW or NEON-LE */
+
+# if defined(ZSTD_ARCH_ARM_NEON)
+  /* This NEON path only works for little endian - otherwise use SWAR below */
+    if (MEM_isLittleEndian()) {
+        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
+    }
+# endif /* ZSTD_ARCH_ARM_NEON */
+    /* SWAR */
+    {   const int chunkSize = sizeof(size_t);
+        const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
+        const size_t xFF = ~((size_t)0);
+        const size_t x01 = xFF / 0xFF;
+        const size_t x80 = x01 << 7;
+        const size_t splatChar = tag * x01;
+        ZSTD_VecMask matches = 0;
+        int i = rowEntries - chunkSize;
+        assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
+        if (MEM_isLittleEndian()) { /* runtime check so have two loops */
+            const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
+            do {
+                size_t chunk = MEM_readST(&src[i]);
+                chunk ^= splatChar;
+                chunk = (((chunk | x80) - x01) | chunk) & x80;
+                matches <<= chunkSize;
+                matches |= (chunk * extractMagic) >> shiftAmount;
+                i -= chunkSize;
+            } while (i >= 0);
+        } else { /* big endian: reverse bits during extraction */
+            const size_t msb = xFF ^ (xFF >> 1);
+            const size_t extractMagic = (msb / 0x1FF) | msb;
+            do {
+                size_t chunk = MEM_readST(&src[i]);
+                chunk ^= splatChar;
+                chunk = (((chunk | x80) - x01) | chunk) & x80;
+                matches <<= chunkSize;
+                matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
+                i -= chunkSize;
+            } while (i >= 0);
+        }
+        matches = ~matches;
+        if (rowEntries == 16) {
+            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
+        } else if (rowEntries == 32) {
+            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
+        } else {
+            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
+        }
+    }
+#endif
+}
+
+/* The high-level approach of the SIMD row based match finder is as follows:
+ * - Figure out where to insert the new entry:
+ *      - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index.
+ *           - The hash is salted by a value that changes on every contex reset, so when the same table is used
+ *             we will avoid collisions that would otherwise slow us down by intorducing phantom matches.
+ *      - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
+ *        which row to insert into.
+ *      - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
+ *        be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
+ *        per row).
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
+ *   generate a bitfield that we can cycle through to check the collisions in the hash table.
+ * - Pick the longest match.
+ * - Insert the tag into the equivalent row and position in the tagTable.
+ */
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_RowFindBestMatch(
+                        ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls, const ZSTD_dictMode_e dictMode,
+                        const U32 rowLog)
+{
+    U32* const hashTable = ms->hashTable;
+    BYTE* const tagTable = ms->tagTable;
+    U32* const hashCache = ms->hashCache;
+    const U32 hashLog = ms->rowHashLog;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const U32 curr = (U32)(ip-base);
+    const U32 maxDistance = 1U << cParams->windowLog;
+    const U32 lowestValid = ms->window.lowLimit;
+    const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
+    const U32 isDictionary = (ms->loadedDictEnd != 0);
+    const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
+    const U32 rowEntries = (1U << rowLog);
+    const U32 rowMask = rowEntries - 1;
+    const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
+    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
+    const U64 hashSalt = ms->hashSalt;
+    U32 nbAttempts = 1U << cappedSearchLog;
+    size_t ml=4-1;
+    U32 hash;
+
+    /* DMS/DDS variables that may be referenced laster */
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+
+    /* Initialize the following variables to satisfy static analyzer */
+    size_t ddsIdx = 0;
+    U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
+    U32 dmsTag = 0;
+    U32* dmsRow = NULL;
+    BYTE* dmsTagRow = NULL;
+
+    if (dictMode == ZSTD_dedicatedDictSearch) {
+        const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
+        {   /* Prefetch DDS hashtable entry */
+            ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
+            PREFETCH_L1(&dms->hashTable[ddsIdx]);
+        }
+        ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
+    }
+
+    if (dictMode == ZSTD_dictMatchState) {
+        /* Prefetch DMS rows */
+        U32* const dmsHashTable = dms->hashTable;
+        BYTE* const dmsTagTable = dms->tagTable;
+        U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+        dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
+        dmsRow = dmsHashTable + dmsRelRow;
+        ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
+    }
+
+    /* Update the hashTable and tagTable up to (but not including) ip */
+    if (!ms->lazySkipping) {
+        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
+    } else {
+        /* Stop inserting every position when in the lazy skipping mode.
+         * The hash cache is also not kept up to date in this mode.
+         */
+        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+        ms->nextToUpdate = curr;
+    }
+    ms->hashSaltEntropy += hash; /* collect salt entropy */
+
+    {   /* Get the hash for ip, compute the appropriate row */
+        U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+        U32* const row = hashTable + relRow;
+        BYTE* tagRow = (BYTE*)(tagTable + relRow);
+        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
+        U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+        size_t numMatches = 0;
+        size_t currMatch = 0;
+        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
+
+        /* Cycle through the matches and prefetch */
+        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
+            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+            U32 const matchIndex = row[matchPos];
+            if(matchPos == 0) continue;
+            assert(numMatches < rowEntries);
+            if (matchIndex < lowLimit)
+                break;
+            if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                PREFETCH_L1(base + matchIndex);
+            } else {
+                PREFETCH_L1(dictBase + matchIndex);
+            }
+            matchBuffer[numMatches++] = matchIndex;
+            --nbAttempts;
+        }
+
+        /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+           in ZSTD_row_update_internal() at the next search. */
+        {
+            U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+            tagRow[pos] = (BYTE)tag;
+            row[pos] = ms->nextToUpdate++;
+        }
+
+        /* Return the longest match */
+        for (; currMatch < numMatches; ++currMatch) {
+            U32 const matchIndex = matchBuffer[currMatch];
+            size_t currentMl=0;
+            assert(matchIndex < curr);
+            assert(matchIndex >= lowLimit);
+
+            if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                const BYTE* const match = base + matchIndex;
+                assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                    currentMl = ZSTD_count(ip, match, iLimit);
+            } else {
+                const BYTE* const match = dictBase + matchIndex;
+                assert(match+4 <= dictEnd);
+                if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                    currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
+            }
+
+            /* Save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+        }
+    }
+
+    assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
+    if (dictMode == ZSTD_dedicatedDictSearch) {
+        ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
+                                                  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
+    } else if (dictMode == ZSTD_dictMatchState) {
+        /* TODO: Measure and potentially add prefetching to DMS */
+        const U32 dmsLowestIndex       = dms->window.dictLimit;
+        const BYTE* const dmsBase      = dms->window.base;
+        const BYTE* const dmsEnd       = dms->window.nextSrc;
+        const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+        const U32 dmsIndexDelta        = dictLimit - dmsSize;
+
+        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
+            U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+            size_t numMatches = 0;
+            size_t currMatch = 0;
+            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
+
+            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
+                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+                U32 const matchIndex = dmsRow[matchPos];
+                if(matchPos == 0) continue;
+                if (matchIndex < dmsLowestIndex)
+                    break;
+                PREFETCH_L1(dmsBase + matchIndex);
+                matchBuffer[numMatches++] = matchIndex;
+                --nbAttempts;
+            }
+
+            /* Return the longest match */
+            for (; currMatch < numMatches; ++currMatch) {
+                U32 const matchIndex = matchBuffer[currMatch];
+                size_t currentMl=0;
+                assert(matchIndex >= dmsLowestIndex);
+                assert(matchIndex < curr);
+
+                {   const BYTE* const match = dmsBase + matchIndex;
+                    assert(match+4 <= dmsEnd);
+                    if (MEM_read32(match) == MEM_read32(ip))
+                        currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
+                }
+
+                if (currentMl > ml) {
+                    ml = currentMl;
+                    assert(curr > matchIndex + dmsIndexDelta);
+                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                    if (ip+currentMl == iLimit) break;
+                }
+            }
+        }
+    }
+    return ml;
+}
+
+
+/**
+ * Generate search functions templated on (dictMode, mls, rowLog).
+ * These functions are outlined for code size & compilation time.
+ * ZSTD_searchMax() dispatches to the correct implementation function.
+ *
+ * TODO: The start of the search function involves loading and calculating a
+ * bunch of constants from the ZSTD_matchState_t. These computations could be
+ * done in an initialization function, and saved somewhere in the match state.
+ * Then we could pass a pointer to the saved state instead of the match state,
+ * and avoid duplicate computations.
+ *
+ * TODO: Move the match re-winding into searchMax. This improves compression
+ * ratio, and unlocks further simplifications with the next TODO.
+ *
+ * TODO: Try moving the repcode search into searchMax. After the re-winding
+ * and repcode search are in searchMax, there is no more logic in the match
+ * finder loop that requires knowledge about the dictMode. So we should be
+ * able to avoid force inlining it, and we can join the extDict loop with
+ * the single segment loop. It should go in searchMax instead of its own
+ * function to avoid having multiple virtual function calls per search.
+ */
+
+#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
+#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
+#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
+
+#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
+
+#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls)                                           \
+    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)(                      \
+            ZSTD_matchState_t* ms,                                                     \
+            const BYTE* ip, const BYTE* const iLimit,                                  \
+            size_t* offBasePtr)                                                        \
+    {                                                                                  \
+        assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                           \
+        return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
+    }                                                                                  \
+
+#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls)                                          \
+    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)(                     \
+            ZSTD_matchState_t* ms,                                                    \
+            const BYTE* ip, const BYTE* const iLimit,                                 \
+            size_t* offsetPtr)                                                        \
+    {                                                                                 \
+        assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                          \
+        return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
+    }                                                                                 \
+
+#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)                                          \
+    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(                     \
+            ZSTD_matchState_t* ms,                                                             \
+            const BYTE* ip, const BYTE* const iLimit,                                          \
+            size_t* offsetPtr)                                                                 \
+    {                                                                                          \
+        assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                                   \
+        assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog);                               \
+        return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
+    }                                                                                          \
+
+#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
+    X(dictMode, mls, 4)                        \
+    X(dictMode, mls, 5)                        \
+    X(dictMode, mls, 6)
+
+#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
+    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4)      \
+    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5)      \
+    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
+
+#define ZSTD_FOR_EACH_MLS(X, dictMode) \
+    X(dictMode, 4)                     \
+    X(dictMode, 5)                     \
+    X(dictMode, 6)
+
+#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
+    X(__VA_ARGS__, noDict)              \
+    X(__VA_ARGS__, extDict)             \
+    X(__VA_ARGS__, dictMatchState)      \
+    X(__VA_ARGS__, dedicatedDictSearch)
+
+/* Generate row search fns for each combination of (dictMode, mls, rowLog) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
+/* Generate binary Tree search fns for each combination of (dictMode, mls) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
+/* Generate hash chain search fns for each combination of (dictMode, mls) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
+
+typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
+
+#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls)                         \
+    case mls:                                                             \
+        return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
+#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls)                         \
+    case mls:                                                             \
+        return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
+#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog)                         \
+    case rowLog:                                                                   \
+        return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
+
+#define ZSTD_SWITCH_MLS(X, dictMode)   \
+    switch (mls) {                     \
+        ZSTD_FOR_EACH_MLS(X, dictMode) \
+    }
+
+#define ZSTD_SWITCH_ROWLOG(dictMode, mls)                                    \
+    case mls:                                                                \
+        switch (rowLog) {                                                    \
+            ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
+        }                                                                    \
+        ZSTD_UNREACHABLE;                                                    \
+        break;
+
+#define ZSTD_SWITCH_SEARCH_METHOD(dictMode)                       \
+    switch (searchMethod) {                                       \
+        case search_hashChain:                                    \
+            ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
+            break;                                                \
+        case search_binaryTree:                                   \
+            ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
+            break;                                                \
+        case search_rowHash:                                      \
+            ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode)         \
+            break;                                                \
+    }                                                             \
+    ZSTD_UNREACHABLE;
+
+/**
+ * Searches for the longest match at @p ip.
+ * Dispatches to the correct implementation function based on the
+ * (searchMethod, dictMode, mls, rowLog). We use switch statements
+ * here instead of using an indirect function call through a function
+ * pointer because after Spectre and Meltdown mitigations, indirect
+ * function calls can be very costly, especially in the kernel.
+ *
+ * NOTE: dictMode and searchMethod should be templated, so those switch
+ * statements should be optimized out. Only the mls & rowLog switches
+ * should be left.
+ *
+ * @param ms The match state.
+ * @param ip The position to search at.
+ * @param iend The end of the input data.
+ * @param[out] offsetPtr Stores the match offset into this pointer.
+ * @param mls The minimum search length, in the range [4, 6].
+ * @param rowLog The row log (if applicable), in the range [4, 6].
+ * @param searchMethod The search method to use (templated).
+ * @param dictMode The dictMode (templated).
+ *
+ * @returns The length of the longest match found, or < mls if no match is found.
+ * If a match is found its offset is stored in @p offsetPtr.
+ */
+FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
+    ZSTD_matchState_t* ms,
+    const BYTE* ip,
+    const BYTE* iend,
+    size_t* offsetPtr,
+    U32 const mls,
+    U32 const rowLog,
+    searchMethod_e const searchMethod,
+    ZSTD_dictMode_e const dictMode)
+{
+    if (dictMode == ZSTD_noDict) {
+        ZSTD_SWITCH_SEARCH_METHOD(noDict)
+    } else if (dictMode == ZSTD_extDict) {
+        ZSTD_SWITCH_SEARCH_METHOD(extDict)
+    } else if (dictMode == ZSTD_dictMatchState) {
+        ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
+    } else if (dictMode == ZSTD_dedicatedDictSearch) {
+        ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
+    }
+    ZSTD_UNREACHABLE;
+    return 0;
+}
+
+/* *******************************
+*  Common parser - lazy strategy
+*********************************/
+
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_lazy_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        const void* src, size_t srcSize,
+                        const searchMethod_e searchMethod, const U32 depth,
+                        ZSTD_dictMode_e const dictMode)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32 prefixLowestIndex = ms->window.dictLimit;
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+    const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+    const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+
+    U32 offset_1 = rep[0], offset_2 = rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    const int isDMS = dictMode == ZSTD_dictMatchState;
+    const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+    const int isDxS = isDMS || isDDS;
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const U32 dictLowestIndex      = isDxS ? dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = isDxS ? dms->window.base : NULL;
+    const BYTE* const dictLowest   = isDxS ? dictBase + dictLowestIndex : NULL;
+    const BYTE* const dictEnd      = isDxS ? dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = isDxS ?
+                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
+
+    DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
+    ip += (dictAndPrefixLength == 0);
+    if (dictMode == ZSTD_noDict) {
+        U32 const curr = (U32)(ip - base);
+        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
+        U32 const maxRep = curr - windowLow;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+    if (isDxS) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
+
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
+    if (searchMethod == search_rowHash) {
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+    }
+
+    /* Match Loop */
+#if defined(__GNUC__) && defined(__x86_64__)
+    /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
+     * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
+     */
+    __asm__(".p2align 5");
+#endif
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offBase = REPCODE1_TO_OFFBASE;
+        const BYTE* start=ip+1;
+        DEBUGLOG(7, "search baseline (depth 0)");
+
+        /* check repCode */
+        if (isDxS) {
+            const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
+            const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
+                                && repIndex < prefixLowestIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+                const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                if (depth==0) goto _storeSequence;
+            }
+        }
+        if ( dictMode == ZSTD_noDict
+          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
+            matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            if (depth==0) goto _storeSequence;
+        }
+
+        /* first search (depth 0) */
+        {   size_t offbaseFound = 999999999;
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offBase = offbaseFound;
+        }
+
+        if (matchLength < 4) {
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
+            ip += step;
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            DEBUGLOG(7, "search depth 1");
+            ip ++;
+            if ( (dictMode == ZSTD_noDict)
+              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                int const gain2 = (int)(mlRep * 3);
+                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                if ((mlRep >= 4) && (gain2 > gain1))
+                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+            }
+            if (isDxS) {
+                const U32 repIndex = (U32)(ip - base) - offset_1;
+                const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+                if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                    int const gain2 = (int)(mlRep * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                }
+            }
+            {   size_t ofbCandidate=999999999;
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                if ((ml2 >= 4) && (gain2 > gain1)) {
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                DEBUGLOG(7, "search depth 2");
+                ip ++;
+                if ( (dictMode == ZSTD_noDict)
+                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                    size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                    int const gain2 = (int)(mlRep * 4);
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                }
+                if (isDxS) {
+                    const U32 repIndex = (U32)(ip - base) - offset_1;
+                    const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+                    if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                        && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                        const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                        size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                        int const gain2 = (int)(mlRep * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                        if ((mlRep >= 4) && (gain2 > gain1))
+                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                    }
+                }
+                {   size_t ofbCandidate=999999999;
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* NOTE:
+         * Pay attention that `start[-value]` can lead to strange undefined behavior
+         * notably if `value` is unsigned, resulting in a large positive `-value`.
+         */
+        /* catch up */
+        if (OFFBASE_IS_OFFSET(offBase)) {
+            if (dictMode == ZSTD_noDict) {
+                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
+                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
+                    { start--; matchLength++; }
+            }
+            if (isDxS) {
+                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+                const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            }
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+        }
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = (size_t)(start - anchor);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+            anchor = ip = start + matchLength;
+        }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
+
+        /* check immediate repcode */
+        if (isDxS) {
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex = current2 - offset_2;
+                const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                        dictBase - dictIndexDelta + repIndex :
+                        base + repIndex;
+                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                    ip += matchLength;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+            }
+        }
+
+        if (dictMode == ZSTD_noDict) {
+            while ( ((ip <= ilimit) & (offset_2>0))
+                 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                /* store sequence */
+                matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+    }   }   }
+
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+#endif /* build exclusions */
+
+
+#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_greedy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+}
+#endif
+
+#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_lazy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+}
+#endif
+
+#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_lazy2_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
+}
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
+}
+#endif
+
+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_lazy_extDict_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        const void* src, size_t srcSize,
+                        const searchMethod_e searchMethod, const U32 depth)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictEnd  = dictBase + dictLimit;
+    const BYTE* const dictStart  = dictBase + ms->window.lowLimit;
+    const U32 windowLog = ms->cParams.windowLog;
+    const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+    const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+
+    U32 offset_1 = rep[0], offset_2 = rep[1];
+
+    DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
+
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
+    /* init */
+    ip += (ip == prefixStart);
+    if (searchMethod == search_rowHash) {
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+    }
+
+    /* Match Loop */
+#if defined(__GNUC__) && defined(__x86_64__)
+    /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
+     * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
+     */
+    __asm__(".p2align 5");
+#endif
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offBase = REPCODE1_TO_OFFBASE;
+        const BYTE* start=ip+1;
+        U32 curr = (U32)(ip-base);
+
+        /* check repCode */
+        {   const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
+            const U32 repIndex = (U32)(curr+1 - offset_1);
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
+               & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
+            if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                if (depth==0) goto _storeSequence;
+        }   }
+
+        /* first search (depth 0) */
+        {   size_t ofbCandidate = 999999999;
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offBase = ofbCandidate;
+        }
+
+        if (matchLength < 4) {
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
+            ip += step + 1;   /* jump faster over incompressible sections */
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            ip ++;
+            curr++;
+            /* check repCode */
+            if (offBase) {
+                const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                const U32 repIndex = (U32)(curr - offset_1);
+                const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                const BYTE* const repMatch = repBase + repIndex;
+                if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
+                   & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+                if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                    /* repcode detected */
+                    const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                    size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                    int const gain2 = (int)(repLength * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                    if ((repLength >= 4) && (gain2 > gain1))
+                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+            }   }
+
+            /* search match, depth 1 */
+            {   size_t ofbCandidate = 999999999;
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                if ((ml2 >= 4) && (gain2 > gain1)) {
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                ip ++;
+                curr++;
+                /* check repCode */
+                if (offBase) {
+                    const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                    const U32 repIndex = (U32)(curr - offset_1);
+                    const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                    const BYTE* const repMatch = repBase + repIndex;
+                    if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
+                       & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+                    if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                        /* repcode detected */
+                        const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                        size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                        int const gain2 = (int)(repLength * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                        if ((repLength >= 4) && (gain2 > gain1))
+                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                }   }
+
+                /* search match, depth 2 */
+                {   size_t ofbCandidate = 999999999;
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* catch up */
+        if (OFFBASE_IS_OFFSET(offBase)) {
+            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+            const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+            const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+            while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+        }
+
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = (size_t)(start - anchor);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+            anchor = ip = start + matchLength;
+        }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
+
+        /* check immediate repcode */
+        while (ip <= ilimit) {
+            const U32 repCurrent = (U32)(ip-base);
+            const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
+            const U32 repIndex = repCurrent - offset_2;
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
+               & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+            if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+            }
+            break;
+    }   }
+
+    /* Save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+#endif /* build exclusions */
+
+#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
+}
+
+size_t ZSTD_compressBlock_greedy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
+}
+#endif
+
+#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
+}
+
+size_t ZSTD_compressBlock_lazy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
+}
+#endif
+
+#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
+}
+
+size_t ZSTD_compressBlock_lazy2_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+}
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
+}
+#endif
diff --git a/src/zstd/compress/zstd_lazy.h b/src/zstd/compress/zstd_lazy.h
new file mode 100644
index 000000000..3635813bd
--- /dev/null
+++ b/src/zstd/compress/zstd_lazy.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LAZY_H
+#define ZSTD_LAZY_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "zstd_compress_internal.h"
+
+/**
+ * Dedicated Dictionary Search Structure bucket log. In the
+ * ZSTD_dedicatedDictSearch mode, the hashTable has
+ * 2 ** ZSTD_LAZY_DDSS_BUCKET_LOG entries in each bucket, rather than just
+ * one.
+ */
+#define ZSTD_LAZY_DDSS_BUCKET_LOG 2
+
+#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+
+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
+
+void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
+
+void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
+#endif
+
+#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy
+#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row
+#else
+#define ZSTD_COMPRESSBLOCK_GREEDY NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL
+#endif
+
+#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy
+#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row
+#else
+#define ZSTD_COMPRESSBLOCK_LAZY NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL
+#endif
+
+#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2
+#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row
+#else
+#define ZSTD_COMPRESSBLOCK_LAZY2 NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict
+#else
+#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_LAZY_H */
diff --git a/src/zstd/compress/zstd_ldm.c b/src/zstd/compress/zstd_ldm.c
new file mode 100644
index 000000000..17c069fe1
--- /dev/null
+++ b/src/zstd/compress/zstd_ldm.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_ldm.h"
+
+#include "../common/debug.h"
+#include "../common/xxhash.h"
+#include "zstd_fast.h"          /* ZSTD_fillHashTable() */
+#include "zstd_double_fast.h"   /* ZSTD_fillDoubleHashTable() */
+#include "zstd_ldm_geartab.h"
+
+#define LDM_BUCKET_SIZE_LOG 3
+#define LDM_MIN_MATCH_LENGTH 64
+#define LDM_HASH_RLOG 7
+
+typedef struct {
+    U64 rolling;
+    U64 stopMask;
+} ldmRollingHashState_t;
+
+/** ZSTD_ldm_gear_init():
+ *
+ * Initializes the rolling hash state such that it will honor the
+ * settings in params. */
+static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const* params)
+{
+    unsigned maxBitsInMask = MIN(params->minMatchLength, 64);
+    unsigned hashRateLog = params->hashRateLog;
+
+    state->rolling = ~(U32)0;
+
+    /* The choice of the splitting criterion is subject to two conditions:
+     *   1. it has to trigger on average every 2^(hashRateLog) bytes;
+     *   2. ideally, it has to depend on a window of minMatchLength bytes.
+     *
+     * In the gear hash algorithm, bit n depends on the last n bytes;
+     * so in order to obtain a good quality splitting criterion it is
+     * preferable to use bits with high weight.
+     *
+     * To match condition 1 we use a mask with hashRateLog bits set
+     * and, because of the previous remark, we make sure these bits
+     * have the highest possible weight while still respecting
+     * condition 2.
+     */
+    if (hashRateLog > 0 && hashRateLog <= maxBitsInMask) {
+        state->stopMask = (((U64)1 << hashRateLog) - 1) << (maxBitsInMask - hashRateLog);
+    } else {
+        /* In this degenerate case we simply honor the hash rate. */
+        state->stopMask = ((U64)1 << hashRateLog) - 1;
+    }
+}
+
+/** ZSTD_ldm_gear_reset()
+ * Feeds [data, data + minMatchLength) into the hash without registering any
+ * splits. This effectively resets the hash state. This is used when skipping
+ * over data, either at the beginning of a block, or skipping sections.
+ */
+static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state,
+                                BYTE const* data, size_t minMatchLength)
+{
+    U64 hash = state->rolling;
+    size_t n = 0;
+
+#define GEAR_ITER_ONCE() do {                                  \
+        hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \
+        n += 1;                                                \
+    } while (0)
+    while (n + 3 < minMatchLength) {
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+    }
+    while (n < minMatchLength) {
+        GEAR_ITER_ONCE();
+    }
+#undef GEAR_ITER_ONCE
+}
+
+/** ZSTD_ldm_gear_feed():
+ *
+ * Registers in the splits array all the split points found in the first
+ * size bytes following the data pointer. This function terminates when
+ * either all the data has been processed or LDM_BATCH_SIZE splits are
+ * present in the splits array.
+ *
+ * Precondition: The splits array must not be full.
+ * Returns: The number of bytes processed. */
+static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state,
+                                 BYTE const* data, size_t size,
+                                 size_t* splits, unsigned* numSplits)
+{
+    size_t n;
+    U64 hash, mask;
+
+    hash = state->rolling;
+    mask = state->stopMask;
+    n = 0;
+
+#define GEAR_ITER_ONCE() do { \
+        hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \
+        n += 1; \
+        if (UNLIKELY((hash & mask) == 0)) { \
+            splits[*numSplits] = n; \
+            *numSplits += 1; \
+            if (*numSplits == LDM_BATCH_SIZE) \
+                goto done; \
+        } \
+    } while (0)
+
+    while (n + 3 < size) {
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+    }
+    while (n < size) {
+        GEAR_ITER_ONCE();
+    }
+
+#undef GEAR_ITER_ONCE
+
+done:
+    state->rolling = hash;
+    return n;
+}
+
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams)
+{
+    params->windowLog = cParams->windowLog;
+    ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
+    DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
+    if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
+    if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
+    if (params->hashLog == 0) {
+        params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
+        assert(params->hashLog <= ZSTD_HASHLOG_MAX);
+    }
+    if (params->hashRateLog == 0) {
+        params->hashRateLog = params->windowLog < params->hashLog
+                                   ? 0
+                                   : params->windowLog - params->hashLog;
+    }
+    params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
+}
+
+size_t ZSTD_ldm_getTableSize(ldmParams_t params)
+{
+    size_t const ldmHSize = ((size_t)1) << params.hashLog;
+    size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog);
+    size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog);
+    size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize)
+                           + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t));
+    return params.enableLdm == ZSTD_ps_enable ? totalSize : 0;
+}
+
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
+{
+    return params.enableLdm == ZSTD_ps_enable ? (maxChunkSize / params.minMatchLength) : 0;
+}
+
+/** ZSTD_ldm_getBucket() :
+ *  Returns a pointer to the start of the bucket associated with hash. */
+static ldmEntry_t* ZSTD_ldm_getBucket(
+        ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams)
+{
+    return ldmState->hashTable + (hash << ldmParams.bucketSizeLog);
+}
+
+/** ZSTD_ldm_insertEntry() :
+ *  Insert the entry with corresponding hash into the hash table */
+static void ZSTD_ldm_insertEntry(ldmState_t* ldmState,
+                                 size_t const hash, const ldmEntry_t entry,
+                                 ldmParams_t const ldmParams)
+{
+    BYTE* const pOffset = ldmState->bucketOffsets + hash;
+    unsigned const offset = *pOffset;
+
+    *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + offset) = entry;
+    *pOffset = (BYTE)((offset + 1) & ((1u << ldmParams.bucketSizeLog) - 1));
+
+}
+
+/** ZSTD_ldm_countBackwardsMatch() :
+ *  Returns the number of bytes that match backwards before pIn and pMatch.
+ *
+ *  We count only bytes where pMatch >= pBase and pIn >= pAnchor. */
+static size_t ZSTD_ldm_countBackwardsMatch(
+            const BYTE* pIn, const BYTE* pAnchor,
+            const BYTE* pMatch, const BYTE* pMatchBase)
+{
+    size_t matchLength = 0;
+    while (pIn > pAnchor && pMatch > pMatchBase && pIn[-1] == pMatch[-1]) {
+        pIn--;
+        pMatch--;
+        matchLength++;
+    }
+    return matchLength;
+}
+
+/** ZSTD_ldm_countBackwardsMatch_2segments() :
+ *  Returns the number of bytes that match backwards from pMatch,
+ *  even with the backwards match spanning 2 different segments.
+ *
+ *  On reaching `pMatchBase`, start counting from mEnd */
+static size_t ZSTD_ldm_countBackwardsMatch_2segments(
+                    const BYTE* pIn, const BYTE* pAnchor,
+                    const BYTE* pMatch, const BYTE* pMatchBase,
+                    const BYTE* pExtDictStart, const BYTE* pExtDictEnd)
+{
+    size_t matchLength = ZSTD_ldm_countBackwardsMatch(pIn, pAnchor, pMatch, pMatchBase);
+    if (pMatch - matchLength != pMatchBase || pMatchBase == pExtDictStart) {
+        /* If backwards match is entirely in the extDict or prefix, immediately return */
+        return matchLength;
+    }
+    DEBUGLOG(7, "ZSTD_ldm_countBackwardsMatch_2segments: found 2-parts backwards match (length in prefix==%zu)", matchLength);
+    matchLength += ZSTD_ldm_countBackwardsMatch(pIn - matchLength, pAnchor, pExtDictEnd, pExtDictStart);
+    DEBUGLOG(7, "final backwards match length = %zu", matchLength);
+    return matchLength;
+}
+
+/** ZSTD_ldm_fillFastTables() :
+ *
+ *  Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies.
+ *  This is similar to ZSTD_loadDictionaryContent.
+ *
+ *  The tables for the other strategies are filled within their
+ *  block compressors. */
+static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+                                      void const* end)
+{
+    const BYTE* const iend = (const BYTE*)end;
+
+    switch(ms->cParams.strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+        break;
+
+    case ZSTD_dfast:
+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+#else
+        assert(0); /* shouldn't be called: cparams should've been adjusted. */
+#endif
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+    case ZSTD_btlazy2:
+    case ZSTD_btopt:
+    case ZSTD_btultra:
+    case ZSTD_btultra2:
+        break;
+    default:
+        assert(0);  /* not possible : not a valid strategy id */
+    }
+
+    return 0;
+}
+
+void ZSTD_ldm_fillHashTable(
+            ldmState_t* ldmState, const BYTE* ip,
+            const BYTE* iend, ldmParams_t const* params)
+{
+    U32 const minMatchLength = params->minMatchLength;
+    U32 const hBits = params->hashLog - params->bucketSizeLog;
+    BYTE const* const base = ldmState->window.base;
+    BYTE const* const istart = ip;
+    ldmRollingHashState_t hashState;
+    size_t* const splits = ldmState->splitIndices;
+    unsigned numSplits;
+
+    DEBUGLOG(5, "ZSTD_ldm_fillHashTable");
+
+    ZSTD_ldm_gear_init(&hashState, params);
+    while (ip < iend) {
+        size_t hashed;
+        unsigned n;
+
+        numSplits = 0;
+        hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits);
+
+        for (n = 0; n < numSplits; n++) {
+            if (ip + splits[n] >= istart + minMatchLength) {
+                BYTE const* const split = ip + splits[n] - minMatchLength;
+                U64 const xxhash = XXH64(split, minMatchLength, 0);
+                U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1));
+                ldmEntry_t entry;
+
+                entry.offset = (U32)(split - base);
+                entry.checksum = (U32)(xxhash >> 32);
+                ZSTD_ldm_insertEntry(ldmState, hash, entry, *params);
+            }
+        }
+
+        ip += hashed;
+    }
+}
+
+
+/** ZSTD_ldm_limitTableUpdate() :
+ *
+ *  Sets cctx->nextToUpdate to a position corresponding closer to anchor
+ *  if it is far way
+ *  (after a long match, only update tables a limited amount). */
+static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+{
+    U32 const curr = (U32)(anchor - ms->window.base);
+    if (curr > ms->nextToUpdate + 1024) {
+        ms->nextToUpdate =
+            curr - MIN(512, curr - ms->nextToUpdate - 1024);
+    }
+}
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_ldm_generateSequences_internal(
+        ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    /* LDM parameters */
+    int const extDict = ZSTD_window_hasExtDict(ldmState->window);
+    U32 const minMatchLength = params->minMatchLength;
+    U32 const entsPerBucket = 1U << params->bucketSizeLog;
+    U32 const hBits = params->hashLog - params->bucketSizeLog;
+    /* Prefix and extDict parameters */
+    U32 const dictLimit = ldmState->window.dictLimit;
+    U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit;
+    BYTE const* const base = ldmState->window.base;
+    BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL;
+    BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL;
+    BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL;
+    BYTE const* const lowPrefixPtr = base + dictLimit;
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    BYTE const* const ilimit = iend - HASH_READ_SIZE;
+    /* Input positions */
+    BYTE const* anchor = istart;
+    BYTE const* ip = istart;
+    /* Rolling hash state */
+    ldmRollingHashState_t hashState;
+    /* Arrays for staged-processing */
+    size_t* const splits = ldmState->splitIndices;
+    ldmMatchCandidate_t* const candidates = ldmState->matchCandidates;
+    unsigned numSplits;
+
+    if (srcSize < minMatchLength)
+        return iend - anchor;
+
+    /* Initialize the rolling hash state with the first minMatchLength bytes */
+    ZSTD_ldm_gear_init(&hashState, params);
+    ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength);
+    ip += minMatchLength;
+
+    while (ip < ilimit) {
+        size_t hashed;
+        unsigned n;
+
+        numSplits = 0;
+        hashed = ZSTD_ldm_gear_feed(&hashState, ip, ilimit - ip,
+                                    splits, &numSplits);
+
+        for (n = 0; n < numSplits; n++) {
+            BYTE const* const split = ip + splits[n] - minMatchLength;
+            U64 const xxhash = XXH64(split, minMatchLength, 0);
+            U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1));
+
+            candidates[n].split = split;
+            candidates[n].hash = hash;
+            candidates[n].checksum = (U32)(xxhash >> 32);
+            candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, *params);
+            PREFETCH_L1(candidates[n].bucket);
+        }
+
+        for (n = 0; n < numSplits; n++) {
+            size_t forwardMatchLength = 0, backwardMatchLength = 0,
+                   bestMatchLength = 0, mLength;
+            U32 offset;
+            BYTE const* const split = candidates[n].split;
+            U32 const checksum = candidates[n].checksum;
+            U32 const hash = candidates[n].hash;
+            ldmEntry_t* const bucket = candidates[n].bucket;
+            ldmEntry_t const* cur;
+            ldmEntry_t const* bestEntry = NULL;
+            ldmEntry_t newEntry;
+
+            newEntry.offset = (U32)(split - base);
+            newEntry.checksum = checksum;
+
+            /* If a split point would generate a sequence overlapping with
+             * the previous one, we merely register it in the hash table and
+             * move on */
+            if (split < anchor) {
+                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
+                continue;
+            }
+
+            for (cur = bucket; cur < bucket + entsPerBucket; cur++) {
+                size_t curForwardMatchLength, curBackwardMatchLength,
+                       curTotalMatchLength;
+                if (cur->checksum != checksum || cur->offset <= lowestIndex) {
+                    continue;
+                }
+                if (extDict) {
+                    BYTE const* const curMatchBase =
+                        cur->offset < dictLimit ? dictBase : base;
+                    BYTE const* const pMatch = curMatchBase + cur->offset;
+                    BYTE const* const matchEnd =
+                        cur->offset < dictLimit ? dictEnd : iend;
+                    BYTE const* const lowMatchPtr =
+                        cur->offset < dictLimit ? dictStart : lowPrefixPtr;
+                    curForwardMatchLength =
+                        ZSTD_count_2segments(split, pMatch, iend, matchEnd, lowPrefixPtr);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength = ZSTD_ldm_countBackwardsMatch_2segments(
+                            split, anchor, pMatch, lowMatchPtr, dictStart, dictEnd);
+                } else { /* !extDict */
+                    BYTE const* const pMatch = base + cur->offset;
+                    curForwardMatchLength = ZSTD_count(split, pMatch, iend);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength =
+                        ZSTD_ldm_countBackwardsMatch(split, anchor, pMatch, lowPrefixPtr);
+                }
+                curTotalMatchLength = curForwardMatchLength + curBackwardMatchLength;
+
+                if (curTotalMatchLength > bestMatchLength) {
+                    bestMatchLength = curTotalMatchLength;
+                    forwardMatchLength = curForwardMatchLength;
+                    backwardMatchLength = curBackwardMatchLength;
+                    bestEntry = cur;
+                }
+            }
+
+            /* No match found -- insert an entry into the hash table
+             * and process the next candidate match */
+            if (bestEntry == NULL) {
+                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
+                continue;
+            }
+
+            /* Match found */
+            offset = (U32)(split - base) - bestEntry->offset;
+            mLength = forwardMatchLength + backwardMatchLength;
+            {
+                rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;
+
+                /* Out of sequence storage */
+                if (rawSeqStore->size == rawSeqStore->capacity)
+                    return ERROR(dstSize_tooSmall);
+                seq->litLength = (U32)(split - backwardMatchLength - anchor);
+                seq->matchLength = (U32)mLength;
+                seq->offset = offset;
+                rawSeqStore->size++;
+            }
+
+            /* Insert the current entry into the hash table --- it must be
+             * done after the previous block to avoid clobbering bestEntry */
+            ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
+
+            anchor = split + forwardMatchLength;
+
+            /* If we find a match that ends after the data that we've hashed
+             * then we have a repeating, overlapping, pattern. E.g. all zeros.
+             * If one repetition of the pattern matches our `stopMask` then all
+             * repetitions will. We don't need to insert them all into out table,
+             * only the first one. So skip over overlapping matches.
+             * This is a major speed boost (20x) for compressing a single byte
+             * repeated, when that byte ends up in the table.
+             */
+            if (anchor > ip + hashed) {
+                ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength);
+                /* Continue the outer loop at anchor (ip + hashed == anchor). */
+                ip = anchor - hashed;
+                break;
+            }
+        }
+
+        ip += hashed;
+    }
+
+    return iend - anchor;
+}
+
+/*! ZSTD_ldm_reduceTable() :
+ *  reduce table indexes by `reducerValue` */
+static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
+                                 U32 const reducerValue)
+{
+    U32 u;
+    for (u = 0; u < size; u++) {
+        if (table[u].offset < reducerValue) table[u].offset = 0;
+        else table[u].offset -= reducerValue;
+    }
+}
+
+size_t ZSTD_ldm_generateSequences(
+        ldmState_t* ldmState, rawSeqStore_t* sequences,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    U32 const maxDist = 1U << params->windowLog;
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    size_t const kMaxChunkSize = 1 << 20;
+    size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0);
+    size_t chunk;
+    size_t leftoverSize = 0;
+
+    assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize);
+    /* Check that ZSTD_window_update() has been called for this chunk prior
+     * to passing it to this function.
+     */
+    assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize);
+    /* The input could be very large (in zstdmt), so it must be broken up into
+     * chunks to enforce the maximum distance and handle overflow correction.
+     */
+    assert(sequences->pos <= sequences->size);
+    assert(sequences->size <= sequences->capacity);
+    for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) {
+        BYTE const* const chunkStart = istart + chunk * kMaxChunkSize;
+        size_t const remaining = (size_t)(iend - chunkStart);
+        BYTE const *const chunkEnd =
+            (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize;
+        size_t const chunkSize = chunkEnd - chunkStart;
+        size_t newLeftoverSize;
+        size_t const prevSize = sequences->size;
+
+        assert(chunkStart < iend);
+        /* 1. Perform overflow correction if necessary. */
+        if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDist, ldmState->loadedDictEnd, chunkStart, chunkEnd)) {
+            U32 const ldmHSize = 1U << params->hashLog;
+            U32 const correction = ZSTD_window_correctOverflow(
+                &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart);
+            ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction);
+            /* invalidate dictionaries on overflow correction */
+            ldmState->loadedDictEnd = 0;
+        }
+        /* 2. We enforce the maximum offset allowed.
+         *
+         * kMaxChunkSize should be small enough that we don't lose too much of
+         * the window through early invalidation.
+         * TODO: * Test the chunk size.
+         *       * Try invalidation after the sequence generation and test the
+         *         offset against maxDist directly.
+         *
+         * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+         * that any offset used is valid at the END of the sequence, since it may
+         * be split into two sequences. This condition holds when using
+         * ZSTD_window_enforceMaxDist(), but if we move to checking offsets
+         * against maxDist directly, we'll have to carefully handle that case.
+         */
+        ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL);
+        /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
+        newLeftoverSize = ZSTD_ldm_generateSequences_internal(
+            ldmState, sequences, params, chunkStart, chunkSize);
+        if (ZSTD_isError(newLeftoverSize))
+            return newLeftoverSize;
+        /* 4. We add the leftover literals from previous iterations to the first
+         *    newly generated sequence, or add the `newLeftoverSize` if none are
+         *    generated.
+         */
+        /* Prepend the leftover literals from the last call */
+        if (prevSize < sequences->size) {
+            sequences->seq[prevSize].litLength += (U32)leftoverSize;
+            leftoverSize = newLeftoverSize;
+        } else {
+            assert(newLeftoverSize == chunkSize);
+            leftoverSize += chunkSize;
+        }
+    }
+    return 0;
+}
+
+void
+ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch)
+{
+    while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
+        if (srcSize <= seq->litLength) {
+            /* Skip past srcSize literals */
+            seq->litLength -= (U32)srcSize;
+            return;
+        }
+        srcSize -= seq->litLength;
+        seq->litLength = 0;
+        if (srcSize < seq->matchLength) {
+            /* Skip past the first srcSize of the match */
+            seq->matchLength -= (U32)srcSize;
+            if (seq->matchLength < minMatch) {
+                /* The match is too short, omit it */
+                if (rawSeqStore->pos + 1 < rawSeqStore->size) {
+                    seq[1].litLength += seq[0].matchLength;
+                }
+                rawSeqStore->pos++;
+            }
+            return;
+        }
+        srcSize -= seq->matchLength;
+        seq->matchLength = 0;
+        rawSeqStore->pos++;
+    }
+}
+
+/**
+ * If the sequence length is longer than remaining then the sequence is split
+ * between this block and the next.
+ *
+ * Returns the current sequence to handle, or if the rest of the block should
+ * be literals, it returns a sequence with offset == 0.
+ */
+static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
+                                 U32 const remaining, U32 const minMatch)
+{
+    rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
+    assert(sequence.offset > 0);
+    /* Likely: No partial sequence */
+    if (remaining >= sequence.litLength + sequence.matchLength) {
+        rawSeqStore->pos++;
+        return sequence;
+    }
+    /* Cut the sequence short (offset == 0 ==> rest is literals). */
+    if (remaining <= sequence.litLength) {
+        sequence.offset = 0;
+    } else if (remaining < sequence.litLength + sequence.matchLength) {
+        sequence.matchLength = remaining - sequence.litLength;
+        if (sequence.matchLength < minMatch) {
+            sequence.offset = 0;
+        }
+    }
+    /* Skip past `remaining` bytes for the future sequences. */
+    ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch);
+    return sequence;
+}
+
+void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
+    U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
+    while (currPos && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos];
+        if (currPos >= currSeq.litLength + currSeq.matchLength) {
+            currPos -= currSeq.litLength + currSeq.matchLength;
+            rawSeqStore->pos++;
+        } else {
+            rawSeqStore->posInSequence = currPos;
+            break;
+        }
+    }
+    if (currPos == 0 || rawSeqStore->pos == rawSeqStore->size) {
+        rawSeqStore->posInSequence = 0;
+    }
+}
+
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+    ZSTD_paramSwitch_e useRowMatchFinder,
+    void const* src, size_t srcSize)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    unsigned const minMatch = cParams->minMatch;
+    ZSTD_blockCompressor const blockCompressor =
+        ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms));
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    /* Input positions */
+    BYTE const* ip = istart;
+
+    DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize);
+    /* If using opt parser, use LDMs only as candidates rather than always accepting them */
+    if (cParams->strategy >= ZSTD_btopt) {
+        size_t lastLLSize;
+        ms->ldmSeqStore = rawSeqStore;
+        lastLLSize = blockCompressor(ms, seqStore, rep, src, srcSize);
+        ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore, srcSize);
+        return lastLLSize;
+    }
+
+    assert(rawSeqStore->pos <= rawSeqStore->size);
+    assert(rawSeqStore->size <= rawSeqStore->capacity);
+    /* Loop through each sequence and apply the block compressor to the literals */
+    while (rawSeqStore->pos < rawSeqStore->size && ip < iend) {
+        /* maybeSplitSequence updates rawSeqStore->pos */
+        rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                   (U32)(iend - ip), minMatch);
+        /* End signal */
+        if (sequence.offset == 0)
+            break;
+
+        assert(ip + sequence.litLength + sequence.matchLength <= iend);
+
+        /* Fill tables for block compressor */
+        ZSTD_ldm_limitTableUpdate(ms, ip);
+        ZSTD_ldm_fillFastTables(ms, ip);
+        /* Run the block compressor */
+        DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
+        {
+            int i;
+            size_t const newLitLength =
+                blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
+            ip += sequence.litLength;
+            /* Update the repcodes */
+            for (i = ZSTD_REP_NUM - 1; i > 0; i--)
+                rep[i] = rep[i-1];
+            rep[0] = sequence.offset;
+            /* Store the sequence */
+            ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+                          OFFSET_TO_OFFBASE(sequence.offset),
+                          sequence.matchLength);
+            ip += sequence.matchLength;
+        }
+    }
+    /* Fill the tables for the block compressor */
+    ZSTD_ldm_limitTableUpdate(ms, ip);
+    ZSTD_ldm_fillFastTables(ms, ip);
+    /* Compress the last literals */
+    return blockCompressor(ms, seqStore, rep, ip, iend - ip);
+}
diff --git a/src/zstd/zstd_ldm.h b/src/zstd/compress/zstd_ldm.h
similarity index 82%
rename from src/zstd/zstd_ldm.h
rename to src/zstd/compress/zstd_ldm.h
index 1da3f2627..f147021d2 100644
--- a/src/zstd/zstd_ldm.h
+++ b/src/zstd/compress/zstd_ldm.h
@@ -11,22 +11,22 @@
 #ifndef ZSTD_LDM_H
 #define ZSTD_LDM_H
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 extern "C" {
 #endif
 
-#include "zstd_compress_internal.h" /* ldmParams_t, U32 */
-#include "zstd.h"                   /* ZSTD_CCtx, size_t */
+#include "zstd_compress_internal.h"   /* ldmParams_t, U32 */
+#include "../zstd.h"   /* ZSTD_CCtx, size_t */
 
 /*-*************************************
- *  Long distance matching
- ***************************************/
+*  Long distance matching
+***************************************/
 
 #define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_LIMIT_DEFAULT
 
 void ZSTD_ldm_fillHashTable(
-    ldmState_t* state, const BYTE* ip,
-    const BYTE* iend, ldmParams_t const * params);
+            ldmState_t* state, const BYTE* ip,
+            const BYTE* iend, ldmParams_t const* params);
 
 /**
  * ZSTD_ldm_generateSequences():
@@ -43,8 +43,8 @@ void ZSTD_ldm_fillHashTable(
  *       sequences.
  */
 size_t ZSTD_ldm_generateSequences(
-    ldmState_t* ldms, rawSeqStore_t* sequences,
-    ldmParams_t const * params, void const * src, size_t srcSize);
+            ldmState_t* ldms, rawSeqStore_t* sequences,
+            ldmParams_t const* params, void const* src, size_t srcSize);
 
 /**
  * ZSTD_ldm_blockCompress():
@@ -65,9 +65,9 @@ size_t ZSTD_ldm_generateSequences(
  * NOTE: This function does not return any errors.
  */
 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
-                              ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-                              ZSTD_paramSwitch_e useRowMatchFinder,
-                              void const * src, size_t srcSize);
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+            ZSTD_paramSwitch_e useRowMatchFinder,
+            void const* src, size_t srcSize);
 
 /**
  * ZSTD_ldm_skipSequences():
@@ -77,7 +77,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
  * Must be called for data that is not passed to ZSTD_ldm_blockCompress().
  */
 void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
-                            U32 const minMatch);
+    U32 const minMatch);
 
 /* ZSTD_ldm_skipRawSeqStoreBytes():
  * Moves forward in rawSeqStore by nbBytes, updating fields 'pos' and 'posInSequence'.
@@ -108,9 +108,9 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize);
  *  Ensures that the minMatchLength >= targetLength during optimal parsing.
  */
 void ZSTD_ldm_adjustParameters(ldmParams_t* params,
-                               ZSTD_compressionParameters const * cParams);
+                               ZSTD_compressionParameters const* cParams);
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 }
 #endif
 
diff --git a/src/zstd/zstd_ldm_geartab.h b/src/zstd/compress/zstd_ldm_geartab.h
similarity index 88%
rename from src/zstd/zstd_ldm_geartab.h
rename to src/zstd/compress/zstd_ldm_geartab.h
index 355c76972..ef34bc5c9 100644
--- a/src/zstd/zstd_ldm_geartab.h
+++ b/src/zstd/compress/zstd_ldm_geartab.h
@@ -11,8 +11,8 @@
 #ifndef ZSTD_LDM_GEARTAB_H
 #define ZSTD_LDM_GEARTAB_H
 
-#include "compiler.h" /* UNUSED_ATTR */
-#include "mem.h"      /* U64 */
+#include "../common/compiler.h" /* UNUSED_ATTR */
+#include "../common/mem.h"      /* U64 */
 
 static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = {
     0xf5b8f72c5f77775c, 0x84935f266b7ac412, 0xb647ada9ca730ccc,
@@ -20,8 +20,8 @@ static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = {
     0x3a03d743bc99a604, 0xcecd042422c4044f, 0x76de76c58524259e,
     0x9c8528f65badeaca, 0x86563706e2097529, 0x2902475fa375d889,
     0xafb32a9739a5ebe6, 0xce2714da3883e639, 0x21eaf821722e69e,
-    0x37b628620b628, 0x49a8d455d88caf5, 0x8556d711e6958140,
-    0x4f7ae74fc605c1f, 0x829f0c3468bd3a20, 0x4ffdc885c625179e,
+    0x37b628620b628,    0x49a8d455d88caf5,  0x8556d711e6958140,
+    0x4f7ae74fc605c1f,  0x829f0c3468bd3a20, 0x4ffdc885c625179e,
     0x8473de048a3daf1b, 0x51008822b05646b2, 0x69d75d12b2d1cc5f,
     0x8c9d4a19159154bc, 0xc3cc10f4abbd4003, 0xd06ddc1cecb97391,
     0xbe48e6e7ed80302e, 0x3481db31cee03547, 0xacc3f67cdaa1d210,
@@ -42,7 +42,7 @@ static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = {
     0x6625682f6e29186b, 0x399553457ac06e50, 0x35dffb4c23abb74,
     0x429db2591f54aade, 0xc52802a8037d1009, 0x6acb27381f0b25f3,
     0xf45e2551ee4f823b, 0x8b0ea2d99580c2f7, 0x3bed519cbcb4e1e1,
-    0xff452823dbb010a, 0x9d42ed614f3dd267, 0x5b9313c06257c57b,
+    0xff452823dbb010a,  0x9d42ed614f3dd267, 0x5b9313c06257c57b,
     0xa114b8008b5e1442, 0xc1fe311c11c13d4b, 0x66e8763ea34c5568,
     0x8b982af1c262f05d, 0xee8876faaa75fbb7, 0x8a62a4d0d172bb2a,
     0xc13d94a3b7449a97, 0x6dbbba9dc15d037c, 0xc786101f1d92e0f1,
@@ -51,9 +51,9 @@ static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = {
     0x4eb1a539465a3eca, 0xbe09dbf03b05d5ca, 0x774e5a362b5472ba,
     0x47a1221229d183cd, 0x504b0ca18ef5a2df, 0xdffbdfbde2456eb9,
     0x46cd2b2fbee34634, 0xf2aef8fe819d98c3, 0x357f5276d4599d61,
-    0x24a5483879c453e3, 0x88026889192b4b9, 0x28da96671782dbec,
+    0x24a5483879c453e3, 0x88026889192b4b9,  0x28da96671782dbec,
     0x4ef37c40588e9aaa, 0x8837b90651bc9fb3, 0xc164f741d3f0e5d6,
-    0xbc135a0a704b70ba, 0x69cd868f7622ada, 0xbc37ba89e0b9c0ab,
+    0xbc135a0a704b70ba, 0x69cd868f7622ada,  0xbc37ba89e0b9c0ab,
     0x47c14a01323552f6, 0x4f00794bacee98bb, 0x7107de7d637a69d5,
     0x88af793bb6f2255e, 0xf3c6466b8799b598, 0xc288c616aa7f3b59,
     0x81ca63cf42fca3fd, 0x88d85ace36a2674b, 0xd056bd3792389e7,
@@ -65,21 +65,21 @@ static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = {
     0x3d7807f5bde64486, 0x17be6d8f5bbb7639, 0x903f0cd44dc35dc,
     0x67b672eafdf1196c, 0xa676ff93ed4c82f1, 0x521d1004c5053d9d,
     0x37ba9ad09ccc9202, 0x84e54d297aacfb51, 0xa0b4b776a143445,
-    0x820d471e20b348e, 0x1874383cb83d46dc, 0x97edeec7a1efe11c,
+    0x820d471e20b348e,  0x1874383cb83d46dc, 0x97edeec7a1efe11c,
     0xb330e50b1bdc42aa, 0x1dd91955ce70e032, 0xa514cdb88f2939d5,
     0x2791233fd90db9d3, 0x7b670a4cc50f7a9b, 0x77c07d2a05c6dfa5,
     0xe3778b6646d0a6fa, 0xb39c8eda47b56749, 0x933ed448addbef28,
-    0xaf846af6ab7d0bf4, 0xe5af208eb666e49, 0x5e6622f73534cd6a,
+    0xaf846af6ab7d0bf4, 0xe5af208eb666e49,  0x5e6622f73534cd6a,
     0x297daeca42ef5b6e, 0x862daef3d35539a6, 0xe68722498f8e1ea9,
     0x981c53093dc0d572, 0xfa09b0bfbf86fbf5, 0x30b1e96166219f15,
     0x70e7d466bdc4fb83, 0x5a66736e35f2a8e9, 0xcddb59d2b7c1baef,
     0xd6c7d247d26d8996, 0xea4e39eac8de1ba3, 0x539c8bb19fa3aff2,
-    0x9f90e4c5fd508d8, 0xa34e5956fbaf3385, 0x2e2f8e151d3ef375,
+    0x9f90e4c5fd508d8,  0xa34e5956fbaf3385, 0x2e2f8e151d3ef375,
     0x173691e9b83faec1, 0xb85a8d56bf016379, 0x8382381267408ae3,
     0xb90f901bbdc0096d, 0x7c6ad32933bcec65, 0x76bb5e2f2c8ad595,
     0x390f851a6cf46d28, 0xc3e6064da1c2da72, 0xc52a0c101cfa5389,
     0xd78eaf84a3fbc530, 0x3781b9e2288b997e, 0x73c2f6dea83d05c4,
-    0x4228e364c5b5ed7, 0x9d7a3edf0da43911, 0x8edcfeda24686756,
+    0x4228e364c5b5ed7,  0x9d7a3edf0da43911, 0x8edcfeda24686756,
     0x5e7667a7b7a9b3a1, 0x4c4f389fa143791d, 0xb08bc1023da7cddc,
     0x7ab4be3ae529b1cc, 0x754e6132dbe74ff9, 0x71635442a839df45,
     0x2f6fb1643fbe52de, 0x961e0a42cf7a8177, 0xf3b45d83d89ef2ea,
@@ -100,6 +100,7 @@ static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = {
     0x3b0311f0a27c0656, 0xdb17bf91c0db96e7, 0x8cd4fd6b4e85a5b2,
     0xfab071054ba6409d, 0x40d6fe831fa9dfd9, 0xaf358debad7d791e,
     0xeb8d0e25a65e3e58, 0xbbcbd3df14e08580, 0xcf751f27ecdab2b,
-    0x2b4da14f2613d8f4};
+    0x2b4da14f2613d8f4
+};
 
 #endif /* ZSTD_LDM_GEARTAB_H */
diff --git a/src/zstd/compress/zstd_opt.c b/src/zstd/compress/zstd_opt.c
new file mode 100644
index 000000000..e63073e5a
--- /dev/null
+++ b/src/zstd/compress/zstd_opt.c
@@ -0,0 +1,1576 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "hist.h"
+#include "zstd_opt.h"
+
+#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+
+#define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+#define ZSTD_MAX_PRICE     (1<<30)
+
+#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+
+
+/*-*************************************
+*  Price functions for optimal parser
+***************************************/
+
+#if 0    /* approximation at bit level (for tests) */
+#  define BITCOST_ACCURACY 0
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
+#elif 0  /* fractional bit accuracy (for tests) */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
+#else    /* opt==approx, ultra==accurate */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+#endif
+
+/* ZSTD_bitWeight() :
+ * provide estimated "cost" of a stat in full bits only */
+MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+{
+    return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+}
+
+/* ZSTD_fracWeight() :
+ * provide fractional-bit "cost" of a stat,
+ * using linear interpolation approximation */
+MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+{
+    U32 const stat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(stat);
+    U32 const BWeight = hb * BITCOST_MULTIPLIER;
+    /* Fweight was meant for "Fractional weight"
+     * but it's effectively a value between 1 and 2
+     * using fixed point arithmetic */
+    U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + BITCOST_ACCURACY < 31);
+    return weight;
+}
+
+#if (DEBUGLEVEL>=2)
+/* debugging function,
+ * @return price in bytes as fractional value
+ * for debug messages only */
+MEM_STATIC double ZSTD_fCost(int price)
+{
+    return (double)price / (BITCOST_MULTIPLIER*8);
+}
+#endif
+
+static int ZSTD_compressedLiterals(optState_t const* const optPtr)
+{
+    return optPtr->literalCompressionMode != ZSTD_ps_disable;
+}
+
+static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel)
+{
+    if (ZSTD_compressedLiterals(optPtr))
+        optPtr->litSumBasePrice = WEIGHT(optPtr->litSum, optLevel);
+    optPtr->litLengthSumBasePrice = WEIGHT(optPtr->litLengthSum, optLevel);
+    optPtr->matchLengthSumBasePrice = WEIGHT(optPtr->matchLengthSum, optLevel);
+    optPtr->offCodeSumBasePrice = WEIGHT(optPtr->offCodeSum, optLevel);
+}
+
+
+static U32 sum_u32(const unsigned table[], size_t nbElts)
+{
+    size_t n;
+    U32 total = 0;
+    for (n=0; n<nbElts; n++) {
+        total += table[n];
+    }
+    return total;
+}
+
+typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
+
+static U32
+ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
+{
+    U32 s, sum=0;
+    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
+            (unsigned)lastEltIndex+1, (unsigned)shift );
+    assert(shift < 30);
+    for (s=0; s<lastEltIndex+1; s++) {
+        unsigned const base = base1 ? 1 : (table[s]>0);
+        unsigned const newStat = base + (table[s] >> shift);
+        sum += newStat;
+        table[s] = newStat;
+    }
+    return sum;
+}
+
+/* ZSTD_scaleStats() :
+ * reduce all elt frequencies in table if sum too large
+ * return the resulting sum of elements */
+static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+{
+    U32 const prevsum = sum_u32(table, lastEltIndex+1);
+    U32 const factor = prevsum >> logTarget;
+    DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
+    assert(logTarget < 30);
+    if (factor <= 1) return prevsum;
+    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
+}
+
+/* ZSTD_rescaleFreqs() :
+ * if first block (detected by optPtr->litLengthSum == 0) : init statistics
+ *    take hints from dictionary if there is one
+ *    and init from zero if there is none,
+ *    using src for literals stats, and baseline stats for sequence symbols
+ * otherwise downscale existing stats, to be used as seed for next block.
+ */
+static void
+ZSTD_rescaleFreqs(optState_t* const optPtr,
+            const BYTE* const src, size_t const srcSize,
+                  int const optLevel)
+{
+    int const compressedLiterals = ZSTD_compressedLiterals(optPtr);
+    DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+    optPtr->priceType = zop_dynamic;
+
+    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
+
+        /* heuristic: use pre-defined stats for too small inputs */
+        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
+            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
+            optPtr->priceType = zop_predef;
+        }
+
+        assert(optPtr->symbolCosts != NULL);
+        if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+
+            /* huffman stats covering the full value set : table presumed generated by dictionary */
+            optPtr->priceType = zop_dynamic;
+
+            if (compressedLiterals) {
+                /* generate literals statistics from huffman table */
+                unsigned lit;
+                assert(optPtr->litFreq != NULL);
+                optPtr->litSum = 0;
+                for (lit=0; lit<=MaxLit; lit++) {
+                    U32 const scaleLog = 11;   /* scale to 2K */
+                    U32 const bitCost = HUF_getNbBitsFromCTable(optPtr->symbolCosts->huf.CTable, lit);
+                    assert(bitCost <= scaleLog);
+                    optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litSum += optPtr->litFreq[lit];
+            }   }
+
+            {   unsigned ll;
+                FSE_CState_t llstate;
+                FSE_initCState(&llstate, optPtr->symbolCosts->fse.litlengthCTable);
+                optPtr->litLengthSum = 0;
+                for (ll=0; ll<=MaxLL; ll++) {
+                    U32 const scaleLog = 10;   /* scale to 1K */
+                    U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll);
+                    assert(bitCost < scaleLog);
+                    optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litLengthSum += optPtr->litLengthFreq[ll];
+            }   }
+
+            {   unsigned ml;
+                FSE_CState_t mlstate;
+                FSE_initCState(&mlstate, optPtr->symbolCosts->fse.matchlengthCTable);
+                optPtr->matchLengthSum = 0;
+                for (ml=0; ml<=MaxML; ml++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml);
+                    assert(bitCost < scaleLog);
+                    optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->matchLengthSum += optPtr->matchLengthFreq[ml];
+            }   }
+
+            {   unsigned of;
+                FSE_CState_t ofstate;
+                FSE_initCState(&ofstate, optPtr->symbolCosts->fse.offcodeCTable);
+                optPtr->offCodeSum = 0;
+                for (of=0; of<=MaxOff; of++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of);
+                    assert(bitCost < scaleLog);
+                    optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->offCodeSum += optPtr->offCodeFreq[of];
+            }   }
+
+        } else {  /* first block, no dictionary */
+
+            assert(optPtr->litFreq != NULL);
+            if (compressedLiterals) {
+                /* base initial cost of literals on direct frequency within src */
+                unsigned lit = MaxLit;
+                HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
+            }
+
+            {   unsigned const baseLLfreqs[MaxLL+1] = {
+                    4, 2, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1
+                };
+                ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(baseLLfreqs));
+                optPtr->litLengthSum = sum_u32(baseLLfreqs, MaxLL+1);
+            }
+
+            {   unsigned ml;
+                for (ml=0; ml<=MaxML; ml++)
+                    optPtr->matchLengthFreq[ml] = 1;
+            }
+            optPtr->matchLengthSum = MaxML+1;
+
+            {   unsigned const baseOFCfreqs[MaxOff+1] = {
+                    6, 2, 1, 1, 2, 3, 4, 4,
+                    4, 3, 2, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1
+                };
+                ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(baseOFCfreqs));
+                optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
+            }
+
+        }
+
+    } else {   /* new block : scale down accumulated statistics */
+
+        if (compressedLiterals)
+            optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
+        optPtr->litLengthSum = ZSTD_scaleStats(optPtr->litLengthFreq, MaxLL, 11);
+        optPtr->matchLengthSum = ZSTD_scaleStats(optPtr->matchLengthFreq, MaxML, 11);
+        optPtr->offCodeSum = ZSTD_scaleStats(optPtr->offCodeFreq, MaxOff, 11);
+    }
+
+    ZSTD_setBasePrices(optPtr, optLevel);
+}
+
+/* ZSTD_rawLiteralsCost() :
+ * price of literals (only) in specified segment (which length can be 0).
+ * does not include price of literalLength symbol */
+static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+                                const optState_t* const optPtr,
+                                int optLevel)
+{
+    DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength);
+    if (litLength == 0) return 0;
+
+    if (!ZSTD_compressedLiterals(optPtr))
+        return (litLength << 3) * BITCOST_MULTIPLIER;  /* Uncompressed - 8 bytes per literal. */
+
+    if (optPtr->priceType == zop_predef)
+        return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
+
+    /* dynamic statistics */
+    {   U32 price = optPtr->litSumBasePrice * litLength;
+        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
+        U32 u;
+        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
+        for (u=0; u < litLength; u++) {
+            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
+            price -= litPrice;
+        }
+        return price;
+    }
+}
+
+/* ZSTD_litLengthPrice() :
+ * cost of literalLength symbol */
+static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel)
+{
+    assert(litLength <= ZSTD_BLOCKSIZE_MAX);
+    if (optPtr->priceType == zop_predef)
+        return WEIGHT(litLength, optLevel);
+
+    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+     * because it isn't representable in the zstd format.
+     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
+     * In such a case, the block would be all literals.
+     */
+    if (litLength == ZSTD_BLOCKSIZE_MAX)
+        return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
+
+    /* dynamic statistics */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        return (LL_bits[llCode] * BITCOST_MULTIPLIER)
+             + optPtr->litLengthSumBasePrice
+             - WEIGHT(optPtr->litLengthFreq[llCode], optLevel);
+    }
+}
+
+/* ZSTD_getMatchPrice() :
+ * Provides the cost of the match part (offset + matchLength) of a sequence.
+ * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
+ * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
+ */
+FORCE_INLINE_TEMPLATE U32
+ZSTD_getMatchPrice(U32 const offBase,
+                   U32 const matchLength,
+             const optState_t* const optPtr,
+                   int const optLevel)
+{
+    U32 price;
+    U32 const offCode = ZSTD_highbit32(offBase);
+    U32 const mlBase = matchLength - MINMATCH;
+    assert(matchLength >= MINMATCH);
+
+    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
+        return WEIGHT(mlBase, optLevel)
+             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
+
+    /* dynamic statistics */
+    price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+    if ((optLevel<2) /*static*/ && offCode >= 20)
+        price += (offCode-19)*2 * BITCOST_MULTIPLIER; /* handicap for long distance offsets, favor decompression speed */
+
+    /* match Length */
+    {   U32 const mlCode = ZSTD_MLcode(mlBase);
+        price += (ML_bits[mlCode] * BITCOST_MULTIPLIER) + (optPtr->matchLengthSumBasePrice - WEIGHT(optPtr->matchLengthFreq[mlCode], optLevel));
+    }
+
+    price += BITCOST_MULTIPLIER / 5;   /* heuristic : make matches a bit more costly to favor less sequences -> faster decompression speed */
+
+    DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price);
+    return price;
+}
+
+/* ZSTD_updateStats() :
+ * assumption : literals + litLength <= iend */
+static void ZSTD_updateStats(optState_t* const optPtr,
+                             U32 litLength, const BYTE* literals,
+                             U32 offBase, U32 matchLength)
+{
+    /* literals */
+    if (ZSTD_compressedLiterals(optPtr)) {
+        U32 u;
+        for (u=0; u < litLength; u++)
+            optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+        optPtr->litSum += litLength*ZSTD_LITFREQ_ADD;
+    }
+
+    /* literal Length */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        optPtr->litLengthFreq[llCode]++;
+        optPtr->litLengthSum++;
+    }
+
+    /* offset code : follows storeSeq() numeric representation */
+    {   U32 const offCode = ZSTD_highbit32(offBase);
+        assert(offCode <= MaxOff);
+        optPtr->offCodeFreq[offCode]++;
+        optPtr->offCodeSum++;
+    }
+
+    /* match Length */
+    {   U32 const mlBase = matchLength - MINMATCH;
+        U32 const mlCode = ZSTD_MLcode(mlBase);
+        optPtr->matchLengthFreq[mlCode]++;
+        optPtr->matchLengthSum++;
+    }
+}
+
+
+/* ZSTD_readMINMATCH() :
+ * function safe only for comparisons
+ * assumption : memPtr must be at least 4 bytes before end of buffer */
+MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
+{
+    switch (length)
+    {
+    default :
+    case 4 : return MEM_read32(memPtr);
+    case 3 : if (MEM_isLittleEndian())
+                return MEM_read32(memPtr)<<8;
+             else
+                return MEM_read32(memPtr)>>8;
+    }
+}
+
+
+/* Update hashTable3 up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+                                       U32* nextToUpdate3,
+                                       const BYTE* const ip)
+{
+    U32* const hashTable3 = ms->hashTable3;
+    U32 const hashLog3 = ms->hashLog3;
+    const BYTE* const base = ms->window.base;
+    U32 idx = *nextToUpdate3;
+    U32 const target = (U32)(ip - base);
+    size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3);
+    assert(hashLog3 > 0);
+
+    while(idx < target) {
+        hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx;
+        idx++;
+    }
+
+    *nextToUpdate3 = target;
+    return hashTable3[hash3];
+}
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+ * @param ip assumed <= iend-8 .
+ * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
+ * @return : nb of positions added */
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_insertBt1(
+                const ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iend,
+                U32 const target,
+                U32 const mls, const int extDict)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32 matchIndex = hashTable[h];
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    const U32 curr = (U32)(ip-base);
+    const U32 btLow = btMask >= curr ? 0 : curr - btMask;
+    U32* smallerPtr = bt + 2*(curr&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 dummy32;   /* to be nullified at the end */
+    /* windowLow is based on target because
+     * we only need positions that will be in the window at the end of the tree update.
+     */
+    U32 const windowLow = ZSTD_getLowestMatchIndex(ms, target, cParams->windowLog);
+    U32 matchEndIdx = curr+8+1;
+    size_t bestLength = 8;
+    U32 nbCompares = 1U << cParams->searchLog;
+#ifdef ZSTD_C_PREDICT
+    U32 predictedSmall = *(bt + 2*((curr-1)&btMask) + 0);
+    U32 predictedLarge = *(bt + 2*((curr-1)&btMask) + 1);
+    predictedSmall += (predictedSmall>0);
+    predictedLarge += (predictedLarge>0);
+#endif /* ZSTD_C_PREDICT */
+
+    DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr);
+
+    assert(curr <= target);
+    assert(ip <= iend-8);   /* required for h calculation */
+    hashTable[h] = curr;   /* Update Hash Table */
+
+    assert(windowLow > 0);
+    for (; nbCompares && (matchIndex >= windowLow); --nbCompares) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(matchIndex < curr);
+
+#ifdef ZSTD_C_PREDICT   /* note : can create issues when hlog small <= 11 */
+        const U32* predictPtr = bt + 2*((matchIndex-1) & btMask);   /* written this way, as bt is a roll buffer */
+        if (matchIndex == predictedSmall) {
+            /* no need to check length, result known */
+            *smallerPtr = matchIndex;
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            predictedSmall = predictPtr[1] + (predictPtr[1]>0);
+            continue;
+        }
+        if (matchIndex == predictedLarge) {
+            *largerPtr = matchIndex;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+            predictedLarge = predictPtr[0] + (predictPtr[0]>0);
+            continue;
+        }
+#endif
+
+        if (!extDict || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);   /* might be wrong if actually extDict */
+            match = base + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            bestLength = matchLength;
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+        }
+
+        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+    {   U32 positions = 0;
+        if (bestLength > 384) positions = MIN(192, (U32)(bestLength - 384));   /* speed optimization */
+        assert(matchEndIdx > curr + 8);
+        return MAX(positions, matchEndIdx - (curr + 8));
+    }
+}
+
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_updateTree_internal(
+                ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iend,
+                const U32 mls, const ZSTD_dictMode_e dictMode)
+{
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                idx, target, dictMode);
+
+    while(idx < target) {
+        U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, target, mls, dictMode == ZSTD_extDict);
+        assert(idx < (U32)(idx + forward));
+        idx += forward;
+    }
+    assert((size_t)(ip - base) <= (size_t)(U32)(-1));
+    assert((size_t)(iend - base) <= (size_t)(U32)(-1));
+    ms->nextToUpdate = target;
+}
+
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+    ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
+}
+
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32
+ZSTD_insertBtAndGetAllMatches (
+                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
+                ZSTD_matchState_t* ms,
+                U32* nextToUpdate3,
+                const BYTE* const ip, const BYTE* const iLimit,
+                const ZSTD_dictMode_e dictMode,
+                const U32 rep[ZSTD_REP_NUM],
+                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+                const U32 lengthToBeat,
+                const U32 mls /* template */)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    const BYTE* const base = ms->window.base;
+    U32 const curr = (U32)(ip-base);
+    U32 const hashLog = cParams->hashLog;
+    U32 const minMatch = (mls==3) ? 3 : 4;
+    U32* const hashTable = ms->hashTable;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32 matchIndex  = hashTable[h];
+    U32* const bt   = ms->chainTable;
+    U32 const btLog = cParams->chainLog - 1;
+    U32 const btMask= (1U << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const dictBase = ms->window.dictBase;
+    U32 const dictLimit = ms->window.dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
+    U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
+    U32 const matchLow = windowLow ? windowLow : 1;
+    U32* smallerPtr = bt + 2*(curr&btMask);
+    U32* largerPtr  = bt + 2*(curr&btMask) + 1;
+    U32 matchEndIdx = curr+8+1;   /* farthest referenced position of any match => detects repetitive patterns */
+    U32 dummy32;   /* to be nullified at the end */
+    U32 mnum = 0;
+    U32 nbCompares = 1U << cParams->searchLog;
+
+    const ZSTD_matchState_t* dms    = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
+    const ZSTD_compressionParameters* const dmsCParams =
+                                      dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL;
+    const BYTE* const dmsBase       = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
+    const BYTE* const dmsEnd        = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL;
+    U32         const dmsHighLimit  = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0;
+    U32         const dmsLowLimit   = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0;
+    U32         const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0;
+    U32         const dmsHashLog    = dictMode == ZSTD_dictMatchState ? dmsCParams->hashLog : hashLog;
+    U32         const dmsBtLog      = dictMode == ZSTD_dictMatchState ? dmsCParams->chainLog - 1 : btLog;
+    U32         const dmsBtMask     = dictMode == ZSTD_dictMatchState ? (1U << dmsBtLog) - 1 : 0;
+    U32         const dmsBtLow      = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit;
+
+    size_t bestLength = lengthToBeat-1;
+    DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", curr);
+
+    /* check repCode */
+    assert(ll0 <= 1);   /* necessarily 1 or 0 */
+    {   U32 const lastR = ZSTD_REP_NUM + ll0;
+        U32 repCode;
+        for (repCode = ll0; repCode < lastR; repCode++) {
+            U32 const repOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+            U32 const repIndex = curr - repOffset;
+            U32 repLen = 0;
+            assert(curr >= dictLimit);
+            if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < curr-dictLimit) {  /* equivalent to `curr > repIndex >= dictLimit` */
+                /* We must validate the repcode offset because when we're using a dictionary the
+                 * valid offset range shrinks when the dictionary goes out of bounds.
+                 */
+                if ((repIndex >= windowLow) & (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch))) {
+                    repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch;
+                }
+            } else {  /* repIndex < dictLimit || repIndex >= curr */
+                const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ?
+                                             dmsBase + repIndex - dmsIndexDelta :
+                                             dictBase + repIndex;
+                assert(curr >= windowLow);
+                if ( dictMode == ZSTD_extDict
+                  && ( ((repOffset-1) /*intentional overflow*/ < curr - windowLow)  /* equivalent to `curr > repIndex >= windowLow` */
+                     & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */)
+                  && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                    repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch;
+                }
+                if (dictMode == ZSTD_dictMatchState
+                  && ( ((repOffset-1) /*intentional overflow*/ < curr - (dmsLowLimit + dmsIndexDelta))  /* equivalent to `curr > repIndex >= dmsLowLimit` */
+                     & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */
+                  && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                    repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch;
+            }   }
+            /* save longer solution */
+            if (repLen > bestLength) {
+                DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                            repCode, ll0, repOffset, repLen);
+                bestLength = repLen;
+                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                matches[mnum].len = (U32)repLen;
+                mnum++;
+                if ( (repLen > sufficient_len)
+                   | (ip+repLen == iLimit) ) {  /* best possible */
+                    return mnum;
+    }   }   }   }
+
+    /* HC3 match finder */
+    if ((mls == 3) /*static*/ && (bestLength < mls)) {
+        U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, nextToUpdate3, ip);
+        if ((matchIndex3 >= matchLow)
+          & (curr - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) {
+            size_t mlen;
+            if ((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) {
+                const BYTE* const match = base + matchIndex3;
+                mlen = ZSTD_count(ip, match, iLimit);
+            } else {
+                const BYTE* const match = dictBase + matchIndex3;
+                mlen = ZSTD_count_2segments(ip, match, iLimit, dictEnd, prefixStart);
+            }
+
+            /* save best solution */
+            if (mlen >= mls /* == 3 > bestLength */) {
+                DEBUGLOG(8, "found small match with hlog3, of length %u",
+                            (U32)mlen);
+                bestLength = mlen;
+                assert(curr > matchIndex3);
+                assert(mnum==0);  /* no prior solution */
+                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
+                matches[0].len = (U32)mlen;
+                mnum = 1;
+                if ( (mlen > sufficient_len) |
+                     (ip+mlen == iLimit) ) {  /* best possible length */
+                    ms->nextToUpdate = curr+1;  /* skip insertion */
+                    return 1;
+        }   }   }
+        /* no dictMatchState lookup: dicts don't have a populated HC3 table */
+    }  /* if (mls == 3) */
+
+    hashTable[h] = curr;   /* Update Hash Table */
+
+    for (; nbCompares && (matchIndex >= matchLow); --nbCompares) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        const BYTE* match;
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(curr > matchIndex);
+
+        if ((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);  /* ensure the condition is correct when !extDict */
+            match = base + matchIndex;
+            if (matchIndex >= dictLimit) assert(memcmp(match, ip, matchLength) == 0);  /* ensure early section of match is equal as expected */
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit);
+        } else {
+            match = dictBase + matchIndex;
+            assert(memcmp(match, ip, matchLength) == 0);  /* ensure early section of match is equal as expected */
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* prepare for match[matchLength] read */
+        }
+
+        if (matchLength > bestLength) {
+            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
+                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+            assert(matchEndIdx > matchIndex);
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+            bestLength = matchLength;
+            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+            matches[mnum].len = (U32)matchLength;
+            mnum++;
+            if ( (matchLength > ZSTD_OPT_NUM)
+               | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */
+                break; /* drop, to preserve bt consistency (miss a little bit of compression) */
+        }   }
+
+        if (match[matchLength] < ip[matchLength]) {
+            /* match smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new candidate => larger than match, which was smaller than current */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous, closer to current */
+        } else {
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+
+    assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
+    if (dictMode == ZSTD_dictMatchState && nbCompares) {
+        size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls);
+        U32 dictMatchIndex = dms->hashTable[dmsH];
+        const U32* const dmsBt = dms->chainTable;
+        commonLengthSmaller = commonLengthLarger = 0;
+        for (; nbCompares && (dictMatchIndex > dmsLowLimit); --nbCompares) {
+            const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match = dmsBase + dictMatchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dmsEnd, prefixStart);
+            if (dictMatchIndex+matchLength >= dmsHighLimit)
+                match = base + dictMatchIndex + dmsIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+            if (matchLength > bestLength) {
+                matchIndex = dictMatchIndex + dmsIndexDelta;
+                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
+                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                bestLength = matchLength;
+                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+                matches[mnum].len = (U32)matchLength;
+                mnum++;
+                if ( (matchLength > ZSTD_OPT_NUM)
+                   | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+            }   }
+
+            if (dictMatchIndex <= dmsBtLow) { break; }   /* beyond tree size, stop the search */
+            if (match[matchLength] < ip[matchLength]) {
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                commonLengthLarger = matchLength;
+                dictMatchIndex = nextPtr[0];
+    }   }   }  /* if (dictMode == ZSTD_dictMatchState) */
+
+    assert(matchEndIdx > curr+8);
+    ms->nextToUpdate = matchEndIdx - 8;  /* skip repetitive patterns */
+    return mnum;
+}
+
+typedef U32 (*ZSTD_getAllMatchesFn)(
+    ZSTD_match_t*,
+    ZSTD_matchState_t*,
+    U32*,
+    const BYTE*,
+    const BYTE*,
+    const U32 rep[ZSTD_REP_NUM],
+    U32 const ll0,
+    U32 const lengthToBeat);
+
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_btGetAllMatches_internal(
+        ZSTD_match_t* matches,
+        ZSTD_matchState_t* ms,
+        U32* nextToUpdate3,
+        const BYTE* ip,
+        const BYTE* const iHighLimit,
+        const U32 rep[ZSTD_REP_NUM],
+        U32 const ll0,
+        U32 const lengthToBeat,
+        const ZSTD_dictMode_e dictMode,
+        const U32 mls)
+{
+    assert(BOUNDED(3, ms->cParams.minMatch, 6) == mls);
+    DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=%d, mls=%u)", (int)dictMode, mls);
+    if (ip < ms->window.base + ms->nextToUpdate)
+        return 0;   /* skipped area */
+    ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode);
+    return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, mls);
+}
+
+#define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##dictMode##_##mls
+
+#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls)            \
+    static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)(      \
+            ZSTD_match_t* matches,                             \
+            ZSTD_matchState_t* ms,                             \
+            U32* nextToUpdate3,                                \
+            const BYTE* ip,                                    \
+            const BYTE* const iHighLimit,                      \
+            const U32 rep[ZSTD_REP_NUM],                       \
+            U32 const ll0,                                     \
+            U32 const lengthToBeat)                            \
+    {                                                          \
+        return ZSTD_btGetAllMatches_internal(                  \
+                matches, ms, nextToUpdate3, ip, iHighLimit,    \
+                rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \
+    }
+
+#define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6)
+
+GEN_ZSTD_BT_GET_ALL_MATCHES(noDict)
+GEN_ZSTD_BT_GET_ALL_MATCHES(extDict)
+GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState)
+
+#define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode)  \
+    {                                            \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3), \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6)  \
+    }
+
+static ZSTD_getAllMatchesFn
+ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode)
+{
+    ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = {
+        ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict),
+        ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict),
+        ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState)
+    };
+    U32 const mls = BOUNDED(3, ms->cParams.minMatch, 6);
+    assert((U32)dictMode < 3);
+    assert(mls - 3 < 4);
+    return getAllMatchesFns[(int)dictMode][mls - 3];
+}
+
+/*************************
+*  LDM helper functions  *
+*************************/
+
+/* Struct containing info needed to make decision about ldm inclusion */
+typedef struct {
+    rawSeqStore_t seqStore;   /* External match candidates store for this block */
+    U32 startPosInBlock;      /* Start position of the current match candidate */
+    U32 endPosInBlock;        /* End position of the current match candidate */
+    U32 offset;               /* Offset of the match candidate */
+} ZSTD_optLdm_t;
+
+/* ZSTD_optLdm_skipRawSeqStoreBytes():
+ * Moves forward in @rawSeqStore by @nbBytes,
+ * which will update the fields 'pos' and 'posInSequence'.
+ */
+static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes)
+{
+    U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
+    while (currPos && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos];
+        if (currPos >= currSeq.litLength + currSeq.matchLength) {
+            currPos -= currSeq.litLength + currSeq.matchLength;
+            rawSeqStore->pos++;
+        } else {
+            rawSeqStore->posInSequence = currPos;
+            break;
+        }
+    }
+    if (currPos == 0 || rawSeqStore->pos == rawSeqStore->size) {
+        rawSeqStore->posInSequence = 0;
+    }
+}
+
+/* ZSTD_opt_getNextMatchAndUpdateSeqStore():
+ * Calculates the beginning and end of the next match in the current block.
+ * Updates 'pos' and 'posInSequence' of the ldmSeqStore.
+ */
+static void
+ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock,
+                                       U32 blockBytesRemaining)
+{
+    rawSeq currSeq;
+    U32 currBlockEndPos;
+    U32 literalsBytesRemaining;
+    U32 matchBytesRemaining;
+
+    /* Setting match end position to MAX to ensure we never use an LDM during this block */
+    if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) {
+        optLdm->startPosInBlock = UINT_MAX;
+        optLdm->endPosInBlock = UINT_MAX;
+        return;
+    }
+    /* Calculate appropriate bytes left in matchLength and litLength
+     * after adjusting based on ldmSeqStore->posInSequence */
+    currSeq = optLdm->seqStore.seq[optLdm->seqStore.pos];
+    assert(optLdm->seqStore.posInSequence <= currSeq.litLength + currSeq.matchLength);
+    currBlockEndPos = currPosInBlock + blockBytesRemaining;
+    literalsBytesRemaining = (optLdm->seqStore.posInSequence < currSeq.litLength) ?
+            currSeq.litLength - (U32)optLdm->seqStore.posInSequence :
+            0;
+    matchBytesRemaining = (literalsBytesRemaining == 0) ?
+            currSeq.matchLength - ((U32)optLdm->seqStore.posInSequence - currSeq.litLength) :
+            currSeq.matchLength;
+
+    /* If there are more literal bytes than bytes remaining in block, no ldm is possible */
+    if (literalsBytesRemaining >= blockBytesRemaining) {
+        optLdm->startPosInBlock = UINT_MAX;
+        optLdm->endPosInBlock = UINT_MAX;
+        ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, blockBytesRemaining);
+        return;
+    }
+
+    /* Matches may be < MINMATCH by this process. In that case, we will reject them
+       when we are deciding whether or not to add the ldm */
+    optLdm->startPosInBlock = currPosInBlock + literalsBytesRemaining;
+    optLdm->endPosInBlock = optLdm->startPosInBlock + matchBytesRemaining;
+    optLdm->offset = currSeq.offset;
+
+    if (optLdm->endPosInBlock > currBlockEndPos) {
+        /* Match ends after the block ends, we can't use the whole match */
+        optLdm->endPosInBlock = currBlockEndPos;
+        ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, currBlockEndPos - currPosInBlock);
+    } else {
+        /* Consume nb of bytes equal to size of sequence left */
+        ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, literalsBytesRemaining + matchBytesRemaining);
+    }
+}
+
+/* ZSTD_optLdm_maybeAddMatch():
+ * Adds a match if it's long enough,
+ * based on it's 'matchStartPosInBlock' and 'matchEndPosInBlock',
+ * into 'matches'. Maintains the correct ordering of 'matches'.
+ */
+static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+                                      const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
+{
+    U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
+    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
+    U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
+
+    /* Ensure that current block position is not outside of the match */
+    if (currPosInBlock < optLdm->startPosInBlock
+      || currPosInBlock >= optLdm->endPosInBlock
+      || candidateMatchLength < MINMATCH) {
+        return;
+    }
+
+    if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
+        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
+                 candidateOffBase, candidateMatchLength, currPosInBlock);
+        matches[*nbMatches].len = candidateMatchLength;
+        matches[*nbMatches].off = candidateOffBase;
+        (*nbMatches)++;
+    }
+}
+
+/* ZSTD_optLdm_processMatchCandidate():
+ * Wrapper function to update ldm seq store and call ldm functions as necessary.
+ */
+static void
+ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+                                  ZSTD_match_t* matches, U32* nbMatches,
+                                  U32 currPosInBlock, U32 remainingBytes)
+{
+    if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) {
+        return;
+    }
+
+    if (currPosInBlock >= optLdm->endPosInBlock) {
+        if (currPosInBlock > optLdm->endPosInBlock) {
+            /* The position at which ZSTD_optLdm_processMatchCandidate() is called is not necessarily
+             * at the end of a match from the ldm seq store, and will often be some bytes
+             * over beyond matchEndPosInBlock. As such, we need to correct for these "overshoots"
+             */
+            U32 const posOvershoot = currPosInBlock - optLdm->endPosInBlock;
+            ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOvershoot);
+        }
+        ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes);
+    }
+    ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock);
+}
+
+
+/*-*******************************
+*  Optimal parser
+*********************************/
+
+#if 0 /* debug */
+
+static void
+listStats(const U32* table, int lastEltID)
+{
+    int const nbElts = lastEltID + 1;
+    int enb;
+    for (enb=0; enb < nbElts; enb++) {
+        (void)table;
+        /* RAWLOG(2, "%3i:%3i,  ", enb, table[enb]); */
+        RAWLOG(2, "%4i,", table[enb]);
+    }
+    RAWLOG(2, " \n");
+}
+
+#endif
+
+#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel)
+#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel)
+#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1))
+
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t
+ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                               seqStore_t* seqStore,
+                               U32 rep[ZSTD_REP_NUM],
+                         const void* src, size_t srcSize,
+                         const int optLevel,
+                         const ZSTD_dictMode_e dictMode)
+{
+    optState_t* const optStatePtr = &ms->opt;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+
+    ZSTD_getAllMatchesFn getAllMatches = ZSTD_selectBtGetAllMatches(ms, dictMode);
+
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4;
+    U32 nextToUpdate3 = ms->nextToUpdate;
+
+    ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+    ZSTD_match_t* const matches = optStatePtr->matchTable;
+    ZSTD_optimal_t lastStretch;
+    ZSTD_optLdm_t optLdm;
+
+    ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t));
+
+    optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
+    optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
+    ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
+
+    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_opt_generic: current=%u, prefix=%u, nextToUpdate=%u",
+                (U32)(ip - base), ms->window.dictLimit, ms->nextToUpdate);
+    assert(optLevel <= 2);
+    ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel);
+    ip += (ip==prefixStart);
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        U32 cur, last_pos = 0;
+
+        /* find first match */
+        {   U32 const litlen = (U32)(ip - anchor);
+            U32 const ll0 = !litlen;
+            U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
+            ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+                                              (U32)(ip-istart), (U32)(iend-ip));
+            if (!nbMatches) {
+                DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart));
+                ip++;
+                continue;
+            }
+
+            /* Match found: let's store this solution, and eventually find more candidates.
+             * During this forward pass, @opt is used to store stretches,
+             * defined as "a match followed by N literals".
+             * Note how this is different from a Sequence, which is "N literals followed by a match".
+             * Storing stretches allows us to store different match predecessors
+             * for each literal position part of a literals run. */
+
+            /* initialize opt[0] */
+            opt[0].mlen = 0;  /* there are only literals so far */
+            opt[0].litlen = litlen;
+            /* No need to include the actual price of the literals before the first match
+             * because it is static for the duration of the forward pass, and is included
+             * in every subsequent price. But, we include the literal length because
+             * the cost variation of litlen depends on the value of litlen.
+             */
+            opt[0].price = LL_PRICE(litlen);
+            ZSTD_STATIC_ASSERT(sizeof(opt[0].rep[0]) == sizeof(rep[0]));
+            ZSTD_memcpy(&opt[0].rep, rep, sizeof(opt[0].rep));
+
+            /* large match -> immediate encoding */
+            {   U32 const maxML = matches[nbMatches-1].len;
+                U32 const maxOffBase = matches[nbMatches-1].off;
+                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
+                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
+
+                if (maxML > sufficient_len) {
+                    lastStretch.litlen = 0;
+                    lastStretch.mlen = maxML;
+                    lastStretch.off = maxOffBase;
+                    DEBUGLOG(6, "large match (%u>%u) => immediate encoding",
+                                maxML, sufficient_len);
+                    cur = 0;
+                    last_pos = maxML;
+                    goto _shortestPath;
+            }   }
+
+            /* set prices for first matches starting position == 0 */
+            assert(opt[0].price >= 0);
+            {   U32 pos;
+                U32 matchNb;
+                for (pos = 1; pos < minMatch; pos++) {
+                    opt[pos].price = ZSTD_MAX_PRICE;
+                    opt[pos].mlen = 0;
+                    opt[pos].litlen = litlen + pos;
+                }
+                for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+                    U32 const offBase = matches[matchNb].off;
+                    U32 const end = matches[matchNb].len;
+                    for ( ; pos <= end ; pos++ ) {
+                        int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
+                        int const sequencePrice = opt[0].price + matchPrice;
+                        DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+                                    pos, ZSTD_fCost(sequencePrice));
+                        opt[pos].mlen = pos;
+                        opt[pos].off = offBase;
+                        opt[pos].litlen = 0; /* end of match */
+                        opt[pos].price = sequencePrice + LL_PRICE(0);
+                    }
+                }
+                last_pos = pos-1;
+                opt[pos].price = ZSTD_MAX_PRICE;
+            }
+        }
+
+        /* check further positions */
+        for (cur = 1; cur <= last_pos; cur++) {
+            const BYTE* const inr = ip + cur;
+            assert(cur <= ZSTD_OPT_NUM);
+            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur);
+
+            /* Fix current position with one literal if cheaper */
+            {   U32 const litlen = opt[cur-1].litlen + 1;
+                int const price = opt[cur-1].price
+                                + LIT_PRICE(ip+cur-1)
+                                + LL_INCPRICE(litlen);
+                assert(price < 1000000000); /* overflow check */
+                if (price <= opt[cur].price) {
+                    ZSTD_optimal_t const prevMatch = opt[cur];
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+                    opt[cur] = opt[cur-1];
+                    opt[cur].litlen = litlen;
+                    opt[cur].price = price;
+                    if ( (optLevel >= 1) /* additional check only for higher modes */
+                      && (prevMatch.litlen == 0) /* replace a match */
+                      && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */
+                      && LIKELY(ip + cur < iend)
+                    ) {
+                        /* check next position, in case it would be cheaper */
+                        int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1);
+                        int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1);
+                        DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f",
+                                cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals));
+                        if ( (with1literal < withMoreLiterals)
+                          && (with1literal < opt[cur+1].price) ) {
+                            /* update offset history - before it disappears */
+                            U32 const prev = cur - prevMatch.mlen;
+                            repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0);
+                            assert(cur >= prevMatch.mlen);
+                            DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !",
+                                        ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals),
+                                        newReps.rep[0], newReps.rep[1], newReps.rep[2] );
+                            opt[cur+1] = prevMatch;  /* mlen & offbase */
+                            ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t));
+                            opt[cur+1].litlen = 1;
+                            opt[cur+1].price = with1literal;
+                            if (last_pos < cur+1) last_pos = cur+1;
+                        }
+                    }
+                } else {
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price));
+                }
+            }
+
+            /* Offset history is not updated during match comparison.
+             * Do it here, now that the match is selected and confirmed.
+             */
+            ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
+            assert(cur >= opt[cur].mlen);
+            if (opt[cur].litlen == 0) {
+                /* just finished a match => alter offset history */
+                U32 const prev = cur - opt[cur].mlen;
+                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0);
+                ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
+            }
+
+            /* last match must start at a minimum distance of 8 from oend */
+            if (inr > ilimit) continue;
+
+            if (cur == last_pos) break;
+
+            if ( (optLevel==0) /*static_test*/
+              && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+                DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1);
+                continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+            }
+
+            assert(opt[cur].price >= 0);
+            {   U32 const ll0 = (opt[cur].litlen == 0);
+                int const previousPrice = opt[cur].price;
+                int const basePrice = previousPrice + LL_PRICE(0);
+                U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
+                U32 matchNb;
+
+                ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+                                                  (U32)(inr-istart), (U32)(iend-inr));
+
+                if (!nbMatches) {
+                    DEBUGLOG(7, "rPos:%u : no match found", cur);
+                    continue;
+                }
+
+                {   U32 const longestML = matches[nbMatches-1].len;
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u",
+                                inr-istart, cur, nbMatches, longestML);
+
+                    if ( (longestML > sufficient_len)
+                      || (cur + longestML >= ZSTD_OPT_NUM)
+                      || (ip + cur + longestML >= iend) ) {
+                        lastStretch.mlen = longestML;
+                        lastStretch.off = matches[nbMatches-1].off;
+                        lastStretch.litlen = 0;
+                        last_pos = cur + longestML;
+                        goto _shortestPath;
+                }   }
+
+                /* set prices using matches found at position == cur */
+                for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+                    U32 const offset = matches[matchNb].off;
+                    U32 const lastML = matches[matchNb].len;
+                    U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                    U32 mlen;
+
+                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
+                                matchNb, matches[matchNb].off, lastML, opt[cur].litlen);
+
+                    for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
+                        U32 const pos = cur + mlen;
+                        int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+
+                        if ((pos > last_pos) || (price < opt[pos].price)) {
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            while (last_pos < pos) {
+                                /* fill empty positions, for future comparisons */
+                                last_pos++;
+                                opt[last_pos].price = ZSTD_MAX_PRICE;
+                                opt[last_pos].litlen = !0;  /* just needs to be != 0, to mean "not an end of match" */
+                            }
+                            opt[pos].mlen = mlen;
+                            opt[pos].off = offset;
+                            opt[pos].litlen = 0;
+                            opt[pos].price = price;
+                        } else {
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
+                        }
+            }   }   }
+            opt[last_pos+1].price = ZSTD_MAX_PRICE;
+        }  /* for (cur = 1; cur <= last_pos; cur++) */
+
+        lastStretch = opt[last_pos];
+        assert(cur >= lastStretch.mlen);
+        cur = last_pos - lastStretch.mlen;
+
+_shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
+        assert(opt[0].mlen == 0);
+        assert(last_pos >= lastStretch.mlen);
+        assert(cur == last_pos - lastStretch.mlen);
+
+        if (lastStretch.mlen==0) {
+            /* no solution : all matches have been converted into literals */
+            assert(lastStretch.litlen == (ip - anchor) + last_pos);
+            ip += last_pos;
+            continue;
+        }
+        assert(lastStretch.off > 0);
+
+        /* Update offset history */
+        if (lastStretch.litlen == 0) {
+            /* finishing on a match : update offset history */
+            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0);
+            ZSTD_memcpy(rep, &reps, sizeof(repcodes_t));
+        } else {
+            ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t));
+            assert(cur >= lastStretch.litlen);
+            cur -= lastStretch.litlen;
+        }
+
+        /* Let's write the shortest path solution.
+         * It is stored in @opt in reverse order,
+         * starting from @storeEnd (==cur+2),
+         * effectively partially @opt overwriting.
+         * Content is changed too:
+         * - So far, @opt stored stretches, aka a match followed by literals
+         * - Now, it will store sequences, aka literals followed by a match
+         */
+        {   U32 const storeEnd = cur + 2;
+            U32 storeStart = storeEnd;
+            U32 stretchPos = cur;
+
+            DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                        last_pos, cur); (void)last_pos;
+            assert(storeEnd < ZSTD_OPT_SIZE);
+            DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                        storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off);
+            if (lastStretch.litlen > 0) {
+                /* last "sequence" is unfinished: just a bunch of literals */
+                opt[storeEnd].litlen = lastStretch.litlen;
+                opt[storeEnd].mlen = 0;
+                storeStart = storeEnd-1;
+                opt[storeStart] = lastStretch;
+            } {
+                opt[storeEnd] = lastStretch;  /* note: litlen will be fixed */
+                storeStart = storeEnd;
+            }
+            while (1) {
+                ZSTD_optimal_t nextStretch = opt[stretchPos];
+                opt[storeStart].litlen = nextStretch.litlen;
+                DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)",
+                            opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off);
+                if (nextStretch.mlen == 0) {
+                    /* reaching beginning of segment */
+                    break;
+                }
+                storeStart--;
+                opt[storeStart] = nextStretch; /* note: litlen will be fixed */
+                assert(nextStretch.litlen + nextStretch.mlen <= stretchPos);
+                stretchPos -= nextStretch.litlen + nextStretch.mlen;
+            }
+
+            /* save sequences */
+            DEBUGLOG(6, "sending selected sequences into seqStore");
+            {   U32 storePos;
+                for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                    U32 const llen = opt[storePos].litlen;
+                    U32 const mlen = opt[storePos].mlen;
+                    U32 const offBase = opt[storePos].off;
+                    U32 const advance = llen + mlen;
+                    DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+                                anchor - istart, (unsigned)llen, (unsigned)mlen);
+
+                    if (mlen==0) {  /* only literals => must be last "sequence", actually starting a new stream of sequences */
+                        assert(storePos == storeEnd);   /* must be last sequence */
+                        ip = anchor + llen;     /* last "sequence" is a bunch of literals => don't progress anchor */
+                        continue;   /* will finish */
+                    }
+
+                    assert(anchor + llen <= iend);
+                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
+                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
+                    anchor += advance;
+                    ip = anchor;
+            }   }
+            DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]);
+
+            /* update all costs */
+            ZSTD_setBasePrices(optStatePtr, optLevel);
+        }
+    }   /* while (ip < ilimit) */
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+#endif /* build exclusions */
+
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+static size_t ZSTD_compressBlock_opt0(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
+}
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+static size_t ZSTD_compressBlock_opt2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
+}
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+}
+#endif
+
+
+
+
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+/* ZSTD_initStats_ultra():
+ * make a first compression pass, just to seed stats with more accurate starting values.
+ * only works on first block, with no dictionary and no ldm.
+ * this function cannot error out, its narrow contract must be respected.
+ */
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+                          seqStore_t* seqStore,
+                          U32 rep[ZSTD_REP_NUM],
+                    const void* src, size_t srcSize)
+{
+    U32 tmpRep[ZSTD_REP_NUM];  /* updated rep codes will sink here */
+    ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
+
+    DEBUGLOG(4, "ZSTD_initStats_ultra (srcSize=%zu)", srcSize);
+    assert(ms->opt.litLengthSum == 0);    /* first block */
+    assert(seqStore->sequences == seqStore->sequencesStart);   /* no ldm */
+    assert(ms->window.dictLimit == ms->window.lowLimit);   /* no dictionary */
+    assert(ms->window.dictLimit - ms->nextToUpdate <= 1);  /* no prefix (note: intentional overflow, defined as 2-complement) */
+
+    ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
+
+    /* invalidate first scan from history, only keep entropy stats */
+    ZSTD_resetSeqStore(seqStore);
+    ms->window.base -= srcSize;
+    ms->window.dictLimit += (U32)srcSize;
+    ms->window.lowLimit = ms->window.dictLimit;
+    ms->nextToUpdate = ms->window.dictLimit;
+
+}
+
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btultra2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    U32 const curr = (U32)((const BYTE*)src - ms->window.base);
+    DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+
+    /* 2-passes strategy:
+     * this strategy makes a first pass over first block to collect statistics
+     * in order to seed next round's statistics with it.
+     * After 1st pass, function forgets history, and starts a new block.
+     * Consequently, this can only work if no data has been previously loaded in tables,
+     * aka, no dictionary, no prefix, no ldm preprocessing.
+     * The compression ratio gain is generally small (~0.5% on first block),
+     * the cost is 2x cpu time on first block. */
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+    if ( (ms->opt.litLengthSum==0)   /* first block */
+      && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
+      && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
+      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
+      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
+      ) {
+        ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+    }
+
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+}
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+}
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+}
+#endif
+
+/* note : no btultra2 variant for extDict nor dictMatchState,
+ * because btultra2 is not meant to work with dictionaries
+ * and is only specific for the first block (no prefix) */
diff --git a/src/zstd/compress/zstd_opt.h b/src/zstd/compress/zstd_opt.h
new file mode 100644
index 000000000..d4e711315
--- /dev/null
+++ b/src/zstd/compress/zstd_opt.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_OPT_H
+#define ZSTD_OPT_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "zstd_compress_internal.h"
+
+#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+/* used in ZSTD_loadDictionaryContent() */
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt
+#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState
+#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict
+#else
+#define ZSTD_COMPRESSBLOCK_BTOPT NULL
+#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+        /* note : no btultra2 variant for extDict nor dictMatchState,
+         * because btultra2 is not meant to work with dictionaries
+         * and is only specific for the first block (no prefix) */
+size_t ZSTD_compressBlock_btultra2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra
+#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState
+#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict
+#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2
+#else
+#define ZSTD_COMPRESSBLOCK_BTULTRA NULL
+#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL
+#endif
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_OPT_H */
diff --git a/src/zstd/compress/zstdmt_compress.c b/src/zstd/compress/zstdmt_compress.c
new file mode 100644
index 000000000..86ccce318
--- /dev/null
+++ b/src/zstd/compress/zstdmt_compress.c
@@ -0,0 +1,1882 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)   /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+/* ======   Dependencies   ====== */
+#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+#include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memset, INT_MAX, UINT_MAX */
+#include "../common/mem.h"         /* MEM_STATIC */
+#include "../common/pool.h"        /* threadpool */
+#include "../common/threading.h"   /* mutex */
+#include "zstd_compress_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */
+#include "zstd_ldm.h"
+#include "zstdmt_compress.h"
+
+/* Guards code to support resizing the SeqPool.
+ * We will want to resize the SeqPool to save memory in the future.
+ * Until then, comment the code out since it is unused.
+ */
+#define ZSTD_RESIZE_SEQPOOL 0
+
+/* ======   Debug   ====== */
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) \
+    && !defined(_MSC_VER) \
+    && !defined(__MINGW32__)
+
+#  include <stdio.h>
+#  include <unistd.h>
+#  include <sys/times.h>
+
+#  define DEBUG_PRINTHEX(l,p,n)                                       \
+    do {                                                              \
+        unsigned debug_u;                                             \
+        for (debug_u=0; debug_u<(n); debug_u++)                       \
+            RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
+        RAWLOG(l, " \n");                                             \
+    } while (0)
+
+static unsigned long long GetCurrentClockTimeMicroseconds(void)
+{
+   static clock_t _ticksPerSecond = 0;
+   if (_ticksPerSecond <= 0) _ticksPerSecond = sysconf(_SC_CLK_TCK);
+
+   {   struct tms junk; clock_t newTicks = (clock_t) times(&junk);
+       return ((((unsigned long long)newTicks)*(1000000))/_ticksPerSecond);
+}  }
+
+#define MUTEX_WAIT_TIME_DLEVEL 6
+#define ZSTD_PTHREAD_MUTEX_LOCK(mutex)                                                  \
+    do {                                                                                \
+        if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) {                                     \
+            unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds();    \
+            ZSTD_pthread_mutex_lock(mutex);                                             \
+            {   unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \
+                unsigned long long const elapsedTime = (afterTime-beforeTime);          \
+                if (elapsedTime > 1000) {                                               \
+                    /* or whatever threshold you like; I'm using 1 millisecond here */  \
+                    DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL,                                    \
+                        "Thread took %llu microseconds to acquire mutex %s \n",         \
+                        elapsedTime, #mutex);                                           \
+            }   }                                                                       \
+        } else {                                                                        \
+            ZSTD_pthread_mutex_lock(mutex);                                             \
+        }                                                                               \
+    } while (0)
+
+#else
+
+#  define ZSTD_PTHREAD_MUTEX_LOCK(m) ZSTD_pthread_mutex_lock(m)
+#  define DEBUG_PRINTHEX(l,p,n) do { } while (0)
+
+#endif
+
+
+/* =====   Buffer Pool   ===== */
+/* a single Buffer Pool can be invoked from multiple threads in parallel */
+
+typedef struct buffer_s {
+    void* start;
+    size_t capacity;
+} buffer_t;
+
+static const buffer_t g_nullBuffer = { NULL, 0 };
+
+typedef struct ZSTDMT_bufferPool_s {
+    ZSTD_pthread_mutex_t poolMutex;
+    size_t bufferSize;
+    unsigned totalBuffers;
+    unsigned nbBuffers;
+    ZSTD_customMem cMem;
+    buffer_t* buffers;
+} ZSTDMT_bufferPool;
+
+static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool)
+{
+    DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool);
+    if (!bufPool) return;   /* compatibility with free on NULL */
+    if (bufPool->buffers) {
+        unsigned u;
+        for (u=0; u<bufPool->totalBuffers; u++) {
+            DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->buffers[u].start);
+            ZSTD_customFree(bufPool->buffers[u].start, bufPool->cMem);
+        }
+        ZSTD_customFree(bufPool->buffers, bufPool->cMem);
+    }
+    ZSTD_pthread_mutex_destroy(&bufPool->poolMutex);
+    ZSTD_customFree(bufPool, bufPool->cMem);
+}
+
+static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem)
+{
+    ZSTDMT_bufferPool* const bufPool =
+        (ZSTDMT_bufferPool*)ZSTD_customCalloc(sizeof(ZSTDMT_bufferPool), cMem);
+    if (bufPool==NULL) return NULL;
+    if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) {
+        ZSTD_customFree(bufPool, cMem);
+        return NULL;
+    }
+    bufPool->buffers = (buffer_t*)ZSTD_customCalloc(maxNbBuffers * sizeof(buffer_t), cMem);
+    if (bufPool->buffers==NULL) {
+        ZSTDMT_freeBufferPool(bufPool);
+        return NULL;
+    }
+    bufPool->bufferSize = 64 KB;
+    bufPool->totalBuffers = maxNbBuffers;
+    bufPool->nbBuffers = 0;
+    bufPool->cMem = cMem;
+    return bufPool;
+}
+
+/* only works at initialization, not during compression */
+static size_t ZSTDMT_sizeof_bufferPool(ZSTDMT_bufferPool* bufPool)
+{
+    size_t const poolSize = sizeof(*bufPool);
+    size_t const arraySize = bufPool->totalBuffers * sizeof(buffer_t);
+    unsigned u;
+    size_t totalBufferSize = 0;
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    for (u=0; u<bufPool->totalBuffers; u++)
+        totalBufferSize += bufPool->buffers[u].capacity;
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+
+    return poolSize + arraySize + totalBufferSize;
+}
+
+/* ZSTDMT_setBufferSize() :
+ * all future buffers provided by this buffer pool will have _at least_ this size
+ * note : it's better for all buffers to have same size,
+ * as they become freely interchangeable, reducing malloc/free usages and memory fragmentation */
+static void ZSTDMT_setBufferSize(ZSTDMT_bufferPool* const bufPool, size_t const bSize)
+{
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    DEBUGLOG(4, "ZSTDMT_setBufferSize: bSize = %u", (U32)bSize);
+    bufPool->bufferSize = bSize;
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+}
+
+
+static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, unsigned maxNbBuffers)
+{
+    if (srcBufPool==NULL) return NULL;
+    if (srcBufPool->totalBuffers >= maxNbBuffers) /* good enough */
+        return srcBufPool;
+    /* need a larger buffer pool */
+    {   ZSTD_customMem const cMem = srcBufPool->cMem;
+        size_t const bSize = srcBufPool->bufferSize;   /* forward parameters */
+        ZSTDMT_bufferPool* newBufPool;
+        ZSTDMT_freeBufferPool(srcBufPool);
+        newBufPool = ZSTDMT_createBufferPool(maxNbBuffers, cMem);
+        if (newBufPool==NULL) return newBufPool;
+        ZSTDMT_setBufferSize(newBufPool, bSize);
+        return newBufPool;
+    }
+}
+
+/** ZSTDMT_getBuffer() :
+ *  assumption : bufPool must be valid
+ * @return : a buffer, with start pointer and size
+ *  note: allocation may fail, in this case, start==NULL and size==0 */
+static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool)
+{
+    size_t const bSize = bufPool->bufferSize;
+    DEBUGLOG(5, "ZSTDMT_getBuffer: bSize = %u", (U32)bufPool->bufferSize);
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    if (bufPool->nbBuffers) {   /* try to use an existing buffer */
+        buffer_t const buf = bufPool->buffers[--(bufPool->nbBuffers)];
+        size_t const availBufferSize = buf.capacity;
+        bufPool->buffers[bufPool->nbBuffers] = g_nullBuffer;
+        if ((availBufferSize >= bSize) & ((availBufferSize>>3) <= bSize)) {
+            /* large enough, but not too much */
+            DEBUGLOG(5, "ZSTDMT_getBuffer: provide buffer %u of size %u",
+                        bufPool->nbBuffers, (U32)buf.capacity);
+            ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+            return buf;
+        }
+        /* size conditions not respected : scratch this buffer, create new one */
+        DEBUGLOG(5, "ZSTDMT_getBuffer: existing buffer does not meet size conditions => freeing");
+        ZSTD_customFree(buf.start, bufPool->cMem);
+    }
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+    /* create new buffer */
+    DEBUGLOG(5, "ZSTDMT_getBuffer: create a new buffer");
+    {   buffer_t buffer;
+        void* const start = ZSTD_customMalloc(bSize, bufPool->cMem);
+        buffer.start = start;   /* note : start can be NULL if malloc fails ! */
+        buffer.capacity = (start==NULL) ? 0 : bSize;
+        if (start==NULL) {
+            DEBUGLOG(5, "ZSTDMT_getBuffer: buffer allocation failure !!");
+        } else {
+            DEBUGLOG(5, "ZSTDMT_getBuffer: created buffer of size %u", (U32)bSize);
+        }
+        return buffer;
+    }
+}
+
+#if ZSTD_RESIZE_SEQPOOL
+/** ZSTDMT_resizeBuffer() :
+ * assumption : bufPool must be valid
+ * @return : a buffer that is at least the buffer pool buffer size.
+ *           If a reallocation happens, the data in the input buffer is copied.
+ */
+static buffer_t ZSTDMT_resizeBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buffer)
+{
+    size_t const bSize = bufPool->bufferSize;
+    if (buffer.capacity < bSize) {
+        void* const start = ZSTD_customMalloc(bSize, bufPool->cMem);
+        buffer_t newBuffer;
+        newBuffer.start = start;
+        newBuffer.capacity = start == NULL ? 0 : bSize;
+        if (start != NULL) {
+            assert(newBuffer.capacity >= buffer.capacity);
+            ZSTD_memcpy(newBuffer.start, buffer.start, buffer.capacity);
+            DEBUGLOG(5, "ZSTDMT_resizeBuffer: created buffer of size %u", (U32)bSize);
+            return newBuffer;
+        }
+        DEBUGLOG(5, "ZSTDMT_resizeBuffer: buffer allocation failure !!");
+    }
+    return buffer;
+}
+#endif
+
+/* store buffer for later re-use, up to pool capacity */
+static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf)
+{
+    DEBUGLOG(5, "ZSTDMT_releaseBuffer");
+    if (buf.start == NULL) return;   /* compatible with release on NULL */
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    if (bufPool->nbBuffers < bufPool->totalBuffers) {
+        bufPool->buffers[bufPool->nbBuffers++] = buf;  /* stored for later use */
+        DEBUGLOG(5, "ZSTDMT_releaseBuffer: stored buffer of size %u in slot %u",
+                    (U32)buf.capacity, (U32)(bufPool->nbBuffers-1));
+        ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+        return;
+    }
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+    /* Reached bufferPool capacity (note: should not happen) */
+    DEBUGLOG(5, "ZSTDMT_releaseBuffer: pool capacity reached => freeing ");
+    ZSTD_customFree(buf.start, bufPool->cMem);
+}
+
+/* We need 2 output buffers per worker since each dstBuff must be flushed after it is released.
+ * The 3 additional buffers are as follows:
+ *   1 buffer for input loading
+ *   1 buffer for "next input" when submitting current one
+ *   1 buffer stuck in queue */
+#define BUF_POOL_MAX_NB_BUFFERS(nbWorkers) (2*(nbWorkers) + 3)
+
+/* After a worker releases its rawSeqStore, it is immediately ready for reuse.
+ * So we only need one seq buffer per worker. */
+#define SEQ_POOL_MAX_NB_BUFFERS(nbWorkers) (nbWorkers)
+
+/* =====   Seq Pool Wrapper   ====== */
+
+typedef ZSTDMT_bufferPool ZSTDMT_seqPool;
+
+static size_t ZSTDMT_sizeof_seqPool(ZSTDMT_seqPool* seqPool)
+{
+    return ZSTDMT_sizeof_bufferPool(seqPool);
+}
+
+static rawSeqStore_t bufferToSeq(buffer_t buffer)
+{
+    rawSeqStore_t seq = kNullRawSeqStore;
+    seq.seq = (rawSeq*)buffer.start;
+    seq.capacity = buffer.capacity / sizeof(rawSeq);
+    return seq;
+}
+
+static buffer_t seqToBuffer(rawSeqStore_t seq)
+{
+    buffer_t buffer;
+    buffer.start = seq.seq;
+    buffer.capacity = seq.capacity * sizeof(rawSeq);
+    return buffer;
+}
+
+static rawSeqStore_t ZSTDMT_getSeq(ZSTDMT_seqPool* seqPool)
+{
+    if (seqPool->bufferSize == 0) {
+        return kNullRawSeqStore;
+    }
+    return bufferToSeq(ZSTDMT_getBuffer(seqPool));
+}
+
+#if ZSTD_RESIZE_SEQPOOL
+static rawSeqStore_t ZSTDMT_resizeSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq)
+{
+  return bufferToSeq(ZSTDMT_resizeBuffer(seqPool, seqToBuffer(seq)));
+}
+#endif
+
+static void ZSTDMT_releaseSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq)
+{
+  ZSTDMT_releaseBuffer(seqPool, seqToBuffer(seq));
+}
+
+static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq)
+{
+  ZSTDMT_setBufferSize(seqPool, nbSeq * sizeof(rawSeq));
+}
+
+static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem)
+{
+    ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(SEQ_POOL_MAX_NB_BUFFERS(nbWorkers), cMem);
+    if (seqPool == NULL) return NULL;
+    ZSTDMT_setNbSeq(seqPool, 0);
+    return seqPool;
+}
+
+static void ZSTDMT_freeSeqPool(ZSTDMT_seqPool* seqPool)
+{
+    ZSTDMT_freeBufferPool(seqPool);
+}
+
+static ZSTDMT_seqPool* ZSTDMT_expandSeqPool(ZSTDMT_seqPool* pool, U32 nbWorkers)
+{
+    return ZSTDMT_expandBufferPool(pool, SEQ_POOL_MAX_NB_BUFFERS(nbWorkers));
+}
+
+
+/* =====   CCtx Pool   ===== */
+/* a single CCtx Pool can be invoked from multiple threads in parallel */
+
+typedef struct {
+    ZSTD_pthread_mutex_t poolMutex;
+    int totalCCtx;
+    int availCCtx;
+    ZSTD_customMem cMem;
+    ZSTD_CCtx** cctxs;
+} ZSTDMT_CCtxPool;
+
+/* note : all CCtx borrowed from the pool must be reverted back to the pool _before_ freeing the pool */
+static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool)
+{
+    if (!pool) return;
+    ZSTD_pthread_mutex_destroy(&pool->poolMutex);
+    if (pool->cctxs) {
+        int cid;
+        for (cid=0; cid<pool->totalCCtx; cid++)
+            ZSTD_freeCCtx(pool->cctxs[cid]);  /* free compatible with NULL */
+        ZSTD_customFree(pool->cctxs, pool->cMem);
+    }
+    ZSTD_customFree(pool, pool->cMem);
+}
+
+/* ZSTDMT_createCCtxPool() :
+ * implies nbWorkers >= 1 , checked by caller ZSTDMT_createCCtx() */
+static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(int nbWorkers,
+                                              ZSTD_customMem cMem)
+{
+    ZSTDMT_CCtxPool* const cctxPool =
+        (ZSTDMT_CCtxPool*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtxPool), cMem);
+    assert(nbWorkers > 0);
+    if (!cctxPool) return NULL;
+    if (ZSTD_pthread_mutex_init(&cctxPool->poolMutex, NULL)) {
+        ZSTD_customFree(cctxPool, cMem);
+        return NULL;
+    }
+    cctxPool->totalCCtx = nbWorkers;
+    cctxPool->cctxs = (ZSTD_CCtx**)ZSTD_customCalloc(nbWorkers * sizeof(ZSTD_CCtx*), cMem);
+    if (!cctxPool->cctxs) {
+        ZSTDMT_freeCCtxPool(cctxPool);
+        return NULL;
+    }
+    cctxPool->cMem = cMem;
+    cctxPool->cctxs[0] = ZSTD_createCCtx_advanced(cMem);
+    if (!cctxPool->cctxs[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; }
+    cctxPool->availCCtx = 1;   /* at least one cctx for single-thread mode */
+    DEBUGLOG(3, "cctxPool created, with %u workers", nbWorkers);
+    return cctxPool;
+}
+
+static ZSTDMT_CCtxPool* ZSTDMT_expandCCtxPool(ZSTDMT_CCtxPool* srcPool,
+                                              int nbWorkers)
+{
+    if (srcPool==NULL) return NULL;
+    if (nbWorkers <= srcPool->totalCCtx) return srcPool;   /* good enough */
+    /* need a larger cctx pool */
+    {   ZSTD_customMem const cMem = srcPool->cMem;
+        ZSTDMT_freeCCtxPool(srcPool);
+        return ZSTDMT_createCCtxPool(nbWorkers, cMem);
+    }
+}
+
+/* only works during initialization phase, not during compression */
+static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool)
+{
+    ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
+    {   unsigned const nbWorkers = cctxPool->totalCCtx;
+        size_t const poolSize = sizeof(*cctxPool);
+        size_t const arraySize = cctxPool->totalCCtx * sizeof(ZSTD_CCtx*);
+        size_t totalCCtxSize = 0;
+        unsigned u;
+        for (u=0; u<nbWorkers; u++) {
+            totalCCtxSize += ZSTD_sizeof_CCtx(cctxPool->cctxs[u]);
+        }
+        ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+        assert(nbWorkers > 0);
+        return poolSize + arraySize + totalCCtxSize;
+    }
+}
+
+static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* cctxPool)
+{
+    DEBUGLOG(5, "ZSTDMT_getCCtx");
+    ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
+    if (cctxPool->availCCtx) {
+        cctxPool->availCCtx--;
+        {   ZSTD_CCtx* const cctx = cctxPool->cctxs[cctxPool->availCCtx];
+            ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+            return cctx;
+    }   }
+    ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+    DEBUGLOG(5, "create one more CCtx");
+    return ZSTD_createCCtx_advanced(cctxPool->cMem);   /* note : can be NULL, when creation fails ! */
+}
+
+static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return;   /* compatibility with release on NULL */
+    ZSTD_pthread_mutex_lock(&pool->poolMutex);
+    if (pool->availCCtx < pool->totalCCtx)
+        pool->cctxs[pool->availCCtx++] = cctx;
+    else {
+        /* pool overflow : should not happen, since totalCCtx==nbWorkers */
+        DEBUGLOG(4, "CCtx pool overflow : free cctx");
+        ZSTD_freeCCtx(cctx);
+    }
+    ZSTD_pthread_mutex_unlock(&pool->poolMutex);
+}
+
+/* ====   Serial State   ==== */
+
+typedef struct {
+    void const* start;
+    size_t size;
+} range_t;
+
+typedef struct {
+    /* All variables in the struct are protected by mutex. */
+    ZSTD_pthread_mutex_t mutex;
+    ZSTD_pthread_cond_t cond;
+    ZSTD_CCtx_params params;
+    ldmState_t ldmState;
+    XXH64_state_t xxhState;
+    unsigned nextJobID;
+    /* Protects ldmWindow.
+     * Must be acquired after the main mutex when acquiring both.
+     */
+    ZSTD_pthread_mutex_t ldmWindowMutex;
+    ZSTD_pthread_cond_t ldmWindowCond;  /* Signaled when ldmWindow is updated */
+    ZSTD_window_t ldmWindow;  /* A thread-safe copy of ldmState.window */
+} serialState_t;
+
+static int
+ZSTDMT_serialState_reset(serialState_t* serialState,
+                         ZSTDMT_seqPool* seqPool,
+                         ZSTD_CCtx_params params,
+                         size_t jobSize,
+                         const void* dict, size_t const dictSize,
+                         ZSTD_dictContentType_e dictContentType)
+{
+    /* Adjust parameters */
+    if (params.ldmParams.enableLdm == ZSTD_ps_enable) {
+        DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10);
+        ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
+        assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
+        assert(params.ldmParams.hashRateLog < 32);
+    } else {
+        ZSTD_memset(&params.ldmParams, 0, sizeof(params.ldmParams));
+    }
+    serialState->nextJobID = 0;
+    if (params.fParams.checksumFlag)
+        XXH64_reset(&serialState->xxhState, 0);
+    if (params.ldmParams.enableLdm == ZSTD_ps_enable) {
+        ZSTD_customMem cMem = params.customMem;
+        unsigned const hashLog = params.ldmParams.hashLog;
+        size_t const hashSize = ((size_t)1 << hashLog) * sizeof(ldmEntry_t);
+        unsigned const bucketLog =
+            params.ldmParams.hashLog - params.ldmParams.bucketSizeLog;
+        unsigned const prevBucketLog =
+            serialState->params.ldmParams.hashLog -
+            serialState->params.ldmParams.bucketSizeLog;
+        size_t const numBuckets = (size_t)1 << bucketLog;
+        /* Size the seq pool tables */
+        ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, jobSize));
+        /* Reset the window */
+        ZSTD_window_init(&serialState->ldmState.window);
+        /* Resize tables and output space if necessary. */
+        if (serialState->ldmState.hashTable == NULL || serialState->params.ldmParams.hashLog < hashLog) {
+            ZSTD_customFree(serialState->ldmState.hashTable, cMem);
+            serialState->ldmState.hashTable = (ldmEntry_t*)ZSTD_customMalloc(hashSize, cMem);
+        }
+        if (serialState->ldmState.bucketOffsets == NULL || prevBucketLog < bucketLog) {
+            ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem);
+            serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_customMalloc(numBuckets, cMem);
+        }
+        if (!serialState->ldmState.hashTable || !serialState->ldmState.bucketOffsets)
+            return 1;
+        /* Zero the tables */
+        ZSTD_memset(serialState->ldmState.hashTable, 0, hashSize);
+        ZSTD_memset(serialState->ldmState.bucketOffsets, 0, numBuckets);
+
+        /* Update window state and fill hash table with dict */
+        serialState->ldmState.loadedDictEnd = 0;
+        if (dictSize > 0) {
+            if (dictContentType == ZSTD_dct_rawContent) {
+                BYTE const* const dictEnd = (const BYTE*)dict + dictSize;
+                ZSTD_window_update(&serialState->ldmState.window, dict, dictSize, /* forceNonContiguous */ 0);
+                ZSTD_ldm_fillHashTable(&serialState->ldmState, (const BYTE*)dict, dictEnd, &params.ldmParams);
+                serialState->ldmState.loadedDictEnd = params.forceWindow ? 0 : (U32)(dictEnd - serialState->ldmState.window.base);
+            } else {
+                /* don't even load anything */
+            }
+        }
+
+        /* Initialize serialState's copy of ldmWindow. */
+        serialState->ldmWindow = serialState->ldmState.window;
+    }
+
+    serialState->params = params;
+    serialState->params.jobSize = (U32)jobSize;
+    return 0;
+}
+
+static int ZSTDMT_serialState_init(serialState_t* serialState)
+{
+    int initError = 0;
+    ZSTD_memset(serialState, 0, sizeof(*serialState));
+    initError |= ZSTD_pthread_mutex_init(&serialState->mutex, NULL);
+    initError |= ZSTD_pthread_cond_init(&serialState->cond, NULL);
+    initError |= ZSTD_pthread_mutex_init(&serialState->ldmWindowMutex, NULL);
+    initError |= ZSTD_pthread_cond_init(&serialState->ldmWindowCond, NULL);
+    return initError;
+}
+
+static void ZSTDMT_serialState_free(serialState_t* serialState)
+{
+    ZSTD_customMem cMem = serialState->params.customMem;
+    ZSTD_pthread_mutex_destroy(&serialState->mutex);
+    ZSTD_pthread_cond_destroy(&serialState->cond);
+    ZSTD_pthread_mutex_destroy(&serialState->ldmWindowMutex);
+    ZSTD_pthread_cond_destroy(&serialState->ldmWindowCond);
+    ZSTD_customFree(serialState->ldmState.hashTable, cMem);
+    ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem);
+}
+
+static void ZSTDMT_serialState_update(serialState_t* serialState,
+                                      ZSTD_CCtx* jobCCtx, rawSeqStore_t seqStore,
+                                      range_t src, unsigned jobID)
+{
+    /* Wait for our turn */
+    ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
+    while (serialState->nextJobID < jobID) {
+        DEBUGLOG(5, "wait for serialState->cond");
+        ZSTD_pthread_cond_wait(&serialState->cond, &serialState->mutex);
+    }
+    /* A future job may error and skip our job */
+    if (serialState->nextJobID == jobID) {
+        /* It is now our turn, do any processing necessary */
+        if (serialState->params.ldmParams.enableLdm == ZSTD_ps_enable) {
+            size_t error;
+            assert(seqStore.seq != NULL && seqStore.pos == 0 &&
+                   seqStore.size == 0 && seqStore.capacity > 0);
+            assert(src.size <= serialState->params.jobSize);
+            ZSTD_window_update(&serialState->ldmState.window, src.start, src.size, /* forceNonContiguous */ 0);
+            error = ZSTD_ldm_generateSequences(
+                &serialState->ldmState, &seqStore,
+                &serialState->params.ldmParams, src.start, src.size);
+            /* We provide a large enough buffer to never fail. */
+            assert(!ZSTD_isError(error)); (void)error;
+            /* Update ldmWindow to match the ldmState.window and signal the main
+             * thread if it is waiting for a buffer.
+             */
+            ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex);
+            serialState->ldmWindow = serialState->ldmState.window;
+            ZSTD_pthread_cond_signal(&serialState->ldmWindowCond);
+            ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex);
+        }
+        if (serialState->params.fParams.checksumFlag && src.size > 0)
+            XXH64_update(&serialState->xxhState, src.start, src.size);
+    }
+    /* Now it is the next jobs turn */
+    serialState->nextJobID++;
+    ZSTD_pthread_cond_broadcast(&serialState->cond);
+    ZSTD_pthread_mutex_unlock(&serialState->mutex);
+
+    if (seqStore.size > 0) {
+        ZSTD_referenceExternalSequences(jobCCtx, seqStore.seq, seqStore.size);
+        assert(serialState->params.ldmParams.enableLdm == ZSTD_ps_enable);
+    }
+}
+
+static void ZSTDMT_serialState_ensureFinished(serialState_t* serialState,
+                                              unsigned jobID, size_t cSize)
+{
+    ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
+    if (serialState->nextJobID <= jobID) {
+        assert(ZSTD_isError(cSize)); (void)cSize;
+        DEBUGLOG(5, "Skipping past job %u because of error", jobID);
+        serialState->nextJobID = jobID + 1;
+        ZSTD_pthread_cond_broadcast(&serialState->cond);
+
+        ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex);
+        ZSTD_window_clear(&serialState->ldmWindow);
+        ZSTD_pthread_cond_signal(&serialState->ldmWindowCond);
+        ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex);
+    }
+    ZSTD_pthread_mutex_unlock(&serialState->mutex);
+
+}
+
+
+/* ------------------------------------------ */
+/* =====          Worker thread         ===== */
+/* ------------------------------------------ */
+
+static const range_t kNullRange = { NULL, 0 };
+
+typedef struct {
+    size_t   consumed;                   /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx */
+    size_t   cSize;                      /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx, then set0 by mtctx */
+    ZSTD_pthread_mutex_t job_mutex;      /* Thread-safe - used by mtctx and worker */
+    ZSTD_pthread_cond_t job_cond;        /* Thread-safe - used by mtctx and worker */
+    ZSTDMT_CCtxPool* cctxPool;           /* Thread-safe - used by mtctx and (all) workers */
+    ZSTDMT_bufferPool* bufPool;          /* Thread-safe - used by mtctx and (all) workers */
+    ZSTDMT_seqPool* seqPool;             /* Thread-safe - used by mtctx and (all) workers */
+    serialState_t* serial;               /* Thread-safe - used by mtctx and (all) workers */
+    buffer_t dstBuff;                    /* set by worker (or mtctx), then read by worker & mtctx, then modified by mtctx => no barrier */
+    range_t prefix;                      /* set by mtctx, then read by worker & mtctx => no barrier */
+    range_t src;                         /* set by mtctx, then read by worker & mtctx => no barrier */
+    unsigned jobID;                      /* set by mtctx, then read by worker => no barrier */
+    unsigned firstJob;                   /* set by mtctx, then read by worker => no barrier */
+    unsigned lastJob;                    /* set by mtctx, then read by worker => no barrier */
+    ZSTD_CCtx_params params;             /* set by mtctx, then read by worker => no barrier */
+    const ZSTD_CDict* cdict;             /* set by mtctx, then read by worker => no barrier */
+    unsigned long long fullFrameSize;    /* set by mtctx, then read by worker => no barrier */
+    size_t   dstFlushed;                 /* used only by mtctx */
+    unsigned frameChecksumNeeded;        /* used only by mtctx */
+} ZSTDMT_jobDescription;
+
+#define JOB_ERROR(e)                                \
+    do {                                            \
+        ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);   \
+        job->cSize = e;                             \
+        ZSTD_pthread_mutex_unlock(&job->job_mutex); \
+        goto _endJob;                               \
+    } while (0)
+
+/* ZSTDMT_compressionJob() is a POOL_function type */
+static void ZSTDMT_compressionJob(void* jobDescription)
+{
+    ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription;
+    ZSTD_CCtx_params jobParams = job->params;   /* do not modify job->params ! copy it, modify the copy */
+    ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool);
+    rawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool);
+    buffer_t dstBuff = job->dstBuff;
+    size_t lastCBlockSize = 0;
+
+    /* resources */
+    if (cctx==NULL) JOB_ERROR(ERROR(memory_allocation));
+    if (dstBuff.start == NULL) {   /* streaming job : doesn't provide a dstBuffer */
+        dstBuff = ZSTDMT_getBuffer(job->bufPool);
+        if (dstBuff.start==NULL) JOB_ERROR(ERROR(memory_allocation));
+        job->dstBuff = dstBuff;   /* this value can be read in ZSTDMT_flush, when it copies the whole job */
+    }
+    if (jobParams.ldmParams.enableLdm == ZSTD_ps_enable && rawSeqStore.seq == NULL)
+        JOB_ERROR(ERROR(memory_allocation));
+
+    /* Don't compute the checksum for chunks, since we compute it externally,
+     * but write it in the header.
+     */
+    if (job->jobID != 0) jobParams.fParams.checksumFlag = 0;
+    /* Don't run LDM for the chunks, since we handle it externally */
+    jobParams.ldmParams.enableLdm = ZSTD_ps_disable;
+    /* Correct nbWorkers to 0. */
+    jobParams.nbWorkers = 0;
+
+
+    /* init */
+    if (job->cdict) {
+        size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, job->cdict, &jobParams, job->fullFrameSize);
+        assert(job->firstJob);  /* only allowed for first job */
+        if (ZSTD_isError(initError)) JOB_ERROR(initError);
+    } else {  /* srcStart points at reloaded section */
+        U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size;
+        {   size_t const forceWindowError = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_forceMaxWindow, !job->firstJob);
+            if (ZSTD_isError(forceWindowError)) JOB_ERROR(forceWindowError);
+        }
+        if (!job->firstJob) {
+            size_t const err = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_deterministicRefPrefix, 0);
+            if (ZSTD_isError(err)) JOB_ERROR(err);
+        }
+        {   size_t const initError = ZSTD_compressBegin_advanced_internal(cctx,
+                                        job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
+                                        ZSTD_dtlm_fast,
+                                        NULL, /*cdict*/
+                                        &jobParams, pledgedSrcSize);
+            if (ZSTD_isError(initError)) JOB_ERROR(initError);
+    }   }
+
+    /* Perform serial step as early as possible, but after CCtx initialization */
+    ZSTDMT_serialState_update(job->serial, cctx, rawSeqStore, job->src, job->jobID);
+
+    if (!job->firstJob) {  /* flush and overwrite frame header when it's not first job */
+        size_t const hSize = ZSTD_compressContinue_public(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0);
+        if (ZSTD_isError(hSize)) JOB_ERROR(hSize);
+        DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize);
+        ZSTD_invalidateRepCodes(cctx);
+    }
+
+    /* compress */
+    {   size_t const chunkSize = 4*ZSTD_BLOCKSIZE_MAX;
+        int const nbChunks = (int)((job->src.size + (chunkSize-1)) / chunkSize);
+        const BYTE* ip = (const BYTE*) job->src.start;
+        BYTE* const ostart = (BYTE*)dstBuff.start;
+        BYTE* op = ostart;
+        BYTE* oend = op + dstBuff.capacity;
+        int chunkNb;
+        if (sizeof(size_t) > sizeof(int)) assert(job->src.size < ((size_t)INT_MAX) * chunkSize);   /* check overflow */
+        DEBUGLOG(5, "ZSTDMT_compressionJob: compress %u bytes in %i blocks", (U32)job->src.size, nbChunks);
+        assert(job->cSize == 0);
+        for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) {
+            size_t const cSize = ZSTD_compressContinue_public(cctx, op, oend-op, ip, chunkSize);
+            if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
+            ip += chunkSize;
+            op += cSize; assert(op < oend);
+            /* stats */
+            ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
+            job->cSize += cSize;
+            job->consumed = chunkSize * chunkNb;
+            DEBUGLOG(5, "ZSTDMT_compressionJob: compress new block : cSize==%u bytes (total: %u)",
+                        (U32)cSize, (U32)job->cSize);
+            ZSTD_pthread_cond_signal(&job->job_cond);   /* warns some more data is ready to be flushed */
+            ZSTD_pthread_mutex_unlock(&job->job_mutex);
+        }
+        /* last block */
+        assert(chunkSize > 0);
+        assert((chunkSize & (chunkSize - 1)) == 0);  /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
+        if ((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/ ) {
+            size_t const lastBlockSize1 = job->src.size & (chunkSize-1);
+            size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1;
+            size_t const cSize = (job->lastJob) ?
+                 ZSTD_compressEnd_public(cctx, op, oend-op, ip, lastBlockSize) :
+                 ZSTD_compressContinue_public(cctx, op, oend-op, ip, lastBlockSize);
+            if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
+            lastCBlockSize = cSize;
+    }   }
+    if (!job->firstJob) {
+        /* Double check that we don't have an ext-dict, because then our
+         * repcode invalidation doesn't work.
+         */
+        assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
+    }
+    ZSTD_CCtx_trace(cctx, 0);
+
+_endJob:
+    ZSTDMT_serialState_ensureFinished(job->serial, job->jobID, job->cSize);
+    if (job->prefix.size > 0)
+        DEBUGLOG(5, "Finished with prefix: %zx", (size_t)job->prefix.start);
+    DEBUGLOG(5, "Finished with source: %zx", (size_t)job->src.start);
+    /* release resources */
+    ZSTDMT_releaseSeq(job->seqPool, rawSeqStore);
+    ZSTDMT_releaseCCtx(job->cctxPool, cctx);
+    /* report */
+    ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
+    if (ZSTD_isError(job->cSize)) assert(lastCBlockSize == 0);
+    job->cSize += lastCBlockSize;
+    job->consumed = job->src.size;  /* when job->consumed == job->src.size , compression job is presumed completed */
+    ZSTD_pthread_cond_signal(&job->job_cond);
+    ZSTD_pthread_mutex_unlock(&job->job_mutex);
+}
+
+
+/* ------------------------------------------ */
+/* =====   Multi-threaded compression   ===== */
+/* ------------------------------------------ */
+
+typedef struct {
+    range_t prefix;         /* read-only non-owned prefix buffer */
+    buffer_t buffer;
+    size_t filled;
+} inBuff_t;
+
+typedef struct {
+  BYTE* buffer;     /* The round input buffer. All jobs get references
+                     * to pieces of the buffer. ZSTDMT_tryGetInputRange()
+                     * handles handing out job input buffers, and makes
+                     * sure it doesn't overlap with any pieces still in use.
+                     */
+  size_t capacity;  /* The capacity of buffer. */
+  size_t pos;       /* The position of the current inBuff in the round
+                     * buffer. Updated past the end if the inBuff once
+                     * the inBuff is sent to the worker thread.
+                     * pos <= capacity.
+                     */
+} roundBuff_t;
+
+static const roundBuff_t kNullRoundBuff = {NULL, 0, 0};
+
+#define RSYNC_LENGTH 32
+/* Don't create chunks smaller than the zstd block size.
+ * This stops us from regressing compression ratio too much,
+ * and ensures our output fits in ZSTD_compressBound().
+ *
+ * If this is shrunk < ZSTD_BLOCKSIZELOG_MIN then
+ * ZSTD_COMPRESSBOUND() will need to be updated.
+ */
+#define RSYNC_MIN_BLOCK_LOG ZSTD_BLOCKSIZELOG_MAX
+#define RSYNC_MIN_BLOCK_SIZE (1<<RSYNC_MIN_BLOCK_LOG)
+
+typedef struct {
+  U64 hash;
+  U64 hitMask;
+  U64 primePower;
+} rsyncState_t;
+
+struct ZSTDMT_CCtx_s {
+    POOL_ctx* factory;
+    ZSTDMT_jobDescription* jobs;
+    ZSTDMT_bufferPool* bufPool;
+    ZSTDMT_CCtxPool* cctxPool;
+    ZSTDMT_seqPool* seqPool;
+    ZSTD_CCtx_params params;
+    size_t targetSectionSize;
+    size_t targetPrefixSize;
+    int jobReady;        /* 1 => one job is already prepared, but pool has shortage of workers. Don't create a new job. */
+    inBuff_t inBuff;
+    roundBuff_t roundBuff;
+    serialState_t serial;
+    rsyncState_t rsync;
+    unsigned jobIDMask;
+    unsigned doneJobID;
+    unsigned nextJobID;
+    unsigned frameEnded;
+    unsigned allJobsCompleted;
+    unsigned long long frameContentSize;
+    unsigned long long consumed;
+    unsigned long long produced;
+    ZSTD_customMem cMem;
+    ZSTD_CDict* cdictLocal;
+    const ZSTD_CDict* cdict;
+    unsigned providedFactory: 1;
+};
+
+static void ZSTDMT_freeJobsTable(ZSTDMT_jobDescription* jobTable, U32 nbJobs, ZSTD_customMem cMem)
+{
+    U32 jobNb;
+    if (jobTable == NULL) return;
+    for (jobNb=0; jobNb<nbJobs; jobNb++) {
+        ZSTD_pthread_mutex_destroy(&jobTable[jobNb].job_mutex);
+        ZSTD_pthread_cond_destroy(&jobTable[jobNb].job_cond);
+    }
+    ZSTD_customFree(jobTable, cMem);
+}
+
+/* ZSTDMT_allocJobsTable()
+ * allocate and init a job table.
+ * update *nbJobsPtr to next power of 2 value, as size of table */
+static ZSTDMT_jobDescription* ZSTDMT_createJobsTable(U32* nbJobsPtr, ZSTD_customMem cMem)
+{
+    U32 const nbJobsLog2 = ZSTD_highbit32(*nbJobsPtr) + 1;
+    U32 const nbJobs = 1 << nbJobsLog2;
+    U32 jobNb;
+    ZSTDMT_jobDescription* const jobTable = (ZSTDMT_jobDescription*)
+                ZSTD_customCalloc(nbJobs * sizeof(ZSTDMT_jobDescription), cMem);
+    int initError = 0;
+    if (jobTable==NULL) return NULL;
+    *nbJobsPtr = nbJobs;
+    for (jobNb=0; jobNb<nbJobs; jobNb++) {
+        initError |= ZSTD_pthread_mutex_init(&jobTable[jobNb].job_mutex, NULL);
+        initError |= ZSTD_pthread_cond_init(&jobTable[jobNb].job_cond, NULL);
+    }
+    if (initError != 0) {
+        ZSTDMT_freeJobsTable(jobTable, nbJobs, cMem);
+        return NULL;
+    }
+    return jobTable;
+}
+
+static size_t ZSTDMT_expandJobsTable (ZSTDMT_CCtx* mtctx, U32 nbWorkers) {
+    U32 nbJobs = nbWorkers + 2;
+    if (nbJobs > mtctx->jobIDMask+1) {  /* need more job capacity */
+        ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
+        mtctx->jobIDMask = 0;
+        mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, mtctx->cMem);
+        if (mtctx->jobs==NULL) return ERROR(memory_allocation);
+        assert((nbJobs != 0) && ((nbJobs & (nbJobs - 1)) == 0));  /* ensure nbJobs is a power of 2 */
+        mtctx->jobIDMask = nbJobs - 1;
+    }
+    return 0;
+}
+
+
+/* ZSTDMT_CCtxParam_setNbWorkers():
+ * Internal use only */
+static size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers)
+{
+    return ZSTD_CCtxParams_setParameter(params, ZSTD_c_nbWorkers, (int)nbWorkers);
+}
+
+MEM_STATIC ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced_internal(unsigned nbWorkers, ZSTD_customMem cMem, ZSTD_threadPool* pool)
+{
+    ZSTDMT_CCtx* mtctx;
+    U32 nbJobs = nbWorkers + 2;
+    int initError;
+    DEBUGLOG(3, "ZSTDMT_createCCtx_advanced (nbWorkers = %u)", nbWorkers);
+
+    if (nbWorkers < 1) return NULL;
+    nbWorkers = MIN(nbWorkers , ZSTDMT_NBWORKERS_MAX);
+    if ((cMem.customAlloc!=NULL) ^ (cMem.customFree!=NULL))
+        /* invalid custom allocator */
+        return NULL;
+
+    mtctx = (ZSTDMT_CCtx*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtx), cMem);
+    if (!mtctx) return NULL;
+    ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers);
+    mtctx->cMem = cMem;
+    mtctx->allJobsCompleted = 1;
+    if (pool != NULL) {
+      mtctx->factory = pool;
+      mtctx->providedFactory = 1;
+    }
+    else {
+      mtctx->factory = POOL_create_advanced(nbWorkers, 0, cMem);
+      mtctx->providedFactory = 0;
+    }
+    mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, cMem);
+    assert(nbJobs > 0); assert((nbJobs & (nbJobs - 1)) == 0);  /* ensure nbJobs is a power of 2 */
+    mtctx->jobIDMask = nbJobs - 1;
+    mtctx->bufPool = ZSTDMT_createBufferPool(BUF_POOL_MAX_NB_BUFFERS(nbWorkers), cMem);
+    mtctx->cctxPool = ZSTDMT_createCCtxPool(nbWorkers, cMem);
+    mtctx->seqPool = ZSTDMT_createSeqPool(nbWorkers, cMem);
+    initError = ZSTDMT_serialState_init(&mtctx->serial);
+    mtctx->roundBuff = kNullRoundBuff;
+    if (!mtctx->factory | !mtctx->jobs | !mtctx->bufPool | !mtctx->cctxPool | !mtctx->seqPool | initError) {
+        ZSTDMT_freeCCtx(mtctx);
+        return NULL;
+    }
+    DEBUGLOG(3, "mt_cctx created, for %u threads", nbWorkers);
+    return mtctx;
+}
+
+ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, ZSTD_customMem cMem, ZSTD_threadPool* pool)
+{
+#ifdef ZSTD_MULTITHREAD
+    return ZSTDMT_createCCtx_advanced_internal(nbWorkers, cMem, pool);
+#else
+    (void)nbWorkers;
+    (void)cMem;
+    (void)pool;
+    return NULL;
+#endif
+}
+
+
+/* ZSTDMT_releaseAllJobResources() :
+ * note : ensure all workers are killed first ! */
+static void ZSTDMT_releaseAllJobResources(ZSTDMT_CCtx* mtctx)
+{
+    unsigned jobID;
+    DEBUGLOG(3, "ZSTDMT_releaseAllJobResources");
+    for (jobID=0; jobID <= mtctx->jobIDMask; jobID++) {
+        /* Copy the mutex/cond out */
+        ZSTD_pthread_mutex_t const mutex = mtctx->jobs[jobID].job_mutex;
+        ZSTD_pthread_cond_t const cond = mtctx->jobs[jobID].job_cond;
+
+        DEBUGLOG(4, "job%02u: release dst address %08X", jobID, (U32)(size_t)mtctx->jobs[jobID].dstBuff.start);
+        ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff);
+
+        /* Clear the job description, but keep the mutex/cond */
+        ZSTD_memset(&mtctx->jobs[jobID], 0, sizeof(mtctx->jobs[jobID]));
+        mtctx->jobs[jobID].job_mutex = mutex;
+        mtctx->jobs[jobID].job_cond = cond;
+    }
+    mtctx->inBuff.buffer = g_nullBuffer;
+    mtctx->inBuff.filled = 0;
+    mtctx->allJobsCompleted = 1;
+}
+
+static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* mtctx)
+{
+    DEBUGLOG(4, "ZSTDMT_waitForAllJobsCompleted");
+    while (mtctx->doneJobID < mtctx->nextJobID) {
+        unsigned const jobID = mtctx->doneJobID & mtctx->jobIDMask;
+        ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
+        while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
+            DEBUGLOG(4, "waiting for jobCompleted signal from job %u", mtctx->doneJobID);   /* we want to block when waiting for data to flush */
+            ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
+        }
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex);
+        mtctx->doneJobID++;
+    }
+}
+
+size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx)
+{
+    if (mtctx==NULL) return 0;   /* compatible with free on NULL */
+    if (!mtctx->providedFactory)
+        POOL_free(mtctx->factory);   /* stop and free worker threads */
+    ZSTDMT_releaseAllJobResources(mtctx);  /* release job resources into pools first */
+    ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
+    ZSTDMT_freeBufferPool(mtctx->bufPool);
+    ZSTDMT_freeCCtxPool(mtctx->cctxPool);
+    ZSTDMT_freeSeqPool(mtctx->seqPool);
+    ZSTDMT_serialState_free(&mtctx->serial);
+    ZSTD_freeCDict(mtctx->cdictLocal);
+    if (mtctx->roundBuff.buffer)
+        ZSTD_customFree(mtctx->roundBuff.buffer, mtctx->cMem);
+    ZSTD_customFree(mtctx, mtctx->cMem);
+    return 0;
+}
+
+size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx)
+{
+    if (mtctx == NULL) return 0;   /* supports sizeof NULL */
+    return sizeof(*mtctx)
+            + POOL_sizeof(mtctx->factory)
+            + ZSTDMT_sizeof_bufferPool(mtctx->bufPool)
+            + (mtctx->jobIDMask+1) * sizeof(ZSTDMT_jobDescription)
+            + ZSTDMT_sizeof_CCtxPool(mtctx->cctxPool)
+            + ZSTDMT_sizeof_seqPool(mtctx->seqPool)
+            + ZSTD_sizeof_CDict(mtctx->cdictLocal)
+            + mtctx->roundBuff.capacity;
+}
+
+
+/* ZSTDMT_resize() :
+ * @return : error code if fails, 0 on success */
+static size_t ZSTDMT_resize(ZSTDMT_CCtx* mtctx, unsigned nbWorkers)
+{
+    if (POOL_resize(mtctx->factory, nbWorkers)) return ERROR(memory_allocation);
+    FORWARD_IF_ERROR( ZSTDMT_expandJobsTable(mtctx, nbWorkers) , "");
+    mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, BUF_POOL_MAX_NB_BUFFERS(nbWorkers));
+    if (mtctx->bufPool == NULL) return ERROR(memory_allocation);
+    mtctx->cctxPool = ZSTDMT_expandCCtxPool(mtctx->cctxPool, nbWorkers);
+    if (mtctx->cctxPool == NULL) return ERROR(memory_allocation);
+    mtctx->seqPool = ZSTDMT_expandSeqPool(mtctx->seqPool, nbWorkers);
+    if (mtctx->seqPool == NULL) return ERROR(memory_allocation);
+    ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers);
+    return 0;
+}
+
+
+/*! ZSTDMT_updateCParams_whileCompressing() :
+ *  Updates a selected set of compression parameters, remaining compatible with currently active frame.
+ *  New parameters will be applied to next compression job. */
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams)
+{
+    U32 const saved_wlog = mtctx->params.cParams.windowLog;   /* Do not modify windowLog while compressing */
+    int const compressionLevel = cctxParams->compressionLevel;
+    DEBUGLOG(5, "ZSTDMT_updateCParams_whileCompressing (level:%i)",
+                compressionLevel);
+    mtctx->params.compressionLevel = compressionLevel;
+    {   ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+        cParams.windowLog = saved_wlog;
+        mtctx->params.cParams = cParams;
+    }
+}
+
+/* ZSTDMT_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads.
+ * Note : mutex will be acquired during statistics collection inside workers. */
+ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx)
+{
+    ZSTD_frameProgression fps;
+    DEBUGLOG(5, "ZSTDMT_getFrameProgression");
+    fps.ingested = mtctx->consumed + mtctx->inBuff.filled;
+    fps.consumed = mtctx->consumed;
+    fps.produced = fps.flushed = mtctx->produced;
+    fps.currentJobID = mtctx->nextJobID;
+    fps.nbActiveWorkers = 0;
+    {   unsigned jobNb;
+        unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1);
+        DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)",
+                    mtctx->doneJobID, lastJobNb, mtctx->jobReady);
+        for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) {
+            unsigned const wJobID = jobNb & mtctx->jobIDMask;
+            ZSTDMT_jobDescription* jobPtr = &mtctx->jobs[wJobID];
+            ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
+            {   size_t const cResult = jobPtr->cSize;
+                size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
+                size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
+                assert(flushed <= produced);
+                fps.ingested += jobPtr->src.size;
+                fps.consumed += jobPtr->consumed;
+                fps.produced += produced;
+                fps.flushed  += flushed;
+                fps.nbActiveWorkers += (jobPtr->consumed < jobPtr->src.size);
+            }
+            ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+        }
+    }
+    return fps;
+}
+
+
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx)
+{
+    size_t toFlush;
+    unsigned const jobID = mtctx->doneJobID;
+    assert(jobID <= mtctx->nextJobID);
+    if (jobID == mtctx->nextJobID) return 0;   /* no active job => nothing to flush */
+
+    /* look into oldest non-fully-flushed job */
+    {   unsigned const wJobID = jobID & mtctx->jobIDMask;
+        ZSTDMT_jobDescription* const jobPtr = &mtctx->jobs[wJobID];
+        ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
+        {   size_t const cResult = jobPtr->cSize;
+            size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
+            size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
+            assert(flushed <= produced);
+            assert(jobPtr->consumed <= jobPtr->src.size);
+            toFlush = produced - flushed;
+            /* if toFlush==0, nothing is available to flush.
+             * However, jobID is expected to still be active:
+             * if jobID was already completed and fully flushed,
+             * ZSTDMT_flushProduced() should have already moved onto next job.
+             * Therefore, some input has not yet been consumed. */
+            if (toFlush==0) {
+                assert(jobPtr->consumed < jobPtr->src.size);
+            }
+        }
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+    }
+
+    return toFlush;
+}
+
+
+/* ------------------------------------------ */
+/* =====   Multi-threaded compression   ===== */
+/* ------------------------------------------ */
+
+static unsigned ZSTDMT_computeTargetJobLog(const ZSTD_CCtx_params* params)
+{
+    unsigned jobLog;
+    if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+        /* In Long Range Mode, the windowLog is typically oversized.
+         * In which case, it's preferable to determine the jobSize
+         * based on cycleLog instead. */
+        jobLog = MAX(21, ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy) + 3);
+    } else {
+        jobLog = MAX(20, params->cParams.windowLog + 2);
+    }
+    return MIN(jobLog, (unsigned)ZSTDMT_JOBLOG_MAX);
+}
+
+static int ZSTDMT_overlapLog_default(ZSTD_strategy strat)
+{
+    switch(strat)
+    {
+        case ZSTD_btultra2:
+            return 9;
+        case ZSTD_btultra:
+        case ZSTD_btopt:
+            return 8;
+        case ZSTD_btlazy2:
+        case ZSTD_lazy2:
+            return 7;
+        case ZSTD_lazy:
+        case ZSTD_greedy:
+        case ZSTD_dfast:
+        case ZSTD_fast:
+        default:;
+    }
+    return 6;
+}
+
+static int ZSTDMT_overlapLog(int ovlog, ZSTD_strategy strat)
+{
+    assert(0 <= ovlog && ovlog <= 9);
+    if (ovlog == 0) return ZSTDMT_overlapLog_default(strat);
+    return ovlog;
+}
+
+static size_t ZSTDMT_computeOverlapSize(const ZSTD_CCtx_params* params)
+{
+    int const overlapRLog = 9 - ZSTDMT_overlapLog(params->overlapLog, params->cParams.strategy);
+    int ovLog = (overlapRLog >= 8) ? 0 : (params->cParams.windowLog - overlapRLog);
+    assert(0 <= overlapRLog && overlapRLog <= 8);
+    if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+        /* In Long Range Mode, the windowLog is typically oversized.
+         * In which case, it's preferable to determine the jobSize
+         * based on chainLog instead.
+         * Then, ovLog becomes a fraction of the jobSize, rather than windowSize */
+        ovLog = MIN(params->cParams.windowLog, ZSTDMT_computeTargetJobLog(params) - 2)
+                - overlapRLog;
+    }
+    assert(0 <= ovLog && ovLog <= ZSTD_WINDOWLOG_MAX);
+    DEBUGLOG(4, "overlapLog : %i", params->overlapLog);
+    DEBUGLOG(4, "overlap size : %i", 1 << ovLog);
+    return (ovLog==0) ? 0 : (size_t)1 << ovLog;
+}
+
+/* ====================================== */
+/* =======      Streaming API     ======= */
+/* ====================================== */
+
+size_t ZSTDMT_initCStream_internal(
+        ZSTDMT_CCtx* mtctx,
+        const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
+        const ZSTD_CDict* cdict, ZSTD_CCtx_params params,
+        unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTDMT_initCStream_internal (pledgedSrcSize=%u, nbWorkers=%u, cctxPool=%u)",
+                (U32)pledgedSrcSize, params.nbWorkers, mtctx->cctxPool->totalCCtx);
+
+    /* params supposed partially fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+
+    /* init */
+    if (params.nbWorkers != mtctx->params.nbWorkers)
+        FORWARD_IF_ERROR( ZSTDMT_resize(mtctx, params.nbWorkers) , "");
+
+    if (params.jobSize != 0 && params.jobSize < ZSTDMT_JOBSIZE_MIN) params.jobSize = ZSTDMT_JOBSIZE_MIN;
+    if (params.jobSize > (size_t)ZSTDMT_JOBSIZE_MAX) params.jobSize = (size_t)ZSTDMT_JOBSIZE_MAX;
+
+    DEBUGLOG(4, "ZSTDMT_initCStream_internal: %u workers", params.nbWorkers);
+
+    if (mtctx->allJobsCompleted == 0) {   /* previous compression not correctly finished */
+        ZSTDMT_waitForAllJobsCompleted(mtctx);
+        ZSTDMT_releaseAllJobResources(mtctx);
+        mtctx->allJobsCompleted = 1;
+    }
+
+    mtctx->params = params;
+    mtctx->frameContentSize = pledgedSrcSize;
+    if (dict) {
+        ZSTD_freeCDict(mtctx->cdictLocal);
+        mtctx->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize,
+                                                    ZSTD_dlm_byCopy, dictContentType, /* note : a loadPrefix becomes an internal CDict */
+                                                    params.cParams, mtctx->cMem);
+        mtctx->cdict = mtctx->cdictLocal;
+        if (mtctx->cdictLocal == NULL) return ERROR(memory_allocation);
+    } else {
+        ZSTD_freeCDict(mtctx->cdictLocal);
+        mtctx->cdictLocal = NULL;
+        mtctx->cdict = cdict;
+    }
+
+    mtctx->targetPrefixSize = ZSTDMT_computeOverlapSize(&params);
+    DEBUGLOG(4, "overlapLog=%i => %u KB", params.overlapLog, (U32)(mtctx->targetPrefixSize>>10));
+    mtctx->targetSectionSize = params.jobSize;
+    if (mtctx->targetSectionSize == 0) {
+        mtctx->targetSectionSize = 1ULL << ZSTDMT_computeTargetJobLog(&params);
+    }
+    assert(mtctx->targetSectionSize <= (size_t)ZSTDMT_JOBSIZE_MAX);
+
+    if (params.rsyncable) {
+        /* Aim for the targetsectionSize as the average job size. */
+        U32 const jobSizeKB = (U32)(mtctx->targetSectionSize >> 10);
+        U32 const rsyncBits = (assert(jobSizeKB >= 1), ZSTD_highbit32(jobSizeKB) + 10);
+        /* We refuse to create jobs < RSYNC_MIN_BLOCK_SIZE bytes, so make sure our
+         * expected job size is at least 4x larger. */
+        assert(rsyncBits >= RSYNC_MIN_BLOCK_LOG + 2);
+        DEBUGLOG(4, "rsyncLog = %u", rsyncBits);
+        mtctx->rsync.hash = 0;
+        mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1;
+        mtctx->rsync.primePower = ZSTD_rollingHash_primePower(RSYNC_LENGTH);
+    }
+    if (mtctx->targetSectionSize < mtctx->targetPrefixSize) mtctx->targetSectionSize = mtctx->targetPrefixSize;  /* job size must be >= overlap size */
+    DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), (U32)params.jobSize);
+    DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10));
+    ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize));
+    {
+        /* If ldm is enabled we need windowSize space. */
+        size_t const windowSize = mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable ? (1U << mtctx->params.cParams.windowLog) : 0;
+        /* Two buffers of slack, plus extra space for the overlap
+         * This is the minimum slack that LDM works with. One extra because
+         * flush might waste up to targetSectionSize-1 bytes. Another extra
+         * for the overlap (if > 0), then one to fill which doesn't overlap
+         * with the LDM window.
+         */
+        size_t const nbSlackBuffers = 2 + (mtctx->targetPrefixSize > 0);
+        size_t const slackSize = mtctx->targetSectionSize * nbSlackBuffers;
+        /* Compute the total size, and always have enough slack */
+        size_t const nbWorkers = MAX(mtctx->params.nbWorkers, 1);
+        size_t const sectionsSize = mtctx->targetSectionSize * nbWorkers;
+        size_t const capacity = MAX(windowSize, sectionsSize) + slackSize;
+        if (mtctx->roundBuff.capacity < capacity) {
+            if (mtctx->roundBuff.buffer)
+                ZSTD_customFree(mtctx->roundBuff.buffer, mtctx->cMem);
+            mtctx->roundBuff.buffer = (BYTE*)ZSTD_customMalloc(capacity, mtctx->cMem);
+            if (mtctx->roundBuff.buffer == NULL) {
+                mtctx->roundBuff.capacity = 0;
+                return ERROR(memory_allocation);
+            }
+            mtctx->roundBuff.capacity = capacity;
+        }
+    }
+    DEBUGLOG(4, "roundBuff capacity : %u KB", (U32)(mtctx->roundBuff.capacity>>10));
+    mtctx->roundBuff.pos = 0;
+    mtctx->inBuff.buffer = g_nullBuffer;
+    mtctx->inBuff.filled = 0;
+    mtctx->inBuff.prefix = kNullRange;
+    mtctx->doneJobID = 0;
+    mtctx->nextJobID = 0;
+    mtctx->frameEnded = 0;
+    mtctx->allJobsCompleted = 0;
+    mtctx->consumed = 0;
+    mtctx->produced = 0;
+    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, mtctx->targetSectionSize,
+                                 dict, dictSize, dictContentType))
+        return ERROR(memory_allocation);
+    return 0;
+}
+
+
+/* ZSTDMT_writeLastEmptyBlock()
+ * Write a single empty block with an end-of-frame to finish a frame.
+ * Job must be created from streaming variant.
+ * This function is always successful if expected conditions are fulfilled.
+ */
+static void ZSTDMT_writeLastEmptyBlock(ZSTDMT_jobDescription* job)
+{
+    assert(job->lastJob == 1);
+    assert(job->src.size == 0);   /* last job is empty -> will be simplified into a last empty block */
+    assert(job->firstJob == 0);   /* cannot be first job, as it also needs to create frame header */
+    assert(job->dstBuff.start == NULL);   /* invoked from streaming variant only (otherwise, dstBuff might be user's output) */
+    job->dstBuff = ZSTDMT_getBuffer(job->bufPool);
+    if (job->dstBuff.start == NULL) {
+      job->cSize = ERROR(memory_allocation);
+      return;
+    }
+    assert(job->dstBuff.capacity >= ZSTD_blockHeaderSize);   /* no buffer should ever be that small */
+    job->src = kNullRange;
+    job->cSize = ZSTD_writeLastEmptyBlock(job->dstBuff.start, job->dstBuff.capacity);
+    assert(!ZSTD_isError(job->cSize));
+    assert(job->consumed == 0);
+}
+
+static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZSTD_EndDirective endOp)
+{
+    unsigned const jobID = mtctx->nextJobID & mtctx->jobIDMask;
+    int const endFrame = (endOp == ZSTD_e_end);
+
+    if (mtctx->nextJobID > mtctx->doneJobID + mtctx->jobIDMask) {
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: will not create new job : table is full");
+        assert((mtctx->nextJobID & mtctx->jobIDMask) == (mtctx->doneJobID & mtctx->jobIDMask));
+        return 0;
+    }
+
+    if (!mtctx->jobReady) {
+        BYTE const* src = (BYTE const*)mtctx->inBuff.buffer.start;
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: preparing job %u to compress %u bytes with %u preload ",
+                    mtctx->nextJobID, (U32)srcSize, (U32)mtctx->inBuff.prefix.size);
+        mtctx->jobs[jobID].src.start = src;
+        mtctx->jobs[jobID].src.size = srcSize;
+        assert(mtctx->inBuff.filled >= srcSize);
+        mtctx->jobs[jobID].prefix = mtctx->inBuff.prefix;
+        mtctx->jobs[jobID].consumed = 0;
+        mtctx->jobs[jobID].cSize = 0;
+        mtctx->jobs[jobID].params = mtctx->params;
+        mtctx->jobs[jobID].cdict = mtctx->nextJobID==0 ? mtctx->cdict : NULL;
+        mtctx->jobs[jobID].fullFrameSize = mtctx->frameContentSize;
+        mtctx->jobs[jobID].dstBuff = g_nullBuffer;
+        mtctx->jobs[jobID].cctxPool = mtctx->cctxPool;
+        mtctx->jobs[jobID].bufPool = mtctx->bufPool;
+        mtctx->jobs[jobID].seqPool = mtctx->seqPool;
+        mtctx->jobs[jobID].serial = &mtctx->serial;
+        mtctx->jobs[jobID].jobID = mtctx->nextJobID;
+        mtctx->jobs[jobID].firstJob = (mtctx->nextJobID==0);
+        mtctx->jobs[jobID].lastJob = endFrame;
+        mtctx->jobs[jobID].frameChecksumNeeded = mtctx->params.fParams.checksumFlag && endFrame && (mtctx->nextJobID>0);
+        mtctx->jobs[jobID].dstFlushed = 0;
+
+        /* Update the round buffer pos and clear the input buffer to be reset */
+        mtctx->roundBuff.pos += srcSize;
+        mtctx->inBuff.buffer = g_nullBuffer;
+        mtctx->inBuff.filled = 0;
+        /* Set the prefix */
+        if (!endFrame) {
+            size_t const newPrefixSize = MIN(srcSize, mtctx->targetPrefixSize);
+            mtctx->inBuff.prefix.start = src + srcSize - newPrefixSize;
+            mtctx->inBuff.prefix.size = newPrefixSize;
+        } else {   /* endFrame==1 => no need for another input buffer */
+            mtctx->inBuff.prefix = kNullRange;
+            mtctx->frameEnded = endFrame;
+            if (mtctx->nextJobID == 0) {
+                /* single job exception : checksum is already calculated directly within worker thread */
+                mtctx->params.fParams.checksumFlag = 0;
+        }   }
+
+        if ( (srcSize == 0)
+          && (mtctx->nextJobID>0)/*single job must also write frame header*/ ) {
+            DEBUGLOG(5, "ZSTDMT_createCompressionJob: creating a last empty block to end frame");
+            assert(endOp == ZSTD_e_end);  /* only possible case : need to end the frame with an empty last block */
+            ZSTDMT_writeLastEmptyBlock(mtctx->jobs + jobID);
+            mtctx->nextJobID++;
+            return 0;
+        }
+    }
+
+    DEBUGLOG(5, "ZSTDMT_createCompressionJob: posting job %u : %u bytes  (end:%u, jobNb == %u (mod:%u))",
+                mtctx->nextJobID,
+                (U32)mtctx->jobs[jobID].src.size,
+                mtctx->jobs[jobID].lastJob,
+                mtctx->nextJobID,
+                jobID);
+    if (POOL_tryAdd(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[jobID])) {
+        mtctx->nextJobID++;
+        mtctx->jobReady = 0;
+    } else {
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: no worker available for job %u", mtctx->nextJobID);
+        mtctx->jobReady = 1;
+    }
+    return 0;
+}
+
+
+/*! ZSTDMT_flushProduced() :
+ *  flush whatever data has been produced but not yet flushed in current job.
+ *  move to next job if current one is fully flushed.
+ * `output` : `pos` will be updated with amount of data flushed .
+ * `blockToFlush` : if >0, the function will block and wait if there is no data available to flush .
+ * @return : amount of data remaining within internal buffer, 0 if no more, 1 if unknown but > 0, or an error code */
+static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, unsigned blockToFlush, ZSTD_EndDirective end)
+{
+    unsigned const wJobID = mtctx->doneJobID & mtctx->jobIDMask;
+    DEBUGLOG(5, "ZSTDMT_flushProduced (blocking:%u , job %u <= %u)",
+                blockToFlush, mtctx->doneJobID, mtctx->nextJobID);
+    assert(output->size >= output->pos);
+
+    ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex);
+    if (  blockToFlush
+      && (mtctx->doneJobID < mtctx->nextJobID) ) {
+        assert(mtctx->jobs[wJobID].dstFlushed <= mtctx->jobs[wJobID].cSize);
+        while (mtctx->jobs[wJobID].dstFlushed == mtctx->jobs[wJobID].cSize) {  /* nothing to flush */
+            if (mtctx->jobs[wJobID].consumed == mtctx->jobs[wJobID].src.size) {
+                DEBUGLOG(5, "job %u is completely consumed (%u == %u) => don't wait for cond, there will be none",
+                            mtctx->doneJobID, (U32)mtctx->jobs[wJobID].consumed, (U32)mtctx->jobs[wJobID].src.size);
+                break;
+            }
+            DEBUGLOG(5, "waiting for something to flush from job %u (currently flushed: %u bytes)",
+                        mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
+            ZSTD_pthread_cond_wait(&mtctx->jobs[wJobID].job_cond, &mtctx->jobs[wJobID].job_mutex);  /* block when nothing to flush but some to come */
+    }   }
+
+    /* try to flush something */
+    {   size_t cSize = mtctx->jobs[wJobID].cSize;                  /* shared */
+        size_t const srcConsumed = mtctx->jobs[wJobID].consumed;   /* shared */
+        size_t const srcSize = mtctx->jobs[wJobID].src.size;       /* read-only, could be done after mutex lock, but no-declaration-after-statement */
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+        if (ZSTD_isError(cSize)) {
+            DEBUGLOG(5, "ZSTDMT_flushProduced: job %u : compression error detected : %s",
+                        mtctx->doneJobID, ZSTD_getErrorName(cSize));
+            ZSTDMT_waitForAllJobsCompleted(mtctx);
+            ZSTDMT_releaseAllJobResources(mtctx);
+            return cSize;
+        }
+        /* add frame checksum if necessary (can only happen once) */
+        assert(srcConsumed <= srcSize);
+        if ( (srcConsumed == srcSize)   /* job completed -> worker no longer active */
+          && mtctx->jobs[wJobID].frameChecksumNeeded ) {
+            U32 const checksum = (U32)XXH64_digest(&mtctx->serial.xxhState);
+            DEBUGLOG(4, "ZSTDMT_flushProduced: writing checksum : %08X \n", checksum);
+            MEM_writeLE32((char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].cSize, checksum);
+            cSize += 4;
+            mtctx->jobs[wJobID].cSize += 4;  /* can write this shared value, as worker is no longer active */
+            mtctx->jobs[wJobID].frameChecksumNeeded = 0;
+        }
+
+        if (cSize > 0) {   /* compression is ongoing or completed */
+            size_t const toFlush = MIN(cSize - mtctx->jobs[wJobID].dstFlushed, output->size - output->pos);
+            DEBUGLOG(5, "ZSTDMT_flushProduced: Flushing %u bytes from job %u (completion:%u/%u, generated:%u)",
+                        (U32)toFlush, mtctx->doneJobID, (U32)srcConsumed, (U32)srcSize, (U32)cSize);
+            assert(mtctx->doneJobID < mtctx->nextJobID);
+            assert(cSize >= mtctx->jobs[wJobID].dstFlushed);
+            assert(mtctx->jobs[wJobID].dstBuff.start != NULL);
+            if (toFlush > 0) {
+                ZSTD_memcpy((char*)output->dst + output->pos,
+                    (const char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].dstFlushed,
+                    toFlush);
+            }
+            output->pos += toFlush;
+            mtctx->jobs[wJobID].dstFlushed += toFlush;  /* can write : this value is only used by mtctx */
+
+            if ( (srcConsumed == srcSize)    /* job is completed */
+              && (mtctx->jobs[wJobID].dstFlushed == cSize) ) {   /* output buffer fully flushed => free this job position */
+                DEBUGLOG(5, "Job %u completed (%u bytes), moving to next one",
+                        mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
+                ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[wJobID].dstBuff);
+                DEBUGLOG(5, "dstBuffer released");
+                mtctx->jobs[wJobID].dstBuff = g_nullBuffer;
+                mtctx->jobs[wJobID].cSize = 0;   /* ensure this job slot is considered "not started" in future check */
+                mtctx->consumed += srcSize;
+                mtctx->produced += cSize;
+                mtctx->doneJobID++;
+        }   }
+
+        /* return value : how many bytes left in buffer ; fake it to 1 when unknown but >0 */
+        if (cSize > mtctx->jobs[wJobID].dstFlushed) return (cSize - mtctx->jobs[wJobID].dstFlushed);
+        if (srcSize > srcConsumed) return 1;   /* current job not completely compressed */
+    }
+    if (mtctx->doneJobID < mtctx->nextJobID) return 1;   /* some more jobs ongoing */
+    if (mtctx->jobReady) return 1;      /* one job is ready to push, just not yet in the list */
+    if (mtctx->inBuff.filled > 0) return 1;   /* input is not empty, and still needs to be converted into a job */
+    mtctx->allJobsCompleted = mtctx->frameEnded;   /* all jobs are entirely flushed => if this one is last one, frame is completed */
+    if (end == ZSTD_e_end) return !mtctx->frameEnded;  /* for ZSTD_e_end, question becomes : is frame completed ? instead of : are internal buffers fully flushed ? */
+    return 0;   /* internal buffers fully flushed */
+}
+
+/**
+ * Returns the range of data used by the earliest job that is not yet complete.
+ * If the data of the first job is broken up into two segments, we cover both
+ * sections.
+ */
+static range_t ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx)
+{
+    unsigned const firstJobID = mtctx->doneJobID;
+    unsigned const lastJobID = mtctx->nextJobID;
+    unsigned jobID;
+
+    for (jobID = firstJobID; jobID < lastJobID; ++jobID) {
+        unsigned const wJobID = jobID & mtctx->jobIDMask;
+        size_t consumed;
+
+        ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex);
+        consumed = mtctx->jobs[wJobID].consumed;
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+
+        if (consumed < mtctx->jobs[wJobID].src.size) {
+            range_t range = mtctx->jobs[wJobID].prefix;
+            if (range.size == 0) {
+                /* Empty prefix */
+                range = mtctx->jobs[wJobID].src;
+            }
+            /* Job source in multiple segments not supported yet */
+            assert(range.start <= mtctx->jobs[wJobID].src.start);
+            return range;
+        }
+    }
+    return kNullRange;
+}
+
+/**
+ * Returns non-zero iff buffer and range overlap.
+ */
+static int ZSTDMT_isOverlapped(buffer_t buffer, range_t range)
+{
+    BYTE const* const bufferStart = (BYTE const*)buffer.start;
+    BYTE const* const rangeStart = (BYTE const*)range.start;
+
+    if (rangeStart == NULL || bufferStart == NULL)
+        return 0;
+
+    {
+        BYTE const* const bufferEnd = bufferStart + buffer.capacity;
+        BYTE const* const rangeEnd = rangeStart + range.size;
+
+        /* Empty ranges cannot overlap */
+        if (bufferStart == bufferEnd || rangeStart == rangeEnd)
+            return 0;
+
+        return bufferStart < rangeEnd && rangeStart < bufferEnd;
+    }
+}
+
+static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window)
+{
+    range_t extDict;
+    range_t prefix;
+
+    DEBUGLOG(5, "ZSTDMT_doesOverlapWindow");
+    extDict.start = window.dictBase + window.lowLimit;
+    extDict.size = window.dictLimit - window.lowLimit;
+
+    prefix.start = window.base + window.dictLimit;
+    prefix.size = window.nextSrc - (window.base + window.dictLimit);
+    DEBUGLOG(5, "extDict [0x%zx, 0x%zx)",
+                (size_t)extDict.start,
+                (size_t)extDict.start + extDict.size);
+    DEBUGLOG(5, "prefix  [0x%zx, 0x%zx)",
+                (size_t)prefix.start,
+                (size_t)prefix.start + prefix.size);
+
+    return ZSTDMT_isOverlapped(buffer, extDict)
+        || ZSTDMT_isOverlapped(buffer, prefix);
+}
+
+static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, buffer_t buffer)
+{
+    if (mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable) {
+        ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex;
+        DEBUGLOG(5, "ZSTDMT_waitForLdmComplete");
+        DEBUGLOG(5, "source  [0x%zx, 0x%zx)",
+                    (size_t)buffer.start,
+                    (size_t)buffer.start + buffer.capacity);
+        ZSTD_PTHREAD_MUTEX_LOCK(mutex);
+        while (ZSTDMT_doesOverlapWindow(buffer, mtctx->serial.ldmWindow)) {
+            DEBUGLOG(5, "Waiting for LDM to finish...");
+            ZSTD_pthread_cond_wait(&mtctx->serial.ldmWindowCond, mutex);
+        }
+        DEBUGLOG(6, "Done waiting for LDM to finish");
+        ZSTD_pthread_mutex_unlock(mutex);
+    }
+}
+
+/**
+ * Attempts to set the inBuff to the next section to fill.
+ * If any part of the new section is still in use we give up.
+ * Returns non-zero if the buffer is filled.
+ */
+static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx)
+{
+    range_t const inUse = ZSTDMT_getInputDataInUse(mtctx);
+    size_t const spaceLeft = mtctx->roundBuff.capacity - mtctx->roundBuff.pos;
+    size_t const target = mtctx->targetSectionSize;
+    buffer_t buffer;
+
+    DEBUGLOG(5, "ZSTDMT_tryGetInputRange");
+    assert(mtctx->inBuff.buffer.start == NULL);
+    assert(mtctx->roundBuff.capacity >= target);
+
+    if (spaceLeft < target) {
+        /* ZSTD_invalidateRepCodes() doesn't work for extDict variants.
+         * Simply copy the prefix to the beginning in that case.
+         */
+        BYTE* const start = (BYTE*)mtctx->roundBuff.buffer;
+        size_t const prefixSize = mtctx->inBuff.prefix.size;
+
+        buffer.start = start;
+        buffer.capacity = prefixSize;
+        if (ZSTDMT_isOverlapped(buffer, inUse)) {
+            DEBUGLOG(5, "Waiting for buffer...");
+            return 0;
+        }
+        ZSTDMT_waitForLdmComplete(mtctx, buffer);
+        ZSTD_memmove(start, mtctx->inBuff.prefix.start, prefixSize);
+        mtctx->inBuff.prefix.start = start;
+        mtctx->roundBuff.pos = prefixSize;
+    }
+    buffer.start = mtctx->roundBuff.buffer + mtctx->roundBuff.pos;
+    buffer.capacity = target;
+
+    if (ZSTDMT_isOverlapped(buffer, inUse)) {
+        DEBUGLOG(5, "Waiting for buffer...");
+        return 0;
+    }
+    assert(!ZSTDMT_isOverlapped(buffer, mtctx->inBuff.prefix));
+
+    ZSTDMT_waitForLdmComplete(mtctx, buffer);
+
+    DEBUGLOG(5, "Using prefix range [%zx, %zx)",
+                (size_t)mtctx->inBuff.prefix.start,
+                (size_t)mtctx->inBuff.prefix.start + mtctx->inBuff.prefix.size);
+    DEBUGLOG(5, "Using source range [%zx, %zx)",
+                (size_t)buffer.start,
+                (size_t)buffer.start + buffer.capacity);
+
+
+    mtctx->inBuff.buffer = buffer;
+    mtctx->inBuff.filled = 0;
+    assert(mtctx->roundBuff.pos + buffer.capacity <= mtctx->roundBuff.capacity);
+    return 1;
+}
+
+typedef struct {
+  size_t toLoad;  /* The number of bytes to load from the input. */
+  int flush;      /* Boolean declaring if we must flush because we found a synchronization point. */
+} syncPoint_t;
+
+/**
+ * Searches through the input for a synchronization point. If one is found, we
+ * will instruct the caller to flush, and return the number of bytes to load.
+ * Otherwise, we will load as many bytes as possible and instruct the caller
+ * to continue as normal.
+ */
+static syncPoint_t
+findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input)
+{
+    BYTE const* const istart = (BYTE const*)input.src + input.pos;
+    U64 const primePower = mtctx->rsync.primePower;
+    U64 const hitMask = mtctx->rsync.hitMask;
+
+    syncPoint_t syncPoint;
+    U64 hash;
+    BYTE const* prev;
+    size_t pos;
+
+    syncPoint.toLoad = MIN(input.size - input.pos, mtctx->targetSectionSize - mtctx->inBuff.filled);
+    syncPoint.flush = 0;
+    if (!mtctx->params.rsyncable)
+        /* Rsync is disabled. */
+        return syncPoint;
+    if (mtctx->inBuff.filled + input.size - input.pos < RSYNC_MIN_BLOCK_SIZE)
+        /* We don't emit synchronization points if it would produce too small blocks.
+         * We don't have enough input to find a synchronization point, so don't look.
+         */
+        return syncPoint;
+    if (mtctx->inBuff.filled + syncPoint.toLoad < RSYNC_LENGTH)
+        /* Not enough to compute the hash.
+         * We will miss any synchronization points in this RSYNC_LENGTH byte
+         * window. However, since it depends only in the internal buffers, if the
+         * state is already synchronized, we will remain synchronized.
+         * Additionally, the probability that we miss a synchronization point is
+         * low: RSYNC_LENGTH / targetSectionSize.
+         */
+        return syncPoint;
+    /* Initialize the loop variables. */
+    if (mtctx->inBuff.filled < RSYNC_MIN_BLOCK_SIZE) {
+        /* We don't need to scan the first RSYNC_MIN_BLOCK_SIZE positions
+         * because they can't possibly be a sync point. So we can start
+         * part way through the input buffer.
+         */
+        pos = RSYNC_MIN_BLOCK_SIZE - mtctx->inBuff.filled;
+        if (pos >= RSYNC_LENGTH) {
+            prev = istart + pos - RSYNC_LENGTH;
+            hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH);
+        } else {
+            assert(mtctx->inBuff.filled >= RSYNC_LENGTH);
+            prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH;
+            hash = ZSTD_rollingHash_compute(prev + pos, (RSYNC_LENGTH - pos));
+            hash = ZSTD_rollingHash_append(hash, istart, pos);
+        }
+    } else {
+        /* We have enough bytes buffered to initialize the hash,
+         * and have processed enough bytes to find a sync point.
+         * Start scanning at the beginning of the input.
+         */
+        assert(mtctx->inBuff.filled >= RSYNC_MIN_BLOCK_SIZE);
+        assert(RSYNC_MIN_BLOCK_SIZE >= RSYNC_LENGTH);
+        pos = 0;
+        prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH;
+        hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH);
+        if ((hash & hitMask) == hitMask) {
+            /* We're already at a sync point so don't load any more until
+             * we're able to flush this sync point.
+             * This likely happened because the job table was full so we
+             * couldn't add our job.
+             */
+            syncPoint.toLoad = 0;
+            syncPoint.flush = 1;
+            return syncPoint;
+        }
+    }
+    /* Starting with the hash of the previous RSYNC_LENGTH bytes, roll
+     * through the input. If we hit a synchronization point, then cut the
+     * job off, and tell the compressor to flush the job. Otherwise, load
+     * all the bytes and continue as normal.
+     * If we go too long without a synchronization point (targetSectionSize)
+     * then a block will be emitted anyways, but this is okay, since if we
+     * are already synchronized we will remain synchronized.
+     */
+    assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
+    for (; pos < syncPoint.toLoad; ++pos) {
+        BYTE const toRemove = pos < RSYNC_LENGTH ? prev[pos] : istart[pos - RSYNC_LENGTH];
+        /* This assert is very expensive, and Debian compiles with asserts enabled.
+         * So disable it for now. We can get similar coverage by checking it at the
+         * beginning & end of the loop.
+         * assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
+         */
+        hash = ZSTD_rollingHash_rotate(hash, toRemove, istart[pos], primePower);
+        assert(mtctx->inBuff.filled + pos >= RSYNC_MIN_BLOCK_SIZE);
+        if ((hash & hitMask) == hitMask) {
+            syncPoint.toLoad = pos + 1;
+            syncPoint.flush = 1;
+            ++pos; /* for assert */
+            break;
+        }
+    }
+    assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
+    return syncPoint;
+}
+
+size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx)
+{
+    size_t hintInSize = mtctx->targetSectionSize - mtctx->inBuff.filled;
+    if (hintInSize==0) hintInSize = mtctx->targetSectionSize;
+    return hintInSize;
+}
+
+/** ZSTDMT_compressStream_generic() :
+ *  internal use only - exposed to be invoked from zstd_compress.c
+ *  assumption : output and input are valid (pos <= size)
+ * @return : minimum amount of data remaining to flush, 0 if none */
+size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
+                                     ZSTD_outBuffer* output,
+                                     ZSTD_inBuffer* input,
+                                     ZSTD_EndDirective endOp)
+{
+    unsigned forwardInputProgress = 0;
+    DEBUGLOG(5, "ZSTDMT_compressStream_generic (endOp=%u, srcSize=%u)",
+                (U32)endOp, (U32)(input->size - input->pos));
+    assert(output->pos <= output->size);
+    assert(input->pos  <= input->size);
+
+    if ((mtctx->frameEnded) && (endOp==ZSTD_e_continue)) {
+        /* current frame being ended. Only flush/end are allowed */
+        return ERROR(stage_wrong);
+    }
+
+    /* fill input buffer */
+    if ( (!mtctx->jobReady)
+      && (input->size > input->pos) ) {   /* support NULL input */
+        if (mtctx->inBuff.buffer.start == NULL) {
+            assert(mtctx->inBuff.filled == 0); /* Can't fill an empty buffer */
+            if (!ZSTDMT_tryGetInputRange(mtctx)) {
+                /* It is only possible for this operation to fail if there are
+                 * still compression jobs ongoing.
+                 */
+                DEBUGLOG(5, "ZSTDMT_tryGetInputRange failed");
+                assert(mtctx->doneJobID != mtctx->nextJobID);
+            } else
+                DEBUGLOG(5, "ZSTDMT_tryGetInputRange completed successfully : mtctx->inBuff.buffer.start = %p", mtctx->inBuff.buffer.start);
+        }
+        if (mtctx->inBuff.buffer.start != NULL) {
+            syncPoint_t const syncPoint = findSynchronizationPoint(mtctx, *input);
+            if (syncPoint.flush && endOp == ZSTD_e_continue) {
+                endOp = ZSTD_e_flush;
+            }
+            assert(mtctx->inBuff.buffer.capacity >= mtctx->targetSectionSize);
+            DEBUGLOG(5, "ZSTDMT_compressStream_generic: adding %u bytes on top of %u to buffer of size %u",
+                        (U32)syncPoint.toLoad, (U32)mtctx->inBuff.filled, (U32)mtctx->targetSectionSize);
+            ZSTD_memcpy((char*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled, (const char*)input->src + input->pos, syncPoint.toLoad);
+            input->pos += syncPoint.toLoad;
+            mtctx->inBuff.filled += syncPoint.toLoad;
+            forwardInputProgress = syncPoint.toLoad>0;
+        }
+    }
+    if ((input->pos < input->size) && (endOp == ZSTD_e_end)) {
+        /* Can't end yet because the input is not fully consumed.
+            * We are in one of these cases:
+            * - mtctx->inBuff is NULL & empty: we couldn't get an input buffer so don't create a new job.
+            * - We filled the input buffer: flush this job but don't end the frame.
+            * - We hit a synchronization point: flush this job but don't end the frame.
+            */
+        assert(mtctx->inBuff.filled == 0 || mtctx->inBuff.filled == mtctx->targetSectionSize || mtctx->params.rsyncable);
+        endOp = ZSTD_e_flush;
+    }
+
+    if ( (mtctx->jobReady)
+      || (mtctx->inBuff.filled >= mtctx->targetSectionSize)  /* filled enough : let's compress */
+      || ((endOp != ZSTD_e_continue) && (mtctx->inBuff.filled > 0))  /* something to flush : let's go */
+      || ((endOp == ZSTD_e_end) && (!mtctx->frameEnded)) ) {   /* must finish the frame with a zero-size block */
+        size_t const jobSize = mtctx->inBuff.filled;
+        assert(mtctx->inBuff.filled <= mtctx->targetSectionSize);
+        FORWARD_IF_ERROR( ZSTDMT_createCompressionJob(mtctx, jobSize, endOp) , "");
+    }
+
+    /* check for potential compressed data ready to be flushed */
+    {   size_t const remainingToFlush = ZSTDMT_flushProduced(mtctx, output, !forwardInputProgress, endOp); /* block if there was no forward input progress */
+        if (input->pos < input->size) return MAX(remainingToFlush, 1);  /* input not consumed : do not end flush yet */
+        DEBUGLOG(5, "end of ZSTDMT_compressStream_generic: remainingToFlush = %u", (U32)remainingToFlush);
+        return remainingToFlush;
+    }
+}
diff --git a/src/zstd/zstdmt_compress.h b/src/zstd/compress/zstdmt_compress.h
similarity index 60%
rename from src/zstd/zstdmt_compress.h
rename to src/zstd/compress/zstdmt_compress.h
index 467551e6d..ed4dc0e99 100644
--- a/src/zstd/zstdmt_compress.h
+++ b/src/zstd/compress/zstdmt_compress.h
@@ -8,12 +8,13 @@
  * You may select, at your option, one of the above-listed licenses.
  */
 
-#ifndef ZSTDMT_COMPRESS_H
-#define ZSTDMT_COMPRESS_H
+ #ifndef ZSTDMT_COMPRESS_H
+ #define ZSTDMT_COMPRESS_H
+
+ #if defined (__cplusplus)
+ extern "C" {
+ #endif
 
-#if defined(__cplusplus)
-extern "C" {
-#endif
 
 /* Note : This is an internal API.
  *        These APIs used to be exposed with ZSTDLIB_API,
@@ -25,19 +26,21 @@ extern "C" {
  */
 
 /* ===   Dependencies   === */
-#include "../common/zstd_deps.h" /* size_t */
-#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters */
-#include "../zstd.h"			 /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */
+#include "../common/zstd_deps.h"   /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters */
+#include "../zstd.h"            /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */
+
 
 /* ===   Constants   === */
 #ifndef ZSTDMT_NBWORKERS_MAX /* a different value can be selected at compile time */
-#define ZSTDMT_NBWORKERS_MAX ((sizeof(void*) == 4) /*32-bit*/ ? 64 : 256)
+#  define ZSTDMT_NBWORKERS_MAX ((sizeof(void*)==4) /*32-bit*/ ? 64 : 256)
 #endif
-#ifndef ZSTDMT_JOBSIZE_MIN /* a different value can be selected at compile time */
-#define ZSTDMT_JOBSIZE_MIN (512 KB)
+#ifndef ZSTDMT_JOBSIZE_MIN   /* a different value can be selected at compile time */
+#  define ZSTDMT_JOBSIZE_MIN (512 KB)
 #endif
-#define ZSTDMT_JOBLOG_MAX (MEM_32bits() ? 29 : 30)
-#define ZSTDMT_JOBSIZE_MAX (MEM_32bits() ? (512 MB) : (1024 MB))
+#define ZSTDMT_JOBLOG_MAX   (MEM_32bits() ? 29 : 30)
+#define ZSTDMT_JOBSIZE_MAX  (MEM_32bits() ? (512 MB) : (1024 MB))
+
 
 /* ========================================================
  * ===  Private interface, for use by ZSTD_compress.c   ===
@@ -47,14 +50,16 @@ extern "C" {
 /* ===   Memory management   === */
 typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx;
 /* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */
-ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, ZSTD_customMem cMem, ZSTD_threadPool* pool);
+ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers,
+                                        ZSTD_customMem cMem,
+					ZSTD_threadPool *pool);
 size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx);
 
 size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx);
 
 /* ===   Streaming functions   === */
 
-size_t ZSTDMT_nextInputSizeHint(ZSTDMT_CCtx const* mtctx);
+size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx);
 
 /*! ZSTDMT_initCStream_internal() :
  *  Private use only. Init streaming operation.
@@ -64,7 +69,10 @@ size_t ZSTDMT_nextInputSizeHint(ZSTDMT_CCtx const* mtctx);
  *  If mtctx is reused, memory allocations from the prior compression may not be freed,
  *  even if they are not needed for the current compression.
  *  @return : 0, or an error code */
-size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* mtctx, void const* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, ZSTD_CDict const* cdict, ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);
+size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* mtctx,
+                    const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
+                    const ZSTD_CDict* cdict,
+                    ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);
 
 /*! ZSTDMT_compressStream_generic() :
  *  Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
@@ -73,20 +81,23 @@ size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* mtctx, void const* dict, size_t
  *           0 if fully flushed
  *           or an error code
  *  note : needs to be init using any ZSTD_initCStream*() variant */
-size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input, ZSTD_EndDirective endOp);
-
-/*! ZSTDMT_toFlushNow()
- *  Tell how many bytes are ready to be flushed immediately.
- *  Probe the oldest active job (not yet entirely flushed) and check its output buffer.
- *  If return 0, it means there is no active job,
- *  or, it means oldest job is still active, but everything produced has been flushed so far,
- *  therefore flushing is limited by speed of oldest job. */
+size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
+                                     ZSTD_outBuffer* output,
+                                     ZSTD_inBuffer* input,
+                                     ZSTD_EndDirective endOp);
+
+ /*! ZSTDMT_toFlushNow()
+  *  Tell how many bytes are ready to be flushed immediately.
+  *  Probe the oldest active job (not yet entirely flushed) and check its output buffer.
+  *  If return 0, it means there is no active job,
+  *  or, it means oldest job is still active, but everything produced has been flushed so far,
+  *  therefore flushing is limited by speed of oldest job. */
 size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx);
 
 /*! ZSTDMT_updateCParams_whileCompressing() :
  *  Updates only a selected set of compression parameters, to remain compatible with current frame.
  *  New parameters will be applied to next compression job. */
-void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, ZSTD_CCtx_params const* cctxParams);
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
 
 /*! ZSTDMT_getFrameProgression():
  *  tells how much data has been consumed (input) and produced (output) for current frame.
@@ -94,8 +105,9 @@ void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, ZSTD_CCtx_params
  */
 ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
 
-#if defined(__cplusplus)
+
+#if defined (__cplusplus)
 }
 #endif
 
-#endif /* ZSTDMT_COMPRESS_H */
+#endif   /* ZSTDMT_COMPRESS_H */
diff --git a/src/zstd/cpu.h b/src/zstd/cpu.h
deleted file mode 100644
index d1b9fb2a9..000000000
--- a/src/zstd/cpu.h
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef ZSTD_COMMON_CPU_H
-#define ZSTD_COMMON_CPU_H
-
-/**
- * Implementation taken from folly/CpuId.h
- * https://github.com/facebook/folly/blob/master/folly/CpuId.h
- */
-
-#include "mem.h"
-
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
-
-typedef struct {
-	U32 f1c;
-	U32 f1d;
-	U32 f7b;
-	U32 f7c;
-} ZSTD_cpuid_t;
-
-MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
-	U32 f1c = 0;
-	U32 f1d = 0;
-	U32 f7b = 0;
-	U32 f7c = 0;
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
-	int reg[4];
-	__cpuid((int*)reg, 0);
-	{
-		int const n = reg[0];
-		if(n >= 1) {
-			__cpuid((int*)reg, 1);
-			f1c = (U32)reg[2];
-			f1d = (U32)reg[3];
-		}
-		if(n >= 7) {
-			__cpuidex((int*)reg, 7, 0);
-			f7b = (U32)reg[1];
-			f7c = (U32)reg[2];
-		}
-	}
-#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
-	/* The following block like the normal cpuid branch below, but gcc
-	 * reserves ebx for use of its pic register so we must specially
-	 * handle the save and restore to avoid clobbering the register
-	 */
-	U32 n;
-	__asm__(
-	    "pushl %%ebx\n\t"
-	    "cpuid\n\t"
-	    "popl %%ebx\n\t"
-	    : "=a"(n)
-	    : "a"(0)
-	    : "ecx", "edx");
-	if(n >= 1) {
-		U32 f1a;
-		__asm__(
-		    "pushl %%ebx\n\t"
-		    "cpuid\n\t"
-		    "popl %%ebx\n\t"
-		    : "=a"(f1a), "=c"(f1c), "=d"(f1d)
-		    : "a"(1));
-	}
-	if(n >= 7) {
-		__asm__(
-		    "pushl %%ebx\n\t"
-		    "cpuid\n\t"
-		    "movl %%ebx, %%eax\n\t"
-		    "popl %%ebx"
-		    : "=a"(f7b), "=c"(f7c)
-		    : "a"(7), "c"(0)
-		    : "edx");
-	}
-#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
-	U32 n;
-	__asm__("cpuid"
-	        : "=a"(n)
-	        : "a"(0)
-	        : "ebx", "ecx", "edx");
-	if(n >= 1) {
-		U32 f1a;
-		__asm__("cpuid"
-		        : "=a"(f1a), "=c"(f1c), "=d"(f1d)
-		        : "a"(1)
-		        : "ebx");
-	}
-	if(n >= 7) {
-		U32 f7a;
-		__asm__("cpuid"
-		        : "=a"(f7a), "=b"(f7b), "=c"(f7c)
-		        : "a"(7), "c"(0)
-		        : "edx");
-	}
-#endif
-	{
-		ZSTD_cpuid_t cpuid;
-		cpuid.f1c = f1c;
-		cpuid.f1d = f1d;
-		cpuid.f7b = f7b;
-		cpuid.f7c = f7c;
-		return cpuid;
-	}
-}
-
-#define X(name, r, bit)                                          \
-	MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) { \
-		return ((cpuid.r) & (1U << bit)) != 0;                   \
-	}
-
-/* cpuid(1): Processor Info and Feature Bits. */
-#define C(name, bit) X(name, f1c, bit)
-C(sse3, 0)
-C(pclmuldq, 1)
-C(dtes64, 2)
-C(monitor, 3)
-C(dscpl, 4)
-C(vmx, 5)
-C(smx, 6)
-C(eist, 7)
-C(tm2, 8)
-C(ssse3, 9)
-C(cnxtid, 10)
-C(fma, 12)
-C(cx16, 13)
-C(xtpr, 14)
-C(pdcm, 15)
-C(pcid, 17)
-C(dca, 18)
-C(sse41, 19)
-C(sse42, 20)
-C(x2apic, 21)
-C(movbe, 22)
-C(popcnt, 23)
-C(tscdeadline, 24)
-C(aes, 25)
-C(xsave, 26)
-C(osxsave, 27)
-C(avx, 28)
-C(f16c, 29)
-C(rdrand, 30)
-#undef C
-#define D(name, bit) X(name, f1d, bit)
-D(fpu, 0)
-D(vme, 1)
-D(de, 2)
-D(pse, 3)
-D(tsc, 4)
-D(msr, 5)
-D(pae, 6)
-D(mce, 7)
-D(cx8, 8)
-D(apic, 9)
-D(sep, 11)
-D(mtrr, 12)
-D(pge, 13)
-D(mca, 14)
-D(cmov, 15)
-D(pat, 16)
-D(pse36, 17)
-D(psn, 18)
-D(clfsh, 19)
-D(ds, 21)
-D(acpi, 22)
-D(mmx, 23)
-D(fxsr, 24)
-D(sse, 25)
-D(sse2, 26)
-D(ss, 27)
-D(htt, 28)
-D(tm, 29)
-D(pbe, 31)
-#undef D
-
-/* cpuid(7): Extended Features. */
-#define B(name, bit) X(name, f7b, bit)
-B(bmi1, 3)
-B(hle, 4)
-B(avx2, 5)
-B(smep, 7)
-B(bmi2, 8)
-B(erms, 9)
-B(invpcid, 10)
-B(rtm, 11)
-B(mpx, 14)
-B(avx512f, 16)
-B(avx512dq, 17)
-B(rdseed, 18)
-B(adx, 19)
-B(smap, 20)
-B(avx512ifma, 21)
-B(pcommit, 22)
-B(clflushopt, 23)
-B(clwb, 24)
-B(avx512pf, 26)
-B(avx512er, 27)
-B(avx512cd, 28)
-B(sha, 29)
-B(avx512bw, 30)
-B(avx512vl, 31)
-#undef B
-#define C(name, bit) X(name, f7c, bit)
-C(prefetchwt1, 0)
-C(avx512vbmi, 1)
-#undef C
-
-#undef X
-
-#endif /* ZSTD_COMMON_CPU_H */
diff --git a/src/zstd/decompress/huf_decompress.c b/src/zstd/decompress/huf_decompress.c
new file mode 100644
index 000000000..f85dd0bee
--- /dev/null
+++ b/src/zstd/decompress/huf_decompress.c
@@ -0,0 +1,1944 @@
+/* ******************************************************************
+ * huff0 huffman decoder,
+ * part of Finite State Entropy library
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include "../common/zstd_deps.h"  /* ZSTD_memcpy, ZSTD_memset */
+#include "../common/compiler.h"
+#include "../common/bitstream.h"  /* BIT_* */
+#include "../common/fse.h"        /* to compress headers */
+#include "../common/huf.h"
+#include "../common/error_private.h"
+#include "../common/zstd_internal.h"
+#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
+
+/* **************************************************************
+*  Constants
+****************************************************************/
+
+#define HUF_DECODER_FAST_TABLELOG 11
+
+/* **************************************************************
+*  Macros
+****************************************************************/
+
+#ifdef HUF_DISABLE_FAST_DECODE
+# define HUF_ENABLE_FAST_DECODE 0
+#else
+# define HUF_ENABLE_FAST_DECODE 1
+#endif
+
+/* These two optional macros force the use one way or another of the two
+ * Huffman decompression implementations. You can't force in both directions
+ * at the same time.
+ */
+#if defined(HUF_FORCE_DECOMPRESS_X1) && \
+    defined(HUF_FORCE_DECOMPRESS_X2)
+#error "Cannot force the use of the X1 and X2 decoders at the same time!"
+#endif
+
+/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
+ * supported at runtime, so we can add the BMI2 target attribute.
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
+ */
+#if DYNAMIC_BMI2
+# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+#else
+# define HUF_FAST_BMI2_ATTRS
+#endif
+
+#ifdef __cplusplus
+# define HUF_EXTERN_C extern "C"
+#else
+# define HUF_EXTERN_C
+#endif
+#define HUF_ASM_DECL HUF_EXTERN_C
+
+#if DYNAMIC_BMI2
+# define HUF_NEED_BMI2_FUNCTION 1
+#else
+# define HUF_NEED_BMI2_FUNCTION 0
+#endif
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+
+
+/* **************************************************************
+*  Byte alignment for workSpace management
+****************************************************************/
+#define HUF_ALIGN(x, a)         HUF_ALIGN_MASK((x), (a) - 1)
+#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+
+
+/* **************************************************************
+*  BMI2 Variant Wrappers
+****************************************************************/
+typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
+                                              const void *cSrc,
+                                              size_t cSrcSize,
+                                              const HUF_DTable *DTable);
+
+#if DYNAMIC_BMI2
+
+#define HUF_DGEN(fn)                                                        \
+                                                                            \
+    static size_t fn##_default(                                             \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2(                          \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+    {                                                                       \
+        if (flags & HUF_flags_bmi2) {                                       \
+            return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+        }                                                                   \
+        return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+    }
+
+#else
+
+#define HUF_DGEN(fn)                                                        \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+    {                                                                       \
+        (void)flags;                                                        \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }
+
+#endif
+
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+{
+    DTableDesc dtd;
+    ZSTD_memcpy(&dtd, table, sizeof(dtd));
+    return dtd;
+}
+
+static size_t HUF_initFastDStream(BYTE const* ip) {
+    BYTE const lastByte = ip[7];
+    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+    size_t const value = MEM_readLEST(ip) | 1;
+    assert(bitsConsumed <= 8);
+    assert(sizeof(size_t) == 8);
+    return value << bitsConsumed;
+}
+
+
+/**
+ * The input/output arguments to the Huffman fast decoding loop:
+ *
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
+ * dt [in] - The decoding table.
+ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
+ *                down to this pointer. It may be below iend[0].
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
+ *             as long as it is above ilowest, but that indicates corruption.
+ */
+typedef struct {
+    BYTE const* ip[4];
+    BYTE* op[4];
+    U64 bits[4];
+    void const* dt;
+    BYTE const* ilowest;
+    BYTE* oend;
+    BYTE const* iend[4];
+} HUF_DecompressFastArgs;
+
+typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
+
+/**
+ * Initializes args for the fast decoding loop.
+ * @returns 1 on success
+ *          0 if the fallback implementation should be used.
+ *          Or an error code on failure.
+ */
+static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+{
+    void const* dt = DTable + 1;
+    U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
+
+    const BYTE* const istart = (const BYTE*)src;
+
+    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
+
+    /* The fast decoding loop assumes 64-bit little-endian.
+     * This condition is false on x32.
+     */
+    if (!MEM_isLittleEndian() || MEM_32bits())
+        return 0;
+
+    /* Avoid nullptr addition */
+    if (dstSize == 0)
+        return 0;
+    assert(dst != NULL);
+
+    /* strict minimum : jump table + 1 byte per stream */
+    if (srcSize < 10)
+        return ERROR(corruption_detected);
+
+    /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.
+     * If table log is not correct at this point, fallback to the old decoder.
+     * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
+     */
+    if (dtLog != HUF_DECODER_FAST_TABLELOG)
+        return 0;
+
+    /* Read the jump table. */
+    {
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = srcSize - (length1 + length2 + length3 + 6);
+        args->iend[0] = istart + 6;  /* jumpTable */
+        args->iend[1] = args->iend[0] + length1;
+        args->iend[2] = args->iend[1] + length2;
+        args->iend[3] = args->iend[2] + length3;
+
+        /* HUF_initFastDStream() requires this, and this small of an input
+         * won't benefit from the ASM loop anyways.
+         */
+        if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
+            return 0;
+        if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
+    }
+    /* ip[] contains the position that is currently loaded into bits[]. */
+    args->ip[0] = args->iend[1] - sizeof(U64);
+    args->ip[1] = args->iend[2] - sizeof(U64);
+    args->ip[2] = args->iend[3] - sizeof(U64);
+    args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64);
+
+    /* op[] contains the output pointers. */
+    args->op[0] = (BYTE*)dst;
+    args->op[1] = args->op[0] + (dstSize+3)/4;
+    args->op[2] = args->op[1] + (dstSize+3)/4;
+    args->op[3] = args->op[2] + (dstSize+3)/4;
+
+    /* No point to call the ASM loop for tiny outputs. */
+    if (args->op[3] >= oend)
+        return 0;
+
+    /* bits[] is the bit container.
+        * It is read from the MSB down to the LSB.
+        * It is shifted left as it is read, and zeros are
+        * shifted in. After the lowest valid bit a 1 is
+        * set, so that CountTrailingZeros(bits[]) can be used
+        * to count how many bits we've consumed.
+        */
+    args->bits[0] = HUF_initFastDStream(args->ip[0]);
+    args->bits[1] = HUF_initFastDStream(args->ip[1]);
+    args->bits[2] = HUF_initFastDStream(args->ip[2]);
+    args->bits[3] = HUF_initFastDStream(args->ip[3]);
+
+    /* The decoders must be sure to never read beyond ilowest.
+     * This is lower than iend[0], but allowing decoders to read
+     * down to ilowest can allow an extra iteration or two in the
+     * fast loop.
+     */
+    args->ilowest = istart;
+
+    args->oend = oend;
+    args->dt = dt;
+
+    return 1;
+}
+
+static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
+{
+    /* Validate that we haven't overwritten. */
+    if (args->op[stream] > segmentEnd)
+        return ERROR(corruption_detected);
+    /* Validate that we haven't read beyond iend[].
+        * Note that ip[] may be < iend[] because the MSB is
+        * the next bit to read, and we may have consumed 100%
+        * of the stream, so down to iend[i] - 8 is valid.
+        */
+    if (args->ip[stream] < args->iend[stream] - 8)
+        return ERROR(corruption_detected);
+
+    /* Construct the BIT_DStream_t. */
+    assert(sizeof(size_t) == 8);
+    bit->bitContainer = MEM_readLEST(args->ip[stream]);
+    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
+    bit->start = (const char*)args->ilowest;
+    bit->limitPtr = bit->start + sizeof(size_t);
+    bit->ptr = (const char*)args->ip[stream];
+
+    return 0;
+}
+
+/* Calls X(N) for each stream 0, 1, 2, 3. */
+#define HUF_4X_FOR_EACH_STREAM(X) \
+    do {                          \
+        X(0);                     \
+        X(1);                     \
+        X(2);                     \
+        X(3);                     \
+    } while (0)
+
+/* Calls X(N, var) for each stream 0, 1, 2, 3. */
+#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
+    do {                                        \
+        X(0, (var));                            \
+        X(1, (var));                            \
+        X(2, (var));                            \
+        X(3, (var));                            \
+    } while (0)
+
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decoding */
+
+/**
+ * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
+ * a time.
+ */
+static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
+    U64 D4;
+    if (MEM_isLittleEndian()) {
+        D4 = (U64)((symbol << 8) + nbBits);
+    } else {
+        D4 = (U64)(symbol + (nbBits << 8));
+    }
+    assert(D4 < (1U << 16));
+    D4 *= 0x0001000100010001ULL;
+    return D4;
+}
+
+/**
+ * Increase the tableLog to targetTableLog and rescales the stats.
+ * If tableLog > targetTableLog this is a no-op.
+ * @returns New tableLog
+ */
+static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)
+{
+    if (tableLog > targetTableLog)
+        return tableLog;
+    if (tableLog < targetTableLog) {
+        U32 const scale = targetTableLog - tableLog;
+        U32 s;
+        /* Increase the weight for all non-zero probability symbols by scale. */
+        for (s = 0; s < nbSymbols; ++s) {
+            huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
+        }
+        /* Update rankVal to reflect the new weights.
+         * All weights except 0 get moved to weight + scale.
+         * Weights [1, scale] are empty.
+         */
+        for (s = targetTableLog; s > scale; --s) {
+            rankVal[s] = rankVal[s - scale];
+        }
+        for (s = scale; s > 0; --s) {
+            rankVal[s] = 0;
+        }
+    }
+    return targetTableLog;
+}
+
+typedef struct {
+        U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
+        U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
+        U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+        BYTE symbols[HUF_SYMBOLVALUE_MAX + 1];
+        BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
+} HUF_ReadDTableX1_Workspace;
+
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
+{
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+    size_t iSize;
+    void* const dtPtr = DTable + 1;
+    HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
+    HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace;
+
+    DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp));
+    if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge);
+
+    DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+    /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
+    if (HUF_isError(iSize)) return iSize;
+
+
+    /* Table header */
+    {   DTableDesc dtd = HUF_getDTableDesc(DTable);
+        U32 const maxTableLog = dtd.maxTableLog + 1;
+        U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);
+        tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);
+        if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge);   /* DTable too small, Huffman tree cannot fit in */
+        dtd.tableType = 0;
+        dtd.tableLog = (BYTE)tableLog;
+        ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
+    }
+
+    /* Compute symbols and rankStart given rankVal:
+     *
+     * rankVal already contains the number of values of each weight.
+     *
+     * symbols contains the symbols ordered by weight. First are the rankVal[0]
+     * weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on.
+     * symbols[0] is filled (but unused) to avoid a branch.
+     *
+     * rankStart contains the offset where each rank belongs in the DTable.
+     * rankStart[0] is not filled because there are no entries in the table for
+     * weight 0.
+     */
+    {   int n;
+        U32 nextRankStart = 0;
+        int const unroll = 4;
+        int const nLimit = (int)nbSymbols - unroll + 1;
+        for (n=0; n<(int)tableLog+1; n++) {
+            U32 const curr = nextRankStart;
+            nextRankStart += wksp->rankVal[n];
+            wksp->rankStart[n] = curr;
+        }
+        for (n=0; n < nLimit; n += unroll) {
+            int u;
+            for (u=0; u < unroll; ++u) {
+                size_t const w = wksp->huffWeight[n+u];
+                wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n+u);
+            }
+        }
+        for (; n < (int)nbSymbols; ++n) {
+            size_t const w = wksp->huffWeight[n];
+            wksp->symbols[wksp->rankStart[w]++] = (BYTE)n;
+        }
+    }
+
+    /* fill DTable
+     * We fill all entries of each weight in order.
+     * That way length is a constant for each iteration of the outer loop.
+     * We can switch based on the length to a different inner loop which is
+     * optimized for that particular case.
+     */
+    {   U32 w;
+        int symbol = wksp->rankVal[0];
+        int rankStart = 0;
+        for (w=1; w<tableLog+1; ++w) {
+            int const symbolCount = wksp->rankVal[w];
+            int const length = (1 << w) >> 1;
+            int uStart = rankStart;
+            BYTE const nbBits = (BYTE)(tableLog + 1 - w);
+            int s;
+            int u;
+            switch (length) {
+            case 1:
+                for (s=0; s<symbolCount; ++s) {
+                    HUF_DEltX1 D;
+                    D.byte = wksp->symbols[symbol + s];
+                    D.nbBits = nbBits;
+                    dt[uStart] = D;
+                    uStart += 1;
+                }
+                break;
+            case 2:
+                for (s=0; s<symbolCount; ++s) {
+                    HUF_DEltX1 D;
+                    D.byte = wksp->symbols[symbol + s];
+                    D.nbBits = nbBits;
+                    dt[uStart+0] = D;
+                    dt[uStart+1] = D;
+                    uStart += 2;
+                }
+                break;
+            case 4:
+                for (s=0; s<symbolCount; ++s) {
+                    U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
+                    MEM_write64(dt + uStart, D4);
+                    uStart += 4;
+                }
+                break;
+            case 8:
+                for (s=0; s<symbolCount; ++s) {
+                    U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
+                    MEM_write64(dt + uStart, D4);
+                    MEM_write64(dt + uStart + 4, D4);
+                    uStart += 8;
+                }
+                break;
+            default:
+                for (s=0; s<symbolCount; ++s) {
+                    U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
+                    for (u=0; u < length; u += 16) {
+                        MEM_write64(dt + uStart + u + 0, D4);
+                        MEM_write64(dt + uStart + u + 4, D4);
+                        MEM_write64(dt + uStart + u + 8, D4);
+                        MEM_write64(dt + uStart + u + 12, D4);
+                    }
+                    assert(u == length);
+                    uStart += length;
+                }
+                break;
+            }
+            symbol += symbolCount;
+            rankStart += symbolCount * length;
+        }
+    }
+    return iSize;
+}
+
+FORCE_INLINE_TEMPLATE BYTE
+HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+    BYTE const c = dt[val].byte;
+    BIT_skipBits(Dstream, dt[val].nbBits);
+    return c;
+}
+
+#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+    do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
+
+#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)      \
+    do {                                            \
+        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
+    } while (0)
+
+#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr)      \
+    do {                                            \
+        if (MEM_64bits())                           \
+            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
+    } while (0)
+
+HINT_INLINE size_t
+HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    if ((pEnd - p) > 3) {
+        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
+            HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+            HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
+            HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+            HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+        }
+    } else {
+        BIT_reloadDStream(bitDPtr);
+    }
+
+    /* [0-3] symbols remaining */
+    if (MEM_32bits())
+        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
+            HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+
+    return (size_t)(pEnd-pStart);
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X1_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
+    const void* dtPtr = DTable + 1;
+    const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+    BIT_DStream_t bitD;
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    U32 const dtLog = dtd.tableLog;
+
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog);
+
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+/* HUF_decompress4X1_usingDTable_internal_body():
+ * Conditions :
+ * @dstSize >= 6
+ */
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X1_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        BYTE* const olimit = oend - 3;
+        const void* const dtPtr = DTable + 1;
+        const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+        U32 endSignal = 1;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
+        assert(dstSize >= 6); /* validated above */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
+        if ((size_t)(oend - op4) >= sizeof(size_t)) {
+            for ( ; (endSignal) & (op4 < olimit) ; ) {
+                HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
+                HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
+                endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
+                endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
+                endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
+                endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
+            }
+        }
+
+        /* check corruption */
+        /* note : should not be necessary : op# advance in lock step, and we control op4.
+         *        but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX1(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+#if HUF_NEED_BMI2_FUNCTION
+static BMI2_TARGET_ATTRIBUTE
+size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+#endif
+
+static
+size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
+
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+    U64 bits[4];
+    BYTE const* ip[4];
+    BYTE* op[4];
+    U16 const* const dtable = (U16 const*)args->dt;
+    BYTE* const oend = args->oend;
+    BYTE const* const ilowest = args->ilowest;
+
+    /* Copy the arguments to local variables */
+    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
+    ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+    assert(MEM_isLittleEndian());
+    assert(!MEM_32bits());
+
+    for (;;) {
+        BYTE* olimit;
+        int stream;
+
+        /* Assert loop preconditions */
+#ifndef NDEBUG
+        for (stream = 0; stream < 4; ++stream) {
+            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
+            assert(ip[stream] >= ilowest);
+        }
+#endif
+        /* Compute olimit */
+        {
+            /* Each iteration produces 5 output symbols per stream */
+            size_t const oiters = (size_t)(oend - op[3]) / 5;
+            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
+             * per stream.
+             */
+            size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
+            /* We can safely run iters iterations before running bounds checks */
+            size_t const iters = MIN(oiters, iiters);
+            size_t const symbols = iters * 5;
+
+            /* We can simply check that op[3] < olimit, instead of checking all
+             * of our bounds, since we can't hit the other bounds until we've run
+             * iters iterations, which only happens when op[3] == olimit.
+             */
+            olimit = op[3] + symbols;
+
+            /* Exit fast decoding loop once we reach the end. */
+            if (op[3] == olimit)
+                break;
+
+            /* Exit the decoding loop if any input pointer has crossed the
+             * previous one. This indicates corruption, and a precondition
+             * to our loop is that ip[i] >= ip[0].
+             */
+            for (stream = 1; stream < 4; ++stream) {
+                if (ip[stream] < ip[stream - 1])
+                    goto _out;
+            }
+        }
+
+#ifndef NDEBUG
+        for (stream = 1; stream < 4; ++stream) {
+            assert(ip[stream] >= ip[stream - 1]);
+        }
+#endif
+
+#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
+    do {                                                        \
+        int const index = (int)(bits[(_stream)] >> 53);         \
+        int const entry = (int)dtable[index];                   \
+        bits[(_stream)] <<= (entry & 0x3F);                     \
+        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
+    } while (0)
+
+#define HUF_4X1_RELOAD_STREAM(_stream)                              \
+    do {                                                            \
+        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
+        int const nbBits = ctz & 7;                                 \
+        int const nbBytes = ctz >> 3;                               \
+        op[(_stream)] += 5;                                         \
+        ip[(_stream)] -= nbBytes;                                   \
+        bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
+        bits[(_stream)] <<= nbBits;                                 \
+    } while (0)
+
+        /* Manually unroll the loop because compilers don't consistently
+         * unroll the inner loops, which destroys performance.
+         */
+        do {
+            /* Decode 5 symbols in each of the 4 streams */
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
+
+            /* Reload each of the 4 the bitstreams */
+            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
+        } while (op[3] < olimit);
+
+#undef HUF_4X1_DECODE_SYMBOL
+#undef HUF_4X1_RELOAD_STREAM
+    }
+
+_out:
+
+    /* Save the final values of each of the state variables back to args. */
+    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
+    ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+/**
+ * @returns @p dstSize on success (>= 6)
+ *          0 if the fallback implementation should be used
+ *          An error if an error occurred
+ */
+static HUF_FAST_BMI2_ATTRS
+size_t
+HUF_decompress4X1_usingDTable_internal_fast(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable,
+    HUF_DecompressFastLoopFn loopFn)
+{
+    void const* dt = DTable + 1;
+    BYTE const* const ilowest = (BYTE const*)cSrc;
+    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
+    HUF_DecompressFastArgs args;
+    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
+        if (ret == 0)
+            return 0;
+    }
+
+    assert(args.ip[0] >= args.ilowest);
+    loopFn(&args);
+
+    /* Our loop guarantees that ip[] >= ilowest and that we haven't
+    * overwritten any op[].
+    */
+    assert(args.ip[0] >= ilowest);
+    assert(args.ip[0] >= ilowest);
+    assert(args.ip[1] >= ilowest);
+    assert(args.ip[2] >= ilowest);
+    assert(args.ip[3] >= ilowest);
+    assert(args.op[3] <= oend);
+
+    assert(ilowest == args.ilowest);
+    assert(ilowest + 6 == args.iend[0]);
+    (void)ilowest;
+
+    /* finish bit streams one by one. */
+    {   size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* segmentEnd = (BYTE*)dst;
+        int i;
+        for (i = 0; i < 4; ++i) {
+            BIT_DStream_t bit;
+            if (segmentSize <= (size_t)(oend - segmentEnd))
+                segmentEnd += segmentSize;
+            else
+                segmentEnd = oend;
+            FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
+            /* Decompress and validate that we've produced exactly the expected length. */
+            args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);
+            if (args.op[i] != segmentEnd) return ERROR(corruption_detected);
+        }
+    }
+
+    /* decoded size */
+    assert(dstSize != 0);
+    return dstSize;
+}
+
+HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+
+static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+{
+    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
+    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
+
+#if DYNAMIC_BMI2
+    if (flags & HUF_flags_bmi2) {
+        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
+# if ZSTD_ENABLE_ASM_X86_64_BMI2
+        if (!(flags & HUF_flags_disableAsm)) {
+            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+        }
+# endif
+    } else {
+        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+    }
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+    if (!(flags & HUF_flags_disableAsm)) {
+        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+    }
+#endif
+
+    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
+        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+        if (ret != 0)
+            return ret;
+    }
+    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int flags)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+}
+
+#endif /* HUF_FORCE_DECOMPRESS_X2 */
+
+
+#ifndef HUF_FORCE_DECOMPRESS_X1
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2;  /* double-symbols decoding */
+typedef struct { BYTE symbol; } sortedSymbol_t;
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
+
+/**
+ * Constructs a HUF_DEltX2 in a U32.
+ */
+static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level)
+{
+    U32 seq;
+    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);
+    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);
+    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);
+    DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));
+    if (MEM_isLittleEndian()) {
+        seq = level == 1 ? symbol : (baseSeq + (symbol << 8));
+        return seq + (nbBits << 16) + ((U32)level << 24);
+    } else {
+        seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
+        return (seq << 16) + (nbBits << 8) + (U32)level;
+    }
+}
+
+/**
+ * Constructs a HUF_DEltX2.
+ */
+static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)
+{
+    HUF_DEltX2 DElt;
+    U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
+    DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));
+    ZSTD_memcpy(&DElt, &val, sizeof(val));
+    return DElt;
+}
+
+/**
+ * Constructs 2 HUF_DEltX2s and packs them into a U64.
+ */
+static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level)
+{
+    U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
+    return (U64)DElt + ((U64)DElt << 32);
+}
+
+/**
+ * Fills the DTable rank with all the symbols from [begin, end) that are each
+ * nbBits long.
+ *
+ * @param DTableRank The start of the rank in the DTable.
+ * @param begin The first symbol to fill (inclusive).
+ * @param end The last symbol to fill (exclusive).
+ * @param nbBits Each symbol is nbBits long.
+ * @param tableLog The table log.
+ * @param baseSeq If level == 1 { 0 } else { the first level symbol }
+ * @param level The level in the table. Must be 1 or 2.
+ */
+static void HUF_fillDTableX2ForWeight(
+    HUF_DEltX2* DTableRank,
+    sortedSymbol_t const* begin, sortedSymbol_t const* end,
+    U32 nbBits, U32 tableLog,
+    U16 baseSeq, int const level)
+{
+    U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);
+    const sortedSymbol_t* ptr;
+    assert(level >= 1 && level <= 2);
+    switch (length) {
+    case 1:
+        for (ptr = begin; ptr != end; ++ptr) {
+            HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
+            *DTableRank++ = DElt;
+        }
+        break;
+    case 2:
+        for (ptr = begin; ptr != end; ++ptr) {
+            HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
+            DTableRank[0] = DElt;
+            DTableRank[1] = DElt;
+            DTableRank += 2;
+        }
+        break;
+    case 4:
+        for (ptr = begin; ptr != end; ++ptr) {
+            U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
+            ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
+            DTableRank += 4;
+        }
+        break;
+    case 8:
+        for (ptr = begin; ptr != end; ++ptr) {
+            U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
+            ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
+            DTableRank += 8;
+        }
+        break;
+    default:
+        for (ptr = begin; ptr != end; ++ptr) {
+            U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
+            HUF_DEltX2* const DTableRankEnd = DTableRank + length;
+            for (; DTableRank != DTableRankEnd; DTableRank += 8) {
+                ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
+                ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
+                ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
+                ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
+            }
+        }
+        break;
+    }
+}
+
+/* HUF_fillDTableX2Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,
+                           const U32* rankVal, const int minWeight, const int maxWeight1,
+                           const sortedSymbol_t* sortedSymbols, U32 const* rankStart,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    /* Fill skipped values (all positions up to rankVal[minWeight]).
+     * These are positions only get a single symbol because the combined weight
+     * is too large.
+     */
+    if (minWeight>1) {
+        U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);
+        U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);
+        int const skipSize = rankVal[minWeight];
+        assert(length > 1);
+        assert((U32)skipSize < length);
+        switch (length) {
+        case 2:
+            assert(skipSize == 1);
+            ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
+            break;
+        case 4:
+            assert(skipSize <= 4);
+            ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
+            break;
+        default:
+            {
+                int i;
+                for (i = 0; i < skipSize; i += 8) {
+                    ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
+                    ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
+                    ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
+                    ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
+                }
+            }
+        }
+    }
+
+    /* Fill each of the second level symbols by weight. */
+    {
+        int w;
+        for (w = minWeight; w < maxWeight1; ++w) {
+            int const begin = rankStart[w];
+            int const end = rankStart[w+1];
+            U32 const nbBits = nbBitsBaseline - w;
+            U32 const totalBits = nbBits + consumedBits;
+            HUF_fillDTableX2ForWeight(
+                DTable + rankVal[w],
+                sortedSymbols + begin, sortedSymbols + end,
+                totalBits, targetLog,
+                baseSeq, /* level */ 2);
+        }
+    }
+}
+
+static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList,
+                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32* const rankVal = rankValOrigin[0];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    int w;
+    int const wEnd = (int)maxWeight + 1;
+
+    /* Fill DTable in order of weight. */
+    for (w = 1; w < wEnd; ++w) {
+        int const begin = (int)rankStart[w];
+        int const end = (int)rankStart[w+1];
+        U32 const nbBits = nbBitsBaseline - w;
+
+        if (targetLog-nbBits >= minBits) {
+            /* Enough room for a second symbol. */
+            int start = rankVal[w];
+            U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);
+            int minWeight = nbBits + scaleLog;
+            int s;
+            if (minWeight < 1) minWeight = 1;
+            /* Fill the DTable for every symbol of weight w.
+             * These symbols get at least 1 second symbol.
+             */
+            for (s = begin; s != end; ++s) {
+                HUF_fillDTableX2Level2(
+                    DTable + start, targetLog, nbBits,
+                    rankValOrigin[nbBits], minWeight, wEnd,
+                    sortedList, rankStart,
+                    nbBitsBaseline, sortedList[s].symbol);
+                start += length;
+            }
+        } else {
+            /* Only a single symbol. */
+            HUF_fillDTableX2ForWeight(
+                DTable + rankVal[w],
+                sortedList + begin, sortedList + end,
+                nbBits, targetLog,
+                /* baseSeq */ 0, /* level */ 1);
+        }
+    }
+}
+
+typedef struct {
+    rankValCol_t rankVal[HUF_TABLELOG_MAX];
+    U32 rankStats[HUF_TABLELOG_MAX + 1];
+    U32 rankStart0[HUF_TABLELOG_MAX + 3];
+    sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
+    BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
+    U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+} HUF_ReadDTableX2_Workspace;
+
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                       const void* src, size_t srcSize,
+                             void* workSpace, size_t wkspSize, int flags)
+{
+    U32 tableLog, maxW, nbSymbols;
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    U32 maxTableLog = dtd.maxTableLog;
+    size_t iSize;
+    void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+    U32 *rankStart;
+
+    HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace;
+
+    if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC);
+
+    rankStart = wksp->rankStart0 + 1;
+    ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats));
+    ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0));
+
+    DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
+    if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+    if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;
+
+    /* find maxWeight */
+    for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {   U32 w, nextRankStart = 0;
+        for (w=1; w<maxW+1; w++) {
+            U32 curr = nextRankStart;
+            nextRankStart += wksp->rankStats[w];
+            rankStart[w] = curr;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        rankStart[maxW+1] = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {   U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 const w = wksp->weightList[s];
+            U32 const r = rankStart[w]++;
+            wksp->sortedSymbol[r].symbol = (BYTE)s;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {   U32* const rankVal0 = wksp->rankVal[0];
+        {   int const rescale = (maxTableLog-tableLog) - 1;   /* tableLog <= maxTableLog */
+            U32 nextRankVal = 0;
+            U32 w;
+            for (w=1; w<maxW+1; w++) {
+                U32 curr = nextRankVal;
+                nextRankVal += wksp->rankStats[w] << (w+rescale);
+                rankVal0[w] = curr;
+        }   }
+        {   U32 const minBits = tableLog+1 - maxW;
+            U32 consumed;
+            for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+                U32* const rankValPtr = wksp->rankVal[consumed];
+                U32 w;
+                for (w = 1; w < maxW+1; w++) {
+                    rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }   }
+
+    HUF_fillDTableX2(dt, maxTableLog,
+                   wksp->sortedSymbol,
+                   wksp->rankStart0, wksp->rankVal, maxW,
+                   tableLog+1);
+
+    dtd.tableLog = (BYTE)maxTableLog;
+    dtd.tableType = 1;
+    ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
+    return iSize;
+}
+
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    ZSTD_memcpy(op, &dt[val].sequence, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    ZSTD_memcpy(op, &dt[val].sequence, 1);
+    if (dt[val].length==1) {
+        BIT_skipBits(DStream, dt[val].nbBits);
+    } else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
+        }
+    }
+    return 1;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)                     \
+    do {                                                           \
+        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12))                \
+            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
+    } while (0)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr)                     \
+    do {                                                           \
+        if (MEM_64bits())                                          \
+            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
+    } while (0)
+
+HINT_INLINE size_t
+HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+                const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {
+        if (dtLog <= 11 && MEM_64bits()) {
+            /* up to 10 symbols at a time */
+            while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+            }
+        } else {
+            /* up to 8 symbols at a time */
+            while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+                HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+            }
+        }
+    } else {
+        BIT_reloadDStream(bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    if ((size_t)(pEnd - p) >= 2) {
+        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
+            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+        while (p <= pEnd-2)
+            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+    }
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BIT_DStream_t bitD;
+
+    /* Init */
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+/* HUF_decompress4X2_usingDTable_internal_body():
+ * Conditions:
+ * @dstSize >= 6
+ */
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        BYTE* const olimit = oend - (sizeof(size_t)-1);
+        const void* const dtPtr = DTable+1;
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal = 1;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
+        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
+        assert(dstSize >= 6 /* validated above */);
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        if ((size_t)(oend - op4) >= sizeof(size_t)) {
+            for ( ; (endSignal) & (op4 < olimit); ) {
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+                endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
+                endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+                endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
+                endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
+#else
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+                endSignal = (U32)LIKELY((U32)
+                            (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
+                        & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
+                        & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
+                        & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
+#endif
+            }
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+#if HUF_NEED_BMI2_FUNCTION
+static BMI2_TARGET_ATTRIBUTE
+size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+#endif
+
+static
+size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
+
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+    U64 bits[4];
+    BYTE const* ip[4];
+    BYTE* op[4];
+    BYTE* oend[4];
+    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
+    BYTE const* const ilowest = args->ilowest;
+
+    /* Copy the arguments to local registers. */
+    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
+    ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+    oend[0] = op[1];
+    oend[1] = op[2];
+    oend[2] = op[3];
+    oend[3] = args->oend;
+
+    assert(MEM_isLittleEndian());
+    assert(!MEM_32bits());
+
+    for (;;) {
+        BYTE* olimit;
+        int stream;
+
+        /* Assert loop preconditions */
+#ifndef NDEBUG
+        for (stream = 0; stream < 4; ++stream) {
+            assert(op[stream] <= oend[stream]);
+            assert(ip[stream] >= ilowest);
+        }
+#endif
+        /* Compute olimit */
+        {
+            /* Each loop does 5 table lookups for each of the 4 streams.
+             * Each table lookup consumes up to 11 bits of input, and produces
+             * up to 2 bytes of output.
+             */
+            /* We can consume up to 7 bytes of input per iteration per stream.
+             * We also know that each input pointer is >= ip[0]. So we can run
+             * iters loops before running out of input.
+             */
+            size_t iters = (size_t)(ip[0] - ilowest) / 7;
+            /* Each iteration can produce up to 10 bytes of output per stream.
+             * Each output stream my advance at different rates. So take the
+             * minimum number of safe iterations among all the output streams.
+             */
+            for (stream = 0; stream < 4; ++stream) {
+                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
+                iters = MIN(iters, oiters);
+            }
+
+            /* Each iteration produces at least 5 output symbols. So until
+             * op[3] crosses olimit, we know we haven't executed iters
+             * iterations yet. This saves us maintaining an iters counter,
+             * at the expense of computing the remaining # of iterations
+             * more frequently.
+             */
+            olimit = op[3] + (iters * 5);
+
+            /* Exit the fast decoding loop once we reach the end. */
+            if (op[3] == olimit)
+                break;
+
+            /* Exit the decoding loop if any input pointer has crossed the
+             * previous one. This indicates corruption, and a precondition
+             * to our loop is that ip[i] >= ip[0].
+             */
+            for (stream = 1; stream < 4; ++stream) {
+                if (ip[stream] < ip[stream - 1])
+                    goto _out;
+            }
+        }
+
+#ifndef NDEBUG
+        for (stream = 1; stream < 4; ++stream) {
+            assert(ip[stream] >= ip[stream - 1]);
+        }
+#endif
+
+#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)                      \
+    do {                                                              \
+        if ((_decode3) || (_stream) != 3) {                           \
+            int const index = (int)(bits[(_stream)] >> 53);           \
+            HUF_DEltX2 const entry = dtable[index];                   \
+            MEM_write16(op[(_stream)], entry.sequence); \
+            bits[(_stream)] <<= (entry.nbBits) & 0x3F;                \
+            op[(_stream)] += (entry.length);                          \
+        }                                                             \
+    } while (0)
+
+#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
+    do {                                                                \
+        HUF_4X2_DECODE_SYMBOL(3, 1);                                    \
+        {                                                               \
+            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
+            int const nbBits = ctz & 7;                                 \
+            int const nbBytes = ctz >> 3;                               \
+            ip[(_stream)] -= nbBytes;                                   \
+            bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
+            bits[(_stream)] <<= nbBits;                                 \
+        }                                                               \
+    } while (0)
+
+        /* Manually unroll the loop because compilers don't consistently
+         * unroll the inner loops, which destroys performance.
+         */
+        do {
+            /* Decode 5 symbols from each of the first 3 streams.
+             * The final stream will be decoded during the reload phase
+             * to reduce register pressure.
+             */
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
+
+            /* Decode one symbol from the final stream */
+            HUF_4X2_DECODE_SYMBOL(3, 1);
+
+            /* Decode 4 symbols from the final stream & reload bitstreams.
+             * The final stream is reloaded last, meaning that all 5 symbols
+             * are decoded from the final stream before it is reloaded.
+             */
+            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
+        } while (op[3] < olimit);
+    }
+
+#undef HUF_4X2_DECODE_SYMBOL
+#undef HUF_4X2_RELOAD_STREAM
+
+_out:
+
+    /* Save the final values of each of the state variables back to args. */
+    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
+    ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+
+static HUF_FAST_BMI2_ATTRS size_t
+HUF_decompress4X2_usingDTable_internal_fast(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable,
+    HUF_DecompressFastLoopFn loopFn) {
+    void const* dt = DTable + 1;
+    const BYTE* const ilowest = (const BYTE*)cSrc;
+    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
+    HUF_DecompressFastArgs args;
+    {
+        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+        if (ret == 0)
+            return 0;
+    }
+
+    assert(args.ip[0] >= args.ilowest);
+    loopFn(&args);
+
+    /* note : op4 already verified within main loop */
+    assert(args.ip[0] >= ilowest);
+    assert(args.ip[1] >= ilowest);
+    assert(args.ip[2] >= ilowest);
+    assert(args.ip[3] >= ilowest);
+    assert(args.op[3] <= oend);
+
+    assert(ilowest == args.ilowest);
+    assert(ilowest + 6 == args.iend[0]);
+    (void)ilowest;
+
+    /* finish bitStreams one by one */
+    {
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* segmentEnd = (BYTE*)dst;
+        int i;
+        for (i = 0; i < 4; ++i) {
+            BIT_DStream_t bit;
+            if (segmentSize <= (size_t)(oend - segmentEnd))
+                segmentEnd += segmentSize;
+            else
+                segmentEnd = oend;
+            FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
+            args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);
+            if (args.op[i] != segmentEnd)
+                return ERROR(corruption_detected);
+        }
+    }
+
+    /* decoded size */
+    return dstSize;
+}
+
+static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+{
+    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
+    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
+
+#if DYNAMIC_BMI2
+    if (flags & HUF_flags_bmi2) {
+        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
+# if ZSTD_ENABLE_ASM_X86_64_BMI2
+        if (!(flags & HUF_flags_disableAsm)) {
+            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+        }
+# endif
+    } else {
+        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+    }
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+    if (!(flags & HUF_flags_disableAsm)) {
+        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+    }
+#endif
+
+    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
+        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+        if (ret != 0)
+            return ret;
+    }
+    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int flags)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+                                               workSpace, wkspSize, flags);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
+}
+
+static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int flags)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+                                         workSpace, wkspSize, flags);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+}
+
+#endif /* HUF_FORCE_DECOMPRESS_X1 */
+
+
+/* ***********************************/
+/* Universal decompression selectors */
+/* ***********************************/
+
+
+#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}},  /* Q==1 : impossible */
+    {{ 150,216}, { 381,119}},   /* Q == 2 : 12-18% */
+    {{ 170,205}, { 514,112}},   /* Q == 3 : 18-25% */
+    {{ 177,199}, { 539,110}},   /* Q == 4 : 25-32% */
+    {{ 197,194}, { 644,107}},   /* Q == 5 : 32-38% */
+    {{ 221,192}, { 735,107}},   /* Q == 6 : 38-44% */
+    {{ 256,189}, { 881,106}},   /* Q == 7 : 44-50% */
+    {{ 359,188}, {1167,109}},   /* Q == 8 : 50-56% */
+    {{ 582,187}, {1570,114}},   /* Q == 9 : 56-62% */
+    {{ 688,187}, {1712,122}},   /* Q ==10 : 62-69% */
+    {{ 825,186}, {1965,136}},   /* Q ==11 : 69-75% */
+    {{ 976,185}, {2131,150}},   /* Q ==12 : 75-81% */
+    {{1180,186}, {2070,175}},   /* Q ==13 : 81-87% */
+    {{1377,185}, {1731,202}},   /* Q ==14 : 87-93% */
+    {{1412,185}, {1695,202}},   /* Q ==15 : 93-99% */
+};
+#endif
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+{
+    assert(dstSize > 0);
+    assert(dstSize <= 128*1024);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dstSize;
+    (void)cSrcSize;
+    return 0;
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dstSize;
+    (void)cSrcSize;
+    return 1;
+#else
+    /* decoder timing evaluation */
+    {   U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 */
+        U32 const D256 = (U32)(dstSize >> 8);
+        U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+        U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+        DTime1 += DTime1 >> 5;  /* small advantage to algorithm using less memory, to reduce cache eviction */
+        return DTime1 < DTime0;
+    }
+#endif
+}
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                  const void* cSrc, size_t cSrcSize,
+                                  void* workSpace, size_t wkspSize, int flags)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize, flags);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize, flags);
+#else
+        return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize, flags):
+                        HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize, flags);
+#endif
+    }
+}
+
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dtd;
+    assert(dtd.tableType == 0);
+    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dtd;
+    assert(dtd.tableType == 1);
+    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+#else
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+#endif
+}
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+}
+#endif
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dtd;
+    assert(dtd.tableType == 0);
+    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dtd;
+    assert(dtd.tableType == 1);
+    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+#else
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+#endif
+}
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+#else
+        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
+                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+#endif
+    }
+}
diff --git a/src/zstd/huf_decompress_amd64.S b/src/zstd/decompress/huf_decompress_amd64.S
similarity index 89%
rename from src/zstd/huf_decompress_amd64.S
rename to src/zstd/decompress/huf_decompress_amd64.S
index e5dc9a6dc..78da291ee 100644
--- a/src/zstd/huf_decompress_amd64.S
+++ b/src/zstd/decompress/huf_decompress_amd64.S
@@ -8,13 +8,34 @@
  * You may select, at your option, one of the above-listed licenses.
  */
 
-#include "portability_macros.h"
+#include "../common/portability_macros.h"
 
+#if defined(__ELF__) && defined(__GNUC__)
 /* Stack marking
  * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
  */
-#if defined(__ELF__) && defined(__GNUC__)
 .section .note.GNU-stack,"",%progbits
+
+#if defined(__aarch64__)
+/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
+ * See: https://github.com/facebook/zstd/issues/3841
+ * See: https://gcc.godbolt.org/z/sqr5T4ffK
+ * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/
+ * See: https://reviews.llvm.org/D62609
+ */
+.pushsection .note.gnu.property, "a"
+.p2align 3
+.long 4                 /* size of the name - "GNU\0" */
+.long 0x10              /* size of descriptor */
+.long 0x5               /* NT_GNU_PROPERTY_TYPE_0 */
+.asciz "GNU"
+.long 0xc0000000        /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+.long 4                 /* pr_datasz - 4 bytes */
+.long 3                 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
+.p2align 3              /* pr_padding - bring everything to 8 byte alignment */
+.popsection
+#endif
+
 #endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2
@@ -131,7 +152,7 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
     movq 88(%rax), %bits3
     movq 96(%rax), %dtable
     push %rax      /* argument */
-    push 104(%rax) /* ilimit */
+    push 104(%rax) /* ilowest */
     push 112(%rax) /* oend */
     push %olimit   /* olimit space */
 
@@ -156,11 +177,11 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
     shrq $2, %r15
 
     movq %ip0,     %rax /* rax = ip0 */
-    movq 40(%rsp), %rdx /* rdx = ilimit */
-    subq %rdx,     %rax /* rax = ip0 - ilimit */
-    movq %rax,     %rbx /* rbx = ip0 - ilimit */
+    movq 40(%rsp), %rdx /* rdx = ilowest */
+    subq %rdx,     %rax /* rax = ip0 - ilowest */
+    movq %rax,     %rbx /* rbx = ip0 - ilowest */
 
-    /* rdx = (ip0 - ilimit) / 7 */
+    /* rdx = (ip0 - ilowest) / 7 */
     movabsq $2635249153387078803, %rdx
     mulq %rdx
     subq %rdx, %rbx
@@ -183,9 +204,8 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
 
     /* If (op3 + 20 > olimit) */
     movq %op3, %rax    /* rax = op3 */
-    addq $20,  %rax    /* rax = op3 + 20 */
-    cmpq %rax, %olimit /* op3 + 20 > olimit */
-    jb .L_4X1_exit
+    cmpq %rax, %olimit /* op3 == olimit */
+    je .L_4X1_exit
 
     /* If (ip1 < ip0) go to exit */
     cmpq %ip0, %ip1
@@ -316,7 +336,7 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
     /* Restore stack (oend & olimit) */
     pop %rax /* olimit */
     pop %rax /* oend */
-    pop %rax /* ilimit */
+    pop %rax /* ilowest */
     pop %rax /* arg */
 
     /* Save ip / op / bits */
@@ -387,7 +407,7 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
     movq 96(%rax), %dtable
     push %rax      /* argument */
     push %rax      /* olimit */
-    push 104(%rax) /* ilimit */
+    push 104(%rax) /* ilowest */
 
     movq 112(%rax), %rax
     push %rax /* oend3 */
@@ -414,9 +434,9 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
 
     /* We can consume up to 7 input bytes each iteration. */
     movq %ip0,     %rax  /* rax = ip0 */
-    movq 40(%rsp), %rdx  /* rdx = ilimit */
-    subq %rdx,     %rax  /* rax = ip0 - ilimit */
-    movq %rax,    %r15   /* r15 = ip0 - ilimit */
+    movq 40(%rsp), %rdx  /* rdx = ilowest */
+    subq %rdx,     %rax  /* rax = ip0 - ilowest */
+    movq %rax,    %r15   /* r15 = ip0 - ilowest */
 
     /* rdx = rax / 7 */
     movabsq $2635249153387078803, %rdx
@@ -426,7 +446,7 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
     addq %r15, %rdx
     shrq $2, %rdx
 
-    /* r15 = (ip0 - ilimit) / 7 */
+    /* r15 = (ip0 - ilowest) / 7 */
     movq %rdx, %r15
 
     /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
@@ -467,9 +487,8 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
 
     /* If (op3 + 10 > olimit) */
     movq %op3, %rax    /* rax = op3 */
-    addq $10,  %rax    /* rax = op3 + 10 */
-    cmpq %rax, %olimit /* op3 + 10 > olimit */
-    jb .L_4X2_exit
+    cmpq %rax, %olimit /* op3 == olimit */
+    je .L_4X2_exit
 
     /* If (ip1 < ip0) go to exit */
     cmpq %ip0, %ip1
@@ -537,7 +556,7 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
     pop %rax /* oend1 */
     pop %rax /* oend2 */
     pop %rax /* oend3 */
-    pop %rax /* ilimit */
+    pop %rax /* ilowest */
     pop %rax /* olimit */
     pop %rax /* arg */
 
diff --git a/src/zstd/decompress/zstd_ddict.c b/src/zstd/decompress/zstd_ddict.c
new file mode 100644
index 000000000..309ec0d03
--- /dev/null
+++ b/src/zstd/decompress/zstd_ddict.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* zstd_ddict.c :
+ * concentrates all logic that needs to know the internals of ZSTD_DDict object */
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+#include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+#include "../common/cpu.h"         /* bmi2 */
+#include "../common/mem.h"         /* low level memory routines */
+#define FSE_STATIC_LINKING_ONLY
+#include "../common/fse.h"
+#include "../common/huf.h"
+#include "zstd_decompress_internal.h"
+#include "zstd_ddict.h"
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+#  include "../legacy/zstd_legacy.h"
+#endif
+
+
+
+/*-*******************************************************
+*  Types
+*********************************************************/
+struct ZSTD_DDict_s {
+    void* dictBuffer;
+    const void* dictContent;
+    size_t dictSize;
+    ZSTD_entropyDTables_t entropy;
+    U32 dictID;
+    U32 entropyPresent;
+    ZSTD_customMem cMem;
+};  /* typedef'd to ZSTD_DDict within "zstd.h" */
+
+const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict)
+{
+    assert(ddict != NULL);
+    return ddict->dictContent;
+}
+
+size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict)
+{
+    assert(ddict != NULL);
+    return ddict->dictSize;
+}
+
+void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    DEBUGLOG(4, "ZSTD_copyDDictParameters");
+    assert(dctx != NULL);
+    assert(ddict != NULL);
+    dctx->dictID = ddict->dictID;
+    dctx->prefixStart = ddict->dictContent;
+    dctx->virtualStart = ddict->dictContent;
+    dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+    dctx->previousDstEnd = dctx->dictEnd;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    dctx->dictContentBeginForFuzzing = dctx->prefixStart;
+    dctx->dictContentEndForFuzzing = dctx->previousDstEnd;
+#endif
+    if (ddict->entropyPresent) {
+        dctx->litEntropy = 1;
+        dctx->fseEntropy = 1;
+        dctx->LLTptr = ddict->entropy.LLTable;
+        dctx->MLTptr = ddict->entropy.MLTable;
+        dctx->OFTptr = ddict->entropy.OFTable;
+        dctx->HUFptr = ddict->entropy.hufTable;
+        dctx->entropy.rep[0] = ddict->entropy.rep[0];
+        dctx->entropy.rep[1] = ddict->entropy.rep[1];
+        dctx->entropy.rep[2] = ddict->entropy.rep[2];
+    } else {
+        dctx->litEntropy = 0;
+        dctx->fseEntropy = 0;
+    }
+}
+
+
+static size_t
+ZSTD_loadEntropy_intoDDict(ZSTD_DDict* ddict,
+                           ZSTD_dictContentType_e dictContentType)
+{
+    ddict->dictID = 0;
+    ddict->entropyPresent = 0;
+    if (dictContentType == ZSTD_dct_rawContent) return 0;
+
+    if (ddict->dictSize < 8) {
+        if (dictContentType == ZSTD_dct_fullDict)
+            return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+        return 0;   /* pure content mode */
+    }
+    {   U32 const magic = MEM_readLE32(ddict->dictContent);
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
+            if (dictContentType == ZSTD_dct_fullDict)
+                return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+            return 0;   /* pure content mode */
+        }
+    }
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE);
+
+    /* load entropy tables */
+    RETURN_ERROR_IF(ZSTD_isError(ZSTD_loadDEntropy(
+            &ddict->entropy, ddict->dictContent, ddict->dictSize)),
+        dictionary_corrupted, "");
+    ddict->entropyPresent = 1;
+    return 0;
+}
+
+
+static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType)
+{
+    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
+        ddict->dictBuffer = NULL;
+        ddict->dictContent = dict;
+        if (!dict) dictSize = 0;
+    } else {
+        void* const internalBuffer = ZSTD_customMalloc(dictSize, ddict->cMem);
+        ddict->dictBuffer = internalBuffer;
+        ddict->dictContent = internalBuffer;
+        if (!internalBuffer) return ERROR(memory_allocation);
+        ZSTD_memcpy(internalBuffer, dict, dictSize);
+    }
+    ddict->dictSize = dictSize;
+    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+
+    /* parse dictionary content */
+    FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+
+    return 0;
+}
+
+ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType,
+                                      ZSTD_customMem customMem)
+{
+    if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
+
+    {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_customMalloc(sizeof(ZSTD_DDict), customMem);
+        if (ddict == NULL) return NULL;
+        ddict->cMem = customMem;
+        {   size_t const initResult = ZSTD_initDDict_internal(ddict,
+                                            dict, dictSize,
+                                            dictLoadMethod, dictContentType);
+            if (ZSTD_isError(initResult)) {
+                ZSTD_freeDDict(ddict);
+                return NULL;
+        }   }
+        return ddict;
+    }
+}
+
+/*! ZSTD_createDDict() :
+*   Create a digested dictionary, to start decompression without startup delay.
+*   `dict` content is copied inside DDict.
+*   Consequently, `dict` can be released after `ZSTD_DDict` creation */
+ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator);
+}
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, to start decompression without startup delay.
+ *  Dictionary content is simply referenced, it will be accessed during decompression.
+ *  Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */
+ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator);
+}
+
+
+const ZSTD_DDict* ZSTD_initStaticDDict(
+                                void* sBuffer, size_t sBufferSize,
+                                const void* dict, size_t dictSize,
+                                ZSTD_dictLoadMethod_e dictLoadMethod,
+                                ZSTD_dictContentType_e dictContentType)
+{
+    size_t const neededSpace = sizeof(ZSTD_DDict)
+                             + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+    ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer;
+    assert(sBuffer != NULL);
+    assert(dict != NULL);
+    if ((size_t)sBuffer & 7) return NULL;   /* 8-aligned */
+    if (sBufferSize < neededSpace) return NULL;
+    if (dictLoadMethod == ZSTD_dlm_byCopy) {
+        ZSTD_memcpy(ddict+1, dict, dictSize);  /* local copy */
+        dict = ddict+1;
+    }
+    if (ZSTD_isError( ZSTD_initDDict_internal(ddict,
+                                              dict, dictSize,
+                                              ZSTD_dlm_byRef, dictContentType) ))
+        return NULL;
+    return ddict;
+}
+
+
+size_t ZSTD_freeDDict(ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = ddict->cMem;
+        ZSTD_customFree(ddict->dictBuffer, cMem);
+        ZSTD_customFree(ddict, cMem);
+        return 0;
+    }
+}
+
+/*! ZSTD_estimateDDictSize() :
+ *  Estimate amount of memory that will be needed to create a dictionary for decompression.
+ *  Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */
+size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+    return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+}
+
+size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ;
+}
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;
+    return ddict->dictID;
+}
diff --git a/src/zstd/zstd_ddict.h b/src/zstd/decompress/zstd_ddict.h
similarity index 90%
rename from src/zstd/zstd_ddict.h
rename to src/zstd/decompress/zstd_ddict.h
index 5879e77cf..c4ca8877a 100644
--- a/src/zstd/zstd_ddict.h
+++ b/src/zstd/decompress/zstd_ddict.h
@@ -8,14 +8,16 @@
  * You may select, at your option, one of the above-listed licenses.
  */
 
+
 #ifndef ZSTD_DDICT_H
 #define ZSTD_DDICT_H
 
 /*-*******************************************************
  *  Dependencies
  *********************************************************/
-#include "zstd_deps.h" /* size_t */
-#include "zstd.h"      /* ZSTD_DDict, and several public functions */
+#include "../common/zstd_deps.h"   /* size_t */
+#include "../zstd.h"     /* ZSTD_DDict, and several public functions */
+
 
 /*-*******************************************************
  *  Interface
@@ -37,4 +39,6 @@ size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict);
 
 void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
 
+
+
 #endif /* ZSTD_DDICT_H */
diff --git a/src/zstd/decompress/zstd_decompress.c b/src/zstd/decompress/zstd_decompress.c
new file mode 100644
index 000000000..2f03cf7b0
--- /dev/null
+++ b/src/zstd/decompress/zstd_decompress.c
@@ -0,0 +1,2407 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTD_decompress() allocates its context,
+ * on stack (0), or into heap (1, default; requires malloc()).
+ * Note that functions with explicit context such as ZSTD_decompressDCtx() are unaffected.
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif
+
+/*!
+*  LEGACY_SUPPORT :
+*  if set to 1+, ZSTD_decompress() can decode older formats (v0.1+)
+*/
+#ifndef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 0
+#endif
+
+/*!
+ *  MAXWINDOWSIZE_DEFAULT :
+ *  maximum window size accepted by DStream __by default__.
+ *  Frames requiring more memory will be rejected.
+ *  It's possible to set a different limit using ZSTD_DCtx_setMaxWindowSize().
+ */
+#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT
+#  define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + 1)
+#endif
+
+/*!
+ *  NO_FORWARD_PROGRESS_MAX :
+ *  maximum allowed nb of calls to ZSTD_decompressStream()
+ *  without any forward progress
+ *  (defined as: no byte read from input, and no byte flushed to output)
+ *  before triggering an error.
+ */
+#ifndef ZSTD_NO_FORWARD_PROGRESS_MAX
+#  define ZSTD_NO_FORWARD_PROGRESS_MAX 16
+#endif
+
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+#include "../common/error_private.h"
+#include "../common/zstd_internal.h"  /* blockProperties_t */
+#include "../common/mem.h"         /* low level memory routines */
+#include "../common/bits.h"  /* ZSTD_highbit32 */
+#define FSE_STATIC_LINKING_ONLY
+#include "../common/fse.h"
+#include "../common/huf.h"
+#include "../common/xxhash.h" /* XXH64_reset, XXH64_update, XXH64_digest, XXH64 */
+#include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+#include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+#include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+#  include "../legacy/zstd_legacy.h"
+#endif
+
+
+
+/*************************************
+ * Multiple DDicts Hashset internals *
+ *************************************/
+
+#define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
+#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+                                                    * Currently, that means a 0.75 load factor.
+                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+                                                    * the load factor of the ddict hash set.
+                                                    */
+
+#define DDICT_HASHSET_TABLE_BASE_SIZE 64
+#define DDICT_HASHSET_RESIZE_FACTOR 2
+
+/* Hash function to determine starting position of dict insertion within the table
+ * Returns an index between [0, hashSet->ddictPtrTableSize]
+ */
+static size_t ZSTD_DDictHashSet_getIndex(const ZSTD_DDictHashSet* hashSet, U32 dictID) {
+    const U64 hash = XXH64(&dictID, sizeof(U32), 0);
+    /* DDict ptr table size is a multiple of 2, use size - 1 as mask to get index within [0, hashSet->ddictPtrTableSize) */
+    return hash & (hashSet->ddictPtrTableSize - 1);
+}
+
+/* Adds DDict to a hashset without resizing it.
+ * If inserting a DDict with a dictID that already exists in the set, replaces the one in the set.
+ * Returns 0 if successful, or a zstd error code if something went wrong.
+ */
+static size_t ZSTD_DDictHashSet_emplaceDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict) {
+    const U32 dictID = ZSTD_getDictID_fromDDict(ddict);
+    size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID);
+    const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1;
+    RETURN_ERROR_IF(hashSet->ddictPtrCount == hashSet->ddictPtrTableSize, GENERIC, "Hash set is full!");
+    DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx);
+    while (hashSet->ddictPtrTable[idx] != NULL) {
+        /* Replace existing ddict if inserting ddict with same dictID */
+        if (ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]) == dictID) {
+            DEBUGLOG(4, "DictID already exists, replacing rather than adding");
+            hashSet->ddictPtrTable[idx] = ddict;
+            return 0;
+        }
+        idx &= idxRangeMask;
+        idx++;
+    }
+    DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx);
+    hashSet->ddictPtrTable[idx] = ddict;
+    hashSet->ddictPtrCount++;
+    return 0;
+}
+
+/* Expands hash table by factor of DDICT_HASHSET_RESIZE_FACTOR and
+ * rehashes all values, allocates new table, frees old table.
+ * Returns 0 on success, otherwise a zstd error code.
+ */
+static size_t ZSTD_DDictHashSet_expand(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) {
+    size_t newTableSize = hashSet->ddictPtrTableSize * DDICT_HASHSET_RESIZE_FACTOR;
+    const ZSTD_DDict** newTable = (const ZSTD_DDict**)ZSTD_customCalloc(sizeof(ZSTD_DDict*) * newTableSize, customMem);
+    const ZSTD_DDict** oldTable = hashSet->ddictPtrTable;
+    size_t oldTableSize = hashSet->ddictPtrTableSize;
+    size_t i;
+
+    DEBUGLOG(4, "Expanding DDict hash table! Old size: %zu new size: %zu", oldTableSize, newTableSize);
+    RETURN_ERROR_IF(!newTable, memory_allocation, "Expanded hashset allocation failed!");
+    hashSet->ddictPtrTable = newTable;
+    hashSet->ddictPtrTableSize = newTableSize;
+    hashSet->ddictPtrCount = 0;
+    for (i = 0; i < oldTableSize; ++i) {
+        if (oldTable[i] != NULL) {
+            FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, oldTable[i]), "");
+        }
+    }
+    ZSTD_customFree((void*)oldTable, customMem);
+    DEBUGLOG(4, "Finished re-hash");
+    return 0;
+}
+
+/* Fetches a DDict with the given dictID
+ * Returns the ZSTD_DDict* with the requested dictID. If it doesn't exist, then returns NULL.
+ */
+static const ZSTD_DDict* ZSTD_DDictHashSet_getDDict(ZSTD_DDictHashSet* hashSet, U32 dictID) {
+    size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID);
+    const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1;
+    DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx);
+    for (;;) {
+        size_t currDictID = ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]);
+        if (currDictID == dictID || currDictID == 0) {
+            /* currDictID == 0 implies a NULL ddict entry */
+            break;
+        } else {
+            idx &= idxRangeMask;    /* Goes to start of table when we reach the end */
+            idx++;
+        }
+    }
+    DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx);
+    return hashSet->ddictPtrTable[idx];
+}
+
+/* Allocates space for and returns a ddict hash set
+ * The hash set's ZSTD_DDict* table has all values automatically set to NULL to begin with.
+ * Returns NULL if allocation failed.
+ */
+static ZSTD_DDictHashSet* ZSTD_createDDictHashSet(ZSTD_customMem customMem) {
+    ZSTD_DDictHashSet* ret = (ZSTD_DDictHashSet*)ZSTD_customMalloc(sizeof(ZSTD_DDictHashSet), customMem);
+    DEBUGLOG(4, "Allocating new hash set");
+    if (!ret)
+        return NULL;
+    ret->ddictPtrTable = (const ZSTD_DDict**)ZSTD_customCalloc(DDICT_HASHSET_TABLE_BASE_SIZE * sizeof(ZSTD_DDict*), customMem);
+    if (!ret->ddictPtrTable) {
+        ZSTD_customFree(ret, customMem);
+        return NULL;
+    }
+    ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE;
+    ret->ddictPtrCount = 0;
+    return ret;
+}
+
+/* Frees the table of ZSTD_DDict* within a hashset, then frees the hashset itself.
+ * Note: The ZSTD_DDict* within the table are NOT freed.
+ */
+static void ZSTD_freeDDictHashSet(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) {
+    DEBUGLOG(4, "Freeing ddict hash set");
+    if (hashSet && hashSet->ddictPtrTable) {
+        ZSTD_customFree((void*)hashSet->ddictPtrTable, customMem);
+    }
+    if (hashSet) {
+        ZSTD_customFree(hashSet, customMem);
+    }
+}
+
+/* Public function: Adds a DDict into the ZSTD_DDictHashSet, possibly triggering a resize of the hash set.
+ * Returns 0 on success, or a ZSTD error.
+ */
+static size_t ZSTD_DDictHashSet_addDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict, ZSTD_customMem customMem) {
+    DEBUGLOG(4, "Adding dict ID: %u to hashset with - Count: %zu Tablesize: %zu", ZSTD_getDictID_fromDDict(ddict), hashSet->ddictPtrCount, hashSet->ddictPtrTableSize);
+    if (hashSet->ddictPtrCount * DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT / hashSet->ddictPtrTableSize * DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT != 0) {
+        FORWARD_IF_ERROR(ZSTD_DDictHashSet_expand(hashSet, customMem), "");
+    }
+    FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, ddict), "");
+    return 0;
+}
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support sizeof NULL */
+    return sizeof(*dctx)
+           + ZSTD_sizeof_DDict(dctx->ddictLocal)
+           + dctx->inBuffSize + dctx->outBuffSize;
+}
+
+size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); }
+
+
+static size_t ZSTD_startingInputLength(ZSTD_format_e format)
+{
+    size_t const startingInputLength = ZSTD_FRAMEHEADERSIZE_PREFIX(format);
+    /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
+    assert( (format == ZSTD_f_zstd1) || (format == ZSTD_f_zstd1_magicless) );
+    return startingInputLength;
+}
+
+static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
+{
+    assert(dctx->streamStage == zdss_init);
+    dctx->format = ZSTD_f_zstd1;
+    dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+    dctx->outBufferMode = ZSTD_bm_buffered;
+    dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
+    dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
+    dctx->disableHufAsm = 0;
+    dctx->maxBlockSizeParam = 0;
+}
+
+static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+{
+    dctx->staticSize  = 0;
+    dctx->ddict       = NULL;
+    dctx->ddictLocal  = NULL;
+    dctx->dictEnd     = NULL;
+    dctx->ddictIsCold = 0;
+    dctx->dictUses = ZSTD_dont_use;
+    dctx->inBuff      = NULL;
+    dctx->inBuffSize  = 0;
+    dctx->outBuffSize = 0;
+    dctx->streamStage = zdss_init;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+    dctx->legacyContext = NULL;
+    dctx->previousLegacyVersion = 0;
+#endif
+    dctx->noForwardProgress = 0;
+    dctx->oversizedDuration = 0;
+    dctx->isFrameDecompression = 1;
+#if DYNAMIC_BMI2
+    dctx->bmi2 = ZSTD_cpuSupportsBmi2();
+#endif
+    dctx->ddictSet = NULL;
+    ZSTD_DCtx_resetParameters(dctx);
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    dctx->dictContentEndForFuzzing = NULL;
+#endif
+}
+
+ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize)
+{
+    ZSTD_DCtx* const dctx = (ZSTD_DCtx*) workspace;
+
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+    if (workspaceSize < sizeof(ZSTD_DCtx)) return NULL;  /* minimum size */
+
+    ZSTD_initDCtx_internal(dctx);
+    dctx->staticSize = workspaceSize;
+    dctx->inBuff = (char*)(dctx+1);
+    return dctx;
+}
+
+static ZSTD_DCtx* ZSTD_createDCtx_internal(ZSTD_customMem customMem) {
+    if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
+
+    {   ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_customMalloc(sizeof(*dctx), customMem);
+        if (!dctx) return NULL;
+        dctx->customMem = customMem;
+        ZSTD_initDCtx_internal(dctx);
+        return dctx;
+    }
+}
+
+ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDCtx_internal(customMem);
+}
+
+ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    DEBUGLOG(3, "ZSTD_createDCtx");
+    return ZSTD_createDCtx_internal(ZSTD_defaultCMem);
+}
+
+static void ZSTD_clearDict(ZSTD_DCtx* dctx)
+{
+    ZSTD_freeDDict(dctx->ddictLocal);
+    dctx->ddictLocal = NULL;
+    dctx->ddict = NULL;
+    dctx->dictUses = ZSTD_dont_use;
+}
+
+size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support free on NULL */
+    RETURN_ERROR_IF(dctx->staticSize, memory_allocation, "not compatible with static DCtx");
+    {   ZSTD_customMem const cMem = dctx->customMem;
+        ZSTD_clearDict(dctx);
+        ZSTD_customFree(dctx->inBuff, cMem);
+        dctx->inBuff = NULL;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (dctx->legacyContext)
+            ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion);
+#endif
+        if (dctx->ddictSet) {
+            ZSTD_freeDDictHashSet(dctx->ddictSet, cMem);
+            dctx->ddictSet = NULL;
+        }
+        ZSTD_customFree(dctx, cMem);
+        return 0;
+    }
+}
+
+/* no longer useful */
+void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
+{
+    size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx);
+    ZSTD_memcpy(dstDCtx, srcDCtx, toCopy);  /* no need to copy workspace */
+}
+
+/* Given a dctx with a digested frame params, re-selects the correct ZSTD_DDict based on
+ * the requested dict ID from the frame. If there exists a reference to the correct ZSTD_DDict, then
+ * accordingly sets the ddict to be used to decompress the frame.
+ *
+ * If no DDict is found, then no action is taken, and the ZSTD_DCtx::ddict remains as-is.
+ *
+ * ZSTD_d_refMultipleDDicts must be enabled for this function to be called.
+ */
+static void ZSTD_DCtx_selectFrameDDict(ZSTD_DCtx* dctx) {
+    assert(dctx->refMultipleDDicts && dctx->ddictSet);
+    DEBUGLOG(4, "Adjusting DDict based on requested dict ID from frame");
+    if (dctx->ddict) {
+        const ZSTD_DDict* frameDDict = ZSTD_DDictHashSet_getDDict(dctx->ddictSet, dctx->fParams.dictID);
+        if (frameDDict) {
+            DEBUGLOG(4, "DDict found!");
+            ZSTD_clearDict(dctx);
+            dctx->dictID = dctx->fParams.dictID;
+            dctx->ddict = frameDDict;
+            dctx->dictUses = ZSTD_use_indefinitely;
+        }
+    }
+}
+
+
+/*-*************************************************************
+ *   Frame header decoding
+ ***************************************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+unsigned ZSTD_isFrame(const void* buffer, size_t size)
+{
+    if (size < ZSTD_FRAMEIDSIZE) return 0;
+    {   U32 const magic = MEM_readLE32(buffer);
+        if (magic == ZSTD_MAGICNUMBER) return 1;
+        if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
+    }
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(buffer, size)) return 1;
+#endif
+    return 0;
+}
+
+/*! ZSTD_isSkippableFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ */
+unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size)
+{
+    if (size < ZSTD_FRAMEIDSIZE) return 0;
+    {   U32 const magic = MEM_readLE32(buffer);
+        if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
+    }
+    return 0;
+}
+
+/** ZSTD_frameHeaderSize_internal() :
+ *  srcSize must be large enough to reach header size fields.
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless.
+ * @return : size of the Frame Header
+ *           or an error code, which can be tested with ZSTD_isError() */
+static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format)
+{
+    size_t const minInputSize = ZSTD_startingInputLength(format);
+    RETURN_ERROR_IF(srcSize < minInputSize, srcSize_wrong, "");
+
+    {   BYTE const fhd = ((const BYTE*)src)[minInputSize-1];
+        U32 const dictID= fhd & 3;
+        U32 const singleSegment = (fhd >> 5) & 1;
+        U32 const fcsId = fhd >> 6;
+        return minInputSize + !singleSegment
+             + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId]
+             + (singleSegment && !fcsId);
+    }
+}
+
+/** ZSTD_frameHeaderSize() :
+ *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
+size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+{
+    return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1);
+}
+
+
+/** ZSTD_getFrameHeader_advanced() :
+ *  decode Frame Header, or require larger `srcSize`.
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+**           or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t const minInputSize = ZSTD_startingInputLength(format);
+
+    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
+
+    if (srcSize > 0) {
+        /* note : technically could be considered an assert(), since it's an invalid entry */
+        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
+    }
+    if (srcSize < minInputSize) {
+        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
+            /* when receiving less than @minInputSize bytes,
+             * control these bytes at least correspond to a supported magic number
+             * in order to error out early if they don't.
+            **/
+            size_t const toCopy = MIN(4, srcSize);
+            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
+            assert(src != NULL);
+            ZSTD_memcpy(hbuf, src, toCopy);
+            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
+                /* not a zstd frame : let's check if it's a skippable frame */
+                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
+                ZSTD_memcpy(hbuf, src, toCopy);
+                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
+                    RETURN_ERROR(prefix_unknown,
+                                "first bytes don't correspond to any supported magic number");
+        }   }   }
+        return minInputSize;
+    }
+
+    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
+    if ( (format != ZSTD_f_zstd1_magicless)
+      && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+        if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+            /* skippable frame */
+            if (srcSize < ZSTD_SKIPPABLEHEADERSIZE)
+                return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */
+            ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));
+            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
+            zfhPtr->frameType = ZSTD_skippableFrame;
+            return 0;
+        }
+        RETURN_ERROR(prefix_unknown, "");
+    }
+
+    /* ensure there is enough `srcSize` to fully read/decode frame header */
+    {   size_t const fhsize = ZSTD_frameHeaderSize_internal(src, srcSize, format);
+        if (srcSize < fhsize) return fhsize;
+        zfhPtr->headerSize = (U32)fhsize;
+    }
+
+    {   BYTE const fhdByte = ip[minInputSize-1];
+        size_t pos = minInputSize;
+        U32 const dictIDSizeCode = fhdByte&3;
+        U32 const checksumFlag = (fhdByte>>2)&1;
+        U32 const singleSegment = (fhdByte>>5)&1;
+        U32 const fcsID = fhdByte>>6;
+        U64 windowSize = 0;
+        U32 dictID = 0;
+        U64 frameContentSize = ZSTD_CONTENTSIZE_UNKNOWN;
+        RETURN_ERROR_IF((fhdByte & 0x08) != 0, frameParameter_unsupported,
+                        "reserved bits, must be zero");
+
+        if (!singleSegment) {
+            BYTE const wlByte = ip[pos++];
+            U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+            RETURN_ERROR_IF(windowLog > ZSTD_WINDOWLOG_MAX, frameParameter_windowTooLarge, "");
+            windowSize = (1ULL << windowLog);
+            windowSize += (windowSize >> 3) * (wlByte&7);
+        }
+        switch(dictIDSizeCode)
+        {
+            default:
+                assert(0);  /* impossible */
+                ZSTD_FALLTHROUGH;
+            case 0 : break;
+            case 1 : dictID = ip[pos]; pos++; break;
+            case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break;
+            case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break;
+        }
+        switch(fcsID)
+        {
+            default:
+                assert(0);  /* impossible */
+                ZSTD_FALLTHROUGH;
+            case 0 : if (singleSegment) frameContentSize = ip[pos]; break;
+            case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;
+            case 2 : frameContentSize = MEM_readLE32(ip+pos); break;
+            case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
+        }
+        if (singleSegment) windowSize = frameContentSize;
+
+        zfhPtr->frameType = ZSTD_frame;
+        zfhPtr->frameContentSize = frameContentSize;
+        zfhPtr->windowSize = windowSize;
+        zfhPtr->blockSizeMax = (unsigned) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+        zfhPtr->dictID = dictID;
+        zfhPtr->checksumFlag = checksumFlag;
+    }
+    return 0;
+}
+
+/** ZSTD_getFrameHeader() :
+ *  decode Frame Header, or require larger `srcSize`.
+ *  note : this function does not consume input, it only reads it.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize)
+{
+    return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1);
+}
+
+/** ZSTD_getFrameContentSize() :
+ *  compatible with legacy mode
+ * @return : decompressed size of the single frame pointed to be `src` if known, otherwise
+ *         - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *         - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+{
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(src, srcSize)) {
+        unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize);
+        return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret;
+    }
+#endif
+    {   ZSTD_frameHeader zfh;
+        if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0)
+            return ZSTD_CONTENTSIZE_ERROR;
+        if (zfh.frameType == ZSTD_skippableFrame) {
+            return 0;
+        } else {
+            return zfh.frameContentSize;
+    }   }
+}
+
+static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+{
+    size_t const skippableHeaderSize = ZSTD_SKIPPABLEHEADERSIZE;
+    U32 sizeU32;
+
+    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+
+    sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+    RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+                    frameParameter_unsupported, "");
+    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
+        RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+        return skippableSize;
+    }
+}
+
+/*! ZSTD_readSkippableFrame() :
+ * Retrieves content of a skippable frame, and writes it to dst buffer.
+ *
+ * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+ * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+ * in the magicVariant.
+ *
+ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
+ *
+ * @return : number of bytes written or a ZSTD error.
+ */
+size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
+                               unsigned* magicVariant,  /* optional, can be NULL */
+                         const void* src, size_t srcSize)
+{
+    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+
+    {   U32 const magicNumber = MEM_readLE32(src);
+        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+
+        /* check input validity */
+        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
+
+        /* deliver payload */
+        if (skippableContentSize > 0  && dst != NULL)
+            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+        if (magicVariant != NULL)
+            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+        return skippableContentSize;
+    }
+}
+
+/** ZSTD_findDecompressedSize() :
+ *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+ *      skippable frames
+ *  note: compatible with legacy mode
+ * @return : decompressed size of the frames contained */
+unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+{
+    unsigned long long totalDstSize = 0;
+
+    while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
+        U32 const magicNumber = MEM_readLE32(src);
+
+        if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+            size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
+            assert(skippableSize <= srcSize);
+
+            src = (const BYTE *)src + skippableSize;
+            srcSize -= skippableSize;
+            continue;
+        }
+
+        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
+            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
+
+            if (totalDstSize + fcs < totalDstSize)
+                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
+            totalDstSize += fcs;
+        }
+        /* skip to next frame */
+        {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
+            assert(frameSrcSize <= srcSize);
+
+            src = (const BYTE *)src + frameSrcSize;
+            srcSize -= frameSrcSize;
+        }
+    }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+    if (srcSize) return ZSTD_CONTENTSIZE_ERROR;
+
+    return totalDstSize;
+}
+
+/** ZSTD_getDecompressedSize() :
+ *  compatible with legacy mode
+ * @return : decompressed size if known, 0 otherwise
+             note : 0 can mean any of the following :
+                   - frame content is empty
+                   - decompressed size field is not present in frame header
+                   - frame header unknown / not supported
+                   - frame header not complete (`srcSize` too small) */
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize)
+{
+    unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+    ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_ERROR < ZSTD_CONTENTSIZE_UNKNOWN);
+    return (ret >= ZSTD_CONTENTSIZE_ERROR) ? 0 : ret;
+}
+
+
+/** ZSTD_decodeFrameHeader() :
+ * `headerSize` must be the size provided by ZSTD_frameHeaderSize().
+ * If multiple DDict references are enabled, also will choose the correct DDict to use.
+ * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize)
+{
+    size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format);
+    if (ZSTD_isError(result)) return result;    /* invalid header */
+    RETURN_ERROR_IF(result>0, srcSize_wrong, "headerSize too small");
+
+    /* Reference DDict requested by frame if dctx references multiple ddicts */
+    if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts && dctx->ddictSet) {
+        ZSTD_DCtx_selectFrameDDict(dctx);
+    }
+
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    /* Skip the dictID check in fuzzing mode, because it makes the search
+     * harder.
+     */
+    RETURN_ERROR_IF(dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID),
+                    dictionary_wrong, "");
+#endif
+    dctx->validateChecksum = (dctx->fParams.checksumFlag && !dctx->forceIgnoreChecksum) ? 1 : 0;
+    if (dctx->validateChecksum) XXH64_reset(&dctx->xxhState, 0);
+    dctx->processedCSize += headerSize;
+    return 0;
+}
+
+static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret)
+{
+    ZSTD_frameSizeInfo frameSizeInfo;
+    frameSizeInfo.compressedSize = ret;
+    frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR;
+    return frameSizeInfo;
+}
+
+static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format)
+{
+    ZSTD_frameSizeInfo frameSizeInfo;
+    ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (format == ZSTD_f_zstd1 && ZSTD_isLegacy(src, srcSize))
+        return ZSTD_findFrameSizeInfoLegacy(src, srcSize);
+#endif
+
+    if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
+        && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+        frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
+        assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
+               frameSizeInfo.compressedSize <= srcSize);
+        return frameSizeInfo;
+    } else {
+        const BYTE* ip = (const BYTE*)src;
+        const BYTE* const ipstart = ip;
+        size_t remainingSize = srcSize;
+        size_t nbBlocks = 0;
+        ZSTD_frameHeader zfh;
+
+        /* Extract Frame Header */
+        {   size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format);
+            if (ZSTD_isError(ret))
+                return ZSTD_errorFrameSizeInfo(ret);
+            if (ret > 0)
+                return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+        }
+
+        ip += zfh.headerSize;
+        remainingSize -= zfh.headerSize;
+
+        /* Iterate over each block */
+        while (1) {
+            blockProperties_t blockProperties;
+            size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+            if (ZSTD_isError(cBlockSize))
+                return ZSTD_errorFrameSizeInfo(cBlockSize);
+
+            if (ZSTD_blockHeaderSize + cBlockSize > remainingSize)
+                return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+
+            ip += ZSTD_blockHeaderSize + cBlockSize;
+            remainingSize -= ZSTD_blockHeaderSize + cBlockSize;
+            nbBlocks++;
+
+            if (blockProperties.lastBlock) break;
+        }
+
+        /* Final frame content checksum */
+        if (zfh.checksumFlag) {
+            if (remainingSize < 4)
+                return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+            ip += 4;
+        }
+
+        frameSizeInfo.nbBlocks = nbBlocks;
+        frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
+        frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+                                        ? zfh.frameContentSize
+                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
+        return frameSizeInfo;
+    }
+}
+
+static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) {
+    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format);
+    return frameSizeInfo.compressedSize;
+}
+
+/** ZSTD_findFrameCompressedSize() :
+ * See docs in zstd.h
+ * Note: compatible with legacy mode */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+    return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1);
+}
+
+/** ZSTD_decompressBound() :
+ *  compatible with legacy mode
+ *  `src` must point to the start of a ZSTD frame or a skippeable frame
+ *  `srcSize` must be at least as large as the frame contained
+ *  @return : the maximum decompressed size of the compressed source
+ */
+unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+{
+    unsigned long long bound = 0;
+    /* Iterate over each frame */
+    while (srcSize > 0) {
+        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
+        size_t const compressedSize = frameSizeInfo.compressedSize;
+        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+            return ZSTD_CONTENTSIZE_ERROR;
+        assert(srcSize >= compressedSize);
+        src = (const BYTE*)src + compressedSize;
+        srcSize -= compressedSize;
+        bound += decompressedBound;
+    }
+    return bound;
+}
+
+size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
+{
+    size_t margin = 0;
+    unsigned maxBlockSize = 0;
+
+    /* Iterate over each frame */
+    while (srcSize > 0) {
+        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
+        size_t const compressedSize = frameSizeInfo.compressedSize;
+        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+        ZSTD_frameHeader zfh;
+
+        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
+        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+            return ERROR(corruption_detected);
+
+        if (zfh.frameType == ZSTD_frame) {
+            /* Add the frame header to our margin */
+            margin += zfh.headerSize;
+            /* Add the checksum to our margin */
+            margin += zfh.checksumFlag ? 4 : 0;
+            /* Add 3 bytes per block */
+            margin += 3 * frameSizeInfo.nbBlocks;
+
+            /* Compute the max block size */
+            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
+        } else {
+            assert(zfh.frameType == ZSTD_skippableFrame);
+            /* Add the entire skippable frame size to our margin. */
+            margin += compressedSize;
+        }
+
+        assert(srcSize >= compressedSize);
+        src = (const BYTE*)src + compressedSize;
+        srcSize -= compressedSize;
+    }
+
+    /* Add the max block size back to the margin. */
+    margin += maxBlockSize;
+
+    return margin;
+}
+
+/*-*************************************************************
+ *   Frame decoding
+ ***************************************************************/
+
+/** ZSTD_insertBlock() :
+ *  insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize)
+{
+    DEBUGLOG(5, "ZSTD_insertBlock: %u bytes", (unsigned)blockSize);
+    ZSTD_checkContinuity(dctx, blockStart, blockSize);
+    dctx->previousDstEnd = (const char*)blockStart + blockSize;
+    return blockSize;
+}
+
+
+static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_copyRawBlock");
+    RETURN_ERROR_IF(srcSize > dstCapacity, dstSize_tooSmall, "");
+    if (dst == NULL) {
+        if (srcSize == 0) return 0;
+        RETURN_ERROR(dstBuffer_null, "");
+    }
+    ZSTD_memmove(dst, src, srcSize);
+    return srcSize;
+}
+
+static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity,
+                               BYTE b,
+                               size_t regenSize)
+{
+    RETURN_ERROR_IF(regenSize > dstCapacity, dstSize_tooSmall, "");
+    if (dst == NULL) {
+        if (regenSize == 0) return 0;
+        RETURN_ERROR(dstBuffer_null, "");
+    }
+    ZSTD_memset(dst, b, regenSize);
+    return regenSize;
+}
+
+static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, unsigned streaming)
+{
+#if ZSTD_TRACE
+    if (dctx->traceCtx && ZSTD_trace_decompress_end != NULL) {
+        ZSTD_Trace trace;
+        ZSTD_memset(&trace, 0, sizeof(trace));
+        trace.version = ZSTD_VERSION_NUMBER;
+        trace.streaming = streaming;
+        if (dctx->ddict) {
+            trace.dictionaryID = ZSTD_getDictID_fromDDict(dctx->ddict);
+            trace.dictionarySize = ZSTD_DDict_dictSize(dctx->ddict);
+            trace.dictionaryIsCold = dctx->ddictIsCold;
+        }
+        trace.uncompressedSize = (size_t)uncompressedSize;
+        trace.compressedSize = (size_t)compressedSize;
+        trace.dctx = dctx;
+        ZSTD_trace_decompress_end(dctx->traceCtx, &trace);
+    }
+#else
+    (void)dctx;
+    (void)uncompressedSize;
+    (void)compressedSize;
+    (void)streaming;
+#endif
+}
+
+
+/*! ZSTD_decompressFrame() :
+ * @dctx must be properly initialized
+ *  will update *srcPtr and *srcSizePtr,
+ *  to make *srcPtr progress by one frame. */
+static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+                                   void* dst, size_t dstCapacity,
+                             const void** srcPtr, size_t *srcSizePtr)
+{
+    const BYTE* const istart = (const BYTE*)(*srcPtr);
+    const BYTE* ip = istart;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart;
+    BYTE* op = ostart;
+    size_t remainingSrcSize = *srcSizePtr;
+
+    DEBUGLOG(4, "ZSTD_decompressFrame (srcSize:%i)", (int)*srcSizePtr);
+
+    /* check */
+    RETURN_ERROR_IF(
+        remainingSrcSize < ZSTD_FRAMEHEADERSIZE_MIN(dctx->format)+ZSTD_blockHeaderSize,
+        srcSize_wrong, "");
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTD_frameHeaderSize_internal(
+                ip, ZSTD_FRAMEHEADERSIZE_PREFIX(dctx->format), dctx->format);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+        RETURN_ERROR_IF(remainingSrcSize < frameHeaderSize+ZSTD_blockHeaderSize,
+                        srcSize_wrong, "");
+        FORWARD_IF_ERROR( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) , "");
+        ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
+    }
+
+    /* Shrink the blockSizeMax if enabled */
+    if (dctx->maxBlockSizeParam != 0)
+        dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam);
+
+    /* Loop on each block */
+    while (1) {
+        BYTE* oBlockEnd = oend;
+        size_t decodedSize;
+        blockProperties_t blockProperties;
+        size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSrcSize -= ZSTD_blockHeaderSize;
+        RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, "");
+
+        if (ip >= op && ip < oBlockEnd) {
+            /* We are decompressing in-place. Limit the output pointer so that we
+             * don't overwrite the block that we are currently reading. This will
+             * fail decompression if the input & output pointers aren't spaced
+             * far enough apart.
+             *
+             * This is important to set, even when the pointers are far enough
+             * apart, because ZSTD_decompressBlock_internal() can decide to store
+             * literals in the output buffer, after the block it is decompressing.
+             * Since we don't want anything to overwrite our input, we have to tell
+             * ZSTD_decompressBlock_internal to never write past ip.
+             *
+             * See ZSTD_allocateLiteralsBuffer() for reference.
+             */
+            oBlockEnd = op + (ip - op);
+        }
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            assert(dctx->isFrameDecompression == 1);
+            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming);
+            break;
+        case bt_raw :
+            /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
+            decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize);
+            break;
+        case bt_rle :
+            decodedSize = ZSTD_setRleBlock(op, (size_t)(oBlockEnd-op), *ip, blockProperties.origSize);
+            break;
+        case bt_reserved :
+        default:
+            RETURN_ERROR(corruption_detected, "invalid block type");
+        }
+        FORWARD_IF_ERROR(decodedSize, "Block decompression failure");
+        DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize);
+        if (dctx->validateChecksum) {
+            XXH64_update(&dctx->xxhState, op, decodedSize);
+        }
+        if (decodedSize) /* support dst = NULL,0 */ {
+            op += decodedSize;
+        }
+        assert(ip != NULL);
+        ip += cBlockSize;
+        remainingSrcSize -= cBlockSize;
+        if (blockProperties.lastBlock) break;
+    }
+
+    if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+        RETURN_ERROR_IF((U64)(op-ostart) != dctx->fParams.frameContentSize,
+                        corruption_detected, "");
+    }
+    if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */
+        RETURN_ERROR_IF(remainingSrcSize<4, checksum_wrong, "");
+        if (!dctx->forceIgnoreChecksum) {
+            U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState);
+            U32 checkRead;
+            checkRead = MEM_readLE32(ip);
+            RETURN_ERROR_IF(checkRead != checkCalc, checksum_wrong, "");
+        }
+        ip += 4;
+        remainingSrcSize -= 4;
+    }
+    ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
+    /* Allow caller to get size read */
+    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
+    *srcPtr = ip;
+    *srcSizePtr = remainingSrcSize;
+    return (size_t)(op-ostart);
+}
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize,
+                                  const void* dict, size_t dictSize,
+                                  const ZSTD_DDict* ddict)
+{
+    void* const dststart = dst;
+    int moreThan1Frame = 0;
+
+    DEBUGLOG(5, "ZSTD_decompressMultiFrame");
+    assert(dict==NULL || ddict==NULL);  /* either dict or ddict set, not both */
+
+    if (ddict) {
+        dict = ZSTD_DDict_dictContent(ddict);
+        dictSize = ZSTD_DDict_dictSize(ddict);
+    }
+
+    while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (dctx->format == ZSTD_f_zstd1 && ZSTD_isLegacy(src, srcSize)) {
+            size_t decodedSize;
+            size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
+            if (ZSTD_isError(frameSize)) return frameSize;
+            RETURN_ERROR_IF(dctx->staticSize, memory_allocation,
+                "legacy support is not compatible with static dctx");
+
+            decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize);
+            if (ZSTD_isError(decodedSize)) return decodedSize;
+
+            {
+                unsigned long long const expectedSize = ZSTD_getFrameContentSize(src, srcSize);
+                RETURN_ERROR_IF(expectedSize == ZSTD_CONTENTSIZE_ERROR, corruption_detected, "Corrupted frame header!");
+                if (expectedSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+                    RETURN_ERROR_IF(expectedSize != decodedSize, corruption_detected,
+                        "Frame header size does not match decoded size!");
+                }
+            }
+
+            assert(decodedSize <= dstCapacity);
+            dst = (BYTE*)dst + decodedSize;
+            dstCapacity -= decodedSize;
+
+            src = (const BYTE*)src + frameSize;
+            srcSize -= frameSize;
+
+            continue;
+        }
+#endif
+
+        if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) {
+            U32 const magicNumber = MEM_readLE32(src);
+            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
+            if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+                /* skippable frame detected : skip it */
+                size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
+                assert(skippableSize <= srcSize);
+
+                src = (const BYTE *)src + skippableSize;
+                srcSize -= skippableSize;
+                continue; /* check next frame */
+        }   }
+
+        if (ddict) {
+            /* we were called from ZSTD_decompress_usingDDict */
+            FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(dctx, ddict), "");
+        } else {
+            /* this will initialize correctly with no dict if dict == NULL, so
+             * use this in all cases but ddict */
+            FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize), "");
+        }
+        ZSTD_checkContinuity(dctx, dst, dstCapacity);
+
+        {   const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity,
+                                                    &src, &srcSize);
+            RETURN_ERROR_IF(
+                (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown)
+             && (moreThan1Frame==1),
+                srcSize_wrong,
+                "At least one frame successfully completed, "
+                "but following bytes are garbage: "
+                "it's more likely to be a srcSize error, "
+                "specifying more input bytes than size of frame(s). "
+                "Note: one could be unlucky, it might be a corruption error instead, "
+                "happening right at the place where we expect zstd magic bytes. "
+                "But this is _much_ less likely than a srcSize field error.");
+            if (ZSTD_isError(res)) return res;
+            assert(res <= dstCapacity);
+            if (res != 0)
+                dst = (BYTE*)dst + res;
+            dstCapacity -= res;
+        }
+        moreThan1Frame = 1;
+    }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+    RETURN_ERROR_IF(srcSize, srcSize_wrong, "input not entirely consumed");
+
+    return (size_t)((BYTE*)dst - (BYTE*)dststart);
+}
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                           const void* src, size_t srcSize,
+                           const void* dict, size_t dictSize)
+{
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL);
+}
+
+
+static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx)
+{
+    switch (dctx->dictUses) {
+    default:
+        assert(0 /* Impossible */);
+        ZSTD_FALLTHROUGH;
+    case ZSTD_dont_use:
+        ZSTD_clearDict(dctx);
+        return NULL;
+    case ZSTD_use_indefinitely:
+        return dctx->ddict;
+    case ZSTD_use_once:
+        dctx->dictUses = ZSTD_dont_use;
+        return dctx->ddict;
+    }
+}
+
+size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTD_decompress_usingDDict(dctx, dst, dstCapacity, src, srcSize, ZSTD_getDDict(dctx));
+}
+
+
+size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1)
+    size_t regenSize;
+    ZSTD_DCtx* const dctx =  ZSTD_createDCtx_internal(ZSTD_defaultCMem);
+    RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!");
+    regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
+    ZSTD_freeDCtx(dctx);
+    return regenSize;
+#else   /* stack mode */
+    ZSTD_DCtx dctx;
+    ZSTD_initDCtx_internal(&dctx);
+    return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
+#endif
+}
+
+
+/*-**************************************
+*   Advanced Streaming Decompression API
+*   Bufferless and synchronous
+****************************************/
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+
+/**
+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
+ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
+ * be streamed.
+ *
+ * For blocks that can be streamed, this allows us to reduce the latency until we produce
+ * output, and avoid copying the input.
+ *
+ * @param inputSize - The total amount of input that the caller currently has.
+ */
+static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t inputSize) {
+    if (!(dctx->stage == ZSTDds_decompressBlock || dctx->stage == ZSTDds_decompressLastBlock))
+        return dctx->expected;
+    if (dctx->bType != bt_raw)
+        return dctx->expected;
+    return BOUNDED(1, inputSize, dctx->expected);
+}
+
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {
+    switch(dctx->stage)
+    {
+    default:   /* should not happen */
+        assert(0);
+        ZSTD_FALLTHROUGH;
+    case ZSTDds_getFrameHeaderSize:
+        ZSTD_FALLTHROUGH;
+    case ZSTDds_decodeFrameHeader:
+        return ZSTDnit_frameHeader;
+    case ZSTDds_decodeBlockHeader:
+        return ZSTDnit_blockHeader;
+    case ZSTDds_decompressBlock:
+        return ZSTDnit_block;
+    case ZSTDds_decompressLastBlock:
+        return ZSTDnit_lastBlock;
+    case ZSTDds_checkChecksum:
+        return ZSTDnit_checksum;
+    case ZSTDds_decodeSkippableHeader:
+        ZSTD_FALLTHROUGH;
+    case ZSTDds_skipFrame:
+        return ZSTDnit_skippableFrame;
+    }
+}
+
+static int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; }
+
+/** ZSTD_decompressContinue() :
+ *  srcSize : must be the exact nb of bytes expected (see ZSTD_nextSrcSizeToDecompress())
+ *  @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+ *            or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (unsigned)srcSize);
+    /* Sanity check */
+    RETURN_ERROR_IF(srcSize != ZSTD_nextSrcSizeToDecompressWithInputSize(dctx, srcSize), srcSize_wrong, "not allowed");
+    ZSTD_checkContinuity(dctx, dst, dstCapacity);
+
+    dctx->processedCSize += srcSize;
+
+    switch (dctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        assert(src != NULL);
+        if (dctx->format == ZSTD_f_zstd1) {  /* allows header */
+            assert(srcSize >= ZSTD_FRAMEIDSIZE);  /* to read skippable magic number */
+            if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {        /* skippable frame */
+                ZSTD_memcpy(dctx->headerBuffer, src, srcSize);
+                dctx->expected = ZSTD_SKIPPABLEHEADERSIZE - srcSize;  /* remaining to load to get full skippable frame header */
+                dctx->stage = ZSTDds_decodeSkippableHeader;
+                return 0;
+        }   }
+        dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format);
+        if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize;
+        ZSTD_memcpy(dctx->headerBuffer, src, srcSize);
+        dctx->expected = dctx->headerSize - srcSize;
+        dctx->stage = ZSTDds_decodeFrameHeader;
+        return 0;
+
+    case ZSTDds_decodeFrameHeader:
+        assert(src != NULL);
+        ZSTD_memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize);
+        FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize), "");
+        dctx->expected = ZSTD_blockHeaderSize;
+        dctx->stage = ZSTDds_decodeBlockHeader;
+        return 0;
+
+    case ZSTDds_decodeBlockHeader:
+        {   blockProperties_t bp;
+            size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+            if (ZSTD_isError(cBlockSize)) return cBlockSize;
+            RETURN_ERROR_IF(cBlockSize > dctx->fParams.blockSizeMax, corruption_detected, "Block Size Exceeds Maximum");
+            dctx->expected = cBlockSize;
+            dctx->bType = bp.blockType;
+            dctx->rleSize = bp.origSize;
+            if (cBlockSize) {
+                dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock;
+                return 0;
+            }
+            /* empty block */
+            if (bp.lastBlock) {
+                if (dctx->fParams.checksumFlag) {
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    dctx->expected = 0; /* end of frame */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->expected = ZSTD_blockHeaderSize;  /* jump to next header */
+                dctx->stage = ZSTDds_decodeBlockHeader;
+            }
+            return 0;
+        }
+
+    case ZSTDds_decompressLastBlock:
+    case ZSTDds_decompressBlock:
+        DEBUGLOG(5, "ZSTD_decompressContinue: case ZSTDds_decompressBlock");
+        {   size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+                assert(dctx->isFrameDecompression == 1);
+                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming);
+                dctx->expected = 0;  /* Streaming not supported */
+                break;
+            case bt_raw :
+                assert(srcSize <= dctx->expected);
+                rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize);
+                FORWARD_IF_ERROR(rSize, "ZSTD_copyRawBlock failed");
+                assert(rSize == srcSize);
+                dctx->expected -= rSize;
+                break;
+            case bt_rle :
+                rSize = ZSTD_setRleBlock(dst, dstCapacity, *(const BYTE*)src, dctx->rleSize);
+                dctx->expected = 0;  /* Streaming not supported */
+                break;
+            case bt_reserved :   /* should never happen */
+            default:
+                RETURN_ERROR(corruption_detected, "invalid block type");
+            }
+            FORWARD_IF_ERROR(rSize, "");
+            RETURN_ERROR_IF(rSize > dctx->fParams.blockSizeMax, corruption_detected, "Decompressed Block Size Exceeds Maximum");
+            DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (unsigned)rSize);
+            dctx->decodedSize += rSize;
+            if (dctx->validateChecksum) XXH64_update(&dctx->xxhState, dst, rSize);
+            dctx->previousDstEnd = (char*)dst + rSize;
+
+            /* Stay on the same stage until we are finished streaming the block. */
+            if (dctx->expected > 0) {
+                return rSize;
+            }
+
+            if (dctx->stage == ZSTDds_decompressLastBlock) {   /* end of frame */
+                DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (unsigned)dctx->decodedSize);
+                RETURN_ERROR_IF(
+                    dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                 && dctx->decodedSize != dctx->fParams.frameContentSize,
+                    corruption_detected, "");
+                if (dctx->fParams.checksumFlag) {  /* another round for frame checksum */
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1);
+                    dctx->expected = 0;   /* ends here */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->stage = ZSTDds_decodeBlockHeader;
+                dctx->expected = ZSTD_blockHeaderSize;
+            }
+            return rSize;
+        }
+
+    case ZSTDds_checkChecksum:
+        assert(srcSize == 4);  /* guaranteed by dctx->expected */
+        {
+            if (dctx->validateChecksum) {
+                U32 const h32 = (U32)XXH64_digest(&dctx->xxhState);
+                U32 const check32 = MEM_readLE32(src);
+                DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32);
+                RETURN_ERROR_IF(check32 != h32, checksum_wrong, "");
+            }
+            ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1);
+            dctx->expected = 0;
+            dctx->stage = ZSTDds_getFrameHeaderSize;
+            return 0;
+        }
+
+    case ZSTDds_decodeSkippableHeader:
+        assert(src != NULL);
+        assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
+        assert(dctx->format != ZSTD_f_zstd1_magicless);
+        ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize);   /* complete skippable header */
+        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+        dctx->stage = ZSTDds_skipFrame;
+        return 0;
+
+    case ZSTDds_skipFrame:
+        dctx->expected = 0;
+        dctx->stage = ZSTDds_getFrameHeaderSize;
+        return 0;
+
+    default:
+        assert(0);   /* impossible */
+        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+    }
+}
+
+
+static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+    dctx->prefixStart = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    dctx->dictContentBeginForFuzzing = dctx->prefixStart;
+    dctx->dictContentEndForFuzzing = dctx->previousDstEnd;
+#endif
+    return 0;
+}
+
+/*! ZSTD_loadDEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
+ * @return : size of entropy tables read */
+size_t
+ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+                  const void* const dict, size_t const dictSize)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+
+    RETURN_ERROR_IF(dictSize <= 8, dictionary_corrupted, "dict is too small");
+    assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY);   /* dict must be valid */
+    dictPtr += 8;   /* skip header = magic + dictID */
+
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable));
+    ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE);
+    {   void* const workspace = &entropy->LLTable;   /* use fse tables as temporary workspace; implies fse tables are grouped together */
+        size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable);
+#ifdef HUF_FORCE_DECOMPRESS_X1
+        /* in minimal huffman, we always use X1 variants */
+        size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+                                                dictPtr, dictEnd - dictPtr,
+                                                workspace, workspaceSize, /* flags */ 0);
+#else
+        size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                dictPtr, (size_t)(dictEnd - dictPtr),
+                                                workspace, workspaceSize, /* flags */ 0);
+#endif
+        RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+        dictPtr += hSize;
+    }
+
+    {   short offcodeNCount[MaxOff+1];
+        unsigned offcodeMaxValue = MaxOff, offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr));
+        RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(offcodeMaxValue > MaxOff, dictionary_corrupted, "");
+        RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
+        ZSTD_buildFSETable( entropy->OFTable,
+                            offcodeNCount, offcodeMaxValue,
+                            OF_base, OF_bits,
+                            offcodeLog,
+                            entropy->workspace, sizeof(entropy->workspace),
+                            /* bmi2 */0);
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr));
+        RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(matchlengthMaxValue > MaxML, dictionary_corrupted, "");
+        RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
+        ZSTD_buildFSETable( entropy->MLTable,
+                            matchlengthNCount, matchlengthMaxValue,
+                            ML_base, ML_bits,
+                            matchlengthLog,
+                            entropy->workspace, sizeof(entropy->workspace),
+                            /* bmi2 */ 0);
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr));
+        RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(litlengthMaxValue > MaxLL, dictionary_corrupted, "");
+        RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
+        ZSTD_buildFSETable( entropy->LLTable,
+                            litlengthNCount, litlengthMaxValue,
+                            LL_base, LL_bits,
+                            litlengthLog,
+                            entropy->workspace, sizeof(entropy->workspace),
+                            /* bmi2 */ 0);
+        dictPtr += litlengthHeaderSize;
+    }
+
+    RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, "");
+    {   int i;
+        size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12));
+        for (i=0; i<3; i++) {
+            U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4;
+            RETURN_ERROR_IF(rep==0 || rep > dictContentSize,
+                            dictionary_corrupted, "");
+            entropy->rep[i] = rep;
+    }   }
+
+    return (size_t)(dictPtr - (const BYTE*)dict);
+}
+
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize);
+    {   U32 const magic = MEM_readLE32(dict);
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
+            return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
+    }   }
+    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
+
+    /* load entropy tables */
+    {   size_t const eSize = ZSTD_loadDEntropy(&dctx->entropy, dict, dictSize);
+        RETURN_ERROR_IF(ZSTD_isError(eSize), dictionary_corrupted, "");
+        dict = (const char*)dict + eSize;
+        dictSize -= eSize;
+    }
+    dctx->litEntropy = dctx->fseEntropy = 1;
+
+    /* reference dictionary content */
+    return ZSTD_refDictContent(dctx, dict, dictSize);
+}
+
+size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+{
+    assert(dctx != NULL);
+#if ZSTD_TRACE
+    dctx->traceCtx = (ZSTD_trace_decompress_begin != NULL) ? ZSTD_trace_decompress_begin(dctx) : 0;
+#endif
+    dctx->expected = ZSTD_startingInputLength(dctx->format);  /* dctx->format must be properly set */
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->processedCSize = 0;
+    dctx->decodedSize = 0;
+    dctx->previousDstEnd = NULL;
+    dctx->prefixStart = NULL;
+    dctx->virtualStart = NULL;
+    dctx->dictEnd = NULL;
+    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+    dctx->litEntropy = dctx->fseEntropy = 0;
+    dctx->dictID = 0;
+    dctx->bType = bt_reserved;
+    dctx->isFrameDecompression = 1;
+    ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+    ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+    dctx->LLTptr = dctx->entropy.LLTable;
+    dctx->MLTptr = dctx->entropy.MLTable;
+    dctx->OFTptr = dctx->entropy.OFTable;
+    dctx->HUFptr = dctx->entropy.hufTable;
+    return 0;
+}
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , "");
+    if (dict && dictSize)
+        RETURN_ERROR_IF(
+            ZSTD_isError(ZSTD_decompress_insertDictionary(dctx, dict, dictSize)),
+            dictionary_corrupted, "");
+    return 0;
+}
+
+
+/* ======   ZSTD_DDict   ====== */
+
+size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict");
+    assert(dctx != NULL);
+    if (ddict) {
+        const char* const dictStart = (const char*)ZSTD_DDict_dictContent(ddict);
+        size_t const dictSize = ZSTD_DDict_dictSize(ddict);
+        const void* const dictEnd = dictStart + dictSize;
+        dctx->ddictIsCold = (dctx->dictEnd != dictEnd);
+        DEBUGLOG(4, "DDict is %s",
+                    dctx->ddictIsCold ? "~cold~" : "hot!");
+    }
+    FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , "");
+    if (ddict) {   /* NULL ddict is equivalent to no dictionary */
+        ZSTD_copyDDictParameters(dctx, ddict);
+    }
+    return 0;
+}
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return 0;
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
+    return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
+}
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompress frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary (most common case).
+ *  - The frame was built with dictID intentionally removed.
+ *    Needed dictionary is a hidden piece of information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+ *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use
+ *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+{
+    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
+    size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+    if (ZSTD_isError(hError)) return 0;
+    return zfp.dictID;
+}
+
+
+/*! ZSTD_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Use dictionary without significant overhead. */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const ZSTD_DDict* ddict)
+{
+    /* pass content and size in case legacy frames are encountered */
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize,
+                                     NULL, 0,
+                                     ddict);
+}
+
+
+/*=====================================
+*   Streaming decompression
+*====================================*/
+
+ZSTD_DStream* ZSTD_createDStream(void)
+{
+    DEBUGLOG(3, "ZSTD_createDStream");
+    return ZSTD_createDCtx_internal(ZSTD_defaultCMem);
+}
+
+ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticDCtx(workspace, workspaceSize);
+}
+
+ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDCtx_internal(customMem);
+}
+
+size_t ZSTD_freeDStream(ZSTD_DStream* zds)
+{
+    return ZSTD_freeDCtx(zds);
+}
+
+
+/* ***  Initialization  *** */
+
+size_t ZSTD_DStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
+size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx,
+                                   const void* dict, size_t dictSize,
+                                         ZSTD_dictLoadMethod_e dictLoadMethod,
+                                         ZSTD_dictContentType_e dictContentType)
+{
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    ZSTD_clearDict(dctx);
+    if (dict && dictSize != 0) {
+        dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem);
+        RETURN_ERROR_IF(dctx->ddictLocal == NULL, memory_allocation, "NULL pointer!");
+        dctx->ddict = dctx->ddictLocal;
+        dctx->dictUses = ZSTD_use_indefinitely;
+    }
+    return 0;
+}
+
+size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+    FORWARD_IF_ERROR(ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType), "");
+    dctx->dictUses = ZSTD_use_once;
+    return 0;
+}
+
+size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize)
+{
+    return ZSTD_DCtx_refPrefix_advanced(dctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+
+/* ZSTD_initDStream_usingDict() :
+ * return : expected size, aka ZSTD_startingInputLength().
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize)
+{
+    DEBUGLOG(4, "ZSTD_initDStream_usingDict");
+    FORWARD_IF_ERROR( ZSTD_DCtx_reset(zds, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) , "");
+    return ZSTD_startingInputLength(zds->format);
+}
+
+/* note : this variant can't fail */
+size_t ZSTD_initDStream(ZSTD_DStream* zds)
+{
+    DEBUGLOG(4, "ZSTD_initDStream");
+    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
+    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
+    return ZSTD_startingInputLength(zds->format);
+}
+
+/* ZSTD_initDStream_usingDDict() :
+ * ddict will just be referenced, and must outlive decompression session
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+{
+    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
+    FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+    return ZSTD_startingInputLength(dctx->format);
+}
+
+/* ZSTD_resetDStream() :
+ * return : expected size, aka ZSTD_startingInputLength().
+ * this function cannot fail */
+size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+{
+    DEBUGLOG(4, "ZSTD_resetDStream");
+    FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+    return ZSTD_startingInputLength(dctx->format);
+}
+
+
+size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    ZSTD_clearDict(dctx);
+    if (ddict) {
+        dctx->ddict = ddict;
+        dctx->dictUses = ZSTD_use_indefinitely;
+        if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts) {
+            if (dctx->ddictSet == NULL) {
+                dctx->ddictSet = ZSTD_createDDictHashSet(dctx->customMem);
+                if (!dctx->ddictSet) {
+                    RETURN_ERROR(memory_allocation, "Failed to allocate memory for hash set!");
+                }
+            }
+            assert(!dctx->staticSize);  /* Impossible: ddictSet cannot have been allocated if static dctx */
+            FORWARD_IF_ERROR(ZSTD_DDictHashSet_addDDict(dctx->ddictSet, ddict, dctx->customMem), "");
+        }
+    }
+    return 0;
+}
+
+/* ZSTD_DCtx_setMaxWindowSize() :
+ * note : no direct equivalence in ZSTD_DCtx_setParameter,
+ * since this version sets windowSize, and the other sets windowLog */
+size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize)
+{
+    ZSTD_bounds const bounds = ZSTD_dParam_getBounds(ZSTD_d_windowLogMax);
+    size_t const min = (size_t)1 << bounds.lowerBound;
+    size_t const max = (size_t)1 << bounds.upperBound;
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    RETURN_ERROR_IF(maxWindowSize < min, parameter_outOfBound, "");
+    RETURN_ERROR_IF(maxWindowSize > max, parameter_outOfBound, "");
+    dctx->maxWindowSize = maxWindowSize;
+    return 0;
+}
+
+size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format)
+{
+    return ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, (int)format);
+}
+
+ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+{
+    ZSTD_bounds bounds = { 0, 0, 0 };
+    switch(dParam) {
+        case ZSTD_d_windowLogMax:
+            bounds.lowerBound = ZSTD_WINDOWLOG_ABSOLUTEMIN;
+            bounds.upperBound = ZSTD_WINDOWLOG_MAX;
+            return bounds;
+        case ZSTD_d_format:
+            bounds.lowerBound = (int)ZSTD_f_zstd1;
+            bounds.upperBound = (int)ZSTD_f_zstd1_magicless;
+            ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless);
+            return bounds;
+        case ZSTD_d_stableOutBuffer:
+            bounds.lowerBound = (int)ZSTD_bm_buffered;
+            bounds.upperBound = (int)ZSTD_bm_stable;
+            return bounds;
+        case ZSTD_d_forceIgnoreChecksum:
+            bounds.lowerBound = (int)ZSTD_d_validateChecksum;
+            bounds.upperBound = (int)ZSTD_d_ignoreChecksum;
+            return bounds;
+        case ZSTD_d_refMultipleDDicts:
+            bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
+            bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
+            return bounds;
+        case ZSTD_d_disableHuffmanAssembly:
+            bounds.lowerBound = 0;
+            bounds.upperBound = 1;
+            return bounds;
+        case ZSTD_d_maxBlockSize:
+            bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
+            bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
+            return bounds;
+
+        default:;
+    }
+    bounds.error = ERROR(parameter_unsupported);
+    return bounds;
+}
+
+/* ZSTD_dParam_withinBounds:
+ * @return 1 if value is within dParam bounds,
+ * 0 otherwise */
+static int ZSTD_dParam_withinBounds(ZSTD_dParameter dParam, int value)
+{
+    ZSTD_bounds const bounds = ZSTD_dParam_getBounds(dParam);
+    if (ZSTD_isError(bounds.error)) return 0;
+    if (value < bounds.lowerBound) return 0;
+    if (value > bounds.upperBound) return 0;
+    return 1;
+}
+
+#define CHECK_DBOUNDS(p,v) {                \
+    RETURN_ERROR_IF(!ZSTD_dParam_withinBounds(p, v), parameter_outOfBound, ""); \
+}
+
+size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value)
+{
+    switch (param) {
+        case ZSTD_d_windowLogMax:
+            *value = (int)ZSTD_highbit32((U32)dctx->maxWindowSize);
+            return 0;
+        case ZSTD_d_format:
+            *value = (int)dctx->format;
+            return 0;
+        case ZSTD_d_stableOutBuffer:
+            *value = (int)dctx->outBufferMode;
+            return 0;
+        case ZSTD_d_forceIgnoreChecksum:
+            *value = (int)dctx->forceIgnoreChecksum;
+            return 0;
+        case ZSTD_d_refMultipleDDicts:
+            *value = (int)dctx->refMultipleDDicts;
+            return 0;
+        case ZSTD_d_disableHuffmanAssembly:
+            *value = (int)dctx->disableHufAsm;
+            return 0;
+        case ZSTD_d_maxBlockSize:
+            *value = dctx->maxBlockSizeParam;
+            return 0;
+        default:;
+    }
+    RETURN_ERROR(parameter_unsupported, "");
+}
+
+size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value)
+{
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    switch(dParam) {
+        case ZSTD_d_windowLogMax:
+            if (value == 0) value = ZSTD_WINDOWLOG_LIMIT_DEFAULT;
+            CHECK_DBOUNDS(ZSTD_d_windowLogMax, value);
+            dctx->maxWindowSize = ((size_t)1) << value;
+            return 0;
+        case ZSTD_d_format:
+            CHECK_DBOUNDS(ZSTD_d_format, value);
+            dctx->format = (ZSTD_format_e)value;
+            return 0;
+        case ZSTD_d_stableOutBuffer:
+            CHECK_DBOUNDS(ZSTD_d_stableOutBuffer, value);
+            dctx->outBufferMode = (ZSTD_bufferMode_e)value;
+            return 0;
+        case ZSTD_d_forceIgnoreChecksum:
+            CHECK_DBOUNDS(ZSTD_d_forceIgnoreChecksum, value);
+            dctx->forceIgnoreChecksum = (ZSTD_forceIgnoreChecksum_e)value;
+            return 0;
+        case ZSTD_d_refMultipleDDicts:
+            CHECK_DBOUNDS(ZSTD_d_refMultipleDDicts, value);
+            if (dctx->staticSize != 0) {
+                RETURN_ERROR(parameter_unsupported, "Static dctx does not support multiple DDicts!");
+            }
+            dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
+            return 0;
+        case ZSTD_d_disableHuffmanAssembly:
+            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
+            dctx->disableHufAsm = value != 0;
+            return 0;
+        case ZSTD_d_maxBlockSize:
+            if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value);
+            dctx->maxBlockSizeParam = value;
+            return 0;
+        default:;
+    }
+    RETURN_ERROR(parameter_unsupported, "");
+}
+
+size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset)
+{
+    if ( (reset == ZSTD_reset_session_only)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        dctx->streamStage = zdss_init;
+        dctx->noForwardProgress = 0;
+        dctx->isFrameDecompression = 1;
+    }
+    if ( (reset == ZSTD_reset_parameters)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+        ZSTD_clearDict(dctx);
+        ZSTD_DCtx_resetParameters(dctx);
+    }
+    return 0;
+}
+
+
+size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
+{
+    return ZSTD_sizeof_DCtx(dctx);
+}
+
+static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax)
+{
+    size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax);
+    /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block
+     * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing
+     * the block at the beginning of the output buffer, and maintain a full window.
+     *
+     * We need another blockSize worth of buffer so that we can store split
+     * literals at the end of the block without overwriting the extDict window.
+     */
+    unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2);
+    unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+    size_t const minRBSize = (size_t) neededSize;
+    RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
+                    frameParameter_windowTooLarge, "");
+    return minRBSize;
+}
+
+size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
+{
+    return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX);
+}
+
+size_t ZSTD_estimateDStreamSize(size_t windowSize)
+{
+    size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+    size_t const inBuffSize = blockSize;  /* no block can be larger */
+    size_t const outBuffSize = ZSTD_decodingBufferSize_min(windowSize, ZSTD_CONTENTSIZE_UNKNOWN);
+    return ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize;
+}
+
+size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize)
+{
+    U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX;   /* note : should be user-selectable, but requires an additional parameter (or a dctx) */
+    ZSTD_frameHeader zfh;
+    size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize);
+    if (ZSTD_isError(err)) return err;
+    RETURN_ERROR_IF(err>0, srcSize_wrong, "");
+    RETURN_ERROR_IF(zfh.windowSize > windowSizeMax,
+                    frameParameter_windowTooLarge, "");
+    return ZSTD_estimateDStreamSize((size_t)zfh.windowSize);
+}
+
+
+/* *****   Decompression   ***** */
+
+static int ZSTD_DCtx_isOverflow(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize)
+{
+    return (zds->inBuffSize + zds->outBuffSize) >= (neededInBuffSize + neededOutBuffSize) * ZSTD_WORKSPACETOOLARGE_FACTOR;
+}
+
+static void ZSTD_DCtx_updateOversizedDuration(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize)
+{
+    if (ZSTD_DCtx_isOverflow(zds, neededInBuffSize, neededOutBuffSize))
+        zds->oversizedDuration++;
+    else
+        zds->oversizedDuration = 0;
+}
+
+static int ZSTD_DCtx_isOversizedTooLong(ZSTD_DStream* zds)
+{
+    return zds->oversizedDuration >= ZSTD_WORKSPACETOOLARGE_MAXDURATION;
+}
+
+/* Checks that the output buffer hasn't changed if ZSTD_obm_stable is used. */
+static size_t ZSTD_checkOutBuffer(ZSTD_DStream const* zds, ZSTD_outBuffer const* output)
+{
+    ZSTD_outBuffer const expect = zds->expectedOutBuffer;
+    /* No requirement when ZSTD_obm_stable is not enabled. */
+    if (zds->outBufferMode != ZSTD_bm_stable)
+        return 0;
+    /* Any buffer is allowed in zdss_init, this must be the same for every other call until
+     * the context is reset.
+     */
+    if (zds->streamStage == zdss_init)
+        return 0;
+    /* The buffer must match our expectation exactly. */
+    if (expect.dst == output->dst && expect.pos == output->pos && expect.size == output->size)
+        return 0;
+    RETURN_ERROR(dstBuffer_wrong, "ZSTD_d_stableOutBuffer enabled but output differs!");
+}
+
+/* Calls ZSTD_decompressContinue() with the right parameters for ZSTD_decompressStream()
+ * and updates the stage and the output buffer state. This call is extracted so it can be
+ * used both when reading directly from the ZSTD_inBuffer, and in buffered input mode.
+ * NOTE: You must break after calling this function since the streamStage is modified.
+ */
+static size_t ZSTD_decompressContinueStream(
+            ZSTD_DStream* zds, char** op, char* oend,
+            void const* src, size_t srcSize) {
+    int const isSkipFrame = ZSTD_isSkipFrame(zds);
+    if (zds->outBufferMode == ZSTD_bm_buffered) {
+        size_t const dstSize = isSkipFrame ? 0 : zds->outBuffSize - zds->outStart;
+        size_t const decodedSize = ZSTD_decompressContinue(zds,
+                zds->outBuff + zds->outStart, dstSize, src, srcSize);
+        FORWARD_IF_ERROR(decodedSize, "");
+        if (!decodedSize && !isSkipFrame) {
+            zds->streamStage = zdss_read;
+        } else {
+            zds->outEnd = zds->outStart + decodedSize;
+            zds->streamStage = zdss_flush;
+        }
+    } else {
+        /* Write directly into the output buffer */
+        size_t const dstSize = isSkipFrame ? 0 : (size_t)(oend - *op);
+        size_t const decodedSize = ZSTD_decompressContinue(zds, *op, dstSize, src, srcSize);
+        FORWARD_IF_ERROR(decodedSize, "");
+        *op += decodedSize;
+        /* Flushing is not needed. */
+        zds->streamStage = zdss_read;
+        assert(*op <= oend);
+        assert(zds->outBufferMode == ZSTD_bm_stable);
+    }
+    return 0;
+}
+
+size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    const char* const src = (const char*)input->src;
+    const char* const istart = input->pos != 0 ? src + input->pos : src;
+    const char* const iend = input->size != 0 ? src + input->size : src;
+    const char* ip = istart;
+    char* const dst = (char*)output->dst;
+    char* const ostart = output->pos != 0 ? dst + output->pos : dst;
+    char* const oend = output->size != 0 ? dst + output->size : dst;
+    char* op = ostart;
+    U32 someMoreWork = 1;
+
+    DEBUGLOG(5, "ZSTD_decompressStream");
+    RETURN_ERROR_IF(
+        input->pos > input->size,
+        srcSize_wrong,
+        "forbidden. in: pos: %u   vs size: %u",
+        (U32)input->pos, (U32)input->size);
+    RETURN_ERROR_IF(
+        output->pos > output->size,
+        dstSize_tooSmall,
+        "forbidden. out: pos: %u   vs size: %u",
+        (U32)output->pos, (U32)output->size);
+    DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos));
+    FORWARD_IF_ERROR(ZSTD_checkOutBuffer(zds, output), "");
+
+    while (someMoreWork) {
+        switch(zds->streamStage)
+        {
+        case zdss_init :
+            DEBUGLOG(5, "stage zdss_init => transparent reset ");
+            zds->streamStage = zdss_loadHeader;
+            zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+            zds->legacyVersion = 0;
+#endif
+            zds->hostageByte = 0;
+            zds->expectedOutBuffer = *output;
+            ZSTD_FALLTHROUGH;
+
+        case zdss_loadHeader :
+            DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip));
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+            if (zds->legacyVersion) {
+                RETURN_ERROR_IF(zds->staticSize, memory_allocation,
+                    "legacy support is incompatible with static dctx");
+                {   size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input);
+                    if (hint==0) zds->streamStage = zdss_init;
+                    return hint;
+            }   }
+#endif
+            {   size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
+                if (zds->refMultipleDDicts && zds->ddictSet) {
+                    ZSTD_DCtx_selectFrameDDict(zds);
+                }
+                if (ZSTD_isError(hSize)) {
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+                    U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart);
+                    if (legacyVersion) {
+                        ZSTD_DDict const* const ddict = ZSTD_getDDict(zds);
+                        const void* const dict = ddict ? ZSTD_DDict_dictContent(ddict) : NULL;
+                        size_t const dictSize = ddict ? ZSTD_DDict_dictSize(ddict) : 0;
+                        DEBUGLOG(5, "ZSTD_decompressStream: detected legacy version v0.%u", legacyVersion);
+                        RETURN_ERROR_IF(zds->staticSize, memory_allocation,
+                            "legacy support is incompatible with static dctx");
+                        FORWARD_IF_ERROR(ZSTD_initLegacyStream(&zds->legacyContext,
+                                    zds->previousLegacyVersion, legacyVersion,
+                                    dict, dictSize), "");
+                        zds->legacyVersion = zds->previousLegacyVersion = legacyVersion;
+                        {   size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input);
+                            if (hint==0) zds->streamStage = zdss_init;   /* or stay in stage zdss_loadHeader */
+                            return hint;
+                    }   }
+#endif
+                    return hSize;   /* error */
+                }
+                if (hSize != 0) {   /* need more input */
+                    size_t const toLoad = hSize - zds->lhSize;   /* if hSize!=0, hSize > zds->lhSize */
+                    size_t const remainingInput = (size_t)(iend-ip);
+                    assert(iend >= ip);
+                    if (toLoad > remainingInput) {   /* not enough input to load full header */
+                        if (remainingInput > 0) {
+                            ZSTD_memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput);
+                            zds->lhSize += remainingInput;
+                        }
+                        input->pos = input->size;
+                        /* check first few bytes */
+                        FORWARD_IF_ERROR(
+                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
+                            "First few bytes detected incorrect" );
+                        /* return hint input size */
+                        return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                    }
+                    assert(ip != NULL);
+                    ZSTD_memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad;
+                    break;
+            }   }
+
+            /* check for single-pass mode opportunity */
+            if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                && zds->fParams.frameType != ZSTD_skippableFrame
+                && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+                size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format);
+                if (cSize <= (size_t)(iend-istart)) {
+                    /* shortcut : using single-pass mode */
+                    size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
+                    if (ZSTD_isError(decompressedSize)) return decompressedSize;
+                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()");
+                    assert(istart != NULL);
+                    ip = istart + cSize;
+                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
+                    zds->expected = 0;
+                    zds->streamStage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+            }   }
+
+            /* Check output buffer is large enough for ZSTD_odm_stable. */
+            if (zds->outBufferMode == ZSTD_bm_stable
+                && zds->fParams.frameType != ZSTD_skippableFrame
+                && zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                && (U64)(size_t)(oend-op) < zds->fParams.frameContentSize) {
+                RETURN_ERROR(dstSize_tooSmall, "ZSTD_obm_stable passed but ZSTD_outBuffer is too small");
+            }
+
+            /* Consume header (see ZSTDds_decodeFrameHeader) */
+            DEBUGLOG(4, "Consume header");
+            FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
+
+            if (zds->format == ZSTD_f_zstd1
+                && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
+                zds->stage = ZSTDds_skipFrame;
+            } else {
+                FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize), "");
+                zds->expected = ZSTD_blockHeaderSize;
+                zds->stage = ZSTDds_decodeBlockHeader;
+            }
+
+            /* control buffer memory usage */
+            DEBUGLOG(4, "Control max memory usage (%u KB <= max %u KB)",
+                        (U32)(zds->fParams.windowSize >>10),
+                        (U32)(zds->maxWindowSize >> 10) );
+            zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+            RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
+                            frameParameter_windowTooLarge, "");
+            if (zds->maxBlockSizeParam != 0)
+                zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam);
+
+            /* Adapt buffer sizes to frame header instructions */
+            {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+                size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered
+                        ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax)
+                        : 0;
+
+                ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
+
+                {   int const tooSmall = (zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize);
+                    int const tooLarge = ZSTD_DCtx_isOversizedTooLong(zds);
+
+                    if (tooSmall || tooLarge) {
+                        size_t const bufferSize = neededInBuffSize + neededOutBuffSize;
+                        DEBUGLOG(4, "inBuff  : from %u to %u",
+                                    (U32)zds->inBuffSize, (U32)neededInBuffSize);
+                        DEBUGLOG(4, "outBuff : from %u to %u",
+                                    (U32)zds->outBuffSize, (U32)neededOutBuffSize);
+                        if (zds->staticSize) {  /* static DCtx */
+                            DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize);
+                            assert(zds->staticSize >= sizeof(ZSTD_DCtx));  /* controlled at init */
+                            RETURN_ERROR_IF(
+                                bufferSize > zds->staticSize - sizeof(ZSTD_DCtx),
+                                memory_allocation, "");
+                        } else {
+                            ZSTD_customFree(zds->inBuff, zds->customMem);
+                            zds->inBuffSize = 0;
+                            zds->outBuffSize = 0;
+                            zds->inBuff = (char*)ZSTD_customMalloc(bufferSize, zds->customMem);
+                            RETURN_ERROR_IF(zds->inBuff == NULL, memory_allocation, "");
+                        }
+                        zds->inBuffSize = neededInBuffSize;
+                        zds->outBuff = zds->inBuff + zds->inBuffSize;
+                        zds->outBuffSize = neededOutBuffSize;
+            }   }   }
+            zds->streamStage = zdss_read;
+            ZSTD_FALLTHROUGH;
+
+        case zdss_read:
+            DEBUGLOG(5, "stage zdss_read");
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip));
+                DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize);
+                if (neededInSize==0) {  /* end of frame */
+                    zds->streamStage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                    FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
+                    assert(ip != NULL);
+                    ip += neededInSize;
+                    /* Function modifies the stage so we must break */
+                    break;
+            }   }
+            if (ip==iend) { someMoreWork = 0; break; }   /* no more input */
+            zds->streamStage = zdss_load;
+            ZSTD_FALLTHROUGH;
+
+        case zdss_load:
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds);
+                size_t const toLoad = neededInSize - zds->inPos;
+                int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                size_t loadedSize;
+                /* At this point we shouldn't be decompressing a block that we can stream. */
+                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
+                if (isSkipFrame) {
+                    loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                } else {
+                    RETURN_ERROR_IF(toLoad > zds->inBuffSize - zds->inPos,
+                                    corruption_detected,
+                                    "should never happen");
+                    loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
+                }
+                if (loadedSize != 0) {
+                    /* ip may be NULL */
+                    ip += loadedSize;
+                    zds->inPos += loadedSize;
+                }
+                if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+
+                /* decode loaded input */
+                zds->inPos = 0;   /* input is consumed */
+                FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, zds->inBuff, neededInSize), "");
+                /* Function modifies the stage so we must break */
+                break;
+            }
+        case zdss_flush:
+            {
+                size_t const toFlushSize = zds->outEnd - zds->outStart;
+                size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
+
+                op = op ? op + flushedSize : op;
+
+                zds->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {  /* flush completed */
+                    zds->streamStage = zdss_read;
+                    if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                        DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                (int)(zds->outBuffSize - zds->outStart),
+                                (U32)zds->fParams.blockSizeMax);
+                        zds->outStart = zds->outEnd = 0;
+                    }
+                    break;
+            }   }
+            /* cannot complete flush */
+            someMoreWork = 0;
+            break;
+
+        default:
+            assert(0);    /* impossible */
+            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+    }   }
+
+    /* result */
+    input->pos = (size_t)(ip - (const char*)(input->src));
+    output->pos = (size_t)(op - (char*)(output->dst));
+
+    /* Update the expected output buffer for ZSTD_obm_stable. */
+    zds->expectedOutBuffer = *output;
+
+    if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+        zds->noForwardProgress ++;
+        if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
+            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
+            assert(0);
+        }
+    } else {
+        zds->noForwardProgress = 0;
+    }
+    {   size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds);
+        if (!nextSrcSizeHint) {   /* frame fully decoded */
+            if (zds->outEnd == zds->outStart) {  /* output fully flushed */
+                if (zds->hostageByte) {
+                    if (input->pos >= input->size) {
+                        /* can't release hostage (not present) */
+                        zds->streamStage = zdss_read;
+                        return 1;
+                    }
+                    input->pos++;  /* release hostage */
+                }   /* zds->hostageByte */
+                return 0;
+            }  /* zds->outEnd == zds->outStart */
+            if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */
+                input->pos--;   /* note : pos > 0, otherwise, impossible to finish reading last block */
+                zds->hostageByte=1;
+            }
+            return 1;
+        }  /* nextSrcSizeHint==0 */
+        nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds) == ZSTDnit_block);   /* preload header of next block */
+        assert(zds->inPos <= nextSrcSizeHint);
+        nextSrcSizeHint -= zds->inPos;   /* part already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+size_t ZSTD_decompressStream_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos)
+{
+    ZSTD_outBuffer output;
+    ZSTD_inBuffer  input;
+    output.dst = dst;
+    output.size = dstCapacity;
+    output.pos = *dstPos;
+    input.src = src;
+    input.size = srcSize;
+    input.pos = *srcPos;
+    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+        *dstPos = output.pos;
+        *srcPos = input.pos;
+        return cErr;
+    }
+}
diff --git a/src/zstd/decompress/zstd_decompress_block.c b/src/zstd/decompress/zstd_decompress_block.c
new file mode 100644
index 000000000..76d7332e8
--- /dev/null
+++ b/src/zstd/decompress/zstd_decompress_block.c
@@ -0,0 +1,2215 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* zstd_decompress_block :
+ * this module takes care of decompressing _compressed_ block */
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+#include "../common/compiler.h"    /* prefetch */
+#include "../common/cpu.h"         /* bmi2 */
+#include "../common/mem.h"         /* low level memory routines */
+#define FSE_STATIC_LINKING_ONLY
+#include "../common/fse.h"
+#include "../common/huf.h"
+#include "../common/zstd_internal.h"
+#include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+#include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+#include "zstd_decompress_block.h"
+#include "../common/bits.h"  /* ZSTD_highbit32 */
+
+/*_*******************************************************
+*  Macros
+**********************************************************/
+
+/* These two optional macros force the use one way or another of the two
+ * ZSTD_decompressSequences implementations. You can't force in both directions
+ * at the same time.
+ */
+#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!"
+#endif
+
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
+
+
+/*-*************************************************************
+ *   Block decoding
+ ***************************************************************/
+
+static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx)
+{
+    size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX;
+    assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
+    return blockSizeMax;
+}
+
+/*! ZSTD_getcBlockSize() :
+ *  Provides the size of compressed block from block header `src` */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr)
+{
+    RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");
+
+    {   U32 const cBlockHeader = MEM_readLE24(src);
+        U32 const cSize = cBlockHeader >> 3;
+        bpPtr->lastBlock = cBlockHeader & 1;
+        bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
+        bpPtr->origSize = cSize;   /* only useful for RLE */
+        if (bpPtr->blockType == bt_rle) return 1;
+        RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");
+        return cSize;
+    }
+}
+
+/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
+static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
+    const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
+{
+    size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
+    assert(litSize <= blockSizeMax);
+    assert(dctx->isFrameDecompression || streaming == not_streaming);
+    assert(expectedWriteSize <= blockSizeMax);
+    if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
+        /* If we aren't streaming, we can just put the literals after the output
+         * of the current block. We don't need to worry about overwriting the
+         * extDict of our window, because it doesn't exist.
+         * So if we have space after the end of the block, just put it there.
+         */
+        dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
+        dctx->litBufferEnd = dctx->litBuffer + litSize;
+        dctx->litBufferLocation = ZSTD_in_dst;
+    } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) {
+        /* Literals fit entirely within the extra buffer, put them there to avoid
+         * having to split the literals.
+         */
+        dctx->litBuffer = dctx->litExtraBuffer;
+        dctx->litBufferEnd = dctx->litBuffer + litSize;
+        dctx->litBufferLocation = ZSTD_not_in_dst;
+    } else {
+        assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE);
+        /* Literals must be split between the output block and the extra lit
+         * buffer. We fill the extra lit buffer with the tail of the literals,
+         * and put the rest of the literals at the end of the block, with
+         * WILDCOPY_OVERLENGTH of buffer room to allow for overreads.
+         * This MUST not write more than our maxBlockSize beyond dst, because in
+         * streaming mode, that could overwrite part of our extDict window.
+         */
+        if (splitImmediately) {
+            /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
+            dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+            dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
+        } else {
+            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
+            dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
+            dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
+        }
+        dctx->litBufferLocation = ZSTD_split;
+        assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
+    }
+}
+
+/*! ZSTD_decodeLiteralsBlock() :
+ * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
+ * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
+ * block will be output.  Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
+ * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
+ *
+ * @return : nb of bytes read from src (< srcSize )
+ *  note : symbol not declared but exposed for fullbench */
+static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */
+                          void* dst, size_t dstCapacity, const streaming_operation streaming)
+{
+    DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
+    RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
+
+    {   const BYTE* const istart = (const BYTE*) src;
+        symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
+        size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
+
+        switch(litEncType)
+        {
+        case set_repeat:
+            DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
+            RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
+            ZSTD_FALLTHROUGH;
+
+        case set_compressed:
+            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
+            {   size_t lhSize, litSize, litCSize;
+                U32 singleStream=0;
+                U32 const lhlCode = (istart[0] >> 2) & 3;
+                U32 const lhc = MEM_readLE32(istart);
+                size_t hufSuccess;
+                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                int const flags = 0
+                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
+                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
+                switch(lhlCode)
+                {
+                case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    /* 2 - 2 - 10 - 10 */
+                    singleStream = !lhlCode;
+                    lhSize = 3;
+                    litSize  = (lhc >> 4) & 0x3FF;
+                    litCSize = (lhc >> 14) & 0x3FF;
+                    break;
+                case 2:
+                    /* 2 - 2 - 14 - 14 */
+                    lhSize = 4;
+                    litSize  = (lhc >> 4) & 0x3FFF;
+                    litCSize = lhc >> 18;
+                    break;
+                case 3:
+                    /* 2 - 2 - 18 - 18 */
+                    lhSize = 5;
+                    litSize  = (lhc >> 4) & 0x3FFFF;
+                    litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
+                    break;
+                }
+                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                if (!singleStream)
+                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
+                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
+                        litSize, MIN_LITERALS_FOR_4_STREAMS);
+                RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+                RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
+                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
+
+                /* prefetch huffman table if cold */
+                if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
+                    PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
+                }
+
+                if (litEncType==set_repeat) {
+                    if (singleStream) {
+                        hufSuccess = HUF_decompress1X_usingDTable(
+                            dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                            dctx->HUFptr, flags);
+                    } else {
+                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
+                        hufSuccess = HUF_decompress4X_usingDTable(
+                            dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                            dctx->HUFptr, flags);
+                    }
+                } else {
+                    if (singleStream) {
+#if defined(HUF_FORCE_DECOMPRESS_X2)
+                        hufSuccess = HUF_decompress1X_DCtx_wksp(
+                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                            istart+lhSize, litCSize, dctx->workspace,
+                            sizeof(dctx->workspace), flags);
+#else
+                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
+                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                            istart+lhSize, litCSize, dctx->workspace,
+                            sizeof(dctx->workspace), flags);
+#endif
+                    } else {
+                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
+                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                            istart+lhSize, litCSize, dctx->workspace,
+                            sizeof(dctx->workspace), flags);
+                    }
+                }
+                if (dctx->litBufferLocation == ZSTD_split)
+                {
+                    assert(litSize > ZSTD_LITBUFFEREXTRASIZE);
+                    ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                    ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
+                    dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+                    dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
+                    assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax);
+                }
+
+                RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
+
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                dctx->litEntropy = 1;
+                if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
+                return litCSize + lhSize;
+            }
+
+        case set_basic:
+            {   size_t litSize, lhSize;
+                U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
+                    litSize = MEM_readLE24(istart) >> 4;
+                    break;
+                }
+
+                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                    RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
+                    if (dctx->litBufferLocation == ZSTD_split)
+                    {
+                        ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
+                        ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                    }
+                    else
+                    {
+                        ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
+                    }
+                    dctx->litPtr = dctx->litBuffer;
+                    dctx->litSize = litSize;
+                    return lhSize+litSize;
+                }
+                /* direct reference into compressed stream */
+                dctx->litPtr = istart+lhSize;
+                dctx->litSize = litSize;
+                dctx->litBufferEnd = dctx->litPtr + litSize;
+                dctx->litBufferLocation = ZSTD_not_in_dst;
+                return lhSize+litSize;
+            }
+
+        case set_rle:
+            {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                size_t litSize, lhSize;
+                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
+                    litSize = MEM_readLE24(istart) >> 4;
+                    break;
+                }
+                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                if (dctx->litBufferLocation == ZSTD_split)
+                {
+                    ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
+                    ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
+                }
+                else
+                {
+                    ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
+                }
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                return lhSize+1;
+            }
+        default:
+            RETURN_ERROR(corruption_detected, "impossible");
+        }
+    }
+}
+
+/* Hidden declaration for fullbench */
+size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize,
+                          void* dst, size_t dstCapacity);
+size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize,
+                          void* dst, size_t dstCapacity)
+{
+    dctx->isFrameDecompression = 0;
+    return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming);
+}
+
+/* Default FSE distribution tables.
+ * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
+ * They were generated programmatically with following method :
+ * - start from default distributions, present in /lib/common/zstd_internal.h
+ * - generate tables normally, using ZSTD_buildFSETable()
+ * - printout the content of tables
+ * - pretify output, report below, test with fuzzer to ensure it's correct */
+
+/* Default FSE distribution table for Literal Lengths */
+static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
+     {  1,  1,  1, LL_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+     /* nextState, nbAddBits, nbBits, baseVal */
+     {  0,  0,  4,    0},  { 16,  0,  4,    0},
+     { 32,  0,  5,    1},  {  0,  0,  5,    3},
+     {  0,  0,  5,    4},  {  0,  0,  5,    6},
+     {  0,  0,  5,    7},  {  0,  0,  5,    9},
+     {  0,  0,  5,   10},  {  0,  0,  5,   12},
+     {  0,  0,  6,   14},  {  0,  1,  5,   16},
+     {  0,  1,  5,   20},  {  0,  1,  5,   22},
+     {  0,  2,  5,   28},  {  0,  3,  5,   32},
+     {  0,  4,  5,   48},  { 32,  6,  5,   64},
+     {  0,  7,  5,  128},  {  0,  8,  6,  256},
+     {  0, 10,  6, 1024},  {  0, 12,  6, 4096},
+     { 32,  0,  4,    0},  {  0,  0,  4,    1},
+     {  0,  0,  5,    2},  { 32,  0,  5,    4},
+     {  0,  0,  5,    5},  { 32,  0,  5,    7},
+     {  0,  0,  5,    8},  { 32,  0,  5,   10},
+     {  0,  0,  5,   11},  {  0,  0,  6,   13},
+     { 32,  1,  5,   16},  {  0,  1,  5,   18},
+     { 32,  1,  5,   22},  {  0,  2,  5,   24},
+     { 32,  3,  5,   32},  {  0,  3,  5,   40},
+     {  0,  6,  4,   64},  { 16,  6,  4,   64},
+     { 32,  7,  5,  128},  {  0,  9,  6,  512},
+     {  0, 11,  6, 2048},  { 48,  0,  4,    0},
+     { 16,  0,  4,    1},  { 32,  0,  5,    2},
+     { 32,  0,  5,    3},  { 32,  0,  5,    5},
+     { 32,  0,  5,    6},  { 32,  0,  5,    8},
+     { 32,  0,  5,    9},  { 32,  0,  5,   11},
+     { 32,  0,  5,   12},  {  0,  0,  6,   15},
+     { 32,  1,  5,   18},  { 32,  1,  5,   20},
+     { 32,  2,  5,   24},  { 32,  2,  5,   28},
+     { 32,  3,  5,   40},  { 32,  4,  5,   48},
+     {  0, 16,  6,65536},  {  0, 15,  6,32768},
+     {  0, 14,  6,16384},  {  0, 13,  6, 8192},
+};   /* LL_defaultDTable */
+
+/* Default FSE distribution table for Offset Codes */
+static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = {
+    {  1,  1,  1, OF_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+    /* nextState, nbAddBits, nbBits, baseVal */
+    {  0,  0,  5,    0},     {  0,  6,  4,   61},
+    {  0,  9,  5,  509},     {  0, 15,  5,32765},
+    {  0, 21,  5,2097149},   {  0,  3,  5,    5},
+    {  0,  7,  4,  125},     {  0, 12,  5, 4093},
+    {  0, 18,  5,262141},    {  0, 23,  5,8388605},
+    {  0,  5,  5,   29},     {  0,  8,  4,  253},
+    {  0, 14,  5,16381},     {  0, 20,  5,1048573},
+    {  0,  2,  5,    1},     { 16,  7,  4,  125},
+    {  0, 11,  5, 2045},     {  0, 17,  5,131069},
+    {  0, 22,  5,4194301},   {  0,  4,  5,   13},
+    { 16,  8,  4,  253},     {  0, 13,  5, 8189},
+    {  0, 19,  5,524285},    {  0,  1,  5,    1},
+    { 16,  6,  4,   61},     {  0, 10,  5, 1021},
+    {  0, 16,  5,65533},     {  0, 28,  5,268435453},
+    {  0, 27,  5,134217725}, {  0, 26,  5,67108861},
+    {  0, 25,  5,33554429},  {  0, 24,  5,16777213},
+};   /* OF_defaultDTable */
+
+
+/* Default FSE distribution table for Match Lengths */
+static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
+    {  1,  1,  1, ML_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+    /* nextState, nbAddBits, nbBits, baseVal */
+    {  0,  0,  6,    3},  {  0,  0,  4,    4},
+    { 32,  0,  5,    5},  {  0,  0,  5,    6},
+    {  0,  0,  5,    8},  {  0,  0,  5,    9},
+    {  0,  0,  5,   11},  {  0,  0,  6,   13},
+    {  0,  0,  6,   16},  {  0,  0,  6,   19},
+    {  0,  0,  6,   22},  {  0,  0,  6,   25},
+    {  0,  0,  6,   28},  {  0,  0,  6,   31},
+    {  0,  0,  6,   34},  {  0,  1,  6,   37},
+    {  0,  1,  6,   41},  {  0,  2,  6,   47},
+    {  0,  3,  6,   59},  {  0,  4,  6,   83},
+    {  0,  7,  6,  131},  {  0,  9,  6,  515},
+    { 16,  0,  4,    4},  {  0,  0,  4,    5},
+    { 32,  0,  5,    6},  {  0,  0,  5,    7},
+    { 32,  0,  5,    9},  {  0,  0,  5,   10},
+    {  0,  0,  6,   12},  {  0,  0,  6,   15},
+    {  0,  0,  6,   18},  {  0,  0,  6,   21},
+    {  0,  0,  6,   24},  {  0,  0,  6,   27},
+    {  0,  0,  6,   30},  {  0,  0,  6,   33},
+    {  0,  1,  6,   35},  {  0,  1,  6,   39},
+    {  0,  2,  6,   43},  {  0,  3,  6,   51},
+    {  0,  4,  6,   67},  {  0,  5,  6,   99},
+    {  0,  8,  6,  259},  { 32,  0,  4,    4},
+    { 48,  0,  4,    4},  { 16,  0,  4,    5},
+    { 32,  0,  5,    7},  { 32,  0,  5,    8},
+    { 32,  0,  5,   10},  { 32,  0,  5,   11},
+    {  0,  0,  6,   14},  {  0,  0,  6,   17},
+    {  0,  0,  6,   20},  {  0,  0,  6,   23},
+    {  0,  0,  6,   26},  {  0,  0,  6,   29},
+    {  0,  0,  6,   32},  {  0, 16,  6,65539},
+    {  0, 15,  6,32771},  {  0, 14,  6,16387},
+    {  0, 13,  6, 8195},  {  0, 12,  6, 4099},
+    {  0, 11,  6, 2051},  {  0, 10,  6, 1027},
+};   /* ML_defaultDTable */
+
+
+static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
+{
+    void* ptr = dt;
+    ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
+    ZSTD_seqSymbol* const cell = dt + 1;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->nbBits = 0;
+    cell->nextState = 0;
+    assert(nbAddBits < 255);
+    cell->nbAdditionalBits = nbAddBits;
+    cell->baseValue = baseValue;
+}
+
+
+/* ZSTD_buildFSETable() :
+ * generate FSE decoding table for one symbol (ll, ml or off)
+ * cannot fail if input is valid =>
+ * all inputs are presumed validated at this stage */
+FORCE_INLINE_TEMPLATE
+void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+            const short* normalizedCounter, unsigned maxSymbolValue,
+            const U32* baseValue, const U8* nbAdditionalBits,
+            unsigned tableLog, void* wksp, size_t wkspSize)
+{
+    ZSTD_seqSymbol* const tableDecode = dt+1;
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+
+    U16* symbolNext = (U16*)wksp;
+    BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
+    U32 highThreshold = tableSize - 1;
+
+
+    /* Sanity Checks */
+    assert(maxSymbolValue <= MaxSeq);
+    assert(tableLog <= MaxFSELog);
+    assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
+    (void)wkspSize;
+    /* Init, lay down lowprob symbols */
+    {   ZSTD_seqSymbol_header DTableH;
+        DTableH.tableLog = tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].baseValue = s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    assert(normalizedCounter[s]>=0);
+                    symbolNext[s] = (U16)normalizedCounter[s];
+        }   }   }
+        ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    assert(tableSize <= 512);
+    /* Specialized symbol spreading for the case when there are
+     * no low probability (-1 count) symbols. When compressing
+     * small blocks we avoid low probability symbols to hit this
+     * case, since header decoding speed matters more.
+     */
+    if (highThreshold == tableSize - 1) {
+        size_t const tableMask = tableSize-1;
+        size_t const step = FSE_TABLESTEP(tableSize);
+        /* First lay down the symbols in order.
+         * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
+         * misses since small blocks generally have small table logs, so nearly
+         * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+         * our buffer to handle the over-write.
+         */
+        {
+            U64 const add = 0x0101010101010101ull;
+            size_t pos = 0;
+            U64 sv = 0;
+            U32 s;
+            for (s=0; s<maxSV1; ++s, sv += add) {
+                int i;
+                int const n = normalizedCounter[s];
+                MEM_write64(spread + pos, sv);
+                for (i = 8; i < n; i += 8) {
+                    MEM_write64(spread + pos + i, sv);
+                }
+                assert(n>=0);
+                pos += (size_t)n;
+            }
+        }
+        /* Now we spread those positions across the table.
+         * The benefit of doing it in two stages is that we avoid the
+         * variable size inner loop, which caused lots of branch misses.
+         * Now we can run through all the positions without any branch misses.
+         * We unroll the loop twice, since that is what empirically worked best.
+         */
+        {
+            size_t position = 0;
+            size_t s;
+            size_t const unroll = 2;
+            assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
+            for (s = 0; s < (size_t)tableSize; s += unroll) {
+                size_t u;
+                for (u = 0; u < unroll; ++u) {
+                    size_t const uPosition = (position + (u * step)) & tableMask;
+                    tableDecode[uPosition].baseValue = spread[s + u];
+                }
+                position = (position + (unroll * step)) & tableMask;
+            }
+            assert(position == 0);
+        }
+    } else {
+        U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            int const n = normalizedCounter[s];
+            for (i=0; i<n; i++) {
+                tableDecode[position].baseValue = s;
+                position = (position + step) & tableMask;
+                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {
+        U32 u;
+        for (u=0; u<tableSize; u++) {
+            U32 const symbol = tableDecode[u].baseValue;
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+            tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+            assert(nbAdditionalBits[symbol] < 255);
+            tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
+            tableDecode[u].baseValue = baseValue[symbol];
+        }
+    }
+}
+
+/* Avoids the FORCE_INLINE of the _body() function. */
+static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
+            const short* normalizedCounter, unsigned maxSymbolValue,
+            const U32* baseValue, const U8* nbAdditionalBits,
+            unsigned tableLog, void* wksp, size_t wkspSize)
+{
+    ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
+            baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
+}
+
+#if DYNAMIC_BMI2
+BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
+            const short* normalizedCounter, unsigned maxSymbolValue,
+            const U32* baseValue, const U8* nbAdditionalBits,
+            unsigned tableLog, void* wksp, size_t wkspSize)
+{
+    ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
+            baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
+}
+#endif
+
+void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+            const short* normalizedCounter, unsigned maxSymbolValue,
+            const U32* baseValue, const U8* nbAdditionalBits,
+            unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
+                baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
+        return;
+    }
+#endif
+    (void)bmi2;
+    ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
+            baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
+}
+
+
+/*! ZSTD_buildSeqTable() :
+ * @return : nb bytes read from src,
+ *           or an error code if it fails */
+static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
+                                 symbolEncodingType_e type, unsigned max, U32 maxLog,
+                                 const void* src, size_t srcSize,
+                                 const U32* baseValue, const U8* nbAdditionalBits,
+                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
+                                 int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
+                                 int bmi2)
+{
+    switch(type)
+    {
+    case set_rle :
+        RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");
+        RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
+        {   U32 const symbol = *(const BYTE*)src;
+            U32 const baseline = baseValue[symbol];
+            U8 const nbBits = nbAdditionalBits[symbol];
+            ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
+        }
+        *DTablePtr = DTableSpace;
+        return 1;
+    case set_basic :
+        *DTablePtr = defaultTable;
+        return 0;
+    case set_repeat:
+        RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");
+        /* prefetch FSE table if used */
+        if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
+            const void* const pStart = *DTablePtr;
+            size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
+            PREFETCH_AREA(pStart, pSize);
+        }
+        return 0;
+    case set_compressed :
+        {   unsigned tableLog;
+            S16 norm[MaxSeq+1];
+            size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
+            RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
+            RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
+            ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
+            *DTablePtr = DTableSpace;
+            return headerSize;
+        }
+    default :
+        assert(0);
+        RETURN_ERROR(GENERIC, "impossible");
+    }
+}
+
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                             const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip = istart;
+    int nbSeq;
+    DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
+
+    /* check */
+    RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");
+
+    /* SeqHead */
+    nbSeq = *ip++;
+    if (nbSeq > 0x7F) {
+        if (nbSeq == 0xFF) {
+            RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
+            nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
+            ip+=2;
+        } else {
+            RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
+            nbSeq = ((nbSeq-0x80)<<8) + *ip++;
+        }
+    }
+    *nbSeqPtr = nbSeq;
+
+    if (nbSeq == 0) {
+        /* No sequence : section ends immediately */
+        RETURN_ERROR_IF(ip != iend, corruption_detected,
+            "extraneous data present in the Sequences section");
+        return (size_t)(ip - istart);
+    }
+
+    /* FSE table descriptors */
+    RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
+    RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */
+    {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+        symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+        symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+        ip++;
+
+        /* Build DTables */
+        {   size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr,
+                                                      LLtype, MaxLL, LLFSELog,
+                                                      ip, iend-ip,
+                                                      LL_base, LL_bits,
+                                                      LL_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq,
+                                                      dctx->workspace, sizeof(dctx->workspace),
+                                                      ZSTD_DCtx_get_bmi2(dctx));
+            RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+            ip += llhSize;
+        }
+
+        {   size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr,
+                                                      OFtype, MaxOff, OffFSELog,
+                                                      ip, iend-ip,
+                                                      OF_base, OF_bits,
+                                                      OF_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq,
+                                                      dctx->workspace, sizeof(dctx->workspace),
+                                                      ZSTD_DCtx_get_bmi2(dctx));
+            RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+            ip += ofhSize;
+        }
+
+        {   size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr,
+                                                      MLtype, MaxML, MLFSELog,
+                                                      ip, iend-ip,
+                                                      ML_base, ML_bits,
+                                                      ML_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq,
+                                                      dctx->workspace, sizeof(dctx->workspace),
+                                                      ZSTD_DCtx_get_bmi2(dctx));
+            RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+            ip += mlhSize;
+        }
+    }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+} seq_t;
+
+typedef struct {
+    size_t state;
+    const ZSTD_seqSymbol* table;
+} ZSTD_fseState;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    ZSTD_fseState stateLL;
+    ZSTD_fseState stateOffb;
+    ZSTD_fseState stateML;
+    size_t prevOffset[ZSTD_REP_NUM];
+} seqState_t;
+
+/*! ZSTD_overlapCopy8() :
+ *  Copies 8 bytes from ip to op and updates op and ip where ip <= op.
+ *  If the offset is < 8 then the offset is spread to at least 8 bytes.
+ *
+ *  Precondition: *ip <= *op
+ *  Postcondition: *op - *op >= 8
+ */
+HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
+    assert(*ip <= *op);
+    if (offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* subtracted */
+        int const sub2 = dec64table[offset];
+        (*op)[0] = (*ip)[0];
+        (*op)[1] = (*ip)[1];
+        (*op)[2] = (*ip)[2];
+        (*op)[3] = (*ip)[3];
+        *ip += dec32table[offset];
+        ZSTD_copy4(*op+4, *ip);
+        *ip -= sub2;
+    } else {
+        ZSTD_copy8(*op, *ip);
+    }
+    *ip += 8;
+    *op += 8;
+    assert(*op - *ip >= 8);
+}
+
+/*! ZSTD_safecopy() :
+ *  Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer
+ *  and write up to 16 bytes past oend_w (op >= oend_w is allowed).
+ *  This function is only called in the uncommon case where the sequence is near the end of the block. It
+ *  should be fast for a single long sequence, but can be slow for several short sequences.
+ *
+ *  @param ovtype controls the overlap detection
+ *         - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
+ *         - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
+ *           The src buffer must be before the dst buffer.
+ */
+static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
+    ptrdiff_t const diff = op - ip;
+    BYTE* const oend = op + length;
+
+    assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) ||
+           (ovtype == ZSTD_overlap_src_before_dst && diff >= 0));
+
+    if (length < 8) {
+        /* Handle short lengths. */
+        while (op < oend) *op++ = *ip++;
+        return;
+    }
+    if (ovtype == ZSTD_overlap_src_before_dst) {
+        /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
+        assert(length >= 8);
+        ZSTD_overlapCopy8(&op, &ip, diff);
+        length -= 8;
+        assert(op - ip >= 8);
+        assert(op <= oend);
+    }
+
+    if (oend <= oend_w) {
+        /* No risk of overwrite. */
+        ZSTD_wildcopy(op, ip, length, ovtype);
+        return;
+    }
+    if (op <= oend_w) {
+        /* Wildcopy until we get close to the end. */
+        assert(oend > oend_w);
+        ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
+        ip += oend_w - op;
+        op += oend_w - op;
+    }
+    /* Handle the leftovers. */
+    while (op < oend) *op++ = *ip++;
+}
+
+/* ZSTD_safecopyDstBeforeSrc():
+ * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
+ * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
+static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
+    ptrdiff_t const diff = op - ip;
+    BYTE* const oend = op + length;
+
+    if (length < 8 || diff > -8) {
+        /* Handle short lengths, close overlaps, and dst not before src. */
+        while (op < oend) *op++ = *ip++;
+        return;
+    }
+
+    if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
+        ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
+        ip += oend - WILDCOPY_OVERLENGTH - op;
+        op += oend - WILDCOPY_OVERLENGTH - op;
+    }
+
+    /* Handle the leftovers. */
+    while (op < oend) *op++ = *ip++;
+}
+
+/* ZSTD_execSequenceEnd():
+ * This version handles cases that are near the end of the output buffer. It requires
+ * more careful checks to make sure there is no overflow. By separating out these hard
+ * and unlikely cases, we can speed up the common cases.
+ *
+ * NOTE: This function needs to be fast for a single long sequence, but doesn't need
+ * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
+ */
+FORCE_NOINLINE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_execSequenceEnd(BYTE* op,
+    BYTE* const oend, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+
+    /* bounds checks : careful of address space overflow in 32-bit mode */
+    RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
+    RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
+    assert(op < op + sequenceLength);
+    assert(oLitEnd < op + sequenceLength);
+
+    /* copy literals */
+    ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
+    op = oLitEnd;
+    *litPtr = iLitEnd;
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix */
+        RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
+        match = dictEnd - (prefixStart - match);
+        if (match + sequence.matchLength <= dictEnd) {
+            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+        ZSTD_memmove(oLitEnd, match, length1);
+        op = oLitEnd + length1;
+        sequence.matchLength -= length1;
+        match = prefixStart;
+        }
+    }
+    ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
+    return sequenceLength;
+}
+
+/* ZSTD_execSequenceEndSplitLitBuffer():
+ * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
+ */
+FORCE_NOINLINE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+    BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+
+    /* bounds checks : careful of address space overflow in 32-bit mode */
+    RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
+    RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
+    assert(op < op + sequenceLength);
+    assert(oLitEnd < op + sequenceLength);
+
+    /* copy literals */
+    RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
+    ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
+    op = oLitEnd;
+    *litPtr = iLitEnd;
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix */
+        RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
+        match = dictEnd - (prefixStart - match);
+        if (match + sequence.matchLength <= dictEnd) {
+            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+        ZSTD_memmove(oLitEnd, match, length1);
+        op = oLitEnd + length1;
+        sequence.matchLength -= length1;
+        match = prefixStart;
+        }
+    }
+    ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
+    return sequenceLength;
+}
+
+HINT_INLINE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_execSequence(BYTE* op,
+    BYTE* const oend, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;   /* risk : address space underflow on oend=NULL */
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    assert(op != NULL /* Precondition */);
+    assert(oend_w < oend /* No underflow */);
+
+#if defined(__aarch64__)
+    /* prefetch sequence starting from match that will be used for copy later */
+    PREFETCH_L1(match);
+#endif
+    /* Handle edge cases in a slow path:
+     *   - Read beyond end of literals
+     *   - Match end is within WILDCOPY_OVERLIMIT of oend
+     *   - 32-bit mode and the match length overflows
+     */
+    if (UNLIKELY(
+        iLitEnd > litLimit ||
+        oMatchEnd > oend_w ||
+        (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
+        return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
+
+    /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
+    assert(op <= oLitEnd /* No overflow */);
+    assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
+    assert(oMatchEnd <= oend /* No underflow */);
+    assert(iLitEnd <= litLimit /* Literal length is in bounds */);
+    assert(oLitEnd <= oend_w /* Can wildcopy literals */);
+    assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
+
+    /* Copy Literals:
+     * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
+     * We likely don't need the full 32-byte wildcopy.
+     */
+    assert(WILDCOPY_OVERLENGTH >= 16);
+    ZSTD_copy16(op, (*litPtr));
+    if (UNLIKELY(sequence.litLength > 16)) {
+        ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
+    }
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* Copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix -> go into extDict */
+        RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
+        match = dictEnd + (match - prefixStart);
+        if (match + sequence.matchLength <= dictEnd) {
+            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+        ZSTD_memmove(oLitEnd, match, length1);
+        op = oLitEnd + length1;
+        sequence.matchLength -= length1;
+        match = prefixStart;
+        }
+    }
+    /* Match within prefix of 1 or more bytes */
+    assert(op <= oMatchEnd);
+    assert(oMatchEnd <= oend_w);
+    assert(match >= prefixStart);
+    assert(sequence.matchLength >= 1);
+
+    /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
+     * without overlap checking.
+     */
+    if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
+        /* We bet on a full wildcopy for matches, since we expect matches to be
+         * longer than literals (in general). In silesia, ~10% of matches are longer
+         * than 16 bytes.
+         */
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
+        return sequenceLength;
+    }
+    assert(sequence.offset < WILDCOPY_VECLEN);
+
+    /* Copy 8 bytes and spread the offset to be >= 8. */
+    ZSTD_overlapCopy8(&op, &match, sequence.offset);
+
+    /* If the match length is > 8 bytes, then continue with the wildcopy. */
+    if (sequence.matchLength > 8) {
+        assert(op < oMatchEnd);
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
+    }
+    return sequenceLength;
+}
+
+HINT_INLINE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
+    BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    assert(op != NULL /* Precondition */);
+    assert(oend_w < oend /* No underflow */);
+    /* Handle edge cases in a slow path:
+     *   - Read beyond end of literals
+     *   - Match end is within WILDCOPY_OVERLIMIT of oend
+     *   - 32-bit mode and the match length overflows
+     */
+    if (UNLIKELY(
+            iLitEnd > litLimit ||
+            oMatchEnd > oend_w ||
+            (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
+        return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
+
+    /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
+    assert(op <= oLitEnd /* No overflow */);
+    assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
+    assert(oMatchEnd <= oend /* No underflow */);
+    assert(iLitEnd <= litLimit /* Literal length is in bounds */);
+    assert(oLitEnd <= oend_w /* Can wildcopy literals */);
+    assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
+
+    /* Copy Literals:
+     * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
+     * We likely don't need the full 32-byte wildcopy.
+     */
+    assert(WILDCOPY_OVERLENGTH >= 16);
+    ZSTD_copy16(op, (*litPtr));
+    if (UNLIKELY(sequence.litLength > 16)) {
+        ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
+    }
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* Copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix -> go into extDict */
+        RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
+        match = dictEnd + (match - prefixStart);
+        if (match + sequence.matchLength <= dictEnd) {
+            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            ZSTD_memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = prefixStart;
+    }   }
+    /* Match within prefix of 1 or more bytes */
+    assert(op <= oMatchEnd);
+    assert(oMatchEnd <= oend_w);
+    assert(match >= prefixStart);
+    assert(sequence.matchLength >= 1);
+
+    /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
+     * without overlap checking.
+     */
+    if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
+        /* We bet on a full wildcopy for matches, since we expect matches to be
+         * longer than literals (in general). In silesia, ~10% of matches are longer
+         * than 16 bytes.
+         */
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
+        return sequenceLength;
+    }
+    assert(sequence.offset < WILDCOPY_VECLEN);
+
+    /* Copy 8 bytes and spread the offset to be >= 8. */
+    ZSTD_overlapCopy8(&op, &match, sequence.offset);
+
+    /* If the match length is > 8 bytes, then continue with the wildcopy. */
+    if (sequence.matchLength > 8) {
+        assert(op < oMatchEnd);
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);
+    }
+    return sequenceLength;
+}
+
+
+static void
+ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
+{
+    const void* ptr = dt;
+    const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits",
+                (U32)DStatePtr->state, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+FORCE_INLINE_TEMPLATE void
+ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
+{
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = nextState + lowBits;
+}
+
+/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
+ * bits before reloading. This value is the maximum number of bytes we read
+ * after reloading when we are decoding long offsets.
+ */
+#define LONG_OFFSETS_MAX_EXTRA_BITS_32                       \
+    (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32       \
+        ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32  \
+        : 0)
+
+typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+
+/**
+ * ZSTD_decodeSequence():
+ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets
+ *                  only used in 32-bit mode
+ * @return : Sequence (litL + matchL + offset)
+ */
+FORCE_INLINE_TEMPLATE seq_t
+ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
+{
+    seq_t seq;
+    /*
+     * ZSTD_seqSymbol is a 64 bits wide structure.
+     * It can be loaded in one operation
+     * and its fields extracted by simply shifting or bit-extracting on aarch64.
+     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
+     * operations that cause performance drop. This can be avoided by using this
+     * ZSTD_memcpy hack.
+     */
+#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
+    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
+    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
+    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
+    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
+    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
+#else
+    const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+    const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+    const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
+#endif
+    seq.matchLength = mlDInfo->baseValue;
+    seq.litLength = llDInfo->baseValue;
+    {   U32 const ofBase = ofDInfo->baseValue;
+        BYTE const llBits = llDInfo->nbAdditionalBits;
+        BYTE const mlBits = mlDInfo->nbAdditionalBits;
+        BYTE const ofBits = ofDInfo->nbAdditionalBits;
+        BYTE const totalBits = llBits+mlBits+ofBits;
+
+        U16 const llNext = llDInfo->nextState;
+        U16 const mlNext = mlDInfo->nextState;
+        U16 const ofNext = ofDInfo->nextState;
+        U32 const llnbBits = llDInfo->nbBits;
+        U32 const mlnbBits = mlDInfo->nbBits;
+        U32 const ofnbBits = ofDInfo->nbBits;
+
+        assert(llBits <= MaxLLBits);
+        assert(mlBits <= MaxMLBits);
+        assert(ofBits <= MaxOff);
+        /*
+         * As gcc has better branch and block analyzers, sometimes it is only
+         * valuable to mark likeliness for clang, it gives around 3-4% of
+         * performance.
+         */
+
+        /* sequence */
+        {   size_t offset;
+            if (ofBits > 1) {
+                ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+                ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
+                if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+                    /* Always read extra bits, this keeps the logic simple,
+                     * avoids branches, and avoids accidentally reading 0 bits.
+                     */
+                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
+                    offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                    BIT_reloadDStream(&seqState->DStream);
+                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                } else {
+                    offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+                }
+                seqState->prevOffset[2] = seqState->prevOffset[1];
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset;
+            } else {
+                U32 const ll0 = (llDInfo->baseValue == 0);
+                if (LIKELY((ofBits == 0))) {
+                    offset = seqState->prevOffset[ll0];
+                    seqState->prevOffset[1] = seqState->prevOffset[!ll0];
+                    seqState->prevOffset[0] = offset;
+                } else {
+                    offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+                    {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+                        temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */
+                        if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                        seqState->prevOffset[1] = seqState->prevOffset[0];
+                        seqState->prevOffset[0] = offset = temp;
+            }   }   }
+            seq.offset = offset;
+        }
+
+        if (mlBits > 0)
+            seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+
+        if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+            BIT_reloadDStream(&seqState->DStream);
+        if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
+            BIT_reloadDStream(&seqState->DStream);
+        /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+        ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+
+        if (llBits > 0)
+            seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+
+        if (MEM_32bits())
+            BIT_reloadDStream(&seqState->DStream);
+
+        DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                    (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+
+        if (!isLastSeq) {
+            /* don't update FSE state for last Sequence */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
+            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
+            BIT_reloadDStream(&seqState->DStream);
+        }
+    }
+
+    return seq;
+}
+
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+#if DEBUGLEVEL >= 1
+static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
+{
+    size_t const windowSize = dctx->fParams.windowSize;
+    /* No dictionary used. */
+    if (dctx->dictContentEndForFuzzing == NULL) return 0;
+    /* Dictionary is our prefix. */
+    if (prefixStart == dctx->dictContentBeginForFuzzing) return 1;
+    /* Dictionary is not our ext-dict. */
+    if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0;
+    /* Dictionary is not within our window size. */
+    if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0;
+    /* Dictionary is active. */
+    return 1;
+}
+#endif
+
+static void ZSTD_assertValidSequence(
+        ZSTD_DCtx const* dctx,
+        BYTE const* op, BYTE const* oend,
+        seq_t const seq,
+        BYTE const* prefixStart, BYTE const* virtualStart)
+{
+#if DEBUGLEVEL >= 1
+    if (dctx->isFrameDecompression) {
+        size_t const windowSize = dctx->fParams.windowSize;
+        size_t const sequenceSize = seq.litLength + seq.matchLength;
+        BYTE const* const oLitEnd = op + seq.litLength;
+        DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
+                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+        assert(op <= oend);
+        assert((size_t)(oend - op) >= sequenceSize);
+        assert(sequenceSize <= ZSTD_blockSizeMax(dctx));
+        if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
+            size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
+            /* Offset must be within the dictionary. */
+            assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
+            assert(seq.offset <= windowSize + dictSize);
+        } else {
+            /* Offset must be within our window. */
+            assert(seq.offset <= windowSize);
+        }
+    }
+#else
+    (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
+#endif
+}
+#endif
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+
+
+FORCE_INLINE_TEMPLATE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize, int nbSeq,
+                         const ZSTD_longOffset_e isLongOffset)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* litBufferEnd = dctx->litBufferEnd;
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq);
+
+    /* Literals are split between internal buffer & output buffer */
+    if (nbSeq) {
+        seqState_t seqState;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        RETURN_ERROR_IF(
+            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
+            corruption_detected, "");
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+        assert(dst != NULL);
+
+        ZSTD_STATIC_ASSERT(
+                BIT_DStream_unfinished < BIT_DStream_completed &&
+                BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+                BIT_DStream_completed < BIT_DStream_overflow);
+
+        /* decompress without overrunning litPtr begins */
+        {   seq_t sequence = {0,0,0};  /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */
+            /* Align the decompression loop to 32 + 16 bytes.
+                *
+                * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
+                * speed swings based on the alignment of the decompression loop. This
+                * performance swing is caused by parts of the decompression loop falling
+                * out of the DSB. The entire decompression loop should fit in the DSB,
+                * when it can't we get much worse performance. You can measure if you've
+                * hit the good case or the bad case with this perf command for some
+                * compressed file test.zst:
+                *
+                *   perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
+                *             -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
+                *
+                * If you see most cycles served out of the MITE you've hit the bad case.
+                * If you see most cycles served out of the DSB you've hit the good case.
+                * If it is pretty even then you may be in an okay case.
+                *
+                * This issue has been reproduced on the following CPUs:
+                *   - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
+                *               Use Instruments->Counters to get DSB/MITE cycles.
+                *               I never got performance swings, but I was able to
+                *               go from the good case of mostly DSB to half of the
+                *               cycles served from MITE.
+                *   - Coffeelake: Intel i9-9900k
+                *   - Coffeelake: Intel i7-9700k
+                *
+                * I haven't been able to reproduce the instability or DSB misses on any
+                * of the following CPUS:
+                *   - Haswell
+                *   - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
+                *   - Skylake
+                *
+                * Alignment is done for each of the three major decompression loops:
+                *   - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
+                *   - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
+                *   - ZSTD_decompressSequences_body
+                * Alignment choices are made to minimize large swings on bad cases and influence on performance
+                * from changes external to this code, rather than to overoptimize on the current commit.
+                *
+                * If you are seeing performance stability this script can help test.
+                * It tests on 4 commits in zstd where I saw performance change.
+                *
+                *   https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
+                */
+#if defined(__GNUC__) && defined(__x86_64__)
+            __asm__(".p2align 6");
+#  if __GNUC__ >= 7
+	    /* good for gcc-7, gcc-9, and gcc-11 */
+            __asm__("nop");
+            __asm__(".p2align 5");
+            __asm__("nop");
+            __asm__(".p2align 4");
+#    if __GNUC__ == 8 || __GNUC__ == 10
+	    /* good for gcc-8 and gcc-10 */
+            __asm__("nop");
+            __asm__(".p2align 3");
+#    endif
+#  endif
+#endif
+
+            /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
+            for ( ; nbSeq; nbSeq--) {
+                sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+                if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
+                {   size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                    assert(!ZSTD_isError(oneSeqSize));
+                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+#endif
+                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                        return oneSeqSize;
+                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                    op += oneSeqSize;
+            }   }
+            DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
+
+            /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
+            if (nbSeq > 0) {
+                const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength);
+                if (leftoverLit) {
+                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                    sequence.litLength -= leftoverLit;
+                    op += leftoverLit;
+                }
+                litPtr = dctx->litExtraBuffer;
+                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                dctx->litBufferLocation = ZSTD_not_in_dst;
+                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                    assert(!ZSTD_isError(oneSeqSize));
+                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+#endif
+                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                        return oneSeqSize;
+                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                    op += oneSeqSize;
+                }
+                nbSeq--;
+            }
+        }
+
+        if (nbSeq > 0) {
+            /* there is remaining lit from extra buffer */
+
+#if defined(__GNUC__) && defined(__x86_64__)
+            __asm__(".p2align 6");
+            __asm__("nop");
+#  if __GNUC__ != 7
+            /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
+            __asm__(".p2align 4");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  elif __GNUC__ >= 11
+            __asm__(".p2align 3");
+#  else
+            __asm__(".p2align 5");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  endif
+#endif
+
+            for ( ; nbSeq ; nbSeq--) {
+                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+                size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+#endif
+                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                    return oneSeqSize;
+                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                op += oneSeqSize;
+            }
+        }
+
+        /* check if reached exact end */
+        DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
+        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+        DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed);
+        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    if (dctx->litBufferLocation == ZSTD_split) {
+        /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
+        size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
+        DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize);
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            ZSTD_memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+        litPtr = dctx->litExtraBuffer;
+        litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+        dctx->litBufferLocation = ZSTD_not_in_dst;
+    }
+    /* copy last literals from internal buffer */
+    {   size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
+        DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize);
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            ZSTD_memcpy(op, litPtr, lastLLSize);
+            op += lastLLSize;
+    }   }
+
+    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
+    return (size_t)(op - ostart);
+}
+
+FORCE_INLINE_TEMPLATE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+    void* dst, size_t maxDstSize,
+    const void* seqStart, size_t seqSize, int nbSeq,
+    const ZSTD_longOffset_e isLongOffset)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
+    const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
+    const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
+    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seqState_t seqState;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        RETURN_ERROR_IF(
+            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
+            corruption_detected, "");
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+        assert(dst != NULL);
+
+#if defined(__GNUC__) && defined(__x86_64__)
+            __asm__(".p2align 6");
+            __asm__("nop");
+#  if __GNUC__ >= 7
+            __asm__(".p2align 5");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  else
+            __asm__(".p2align 4");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  endif
+#endif
+
+        for ( ; nbSeq ; nbSeq--) {
+            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+            size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+            assert(!ZSTD_isError(oneSeqSize));
+            ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+#endif
+            if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                return oneSeqSize;
+            DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+            op += oneSeqSize;
+        }
+
+        /* check if reached exact end */
+        assert(nbSeq == 0);
+        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = (size_t)(litEnd - litPtr);
+        DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize);
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            ZSTD_memcpy(op, litPtr, lastLLSize);
+            op += lastLLSize;
+    }   }
+
+    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
+    return (size_t)(op - ostart);
+}
+
+static size_t
+ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+static size_t
+ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
+                                               void* dst, size_t maxDstSize,
+                                         const void* seqStart, size_t seqSize, int nbSeq,
+                                         const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+
+FORCE_INLINE_TEMPLATE
+
+size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
+                   const BYTE* const prefixStart, const BYTE* const dictEnd)
+{
+    prefetchPos += sequence.litLength;
+    {   const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
+        /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+         * No consequence though : memory address is only used for prefetching, not for dereferencing */
+        const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
+        PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+    }
+    return prefetchPos + sequence.matchLength;
+}
+
+/* This decoding function employs prefetching
+ * to reduce latency impact of cache misses.
+ * It's generally employed when block contains a significant portion of long-distance matches
+ * or when coupled with a "cold" dictionary */
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_decompressSequencesLong_body(
+                               ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize, int nbSeq,
+                         const ZSTD_longOffset_e isLongOffset)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* litBufferEnd = dctx->litBufferEnd;
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+
+    /* Regen sequences */
+    if (nbSeq) {
+#define STORED_SEQS 8
+#define STORED_SEQS_MASK (STORED_SEQS-1)
+#define ADVANCED_SEQS STORED_SEQS
+        seq_t sequences[STORED_SEQS];
+        int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
+        seqState_t seqState;
+        int seqNb;
+        size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
+
+        dctx->fseEntropy = 1;
+        { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        assert(dst != NULL);
+        assert(iend >= ip);
+        RETURN_ERROR_IF(
+            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
+            corruption_detected, "");
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+        /* prepare in advance */
+        for (seqNb=0; seqNb<seqAdvance; seqNb++) {
+            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+            prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+            sequences[seqNb] = sequence;
+        }
+
+        /* decompress without stomping litBuffer */
+        for (; seqNb < nbSeq; seqNb++) {
+            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+
+            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
+                /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
+                const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                if (leftoverLit)
+                {
+                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                    sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
+                    op += leftoverLit;
+                }
+                litPtr = dctx->litExtraBuffer;
+                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                dctx->litBufferLocation = ZSTD_not_in_dst;
+                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                    assert(!ZSTD_isError(oneSeqSize));
+                    ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+
+                    prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+                    sequences[seqNb & STORED_SEQS_MASK] = sequence;
+                    op += oneSeqSize;
+            }   }
+            else
+            {
+                /* lit buffer is either wholly contained in first or second split, or not split at all*/
+                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
+                    ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
+                    ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+
+                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+                sequences[seqNb & STORED_SEQS_MASK] = sequence;
+                op += oneSeqSize;
+            }
+        }
+        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+
+        /* finish queue */
+        seqNb -= seqAdvance;
+        for ( ; seqNb<nbSeq ; seqNb++) {
+            seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
+            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
+                const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                if (leftoverLit) {
+                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                    sequence->litLength -= leftoverLit;
+                    op += leftoverLit;
+                }
+                litPtr = dctx->litExtraBuffer;
+                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                dctx->litBufferLocation = ZSTD_not_in_dst;
+                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                    assert(!ZSTD_isError(oneSeqSize));
+                    ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                    op += oneSeqSize;
+                }
+            }
+            else
+            {
+                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
+                    ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
+                    ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                op += oneSeqSize;
+            }
+        }
+
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */
+        size_t const lastLLSize = litBufferEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            ZSTD_memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+        litPtr = dctx->litExtraBuffer;
+        litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+    }
+    {   size_t const lastLLSize = litBufferEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            ZSTD_memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return (size_t)(op - ostart);
+}
+
+static size_t
+ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+
+
+#if DYNAMIC_BMI2
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+static BMI2_TARGET_ATTRIBUTE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+static BMI2_TARGET_ATTRIBUTE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+static BMI2_TARGET_ATTRIBUTE size_t
+ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+#endif /* DYNAMIC_BMI2 */
+
+typedef size_t (*ZSTD_decompressSequences_t)(
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+                            const ZSTD_longOffset_e isLongOffset);
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+static size_t
+ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                   const void* seqStart, size_t seqSize, int nbSeq,
+                   const ZSTD_longOffset_e isLongOffset)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequences");
+#if DYNAMIC_BMI2
+    if (ZSTD_DCtx_get_bmi2(dctx)) {
+        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+    }
+#endif
+    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+static size_t
+ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                                 const void* seqStart, size_t seqSize, int nbSeq,
+                                 const ZSTD_longOffset_e isLongOffset)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
+#if DYNAMIC_BMI2
+    if (ZSTD_DCtx_get_bmi2(dctx)) {
+        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+    }
+#endif
+    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+/* ZSTD_decompressSequencesLong() :
+ * decompression function triggered when a minimum share of offsets is considered "long",
+ * aka out of cache.
+ * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".
+ * This function will try to mitigate main memory latency through the use of prefetching */
+static size_t
+ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+                             void* dst, size_t maxDstSize,
+                             const void* seqStart, size_t seqSize, int nbSeq,
+                             const ZSTD_longOffset_e isLongOffset)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+#if DYNAMIC_BMI2
+    if (ZSTD_DCtx_get_bmi2(dctx)) {
+        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+    }
+#endif
+  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+
+/**
+ * @returns The total size of the history referenceable by zstd, including
+ * both the prefix and the extDict. At @p op any offset larger than this
+ * is invalid.
+ */
+static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
+{
+    return (size_t)(op - virtualStart);
+}
+
+typedef struct {
+    unsigned longOffsetShare;
+    unsigned maxNbAdditionalBits;
+} ZSTD_OffsetInfo;
+
+/* ZSTD_getOffsetInfo() :
+ * condition : offTable must be valid
+ * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+ *           compared to maximum possible of (1<<OffFSELog),
+ *           as well as the maximum number additional bits required.
+ */
+static ZSTD_OffsetInfo
+ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
+{
+    ZSTD_OffsetInfo info = {0, 0};
+    /* If nbSeq == 0, then the offTable is uninitialized, but we have
+     * no sequences, so both values should be 0.
+     */
+    if (nbSeq != 0) {
+        const void* ptr = offTable;
+        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+        const ZSTD_seqSymbol* table = offTable + 1;
+        U32 const max = 1 << tableLog;
+        U32 u;
+        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+
+        assert(max <= (1 << OffFSELog));  /* max not too large */
+        for (u=0; u<max; u++) {
+            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
+            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
+        }
+
+        assert(tableLog <= OffFSELog);
+        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+    }
+
+    return info;
+}
+
+/**
+ * @returns The maximum offset we can decode in one read of our bitstream, without
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
+ * than this must use the long offset decoder.
+ */
+static size_t ZSTD_maxShortOffset(void)
+{
+    if (MEM_64bits()) {
+        /* We can decode any offset without reloading bits.
+         * This might change if the max window size grows.
+         */
+        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+        return (size_t)-1;
+    } else {
+        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
+         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
+         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
+         */
+        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
+        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
+        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
+        return maxOffset;
+    }
+}
+
+size_t
+ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize, const streaming_operation streaming)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+    DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize);
+
+    /* Note : the wording of the specification
+     * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx).
+     * This generally does not happen, as it makes little sense,
+     * since an uncompressed block would feature same size and have no decompression cost.
+     * Also, note that decoder from reference libzstd before < v1.5.4
+     * would consider this edge case as an error.
+     * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
+     * for broader compatibility with the deployed ecosystem of zstd decoders */
+    RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
+
+    /* Decode literals section */
+    {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
+        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
+        if (ZSTD_isError(litCSize)) return litCSize;
+        ip += litCSize;
+        srcSize -= litCSize;
+    }
+
+    /* Build Decoding Tables */
+    {
+        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
+         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
+         */
+        size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
+        size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart);
+        /* isLongOffset must be true if there are long offsets.
+         * Offsets are long if they are larger than ZSTD_maxShortOffset().
+         * We don't expect that to be the case in 64-bit mode.
+         *
+         * We check here to see if our history is large enough to allow long offsets.
+         * If it isn't, then we can't possible have (valid) long offsets. If the offset
+         * is invalid, then it is okay to read it incorrectly.
+         *
+         * If isLongOffsets is true, then we will later check our decoding table to see
+         * if it is even possible to generate long offsets.
+         */
+        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
+        /* These macros control at build-time which decompressor implementation
+         * we use. If neither is defined, we do some inspection and dispatch at
+         * runtime.
+         */
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+        int usePrefetchDecoder = dctx->ddictIsCold;
+#else
+        /* Set to 1 to avoid computing offset info if we don't need to.
+         * Otherwise this value is ignored.
+         */
+        int usePrefetchDecoder = 1;
+#endif
+        int nbSeq;
+        size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+        if (ZSTD_isError(seqHSize)) return seqHSize;
+        ip += seqHSize;
+        srcSize -= seqHSize;
+
+        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
+                "invalid dst");
+
+        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
+         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
+         * NOTE: could probably use a larger nbSeq limit
+         */
+        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
+            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
+            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
+                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
+                 * enough, then we know it is impossible to have too long an offset in this block, so we can
+                 * use the regular offset decoder.
+                 */
+                isLongOffset = ZSTD_lo_isRegularOffset;
+            }
+            if (!usePrefetchDecoder) {
+                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+                usePrefetchDecoder = (info.longOffsetShare >= minShare);
+            }
+        }
+
+        dctx->ddictIsCold = 0;
+
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+        if (usePrefetchDecoder) {
+#else
+        (void)usePrefetchDecoder;
+        {
+#endif
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+#endif
+        }
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+        /* else */
+        if (dctx->litBufferLocation == ZSTD_split)
+            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+        else
+            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+#endif
+    }
+}
+
+
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+{
+    if (dst != dctx->previousDstEnd && dstSize > 0) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+        dctx->prefixStart = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize)
+{
+    size_t dSize;
+    dctx->isFrameDecompression = 0;
+    ZSTD_checkContinuity(dctx, dst, dstCapacity);
+    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming);
+    FORWARD_IF_ERROR(dSize, "");
+    dctx->previousDstEnd = (char*)dst + dSize;
+    return dSize;
+}
+
+
+/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
+}
diff --git a/src/zstd/zstd_decompress_block.h b/src/zstd/decompress/zstd_decompress_block.h
similarity index 56%
rename from src/zstd/zstd_decompress_block.h
rename to src/zstd/decompress/zstd_decompress_block.h
index ce24c10cd..ab152404b 100644
--- a/src/zstd/zstd_decompress_block.h
+++ b/src/zstd/decompress/zstd_decompress_block.h
@@ -8,16 +8,18 @@
  * You may select, at your option, one of the above-listed licenses.
  */
 
+
 #ifndef ZSTD_DEC_BLOCK_H
 #define ZSTD_DEC_BLOCK_H
 
 /*-*******************************************************
  *  Dependencies
  *********************************************************/
-#include "zstd_deps.h"				  /* size_t */
-#include "zstd.h"					  /* DCtx, and some public functions */
-#include "zstd_internal.h"			  /* blockProperties_t, and some public functions */
-#include "zstd_decompress_internal.h" /* ZSTD_seqSymbol */
+#include "../common/zstd_deps.h"   /* size_t */
+#include "../zstd.h"    /* DCtx, and some public functions */
+#include "../common/zstd_internal.h"  /* blockProperties_t, and some public functions */
+#include "zstd_decompress_internal.h"  /* ZSTD_seqSymbol */
+
 
 /* ===   Prototypes   === */
 
@@ -30,8 +32,12 @@
  * ZSTD_decodeSeqHeaders()
  */
 
-/* Streaming state is used to inform allocation of the literal buffer */
-typedef enum { not_streaming = 0, is_streaming = 1 } streaming_operation;
+
+ /* Streaming state is used to inform allocation of the literal buffer */
+typedef enum {
+    not_streaming = 0,
+    is_streaming = 1
+} streaming_operation;
 
 /* ZSTD_decompressBlock_internal() :
  * decompress block, starting at `src`,
@@ -39,7 +45,9 @@ typedef enum { not_streaming = 0, is_streaming = 1 } streaming_operation;
  * @return : decompressed block size,
  *           or an error code (which can be tested using ZSTD_isError())
  */
-size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, void const* src, size_t srcSize, int const frame, const streaming_operation streaming);
+size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize, const streaming_operation streaming);
 
 /* ZSTD_buildFSETable() :
  * generate FSE decoding table for one symbol (ll, ml or off)
@@ -50,6 +58,16 @@ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapac
  * defined in zstd_decompress_internal.h.
  * Internal use only.
  */
-void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, short const* normalizedCounter, unsigned maxSymbolValue, const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize, int bmi2);
+void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+             const short* normalizedCounter, unsigned maxSymbolValue,
+             const U32* baseValue, const U8* nbAdditionalBits,
+                   unsigned tableLog, void* wksp, size_t wkspSize,
+                   int bmi2);
+
+/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize);
+
 
 #endif /* ZSTD_DEC_BLOCK_H */
diff --git a/src/zstd/decompress/zstd_decompress_internal.h b/src/zstd/decompress/zstd_decompress_internal.h
new file mode 100644
index 000000000..83a7a0115
--- /dev/null
+++ b/src/zstd/decompress/zstd_decompress_internal.h
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* zstd_decompress_internal:
+ * objects and definitions shared within lib/decompress modules */
+
+ #ifndef ZSTD_DECOMPRESS_INTERNAL_H
+ #define ZSTD_DECOMPRESS_INTERNAL_H
+
+
+/*-*******************************************************
+ *  Dependencies
+ *********************************************************/
+#include "../common/mem.h"             /* BYTE, U16, U32 */
+#include "../common/zstd_internal.h"   /* constants : MaxLL, MaxML, MaxOff, LLFSELog, etc. */
+
+
+
+/*-*******************************************************
+ *  Constants
+ *********************************************************/
+static UNUSED_ATTR const U32 LL_base[MaxLL+1] = {
+                 0,    1,    2,     3,     4,     5,     6,      7,
+                 8,    9,   10,    11,    12,    13,    14,     15,
+                16,   18,   20,    22,    24,    28,    32,     40,
+                48,   64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                0x2000, 0x4000, 0x8000, 0x10000 };
+
+static UNUSED_ATTR const U32 OF_base[MaxOff+1] = {
+                 0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                 0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
+
+static UNUSED_ATTR const U8 OF_bits[MaxOff+1] = {
+                     0,  1,  2,  3,  4,  5,  6,  7,
+                     8,  9, 10, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31 };
+
+static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
+                     3,  4,  5,    6,     7,     8,     9,    10,
+                    11, 12, 13,   14,    15,    16,    17,    18,
+                    19, 20, 21,   22,    23,    24,    25,    26,
+                    27, 28, 29,   30,    31,    32,    33,    34,
+                    35, 37, 39,   41,    43,    47,    51,    59,
+                    67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                    0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
+
+
+/*-*******************************************************
+ *  Decompression types
+ *********************************************************/
+ typedef struct {
+     U32 fastMode;
+     U32 tableLog;
+ } ZSTD_seqSymbol_header;
+
+ typedef struct {
+     U16  nextState;
+     BYTE nbAdditionalBits;
+     BYTE nbBits;
+     U32  baseValue;
+ } ZSTD_seqSymbol;
+
+ #define SEQSYMBOL_TABLE_SIZE(log)   (1 + (1 << (log)))
+
+#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
+#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
+#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
+
+typedef struct {
+    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
+    U32 rep[ZSTD_REP_NUM];
+    U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
+} ZSTD_entropyDTables_t;
+
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
+               ZSTDds_decompressLastBlock, ZSTDds_checkChecksum,
+               ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage;
+
+typedef enum { zdss_init=0, zdss_loadHeader,
+               zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
+
+typedef enum {
+    ZSTD_use_indefinitely = -1,  /* Use the dictionary indefinitely */
+    ZSTD_dont_use = 0,           /* Do not use the dictionary (if one exists free it) */
+    ZSTD_use_once = 1            /* Use the dictionary once and set to ZSTD_dont_use */
+} ZSTD_dictUses_e;
+
+/* Hashset for storing references to multiple ZSTD_DDict within ZSTD_DCtx */
+typedef struct {
+    const ZSTD_DDict** ddictPtrTable;
+    size_t ddictPtrTableSize;
+    size_t ddictPtrCount;
+} ZSTD_DDictHashSet;
+
+#ifndef ZSTD_DECODER_INTERNAL_BUFFER
+#  define ZSTD_DECODER_INTERNAL_BUFFER  (1 << 16)
+#endif
+
+#define ZSTD_LBMIN 64
+#define ZSTD_LBMAX (128 << 10)
+
+/* extra buffer, compensates when dst is not large enough to store litBuffer */
+#define ZSTD_LITBUFFEREXTRASIZE  BOUNDED(ZSTD_LBMIN, ZSTD_DECODER_INTERNAL_BUFFER, ZSTD_LBMAX)
+
+typedef enum {
+    ZSTD_not_in_dst = 0,  /* Stored entirely within litExtraBuffer */
+    ZSTD_in_dst = 1,           /* Stored entirely within dst (in memory after current output write) */
+    ZSTD_split = 2            /* Split between litExtraBuffer and dst */
+} ZSTD_litLocation_e;
+
+struct ZSTD_DCtx_s
+{
+    const ZSTD_seqSymbol* LLTptr;
+    const ZSTD_seqSymbol* MLTptr;
+    const ZSTD_seqSymbol* OFTptr;
+    const HUF_DTable* HUFptr;
+    ZSTD_entropyDTables_t entropy;
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];   /* space needed when building huffman tables */
+    const void* previousDstEnd;   /* detect continuity */
+    const void* prefixStart;      /* start of current segment */
+    const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
+    const void* dictEnd;          /* end of previous segment */
+    size_t expected;
+    ZSTD_frameHeader fParams;
+    U64 processedCSize;
+    U64 decodedSize;
+    blockType_e bType;            /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
+    ZSTD_dStage stage;
+    U32 litEntropy;
+    U32 fseEntropy;
+    XXH64_state_t xxhState;
+    size_t headerSize;
+    ZSTD_format_e format;
+    ZSTD_forceIgnoreChecksum_e forceIgnoreChecksum;   /* User specified: if == 1, will ignore checksums in compressed frame. Default == 0 */
+    U32 validateChecksum;         /* if == 1, will validate checksum. Is == 1 if (fParams.checksumFlag == 1) and (forceIgnoreChecksum == 0). */
+    const BYTE* litPtr;
+    ZSTD_customMem customMem;
+    size_t litSize;
+    size_t rleSize;
+    size_t staticSize;
+    int isFrameDecompression;
+#if DYNAMIC_BMI2 != 0
+    int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+#endif
+
+    /* dictionary */
+    ZSTD_DDict* ddictLocal;
+    const ZSTD_DDict* ddict;     /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
+    U32 dictID;
+    int ddictIsCold;             /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
+    ZSTD_dictUses_e dictUses;
+    ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
+    ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
+    int disableHufAsm;
+    int maxBlockSizeParam;
+
+    /* streaming */
+    ZSTD_dStreamStage streamStage;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    size_t maxWindowSize;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t lhSize;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+    void* legacyContext;
+    U32 previousLegacyVersion;
+    U32 legacyVersion;
+#endif
+    U32 hostageByte;
+    int noForwardProgress;
+    ZSTD_bufferMode_e outBufferMode;
+    ZSTD_outBuffer expectedOutBuffer;
+
+    /* workspace */
+    BYTE* litBuffer;
+    const BYTE* litBufferEnd;
+    ZSTD_litLocation_e litBufferLocation;
+    BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /* literal buffer can be split between storage within dst and within this scratch buffer */
+    BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
+
+    size_t oversizedDuration;
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    void const* dictContentBeginForFuzzing;
+    void const* dictContentEndForFuzzing;
+#endif
+
+    /* Tracing */
+#if ZSTD_TRACE
+    ZSTD_TraceCtx traceCtx;
+#endif
+};  /* typedef'd to ZSTD_DCtx within "zstd.h" */
+
+MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) {
+#if DYNAMIC_BMI2 != 0
+	return dctx->bmi2;
+#else
+    (void)dctx;
+	return 0;
+#endif
+}
+
+/*-*******************************************************
+ *  Shared internal functions
+ *********************************************************/
+
+/*! ZSTD_loadDEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
+ * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */
+size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+                   const void* const dict, size_t const dictSize);
+
+/*! ZSTD_checkContinuity() :
+ *  check if next `dst` follows previous position, where decompression ended.
+ *  If yes, do nothing (continue on current segment).
+ *  If not, classify previous segment as "external dictionary", and start a new segment.
+ *  This function cannot fail. */
+void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize);
+
+
+#endif /* ZSTD_DECOMPRESS_INTERNAL_H */
diff --git a/src/zstd/dictBuilder/cover.c b/src/zstd/dictBuilder/cover.c
new file mode 100644
index 000000000..44f9029ac
--- /dev/null
+++ b/src/zstd/dictBuilder/cover.c
@@ -0,0 +1,1261 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* *****************************************************************************
+ * Constructs a dictionary using a heuristic based on the following paper:
+ *
+ * Liao, Petri, Moffat, Wirth
+ * Effective Construction of Relative Lempel-Ziv Dictionaries
+ * Published in WWW 2016.
+ *
+ * Adapted from code originally written by @ot (Giuseppe Ottaviano).
+ ******************************************************************************/
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#  define ZDICT_STATIC_LINKING_ONLY
+#endif
+
+#include "../common/mem.h" /* read */
+#include "../common/pool.h" /* POOL_ctx */
+#include "../common/threading.h" /* ZSTD_pthread_mutex_t */
+#include "../common/zstd_internal.h" /* includes zstd.h */
+#include "../common/bits.h" /* ZSTD_highbit32 */
+#include "../zdict.h"
+#include "cover.h"
+
+/*-*************************************
+*  Constants
+***************************************/
+/**
+* There are 32bit indexes used to ref samples, so limit samples size to 4GB
+* on 64bit builds.
+* For 32bit builds we choose 1 GB.
+* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
+* contiguous buffer, so 1GB is already a high limit.
+*/
+#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
+#define COVER_DEFAULT_SPLITPOINT 1.0
+
+/*-*************************************
+*  Console display
+***************************************/
+#ifndef LOCALDISPLAYLEVEL
+static int g_displayLevel = 0;
+#endif
+#undef  DISPLAY
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#undef  LOCALDISPLAYLEVEL
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#undef  DISPLAYLEVEL
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#ifndef LOCALDISPLAYUPDATE
+static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+#endif
+#undef  LOCALDISPLAYUPDATE
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) {           \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#undef  DISPLAYUPDATE
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+
+/*-*************************************
+* Hash table
+***************************************
+* A small specialized hash map for storing activeDmers.
+* The map does not resize, so if it becomes full it will loop forever.
+* Thus, the map must be large enough to store every value.
+* The map implements linear probing and keeps its load less than 0.5.
+*/
+
+#define MAP_EMPTY_VALUE ((U32)-1)
+typedef struct COVER_map_pair_t_s {
+  U32 key;
+  U32 value;
+} COVER_map_pair_t;
+
+typedef struct COVER_map_s {
+  COVER_map_pair_t *data;
+  U32 sizeLog;
+  U32 size;
+  U32 sizeMask;
+} COVER_map_t;
+
+/**
+ * Clear the map.
+ */
+static void COVER_map_clear(COVER_map_t *map) {
+  memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
+}
+
+/**
+ * Initializes a map of the given size.
+ * Returns 1 on success and 0 on failure.
+ * The map must be destroyed with COVER_map_destroy().
+ * The map is only guaranteed to be large enough to hold size elements.
+ */
+static int COVER_map_init(COVER_map_t *map, U32 size) {
+  map->sizeLog = ZSTD_highbit32(size) + 2;
+  map->size = (U32)1 << map->sizeLog;
+  map->sizeMask = map->size - 1;
+  map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
+  if (!map->data) {
+    map->sizeLog = 0;
+    map->size = 0;
+    return 0;
+  }
+  COVER_map_clear(map);
+  return 1;
+}
+
+/**
+ * Internal hash function
+ */
+static const U32 COVER_prime4bytes = 2654435761U;
+static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
+  return (key * COVER_prime4bytes) >> (32 - map->sizeLog);
+}
+
+/**
+ * Helper function that returns the index that a key should be placed into.
+ */
+static U32 COVER_map_index(COVER_map_t *map, U32 key) {
+  const U32 hash = COVER_map_hash(map, key);
+  U32 i;
+  for (i = hash;; i = (i + 1) & map->sizeMask) {
+    COVER_map_pair_t *pos = &map->data[i];
+    if (pos->value == MAP_EMPTY_VALUE) {
+      return i;
+    }
+    if (pos->key == key) {
+      return i;
+    }
+  }
+}
+
+/**
+ * Returns the pointer to the value for key.
+ * If key is not in the map, it is inserted and the value is set to 0.
+ * The map must not be full.
+ */
+static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
+  COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
+  if (pos->value == MAP_EMPTY_VALUE) {
+    pos->key = key;
+    pos->value = 0;
+  }
+  return &pos->value;
+}
+
+/**
+ * Deletes key from the map if present.
+ */
+static void COVER_map_remove(COVER_map_t *map, U32 key) {
+  U32 i = COVER_map_index(map, key);
+  COVER_map_pair_t *del = &map->data[i];
+  U32 shift = 1;
+  if (del->value == MAP_EMPTY_VALUE) {
+    return;
+  }
+  for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
+    COVER_map_pair_t *const pos = &map->data[i];
+    /* If the position is empty we are done */
+    if (pos->value == MAP_EMPTY_VALUE) {
+      del->value = MAP_EMPTY_VALUE;
+      return;
+    }
+    /* If pos can be moved to del do so */
+    if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
+      del->key = pos->key;
+      del->value = pos->value;
+      del = pos;
+      shift = 1;
+    } else {
+      ++shift;
+    }
+  }
+}
+
+/**
+ * Destroys a map that is inited with COVER_map_init().
+ */
+static void COVER_map_destroy(COVER_map_t *map) {
+  if (map->data) {
+    free(map->data);
+  }
+  map->data = NULL;
+  map->size = 0;
+}
+
+/*-*************************************
+* Context
+***************************************/
+
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
+  U32 *suffix;
+  size_t suffixSize;
+  U32 *freqs;
+  U32 *dmerAt;
+  unsigned d;
+} COVER_ctx_t;
+
+/* We need a global context for qsort... */
+static COVER_ctx_t *g_coverCtx = NULL;
+
+/*-*************************************
+*  Helper functions
+***************************************/
+
+/**
+ * Returns the sum of the sample sizes.
+ */
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
+  size_t sum = 0;
+  unsigned i;
+  for (i = 0; i < nbSamples; ++i) {
+    sum += samplesSizes[i];
+  }
+  return sum;
+}
+
+/**
+ * Returns -1 if the dmer at lp is less than the dmer at rp.
+ * Return 0 if the dmers at lp and rp are equal.
+ * Returns 1 if the dmer at lp is greater than the dmer at rp.
+ */
+static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
+  U32 const lhs = *(U32 const *)lp;
+  U32 const rhs = *(U32 const *)rp;
+  return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
+}
+/**
+ * Faster version for d <= 8.
+ */
+static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
+  U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
+  U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
+  U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
+  if (lhs < rhs) {
+    return -1;
+  }
+  return (lhs > rhs);
+}
+
+/**
+ * Same as COVER_cmp() except ties are broken by pointer value
+ * NOTE: g_coverCtx must be set to call this function.  A global is required because
+ * qsort doesn't take an opaque pointer.
+ */
+static int WIN_CDECL COVER_strict_cmp(const void *lp, const void *rp) {
+  int result = COVER_cmp(g_coverCtx, lp, rp);
+  if (result == 0) {
+    result = lp < rp ? -1 : 1;
+  }
+  return result;
+}
+/**
+ * Faster version for d <= 8.
+ */
+static int WIN_CDECL COVER_strict_cmp8(const void *lp, const void *rp) {
+  int result = COVER_cmp8(g_coverCtx, lp, rp);
+  if (result == 0) {
+    result = lp < rp ? -1 : 1;
+  }
+  return result;
+}
+
+/**
+ * Returns the first pointer in [first, last) whose element does not compare
+ * less than value.  If no such element exists it returns last.
+ */
+static const size_t *COVER_lower_bound(const size_t* first, const size_t* last,
+                                       size_t value) {
+  size_t count = (size_t)(last - first);
+  assert(last >= first);
+  while (count != 0) {
+    size_t step = count / 2;
+    const size_t *ptr = first;
+    ptr += step;
+    if (*ptr < value) {
+      first = ++ptr;
+      count -= step + 1;
+    } else {
+      count = step;
+    }
+  }
+  return first;
+}
+
+/**
+ * Generic groupBy function.
+ * Groups an array sorted by cmp into groups with equivalent values.
+ * Calls grp for each group.
+ */
+static void
+COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
+              int (*cmp)(COVER_ctx_t *, const void *, const void *),
+              void (*grp)(COVER_ctx_t *, const void *, const void *)) {
+  const BYTE *ptr = (const BYTE *)data;
+  size_t num = 0;
+  while (num < count) {
+    const BYTE *grpEnd = ptr + size;
+    ++num;
+    while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
+      grpEnd += size;
+      ++num;
+    }
+    grp(ctx, ptr, grpEnd);
+    ptr = grpEnd;
+  }
+}
+
+/*-*************************************
+*  Cover functions
+***************************************/
+
+/**
+ * Called on each group of positions with the same dmer.
+ * Counts the frequency of each dmer and saves it in the suffix array.
+ * Fills `ctx->dmerAt`.
+ */
+static void COVER_group(COVER_ctx_t *ctx, const void *group,
+                        const void *groupEnd) {
+  /* The group consists of all the positions with the same first d bytes. */
+  const U32 *grpPtr = (const U32 *)group;
+  const U32 *grpEnd = (const U32 *)groupEnd;
+  /* The dmerId is how we will reference this dmer.
+   * This allows us to map the whole dmer space to a much smaller space, the
+   * size of the suffix array.
+   */
+  const U32 dmerId = (U32)(grpPtr - ctx->suffix);
+  /* Count the number of samples this dmer shows up in */
+  U32 freq = 0;
+  /* Details */
+  const size_t *curOffsetPtr = ctx->offsets;
+  const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
+  /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
+   * different sample than the last.
+   */
+  size_t curSampleEnd = ctx->offsets[0];
+  for (; grpPtr != grpEnd; ++grpPtr) {
+    /* Save the dmerId for this position so we can get back to it. */
+    ctx->dmerAt[*grpPtr] = dmerId;
+    /* Dictionaries only help for the first reference to the dmer.
+     * After that zstd can reference the match from the previous reference.
+     * So only count each dmer once for each sample it is in.
+     */
+    if (*grpPtr < curSampleEnd) {
+      continue;
+    }
+    freq += 1;
+    /* Binary search to find the end of the sample *grpPtr is in.
+     * In the common case that grpPtr + 1 == grpEnd we can skip the binary
+     * search because the loop is over.
+     */
+    if (grpPtr + 1 != grpEnd) {
+      const size_t *sampleEndPtr =
+          COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
+      curSampleEnd = *sampleEndPtr;
+      curOffsetPtr = sampleEndPtr + 1;
+    }
+  }
+  /* At this point we are never going to look at this segment of the suffix
+   * array again.  We take advantage of this fact to save memory.
+   * We store the frequency of the dmer in the first position of the group,
+   * which is dmerId.
+   */
+  ctx->suffix[dmerId] = freq;
+}
+
+
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of dmer d.
+ * Let S_i be the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer d is in the dictionary we set F(d) = 0.
+ */
+static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
+                                           COVER_map_t *activeDmers, U32 begin,
+                                           U32 end,
+                                           ZDICT_cover_params_t parameters) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 dmersInK = k - d + 1;
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  COVER_segment_t bestSegment = {0, 0, 0};
+  COVER_segment_t activeSegment;
+  /* Reset the activeDmers in the segment */
+  COVER_map_clear(activeDmers);
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* The dmerId for the dmer at the next position */
+    U32 newDmer = ctx->dmerAt[activeSegment.end];
+    /* The entry in activeDmers for this dmerId */
+    U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
+    /* If the dmer isn't already present in the segment add its score. */
+    if (*newDmerOcc == 0) {
+      /* The paper suggest using the L-0.5 norm, but experiments show that it
+       * doesn't help.
+       */
+      activeSegment.score += freqs[newDmer];
+    }
+    /* Add the dmer to the segment */
+    activeSegment.end += 1;
+    *newDmerOcc += 1;
+
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      U32 delDmer = ctx->dmerAt[activeSegment.begin];
+      U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
+      activeSegment.begin += 1;
+      *delDmerOcc -= 1;
+      /* If this is the last occurrence of the dmer, subtract its score */
+      if (*delDmerOcc == 0) {
+        COVER_map_remove(activeDmers, delDmer);
+        activeSegment.score -= freqs[delDmer];
+      }
+    }
+
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+  {
+    /* Trim off the zero frequency head and tail from the segment. */
+    U32 newBegin = bestSegment.end;
+    U32 newEnd = bestSegment.begin;
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      U32 freq = freqs[ctx->dmerAt[pos]];
+      if (freq != 0) {
+        newBegin = MIN(newBegin, pos);
+        newEnd = pos + 1;
+      }
+    }
+    bestSegment.begin = newBegin;
+    bestSegment.end = newEnd;
+  }
+  {
+    /* Zero out the frequency of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      freqs[ctx->dmerAt[pos]] = 0;
+    }
+  }
+  return bestSegment;
+}
+
+/**
+ * Check the validity of the parameters.
+ * Returns non-zero if the parameters are valid and 0 otherwise.
+ */
+static int COVER_checkParameters(ZDICT_cover_params_t parameters,
+                                 size_t maxDictSize) {
+  /* k and d are required parameters */
+  if (parameters.d == 0 || parameters.k == 0) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
+    return 0;
+  }
+  return 1;
+}
+
+/**
+ * Clean up a context initialized with `COVER_ctx_init()`.
+ */
+static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
+  if (!ctx) {
+    return;
+  }
+  if (ctx->suffix) {
+    free(ctx->suffix);
+    ctx->suffix = NULL;
+  }
+  if (ctx->freqs) {
+    free(ctx->freqs);
+    ctx->freqs = NULL;
+  }
+  if (ctx->dmerAt) {
+    free(ctx->dmerAt);
+    ctx->dmerAt = NULL;
+  }
+  if (ctx->offsets) {
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+  }
+}
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can be used multiple
+ * times.
+ * Returns 0 on success or error code on error.
+ * The context must be destroyed with `COVER_ctx_destroy()`.
+ */
+static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
+                          const size_t *samplesSizes, unsigned nbSamples,
+                          unsigned d, double splitPoint)
+{
+  const BYTE *const samples = (const BYTE *)samplesBuffer;
+  const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+  /* Split samples into testing and training sets */
+  const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+  const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+  const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+  const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
+  /* Checks */
+  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+      totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
+    DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                 (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
+    return ERROR(srcSize_wrong);
+  }
+  /* Check if there are at least 5 training samples */
+  if (nbTrainSamples < 5) {
+    DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
+    return ERROR(srcSize_wrong);
+  }
+  /* Check if there's testing sample */
+  if (nbTestSamples < 1) {
+    DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
+    return ERROR(srcSize_wrong);
+  }
+  /* Zero the context */
+  memset(ctx, 0, sizeof(*ctx));
+  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+               (unsigned)trainingSamplesSize);
+  DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+               (unsigned)testSamplesSize);
+  ctx->samples = samples;
+  ctx->samplesSizes = samplesSizes;
+  ctx->nbSamples = nbSamples;
+  ctx->nbTrainSamples = nbTrainSamples;
+  ctx->nbTestSamples = nbTestSamples;
+  /* Partial suffix array */
+  ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
+  ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
+  /* Maps index to the dmerID */
+  ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
+  /* The offsets of each file */
+  ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
+  if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
+    DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
+    COVER_ctx_destroy(ctx);
+    return ERROR(memory_allocation);
+  }
+  ctx->freqs = NULL;
+  ctx->d = d;
+
+  /* Fill offsets from the samplesSizes */
+  {
+    U32 i;
+    ctx->offsets[0] = 0;
+    for (i = 1; i <= nbSamples; ++i) {
+      ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+    }
+  }
+  DISPLAYLEVEL(2, "Constructing partial suffix array\n");
+  {
+    /* suffix is a partial suffix array.
+     * It only sorts suffixes by their first parameters.d bytes.
+     * The sort is stable, so each dmer group is sorted by position in input.
+     */
+    U32 i;
+    for (i = 0; i < ctx->suffixSize; ++i) {
+      ctx->suffix[i] = i;
+    }
+    /* qsort doesn't take an opaque pointer, so pass as a global.
+     * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
+     */
+    g_coverCtx = ctx;
+#if defined(__OpenBSD__)
+    mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
+          (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
+#else
+    qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
+          (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
+#endif
+  }
+  DISPLAYLEVEL(2, "Computing frequencies\n");
+  /* For each dmer group (group of positions with the same first d bytes):
+   * 1. For each position we set dmerAt[position] = dmerID.  The dmerID is
+   *    (groupBeginPtr - suffix).  This allows us to go from position to
+   *    dmerID so we can look up values in freq.
+   * 2. We calculate how many samples the dmer occurs in and save it in
+   *    freqs[dmerId].
+   */
+  COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
+                (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
+  ctx->freqs = ctx->suffix;
+  ctx->suffix = NULL;
+  return 0;
+}
+
+void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
+{
+  const double ratio = (double)nbDmers / (double)maxDictSize;
+  if (ratio >= 10) {
+      return;
+  }
+  LOCALDISPLAYLEVEL(displayLevel, 1,
+                    "WARNING: The maximum dictionary size %u is too large "
+                    "compared to the source size %u! "
+                    "size(source)/size(dictionary) = %f, but it should be >= "
+                    "10! This may lead to a subpar dictionary! We recommend "
+                    "training on sources at least 10x, and preferably 100x "
+                    "the size of the dictionary! \n", (U32)maxDictSize,
+                    (U32)nbDmers, ratio);
+}
+
+COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
+                                       U32 nbDmers, U32 k, U32 passes)
+{
+  const U32 minEpochSize = k * 10;
+  COVER_epoch_info_t epochs;
+  epochs.num = MAX(1, maxDictSize / k / passes);
+  epochs.size = nbDmers / epochs.num;
+  if (epochs.size >= minEpochSize) {
+      assert(epochs.size * epochs.num <= nbDmers);
+      return epochs;
+  }
+  epochs.size = MIN(minEpochSize, nbDmers);
+  epochs.num = nbDmers / epochs.size;
+  assert(epochs.size * epochs.num <= nbDmers);
+  return epochs;
+}
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
+                                    COVER_map_t *activeDmers, void *dictBuffer,
+                                    size_t dictBufferCapacity,
+                                    ZDICT_cover_params_t parameters) {
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data into epochs. We will select one segment from each epoch. */
+  const COVER_epoch_info_t epochs = COVER_computeEpochs(
+      (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
+  const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
+  size_t zeroScoreRun = 0;
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
+                (U32)epochs.num, (U32)epochs.size);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
+    const U32 epochBegin = (U32)(epoch * epochs.size);
+    const U32 epochEnd = epochBegin + epochs.size;
+    size_t segmentSize;
+    /* Select a segment */
+    COVER_segment_t segment = COVER_selectSegment(
+        ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
+    /* If the segment covers no dmers, then we are out of content.
+     * There may be new content in other epochs, for continue for some time.
+     */
+    if (segment.score == 0) {
+      if (++zeroScoreRun >= maxZeroScoreRun) {
+          break;
+      }
+      continue;
+    }
+    zeroScoreRun = 0;
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t parameters)
+{
+  BYTE* const dict = (BYTE*)dictBuffer;
+  COVER_ctx_t ctx;
+  COVER_map_t activeDmers;
+  parameters.splitPoint = 1.0;
+  /* Initialize global data */
+  g_displayLevel = (int)parameters.zParams.notificationLevel;
+  /* Checks */
+  if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
+    DISPLAYLEVEL(1, "Cover parameters incorrect\n");
+    return ERROR(parameter_outOfBound);
+  }
+  if (nbSamples == 0) {
+    DISPLAYLEVEL(1, "Cover must have at least one input file\n");
+    return ERROR(srcSize_wrong);
+  }
+  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+    DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                 ZDICT_DICTSIZE_MIN);
+    return ERROR(dstSize_tooSmall);
+  }
+  /* Initialize context and activeDmers */
+  {
+    size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                      parameters.d, parameters.splitPoint);
+    if (ZSTD_isError(initVal)) {
+      return initVal;
+    }
+  }
+  COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
+  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
+    DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
+    COVER_ctx_destroy(&ctx);
+    return ERROR(memory_allocation);
+  }
+
+  DISPLAYLEVEL(2, "Building dictionary\n");
+  {
+    const size_t tail =
+        COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
+                              dictBufferCapacity, parameters);
+    const size_t dictionarySize = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
+    if (!ZSTD_isError(dictionarySize)) {
+      DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                   (unsigned)dictionarySize);
+    }
+    COVER_ctx_destroy(&ctx);
+    COVER_map_destroy(&activeDmers);
+    return dictionarySize;
+  }
+}
+
+
+
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                    const size_t *samplesSizes, const BYTE *samples,
+                                    size_t *offsets,
+                                    size_t nbTrainSamples, size_t nbSamples,
+                                    BYTE *const dict, size_t dictBufferCapacity) {
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Pointers */
+  ZSTD_CCtx *cctx;
+  ZSTD_CDict *cdict;
+  void *dst;
+  /* Local variables */
+  size_t dstCapacity;
+  size_t i;
+  /* Allocate dst with enough space to compress the maximum sized sample */
+  {
+    size_t maxSampleSize = 0;
+    i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+    for (; i < nbSamples; ++i) {
+      maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
+    }
+    dstCapacity = ZSTD_compressBound(maxSampleSize);
+    dst = malloc(dstCapacity);
+  }
+  /* Create the cctx and cdict */
+  cctx = ZSTD_createCCtx();
+  cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                           parameters.zParams.compressionLevel);
+  if (!dst || !cctx || !cdict) {
+    goto _compressCleanup;
+  }
+  /* Compress each sample and sum their sizes (or error) */
+  totalCompressedSize = dictBufferCapacity;
+  i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+  for (; i < nbSamples; ++i) {
+    const size_t size = ZSTD_compress_usingCDict(
+        cctx, dst, dstCapacity, samples + offsets[i],
+        samplesSizes[i], cdict);
+    if (ZSTD_isError(size)) {
+      totalCompressedSize = size;
+      goto _compressCleanup;
+    }
+    totalCompressedSize += size;
+  }
+_compressCleanup:
+  ZSTD_freeCCtx(cctx);
+  ZSTD_freeCDict(cdict);
+  if (dst) {
+    free(dst);
+  }
+  return totalCompressedSize;
+}
+
+
+/**
+ * Initialize the `COVER_best_t`.
+ */
+void COVER_best_init(COVER_best_t *best) {
+  if (best==NULL) return; /* compatible with init on NULL */
+  (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
+  (void)ZSTD_pthread_cond_init(&best->cond, NULL);
+  best->liveJobs = 0;
+  best->dict = NULL;
+  best->dictSize = 0;
+  best->compressedSize = (size_t)-1;
+  memset(&best->parameters, 0, sizeof(best->parameters));
+}
+
+/**
+ * Wait until liveJobs == 0.
+ */
+void COVER_best_wait(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  while (best->liveJobs != 0) {
+    ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
+  }
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
+ */
+void COVER_best_destroy(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  COVER_best_wait(best);
+  if (best->dict) {
+    free(best->dict);
+  }
+  ZSTD_pthread_mutex_destroy(&best->mutex);
+  ZSTD_pthread_cond_destroy(&best->cond);
+}
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+void COVER_best_start(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  ++best->liveJobs;
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+void COVER_best_finish(COVER_best_t* best,
+                      ZDICT_cover_params_t parameters,
+                      COVER_dictSelection_t selection)
+{
+  void* dict = selection.dictContent;
+  size_t compressedSize = selection.totalCompressedSize;
+  size_t dictSize = selection.dictSize;
+  if (!best) {
+    return;
+  }
+  {
+    size_t liveJobs;
+    ZSTD_pthread_mutex_lock(&best->mutex);
+    --best->liveJobs;
+    liveJobs = best->liveJobs;
+    /* If the new dictionary is better */
+    if (compressedSize < best->compressedSize) {
+      /* Allocate space if necessary */
+      if (!best->dict || best->dictSize < dictSize) {
+        if (best->dict) {
+          free(best->dict);
+        }
+        best->dict = malloc(dictSize);
+        if (!best->dict) {
+          best->compressedSize = ERROR(GENERIC);
+          best->dictSize = 0;
+          ZSTD_pthread_cond_signal(&best->cond);
+          ZSTD_pthread_mutex_unlock(&best->mutex);
+          return;
+        }
+      }
+      /* Save the dictionary, parameters, and size */
+      if (dict) {
+        memcpy(best->dict, dict, dictSize);
+        best->dictSize = dictSize;
+        best->parameters = parameters;
+        best->compressedSize = compressedSize;
+      }
+    }
+    if (liveJobs == 0) {
+      ZSTD_pthread_cond_broadcast(&best->cond);
+    }
+    ZSTD_pthread_mutex_unlock(&best->mutex);
+  }
+}
+
+static COVER_dictSelection_t setDictSelection(BYTE* buf, size_t s, size_t csz)
+{
+    COVER_dictSelection_t ds;
+    ds.dictContent = buf;
+    ds.dictSize = s;
+    ds.totalCompressedSize = csz;
+    return ds;
+}
+
+COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
+    return setDictSelection(NULL, 0, error);
+}
+
+unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
+  return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
+}
+
+void COVER_dictSelectionFree(COVER_dictSelection_t selection){
+  free(selection.dictContent);
+}
+
+COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
+        size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
+        size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
+
+  size_t largestDict = 0;
+  size_t largestCompressed = 0;
+  BYTE* customDictContentEnd = customDictContent + dictContentSize;
+
+  BYTE* largestDictbuffer = (BYTE*)malloc(dictBufferCapacity);
+  BYTE* candidateDictBuffer = (BYTE*)malloc(dictBufferCapacity);
+  double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
+
+  if (!largestDictbuffer || !candidateDictBuffer) {
+    free(largestDictbuffer);
+    free(candidateDictBuffer);
+    return COVER_dictSelectionError(dictContentSize);
+  }
+
+  /* Initial dictionary size and compressed size */
+  memcpy(largestDictbuffer, customDictContent, dictContentSize);
+  dictContentSize = ZDICT_finalizeDictionary(
+    largestDictbuffer, dictBufferCapacity, customDictContent, dictContentSize,
+    samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
+
+  if (ZDICT_isError(dictContentSize)) {
+    free(largestDictbuffer);
+    free(candidateDictBuffer);
+    return COVER_dictSelectionError(dictContentSize);
+  }
+
+  totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
+                                                       samplesBuffer, offsets,
+                                                       nbCheckSamples, nbSamples,
+                                                       largestDictbuffer, dictContentSize);
+
+  if (ZSTD_isError(totalCompressedSize)) {
+    free(largestDictbuffer);
+    free(candidateDictBuffer);
+    return COVER_dictSelectionError(totalCompressedSize);
+  }
+
+  if (params.shrinkDict == 0) {
+    free(candidateDictBuffer);
+    return setDictSelection(largestDictbuffer, dictContentSize, totalCompressedSize);
+  }
+
+  largestDict = dictContentSize;
+  largestCompressed = totalCompressedSize;
+  dictContentSize = ZDICT_DICTSIZE_MIN;
+
+  /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
+  while (dictContentSize < largestDict) {
+    memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
+    dictContentSize = ZDICT_finalizeDictionary(
+      candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize,
+      samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
+
+    if (ZDICT_isError(dictContentSize)) {
+      free(largestDictbuffer);
+      free(candidateDictBuffer);
+      return COVER_dictSelectionError(dictContentSize);
+
+    }
+
+    totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
+                                                         samplesBuffer, offsets,
+                                                         nbCheckSamples, nbSamples,
+                                                         candidateDictBuffer, dictContentSize);
+
+    if (ZSTD_isError(totalCompressedSize)) {
+      free(largestDictbuffer);
+      free(candidateDictBuffer);
+      return COVER_dictSelectionError(totalCompressedSize);
+    }
+
+    if ((double)totalCompressedSize <= (double)largestCompressed * regressionTolerance) {
+      free(largestDictbuffer);
+      return setDictSelection( candidateDictBuffer, dictContentSize, totalCompressedSize );
+    }
+    dictContentSize *= 2;
+  }
+  dictContentSize = largestDict;
+  totalCompressedSize = largestCompressed;
+  free(candidateDictBuffer);
+  return setDictSelection( largestDictbuffer, dictContentSize, totalCompressedSize );
+}
+
+/**
+ * Parameters for COVER_tryParameters().
+ */
+typedef struct COVER_tryParameters_data_s {
+  const COVER_ctx_t *ctx;
+  COVER_best_t *best;
+  size_t dictBufferCapacity;
+  ZDICT_cover_params_t parameters;
+} COVER_tryParameters_data_t;
+
+/**
+ * Tries a set of parameters and updates the COVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void COVER_tryParameters(void *opaque)
+{
+  /* Save parameters as local variables */
+  COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t*)opaque;
+  const COVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_cover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Allocate space for hash table, dict, and freqs */
+  COVER_map_t activeDmers;
+  BYTE* const dict = (BYTE*)malloc(dictBufferCapacity);
+  COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
+  U32* const freqs = (U32*)malloc(ctx->suffixSize * sizeof(U32));
+  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
+    DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
+    goto _cleanup;
+  }
+  if (!dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
+  /* Build the dictionary */
+  {
+    const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
+                                              dictBufferCapacity, parameters);
+    selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
+        totalCompressedSize);
+
+    if (COVER_dictSelectionIsError(selection)) {
+      DISPLAYLEVEL(1, "Failed to select dictionary\n");
+      goto _cleanup;
+    }
+  }
+_cleanup:
+  free(dict);
+  COVER_best_finish(data->best, parameters, selection);
+  free(data);
+  COVER_map_destroy(&activeDmers);
+  COVER_dictSelectionFree(selection);
+  free(freqs);
+}
+
+ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover(
+    void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer,
+    const size_t* samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t* parameters)
+{
+  /* constants */
+  const unsigned nbThreads = parameters->nbThreads;
+  const double splitPoint =
+      parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
+  const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+  const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+  const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+  const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+  const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+  const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+  const unsigned kIterations =
+      (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+  const unsigned shrinkDict = 0;
+  /* Local variables */
+  const int displayLevel = parameters->zParams.notificationLevel;
+  unsigned iteration = 1;
+  unsigned d;
+  unsigned k;
+  COVER_best_t best;
+  POOL_ctx *pool = NULL;
+  int warned = 0;
+
+  /* Checks */
+  if (splitPoint <= 0 || splitPoint > 1) {
+    LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+    return ERROR(parameter_outOfBound);
+  }
+  if (kMinK < kMaxD || kMaxK < kMinK) {
+    LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+    return ERROR(parameter_outOfBound);
+  }
+  if (nbSamples == 0) {
+    DISPLAYLEVEL(1, "Cover must have at least one input file\n");
+    return ERROR(srcSize_wrong);
+  }
+  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+    DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                 ZDICT_DICTSIZE_MIN);
+    return ERROR(dstSize_tooSmall);
+  }
+  if (nbThreads > 1) {
+    pool = POOL_create(nbThreads, 1);
+    if (!pool) {
+      return ERROR(memory_allocation);
+    }
+  }
+  /* Initialization */
+  COVER_best_init(&best);
+  /* Turn down global display level to clean up display at level 2 and below */
+  g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+  /* Loop through d first because each new value needs a new context */
+  LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                    kIterations);
+  for (d = kMinD; d <= kMaxD; d += 2) {
+    /* Initialize the context for this value of d */
+    COVER_ctx_t ctx;
+    LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+    {
+      const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
+      if (ZSTD_isError(initVal)) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return initVal;
+      }
+    }
+    if (!warned) {
+      COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
+      warned = 1;
+    }
+    /* Loop through k reusing the same context */
+    for (k = kMinK; k <= kMaxK; k += kStepSize) {
+      /* Prepare the arguments */
+      COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
+          sizeof(COVER_tryParameters_data_t));
+      LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+      if (!data) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+        COVER_best_destroy(&best);
+        COVER_ctx_destroy(&ctx);
+        POOL_free(pool);
+        return ERROR(memory_allocation);
+      }
+      data->ctx = &ctx;
+      data->best = &best;
+      data->dictBufferCapacity = dictBufferCapacity;
+      data->parameters = *parameters;
+      data->parameters.k = k;
+      data->parameters.d = d;
+      data->parameters.splitPoint = splitPoint;
+      data->parameters.steps = kSteps;
+      data->parameters.shrinkDict = shrinkDict;
+      data->parameters.zParams.notificationLevel = g_displayLevel;
+      /* Check the parameters */
+      if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
+        DISPLAYLEVEL(1, "Cover parameters incorrect\n");
+        free(data);
+        continue;
+      }
+      /* Call the function and pass ownership of data to it */
+      COVER_best_start(&best);
+      if (pool) {
+        POOL_add(pool, &COVER_tryParameters, data);
+      } else {
+        COVER_tryParameters(data);
+      }
+      /* Print status */
+      LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                         (unsigned)((iteration * 100) / kIterations));
+      ++iteration;
+    }
+    COVER_best_wait(&best);
+    COVER_ctx_destroy(&ctx);
+  }
+  LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+  /* Fill the output buffer and parameters with output of the best parameters */
+  {
+    const size_t dictSize = best.dictSize;
+    if (ZSTD_isError(best.compressedSize)) {
+      const size_t compressedSize = best.compressedSize;
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return compressedSize;
+    }
+    *parameters = best.parameters;
+    memcpy(dictBuffer, best.dict, dictSize);
+    COVER_best_destroy(&best);
+    POOL_free(pool);
+    return dictSize;
+  }
+}
diff --git a/src/zstd/dictBuilder/cover.h b/src/zstd/dictBuilder/cover.h
new file mode 100644
index 000000000..a5d7506ef
--- /dev/null
+++ b/src/zstd/dictBuilder/cover.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#  define ZDICT_STATIC_LINKING_ONLY
+#endif
+
+#include "../common/threading.h" /* ZSTD_pthread_mutex_t */
+#include "../common/mem.h"   /* U32, BYTE */
+#include "../zdict.h"
+
+/**
+ * COVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except COVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct COVER_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_cover_params_t parameters;
+  size_t compressedSize;
+} COVER_best_t;
+
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} COVER_segment_t;
+
+/**
+ *Number of epochs and size of each epoch.
+ */
+typedef struct {
+  U32 num;
+  U32 size;
+} COVER_epoch_info_t;
+
+/**
+ * Struct used for the dictionary selection function.
+ */
+typedef struct COVER_dictSelection {
+  BYTE* dictContent;
+  size_t dictSize;
+  size_t totalCompressedSize;
+} COVER_dictSelection_t;
+
+/**
+ * Computes the number of epochs and the size of each epoch.
+ * We will make sure that each epoch gets at least 10 * k bytes.
+ *
+ * The COVER algorithms divide the data up into epochs of equal size and
+ * select one segment from each epoch.
+ *
+ * @param maxDictSize The maximum allowed dictionary size.
+ * @param nbDmers     The number of dmers we are training on.
+ * @param k           The parameter k (segment size).
+ * @param passes      The target number of passes over the dmer corpus.
+ *                    More passes means a better dictionary.
+ */
+COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
+                                       U32 k, U32 passes);
+
+/**
+ * Warns the user when their corpus is too small.
+ */
+void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
+
+/**
+ *  Checks total compressed size of a dictionary
+ */
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                      const size_t *samplesSizes, const BYTE *samples,
+                                      size_t *offsets,
+                                      size_t nbTrainSamples, size_t nbSamples,
+                                      BYTE *const dict, size_t dictBufferCapacity);
+
+/**
+ * Returns the sum of the sample sizes.
+ */
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
+
+/**
+ * Initialize the `COVER_best_t`.
+ */
+void COVER_best_init(COVER_best_t *best);
+
+/**
+ * Wait until liveJobs == 0.
+ */
+void COVER_best_wait(COVER_best_t *best);
+
+/**
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
+ */
+void COVER_best_destroy(COVER_best_t *best);
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+void COVER_best_start(COVER_best_t *best);
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
+                       COVER_dictSelection_t selection);
+/**
+ * Error function for COVER_selectDict function. Checks if the return
+ * value is an error.
+ */
+unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
+
+ /**
+  * Error function for COVER_selectDict function. Returns a struct where
+  * return.totalCompressedSize is a ZSTD error.
+  */
+COVER_dictSelection_t COVER_dictSelectionError(size_t error);
+
+/**
+ * Always call after selectDict is called to free up used memory from
+ * newly created dictionary.
+ */
+void COVER_dictSelectionFree(COVER_dictSelection_t selection);
+
+/**
+ * Called to finalize the dictionary and select one based on whether or not
+ * the shrink-dict flag was enabled. If enabled the dictionary used is the
+ * smallest dictionary within a specified regression of the compressed size
+ * from the largest dictionary.
+ */
+ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
+                       size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
+                       size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
diff --git a/src/zstd/dictBuilder/divsufsort.c b/src/zstd/dictBuilder/divsufsort.c
new file mode 100644
index 000000000..a2870fb3b
--- /dev/null
+++ b/src/zstd/dictBuilder/divsufsort.c
@@ -0,0 +1,1913 @@
+/*
+ * divsufsort.c for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*- Compiler specifics -*/
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wshorten-64-to-32"
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4244)
+#  pragma warning(disable : 4127)    /* C4127 : Condition expression is constant */
+#endif
+
+
+/*- Dependencies -*/
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "divsufsort.h"
+
+/*- Constants -*/
+#if defined(INLINE)
+# undef INLINE
+#endif
+#if !defined(INLINE)
+# define INLINE __inline
+#endif
+#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1)
+# undef ALPHABET_SIZE
+#endif
+#if !defined(ALPHABET_SIZE)
+# define ALPHABET_SIZE (256)
+#endif
+#define BUCKET_A_SIZE (ALPHABET_SIZE)
+#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE)
+#if defined(SS_INSERTIONSORT_THRESHOLD)
+# if SS_INSERTIONSORT_THRESHOLD < 1
+#  undef SS_INSERTIONSORT_THRESHOLD
+#  define SS_INSERTIONSORT_THRESHOLD (1)
+# endif
+#else
+# define SS_INSERTIONSORT_THRESHOLD (8)
+#endif
+#if defined(SS_BLOCKSIZE)
+# if SS_BLOCKSIZE < 0
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (0)
+# elif 32768 <= SS_BLOCKSIZE
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (32767)
+# endif
+#else
+# define SS_BLOCKSIZE (1024)
+#endif
+/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */
+#if SS_BLOCKSIZE == 0
+# define SS_MISORT_STACKSIZE (96)
+#elif SS_BLOCKSIZE <= 4096
+# define SS_MISORT_STACKSIZE (16)
+#else
+# define SS_MISORT_STACKSIZE (24)
+#endif
+#define SS_SMERGE_STACKSIZE (32)
+#define TR_INSERTIONSORT_THRESHOLD (8)
+#define TR_STACKSIZE (64)
+
+
+/*- Macros -*/
+#ifndef SWAP
+# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0)
+#endif /* SWAP */
+#ifndef MIN
+# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b))
+#endif /* MIN */
+#ifndef MAX
+# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b))
+#endif /* MAX */
+#define STACK_PUSH(_a, _b, _c, _d)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize++].d = (_d);\
+  } while(0)
+#define STACK_PUSH5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\
+  } while(0)
+#define STACK_POP(_a, _b, _c, _d)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d;\
+  } while(0)
+#define STACK_POP5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\
+  } while(0)
+#define BUCKET_A(_c0) bucket_A[(_c0)]
+#if ALPHABET_SIZE == 256
+#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)])
+#else
+#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)])
+#endif
+
+
+/*- Private Functions -*/
+
+static const int lg_table[256]= {
+ -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+int
+ss_ilg(int n) {
+#if SS_BLOCKSIZE == 0
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+#elif SS_BLOCKSIZE < 256
+  return lg_table[n];
+#else
+  return (n & 0xff00) ?
+          8 + lg_table[(n >> 8) & 0xff] :
+          0 + lg_table[(n >> 0) & 0xff];
+#endif
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+#if SS_BLOCKSIZE != 0
+
+static const int sqq_table[256] = {
+  0,  16,  22,  27,  32,  35,  39,  42,  45,  48,  50,  53,  55,  57,  59,  61,
+ 64,  65,  67,  69,  71,  73,  75,  76,  78,  80,  81,  83,  84,  86,  87,  89,
+ 90,  91,  93,  94,  96,  97,  98,  99, 101, 102, 103, 104, 106, 107, 108, 109,
+110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155,
+156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,
+169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180,
+181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191,
+192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201,
+202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211,
+212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221,
+221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230,
+230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
+239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247,
+247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255
+};
+
+static INLINE
+int
+ss_isqrt(int x) {
+  int y, e;
+
+  if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; }
+  e = (x & 0xffff0000) ?
+        ((x & 0xff000000) ?
+          24 + lg_table[(x >> 24) & 0xff] :
+          16 + lg_table[(x >> 16) & 0xff]) :
+        ((x & 0x0000ff00) ?
+           8 + lg_table[(x >>  8) & 0xff] :
+           0 + lg_table[(x >>  0) & 0xff]);
+
+  if(e >= 16) {
+    y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7);
+    if(e >= 24) { y = (y + 1 + x / y) >> 1; }
+    y = (y + 1 + x / y) >> 1;
+  } else if(e >= 8) {
+    y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1;
+  } else {
+    return sqq_table[x] >> 4;
+  }
+
+  return (x < (y * y)) ? y - 1 : y;
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Compares two suffixes. */
+static INLINE
+int
+ss_compare(const unsigned char *T,
+           const int *p1, const int *p2,
+           int depth) {
+  const unsigned char *U1, *U2, *U1n, *U2n;
+
+  for(U1 = T + depth + *p1,
+      U2 = T + depth + *p2,
+      U1n = T + *(p1 + 1) + 2,
+      U2n = T + *(p2 + 1) + 2;
+      (U1 < U1n) && (U2 < U2n) && (*U1 == *U2);
+      ++U1, ++U2) {
+  }
+
+  return U1 < U1n ?
+        (U2 < U2n ? *U1 - *U2 : 1) :
+        (U2 < U2n ? -1 : 0);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1)
+
+/* Insertionsort for small size groups */
+static
+void
+ss_insertionsort(const unsigned char *T, const int *PA,
+                 int *first, int *last, int depth) {
+  int *i, *j;
+  int t;
+  int r;
+
+  for(i = last - 2; first <= i; --i) {
+    for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) {
+      do { *(j - 1) = *j; } while((++j < last) && (*j < 0));
+      if(last <= j) { break; }
+    }
+    if(r == 0) { *j = ~*j; }
+    *(j - 1) = t;
+  }
+}
+
+#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+void
+ss_fixdown(const unsigned char *Td, const int *PA,
+           int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = Td[PA[SA[k = j++]]];
+    if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    ss_fixdown(Td, PA, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+ss_median3(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3) {
+  int *t;
+  if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); }
+  if(Td[PA[*v2]] > Td[PA[*v3]]) {
+    if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+ss_median5(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); }
+  if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); }
+  if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); }
+  if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return ss_median3(Td, PA, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = ss_median3(Td, PA, first, first + t, first + (t << 1));
+  middle = ss_median3(Td, PA, middle - t, middle, middle + t);
+  last   = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return ss_median3(Td, PA, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Binary partition for substrings. */
+static INLINE
+int *
+ss_partition(const int *PA,
+                    int *first, int *last, int depth) {
+  int *a, *b;
+  int t;
+  for(a = first - 1, b = last;;) {
+    for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; }
+    for(; (a < --b) && ((PA[*b] + depth) <  (PA[*b + 1] + 1));) { }
+    if(b <= a) { break; }
+    t = ~*b;
+    *b = *a;
+    *a = t;
+  }
+  if(first < a) { *first = ~*first; }
+  return a;
+}
+
+/* Multikey introsort for medium size groups. */
+static
+void
+ss_mintrosort(const unsigned char *T, const int *PA,
+              int *first, int *last,
+              int depth) {
+#define STACK_SIZE SS_MISORT_STACKSIZE
+  struct { int *a, *b, c; int d; } stack[STACK_SIZE];
+  const unsigned char *Td;
+  int *a, *b, *c, *d, *e, *f;
+  int s, t;
+  int ssize;
+  int limit;
+  int v, x = 0;
+
+  for(ssize = 0, limit = ss_ilg(last - first);;) {
+
+    if((last - first) <= SS_INSERTIONSORT_THRESHOLD) {
+#if 1 < SS_INSERTIONSORT_THRESHOLD
+      if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); }
+#endif
+      STACK_POP(first, last, depth, limit);
+      continue;
+    }
+
+    Td = T + depth;
+    if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); }
+    if(limit < 0) {
+      for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) {
+        if((x = Td[PA[*a]]) != v) {
+          if(1 < (a - first)) { break; }
+          v = x;
+          first = a;
+        }
+      }
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, a, depth);
+      }
+      if((a - first) <= (last - a)) {
+        if(1 < (a - first)) {
+          STACK_PUSH(a, last, depth, -1);
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        } else {
+          first = a, limit = -1;
+        }
+      } else {
+        if(1 < (last - a)) {
+          STACK_PUSH(first, a, depth + 1, ss_ilg(a - first));
+          first = a, limit = -1;
+        } else {
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        }
+      }
+      continue;
+    }
+
+    /* choose pivot */
+    a = ss_pivot(Td, PA, first, last);
+    v = Td[PA[*a]];
+    SWAP(*first, *a);
+
+    /* partition */
+    for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { }
+    if(((a = b) < last) && (x < v)) {
+      for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+    }
+    for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { }
+    if((b < (d = c)) && (x > v)) {
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+    for(; b < c;) {
+      SWAP(*b, *c);
+      for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+
+    if(a <= d) {
+      c = b - 1;
+
+      if((s = a - first) > (t = b - a)) { s = t; }
+      for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+      if((s = d - c) > (t = last - d - 1)) { s = t; }
+      for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+
+      a = first + (b - a), c = last - (d - c);
+      b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth);
+
+      if((a - first) <= (last - c)) {
+        if((last - c) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(c, last, depth, limit);
+          last = a;
+        } else if((a - first) <= (c - b)) {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          last = a;
+        } else {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(first, a, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      } else {
+        if((a - first) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(first, a, depth, limit);
+          first = c;
+        } else if((last - c) <= (c - b)) {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          first = c;
+        } else {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(c, last, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      }
+    } else {
+      limit += 1;
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, last, depth);
+        limit = ss_ilg(last - first);
+      }
+      depth += 1;
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if SS_BLOCKSIZE != 0
+
+static INLINE
+void
+ss_blockswap(int *a, int *b, int n) {
+  int t;
+  for(; 0 < n; --n, ++a, ++b) {
+    t = *a, *a = *b, *b = t;
+  }
+}
+
+static INLINE
+void
+ss_rotate(int *first, int *middle, int *last) {
+  int *a, *b, t;
+  int l, r;
+  l = middle - first, r = last - middle;
+  for(; (0 < l) && (0 < r);) {
+    if(l == r) { ss_blockswap(first, middle, l); break; }
+    if(l < r) {
+      a = last - 1, b = middle - 1;
+      t = *a;
+      do {
+        *a-- = *b, *b-- = *a;
+        if(b < first) {
+          *a = t;
+          last = a;
+          if((r -= l + 1) <= l) { break; }
+          a -= 1, b = middle - 1;
+          t = *a;
+        }
+      } while(1);
+    } else {
+      a = first, b = middle;
+      t = *a;
+      do {
+        *a++ = *b, *b++ = *a;
+        if(last <= b) {
+          *a = t;
+          first = a + 1;
+          if((l -= r + 1) <= r) { break; }
+          a += 1, b = middle;
+          t = *a;
+        }
+      } while(1);
+    }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static
+void
+ss_inplacemerge(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int depth) {
+  const int *p;
+  int *a, *b;
+  int len, half;
+  int q, r;
+  int x;
+
+  for(;;) {
+    if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); }
+    else                { x = 0; p = PA +  *(last - 1); }
+    for(a = first, len = middle - first, half = len >> 1, r = -1;
+        0 < len;
+        len = half, half >>= 1) {
+      b = a + half;
+      q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth);
+      if(q < 0) {
+        a = b + 1;
+        half -= (len & 1) ^ 1;
+      } else {
+        r = q;
+      }
+    }
+    if(a < middle) {
+      if(r == 0) { *a = ~*a; }
+      ss_rotate(a, middle, last);
+      last -= middle - a;
+      middle = a;
+      if(first == middle) { break; }
+    }
+    --last;
+    if(x != 0) { while(*--last < 0) { } }
+    if(middle == last) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Merge-forward with internal buffer. */
+static
+void
+ss_mergeforward(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int *buf, int depth) {
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+
+  bufend = buf + (middle - first) - 1;
+  ss_blockswap(buf, first, middle - first);
+
+  for(t = *(a = first), b = buf, c = middle;;) {
+    r = ss_compare(T, PA + *b, PA + *c, depth);
+    if(r < 0) {
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+    } else if(r > 0) {
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    } else {
+      *c = ~*c;
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    }
+  }
+}
+
+/* Merge-backward with internal buffer. */
+static
+void
+ss_mergebackward(const unsigned char *T, const int *PA,
+                 int *first, int *middle, int *last,
+                 int *buf, int depth) {
+  const int *p1, *p2;
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+  int x;
+
+  bufend = buf + (last - middle) - 1;
+  ss_blockswap(buf, middle, last - middle);
+
+  x = 0;
+  if(*bufend < 0)       { p1 = PA + ~*bufend; x |= 1; }
+  else                  { p1 = PA +  *bufend; }
+  if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; }
+  else                  { p2 = PA +  *(middle - 1); }
+  for(t = *(a = last - 1), b = bufend, c = middle - 1;;) {
+    r = ss_compare(T, p1, p2, depth);
+    if(0 < r) {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = *b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+    } else if(r < 0) {
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    } else {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = ~*b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    }
+  }
+}
+
+/* D&C based merge. */
+static
+void
+ss_swapmerge(const unsigned char *T, const int *PA,
+             int *first, int *middle, int *last,
+             int *buf, int bufsize, int depth) {
+#define STACK_SIZE SS_SMERGE_STACKSIZE
+#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a)))
+#define MERGE_CHECK(a, b, c)\
+  do {\
+    if(((c) & 1) ||\
+       (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\
+      *(a) = ~*(a);\
+    }\
+    if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\
+      *(b) = ~*(b);\
+    }\
+  } while(0)
+  struct { int *a, *b, *c; int d; } stack[STACK_SIZE];
+  int *l, *r, *lm, *rm;
+  int m, len, half;
+  int ssize;
+  int check, next;
+
+  for(check = 0, ssize = 0;;) {
+    if((last - middle) <= bufsize) {
+      if((first < middle) && (middle < last)) {
+        ss_mergebackward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    if((middle - first) <= bufsize) {
+      if(first < middle) {
+        ss_mergeforward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1;
+        0 < len;
+        len = half, half >>= 1) {
+      if(ss_compare(T, PA + GETIDX(*(middle + m + half)),
+                       PA + GETIDX(*(middle - m - half - 1)), depth) < 0) {
+        m += half + 1;
+        half -= (len & 1) ^ 1;
+      }
+    }
+
+    if(0 < m) {
+      lm = middle - m, rm = middle + m;
+      ss_blockswap(lm, middle, m);
+      l = r = middle, next = 0;
+      if(rm < last) {
+        if(*rm < 0) {
+          *rm = ~*rm;
+          if(first < lm) { for(; *--l < 0;) { } next |= 4; }
+          next |= 1;
+        } else if(first < lm) {
+          for(; *r < 0; ++r) { }
+          next |= 2;
+        }
+      }
+
+      if((l - first) <= (last - r)) {
+        STACK_PUSH(r, rm, last, (next & 3) | (check & 4));
+        middle = lm, last = l, check = (check & 3) | (next & 4);
+      } else {
+        if((next & 2) && (r == middle)) { next ^= 6; }
+        STACK_PUSH(first, lm, l, (check & 3) | (next & 4));
+        first = r, middle = rm, check = (next & 3) | (check & 4);
+      }
+    } else {
+      if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) {
+        *middle = ~*middle;
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Substring sort */
+static
+void
+sssort(const unsigned char *T, const int *PA,
+       int *first, int *last,
+       int *buf, int bufsize,
+       int depth, int n, int lastsuffix) {
+  int *a;
+#if SS_BLOCKSIZE != 0
+  int *b, *middle, *curbuf;
+  int j, k, curbufsize, limit;
+#endif
+  int i;
+
+  if(lastsuffix != 0) { ++first; }
+
+#if SS_BLOCKSIZE == 0
+  ss_mintrosort(T, PA, first, last, depth);
+#else
+  if((bufsize < SS_BLOCKSIZE) &&
+      (bufsize < (last - first)) &&
+      (bufsize < (limit = ss_isqrt(last - first)))) {
+    if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; }
+    buf = middle = last - limit, bufsize = limit;
+  } else {
+    middle = last, limit = 0;
+  }
+  for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#endif
+    curbufsize = last - (a + SS_BLOCKSIZE);
+    curbuf = a + SS_BLOCKSIZE;
+    if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; }
+    for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) {
+      ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth);
+    }
+  }
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+  ss_mintrosort(T, PA, a, middle, depth);
+#elif 1 < SS_BLOCKSIZE
+  ss_insertionsort(T, PA, a, middle, depth);
+#endif
+  for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) {
+    if(i & 1) {
+      ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth);
+      a -= k;
+    }
+  }
+  if(limit != 0) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, middle, last, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, middle, last, depth);
+#endif
+    ss_inplacemerge(T, PA, first, middle, last, depth);
+  }
+#endif
+
+  if(lastsuffix != 0) {
+    /* Insert last type B* suffix. */
+    int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2;
+    for(a = first, i = *(first - 1);
+        (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth)));
+        ++a) {
+      *(a - 1) = *a;
+    }
+    *(a - 1) = i;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+int
+tr_ilg(int n) {
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Simple insertionsort for small size groups. */
+static
+void
+tr_insertionsort(const int *ISAd, int *first, int *last) {
+  int *a, *b;
+  int t, r;
+
+  for(a = first + 1; a < last; ++a) {
+    for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) {
+      do { *(b + 1) = *b; } while((first <= --b) && (*b < 0));
+      if(b < first) { break; }
+    }
+    if(r == 0) { *b = ~*b; }
+    *(b + 1) = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_fixdown(const int *ISAd, int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = ISAd[SA[k = j++]];
+    if(d < (e = ISAd[SA[j]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+tr_heapsort(const int *ISAd, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    tr_fixdown(ISAd, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+tr_median3(const int *ISAd, int *v1, int *v2, int *v3) {
+  int *t;
+  if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); }
+  if(ISAd[*v2] > ISAd[*v3]) {
+    if(ISAd[*v1] > ISAd[*v3]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+tr_median5(const int *ISAd,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); }
+  if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); }
+  if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); }
+  if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(ISAd[*v3] > ISAd[*v4]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+tr_pivot(const int *ISAd, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return tr_median3(ISAd, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = tr_median3(ISAd, first, first + t, first + (t << 1));
+  middle = tr_median3(ISAd, middle - t, middle, middle + t);
+  last   = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return tr_median3(ISAd, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+typedef struct _trbudget_t trbudget_t;
+struct _trbudget_t {
+  int chance;
+  int remain;
+  int incval;
+  int count;
+};
+
+static INLINE
+void
+trbudget_init(trbudget_t *budget, int chance, int incval) {
+  budget->chance = chance;
+  budget->remain = budget->incval = incval;
+}
+
+static INLINE
+int
+trbudget_check(trbudget_t *budget, int size) {
+  if(size <= budget->remain) { budget->remain -= size; return 1; }
+  if(budget->chance == 0) { budget->count += size; return 0; }
+  budget->remain += budget->incval - size;
+  budget->chance -= 1;
+  return 1;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_partition(const int *ISAd,
+             int *first, int *middle, int *last,
+             int **pa, int **pb, int v) {
+  int *a, *b, *c, *d, *e, *f;
+  int t, s;
+  int x = 0;
+
+  for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { }
+  if(((a = b) < last) && (x < v)) {
+    for(; (++b < last) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+  }
+  for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { }
+  if((b < (d = c)) && (x > v)) {
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+  for(; b < c;) {
+    SWAP(*b, *c);
+    for(; (++b < c) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+
+  if(a <= d) {
+    c = b - 1;
+    if((s = a - first) > (t = b - a)) { s = t; }
+    for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    if((s = d - c) > (t = last - d - 1)) { s = t; }
+    for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    first += (b - a), last -= (d - c);
+  }
+  *pa = first, *pb = last;
+}
+
+static
+void
+tr_copy(int *ISA, const int *SA,
+        int *first, int *a, int *b, int *last,
+        int depth) {
+  /* sort suffixes of middle partition
+     by using sorted order of suffixes of left and right partition. */
+  int *c, *d, *e;
+  int s, v;
+
+  v = b - SA - 1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      ISA[s] = d - SA;
+    }
+  }
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      ISA[s] = d - SA;
+    }
+  }
+}
+
+static
+void
+tr_partialcopy(int *ISA, const int *SA,
+               int *first, int *a, int *b, int *last,
+               int depth) {
+  int *c, *d, *e;
+  int s, v;
+  int rank, lastrank, newrank = -1;
+
+  v = b - SA - 1;
+  lastrank = -1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+
+  lastrank = -1;
+  for(e = d; first <= e; --e) {
+    rank = ISA[*e];
+    if(lastrank != rank) { lastrank = rank; newrank = e - SA; }
+    if(newrank != rank) { ISA[*e] = newrank; }
+  }
+
+  lastrank = -1;
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+}
+
+static
+void
+tr_introsort(int *ISA, const int *ISAd,
+             int *SA, int *first, int *last,
+             trbudget_t *budget) {
+#define STACK_SIZE TR_STACKSIZE
+  struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE];
+  int *a, *b, *c;
+  int t;
+  int v, x = 0;
+  int incr = ISAd - ISA;
+  int limit, next;
+  int ssize, trlink = -1;
+
+  for(ssize = 0, limit = tr_ilg(last - first);;) {
+
+    if(limit < 0) {
+      if(limit == -1) {
+        /* tandem repeat partition */
+        tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1);
+
+        /* update ranks */
+        if(a < last) {
+          for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+        }
+        if(b < last) {
+          for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; }
+        }
+
+        /* push */
+        if(1 < (b - a)) {
+          STACK_PUSH5(NULL, a, b, 0, 0);
+          STACK_PUSH5(ISAd - incr, first, last, -2, trlink);
+          trlink = ssize - 2;
+        }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink);
+            last = a, limit = tr_ilg(a - first);
+          } else if(1 < (last - b)) {
+            first = b, limit = tr_ilg(last - b);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink);
+            first = b, limit = tr_ilg(last - b);
+          } else if(1 < (a - first)) {
+            last = a, limit = tr_ilg(a - first);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      } else if(limit == -2) {
+        /* tandem repeat copy */
+        a = stack[--ssize].b, b = stack[ssize].c;
+        if(stack[ssize].d == 0) {
+          tr_copy(ISA, SA, first, a, b, last, ISAd - ISA);
+        } else {
+          if(0 <= trlink) { stack[trlink].d = -1; }
+          tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA);
+        }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      } else {
+        /* sorted partition */
+        if(0 <= *first) {
+          a = first;
+          do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a));
+          first = a;
+        }
+        if(first < last) {
+          a = first; do { *a = ~*a; } while(*++a < 0);
+          next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1;
+          if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } }
+
+          /* push */
+          if(trbudget_check(budget, a - first)) {
+            if((a - first) <= (last - a)) {
+              STACK_PUSH5(ISAd, a, last, -3, trlink);
+              ISAd += incr, last = a, limit = next;
+            } else {
+              if(1 < (last - a)) {
+                STACK_PUSH5(ISAd + incr, first, a, next, trlink);
+                first = a, limit = -3;
+              } else {
+                ISAd += incr, last = a, limit = next;
+              }
+            }
+          } else {
+            if(0 <= trlink) { stack[trlink].d = -1; }
+            if(1 < (last - a)) {
+              first = a, limit = -3;
+            } else {
+              STACK_POP5(ISAd, first, last, limit, trlink);
+            }
+          }
+        } else {
+          STACK_POP5(ISAd, first, last, limit, trlink);
+        }
+      }
+      continue;
+    }
+
+    if((last - first) <= TR_INSERTIONSORT_THRESHOLD) {
+      tr_insertionsort(ISAd, first, last);
+      limit = -3;
+      continue;
+    }
+
+    if(limit-- == 0) {
+      tr_heapsort(ISAd, first, last - first);
+      for(a = last - 1; first < a; a = b) {
+        for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; }
+      }
+      limit = -3;
+      continue;
+    }
+
+    /* choose pivot */
+    a = tr_pivot(ISAd, first, last);
+    SWAP(*first, *a);
+    v = ISAd[*first];
+
+    /* partition */
+    tr_partition(ISAd, first, first + 1, last, &a, &b, v);
+    if((last - first) != (b - a)) {
+      next = (ISA[*a] != v) ? tr_ilg(b - a) : -1;
+
+      /* update ranks */
+      for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+      if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } }
+
+      /* push */
+      if((1 < (b - a)) && (trbudget_check(budget, b - a))) {
+        if((a - first) <= (last - b)) {
+          if((last - b) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              last = a;
+            } else if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((a - first) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        } else {
+          if((a - first) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              first = b;
+            } else if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((last - b) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        }
+      } else {
+        if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            last = a;
+          } else if(1 < (last - b)) {
+            first = b;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            first = b;
+          } else if(1 < (a - first)) {
+            last = a;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      }
+    } else {
+      if(trbudget_check(budget, last - first)) {
+        limit = tr_ilg(last - first), ISAd += incr;
+      } else {
+        if(0 <= trlink) { stack[trlink].d = -1; }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      }
+    }
+  }
+#undef STACK_SIZE
+}
+
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Tandem repeat sort */
+static
+void
+trsort(int *ISA, int *SA, int n, int depth) {
+  int *ISAd;
+  int *first, *last;
+  trbudget_t budget;
+  int t, skip, unsorted;
+
+  trbudget_init(&budget, tr_ilg(n) * 2 / 3, n);
+/*  trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */
+  for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) {
+    first = SA;
+    skip = 0;
+    unsorted = 0;
+    do {
+      if((t = *first) < 0) { first -= t; skip += t; }
+      else {
+        if(skip != 0) { *(first + skip) = skip; skip = 0; }
+        last = SA + ISA[t] + 1;
+        if(1 < (last - first)) {
+          budget.count = 0;
+          tr_introsort(ISA, ISAd, SA, first, last, &budget);
+          if(budget.count != 0) { unsorted += budget.count; }
+          else { skip = first - last; }
+        } else if((last - first) == 1) {
+          skip = -1;
+        }
+        first = last;
+      }
+    } while(first < (SA + n));
+    if(skip != 0) { *(first + skip) = skip; }
+    if(unsorted == 0) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Sorts suffixes of type B*. */
+static
+int
+sort_typeBstar(const unsigned char *T, int *SA,
+               int *bucket_A, int *bucket_B,
+               int n, int openMP) {
+  int *PAb, *ISAb, *buf;
+#ifdef LIBBSC_OPENMP
+  int *curbuf;
+  int l;
+#endif
+  int i, j, k, t, m, bufsize;
+  int c0, c1;
+#ifdef LIBBSC_OPENMP
+  int d0, d1;
+#endif
+  (void)openMP;
+
+  /* Initialize bucket arrays. */
+  for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
+  for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
+
+  /* Count the number of occurrences of the first one or two characters of each
+     type A, B and B* suffix. Moreover, store the beginning position of all
+     type B* suffixes into the array SA. */
+  for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
+    /* type A suffix. */
+    do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
+    if(0 <= i) {
+      /* type B* suffix. */
+      ++BUCKET_BSTAR(c0, c1);
+      SA[--m] = i;
+      /* type B suffix. */
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
+        ++BUCKET_B(c0, c1);
+      }
+    }
+  }
+  m = n - m;
+/*
+note:
+  A type B* suffix is lexicographically smaller than a type B suffix that
+  begins with the same first two characters.
+*/
+
+  /* Calculate the index of start/end point of each bucket. */
+  for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
+    t = i + BUCKET_A(c0);
+    BUCKET_A(c0) = i + j; /* start point */
+    i = t + BUCKET_B(c0, c0);
+    for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
+      j += BUCKET_BSTAR(c0, c1);
+      BUCKET_BSTAR(c0, c1) = j; /* end point */
+      i += BUCKET_B(c0, c1);
+    }
+  }
+
+  if(0 < m) {
+    /* Sort the type B* suffixes by their first two characters. */
+    PAb = SA + n - m; ISAb = SA + m;
+    for(i = m - 2; 0 <= i; --i) {
+      t = PAb[i], c0 = T[t], c1 = T[t + 1];
+      SA[--BUCKET_BSTAR(c0, c1)] = i;
+    }
+    t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
+    SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
+
+    /* Sort the type B* substrings using sssort. */
+#ifdef LIBBSC_OPENMP
+    if (openMP)
+    {
+        buf = SA + m;
+        c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
+#pragma omp parallel default(shared) private(bufsize, curbuf, k, l, d0, d1)
+        {
+          bufsize = (n - (2 * m)) / omp_get_num_threads();
+          curbuf = buf + omp_get_thread_num() * bufsize;
+          k = 0;
+          for(;;) {
+            #pragma omp critical(sssort_lock)
+            {
+              if(0 < (l = j)) {
+                d0 = c0, d1 = c1;
+                do {
+                  k = BUCKET_BSTAR(d0, d1);
+                  if(--d1 <= d0) {
+                    d1 = ALPHABET_SIZE - 1;
+                    if(--d0 < 0) { break; }
+                  }
+                } while(((l - k) <= 1) && (0 < (l = k)));
+                c0 = d0, c1 = d1, j = k;
+              }
+            }
+            if(l == 0) { break; }
+            sssort(T, PAb, SA + k, SA + l,
+                   curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
+          }
+        }
+    }
+    else
+    {
+        buf = SA + m, bufsize = n - (2 * m);
+        for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+          for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+            i = BUCKET_BSTAR(c0, c1);
+            if(1 < (j - i)) {
+              sssort(T, PAb, SA + i, SA + j,
+                     buf, bufsize, 2, n, *(SA + i) == (m - 1));
+            }
+          }
+        }
+    }
+#else
+    buf = SA + m, bufsize = n - (2 * m);
+    for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+        i = BUCKET_BSTAR(c0, c1);
+        if(1 < (j - i)) {
+          sssort(T, PAb, SA + i, SA + j,
+                 buf, bufsize, 2, n, *(SA + i) == (m - 1));
+        }
+      }
+    }
+#endif
+
+    /* Compute ranks of type B* substrings. */
+    for(i = m - 1; 0 <= i; --i) {
+      if(0 <= SA[i]) {
+        j = i;
+        do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
+        SA[i + 1] = i - j;
+        if(i <= 0) { break; }
+      }
+      j = i;
+      do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
+      ISAb[SA[i]] = j;
+    }
+
+    /* Construct the inverse suffix array of type B* suffixes using trsort. */
+    trsort(ISAb, SA, m, 1);
+
+    /* Set the sorted order of type B* suffixes. */
+    for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
+      if(0 <= i) {
+        t = i;
+        for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
+        SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
+      }
+    }
+
+    /* Calculate the index of start/end point of each bucket. */
+    BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
+    for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
+      i = BUCKET_A(c0 + 1) - 1;
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
+        t = i - BUCKET_B(c0, c1);
+        BUCKET_B(c0, c1) = i; /* end point */
+
+        /* Move all type B* suffixes to the correct position. */
+        for(i = t, j = BUCKET_BSTAR(c0, c1);
+            j <= k;
+            --i, --k) { SA[i] = SA[k]; }
+      }
+      BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
+      BUCKET_B(c0, c0) = i; /* end point */
+    }
+  }
+
+  return m;
+}
+
+/* Constructs the suffix array by using the sorted order of type B* suffixes. */
+static
+void
+construct_SA(const unsigned char *T, int *SA,
+             int *bucket_A, int *bucket_B,
+             int n, int m) {
+  int *i, *j, *k;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          *j = ~s;
+          c0 = T[--s];
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j); assert(k != NULL);
+          *k-- = s;
+        } else {
+          assert(((s == 0) && (T[s] == c1)) || (s < 0));
+          *j = ~s;
+        }
+      }
+    }
+  }
+
+  /* Construct the suffix array by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else {
+      assert(s < 0);
+      *i = ~s;
+    }
+  }
+}
+
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+int
+construct_BWT(const unsigned char *T, int *SA,
+              int *bucket_A, int *bucket_B,
+              int n, int m) {
+  int *i, *j, *k, *orig;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          c0 = T[--s];
+          *j = ~((int)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j); assert(k != NULL);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~((int)T[n - 2]) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      *i = c0;
+      if((0 < s) && (T[s - 1] < c0)) { s = ~((int)T[s - 1]); }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+int
+construct_BWT_indexes(const unsigned char *T, int *SA,
+                      int *bucket_A, int *bucket_B,
+                      int n, int m,
+                      unsigned char * num_indexes, int * indexes) {
+  int *i, *j, *k, *orig;
+  int s;
+  int c0, c1, c2;
+
+  int mod = n / 8;
+  {
+      mod |= mod >> 1;  mod |= mod >> 2;
+      mod |= mod >> 4;  mod |= mod >> 8;
+      mod |= mod >> 16; mod >>= 1;
+
+      *num_indexes = (unsigned char)((n - 1) / (mod + 1));
+  }
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+
+          if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = j - SA;
+
+          c0 = T[--s];
+          *j = ~((int)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j); assert(k != NULL);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  if (T[n - 2] < c2) {
+    if (((n - 1) & mod) == 0) indexes[(n - 1) / (mod + 1) - 1] = k - SA;
+    *k++ = ~((int)T[n - 2]);
+  }
+  else {
+    *k++ = n - 1;
+  }
+
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+
+      if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = i - SA;
+
+      c0 = T[--s];
+      *i = c0;
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      if((0 < s) && (T[s - 1] < c0)) {
+          if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = k - SA;
+          *k++ = ~((int)T[s - 1]);
+      } else
+        *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/*- Function -*/
+
+int
+divsufsort(const unsigned char *T, int *SA, int n, int openMP) {
+  int *bucket_A, *bucket_B;
+  int m;
+  int err = 0;
+
+  /* Check arguments. */
+  if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
+  else if(n == 0) { return 0; }
+  else if(n == 1) { SA[0] = 0; return 0; }
+  else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
+
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Suffixsort. */
+  if((bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, SA, bucket_A, bucket_B, n, openMP);
+    construct_SA(T, SA, bucket_A, bucket_B, n, m);
+  } else {
+    err = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+
+  return err;
+}
+
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP) {
+  int *B;
+  int *bucket_A, *bucket_B;
+  int m, pidx, i;
+
+  /* Check arguments. */
+  if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
+  else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
+
+  if((B = A) == NULL) { B = (int *)malloc((size_t)(n + 1) * sizeof(int)); }
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Burrows-Wheeler Transform. */
+  if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, B, bucket_A, bucket_B, n, openMP);
+
+    if (num_indexes == NULL || indexes == NULL) {
+        pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
+    } else {
+        pidx = construct_BWT_indexes(T, B, bucket_A, bucket_B, n, m, num_indexes, indexes);
+    }
+
+    /* Copy to output string. */
+    U[0] = T[n - 1];
+    for(i = 0; i < pidx; ++i) { U[i + 1] = (unsigned char)B[i]; }
+    for(i += 1; i < n; ++i) { U[i] = (unsigned char)B[i]; }
+    pidx += 1;
+  } else {
+    pidx = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+  if(A == NULL) { free(B); }
+
+  return pidx;
+}
diff --git a/src/zstd/dictBuilder/divsufsort.h b/src/zstd/dictBuilder/divsufsort.h
new file mode 100644
index 000000000..5440994af
--- /dev/null
+++ b/src/zstd/dictBuilder/divsufsort.h
@@ -0,0 +1,67 @@
+/*
+ * divsufsort.h for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DIVSUFSORT_H
+#define _DIVSUFSORT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/*- Prototypes -*/
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param openMP enables OpenMP optimization.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int
+divsufsort(const unsigned char *T, int *SA, int n, int openMP);
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string. (can be T)
+ * @param A [0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @param num_indexes The length of secondary indexes array. (can be NULL)
+ * @param indexes The secondary indexes array. (can be NULL)
+ * @param openMP enables OpenMP optimization.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _DIVSUFSORT_H */
diff --git a/src/zstd/dictBuilder/fastcover.c b/src/zstd/dictBuilder/fastcover.c
new file mode 100644
index 000000000..a958eb337
--- /dev/null
+++ b/src/zstd/dictBuilder/fastcover.c
@@ -0,0 +1,766 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#  define ZDICT_STATIC_LINKING_ONLY
+#endif
+
+#include "../common/mem.h" /* read */
+#include "../common/pool.h"
+#include "../common/threading.h"
+#include "../common/zstd_internal.h" /* includes zstd.h */
+#include "../compress/zstd_compress_internal.h" /* ZSTD_hash*() */
+#include "../zdict.h"
+#include "cover.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+/**
+* There are 32bit indexes used to ref samples, so limit samples size to 4GB
+* on 64bit builds.
+* For 32bit builds we choose 1 GB.
+* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
+* contiguous buffer, so 1GB is already a high limit.
+*/
+#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
+#define FASTCOVER_MAX_F 31
+#define FASTCOVER_MAX_ACCEL 10
+#define FASTCOVER_DEFAULT_SPLITPOINT 0.75
+#define DEFAULT_F 20
+#define DEFAULT_ACCEL 1
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#ifndef LOCALDISPLAYLEVEL
+static int g_displayLevel = 0;
+#endif
+#undef  DISPLAY
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#undef  LOCALDISPLAYLEVEL
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#undef  DISPLAYLEVEL
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#ifndef LOCALDISPLAYUPDATE
+static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+#endif
+#undef  LOCALDISPLAYUPDATE
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#undef  DISPLAYUPDATE
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+
+
+/*-*************************************
+* Hash Functions
+***************************************/
+/**
+ * Hash the d-byte value pointed to by p and mod 2^f into the frequency vector
+ */
+static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 f, unsigned d) {
+  if (d == 6) {
+    return ZSTD_hash6Ptr(p, f);
+  }
+  return ZSTD_hash8Ptr(p, f);
+}
+
+
+/*-*************************************
+* Acceleration
+***************************************/
+typedef struct {
+  unsigned finalize;    /* Percentage of training samples used for ZDICT_finalizeDictionary */
+  unsigned skip;        /* Number of dmer skipped between each dmer counted in computeFrequency */
+} FASTCOVER_accel_t;
+
+
+static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = {
+  { 100, 0 },   /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */
+  { 100, 0 },   /* accel = 1 */
+  { 50, 1 },   /* accel = 2 */
+  { 34, 2 },   /* accel = 3 */
+  { 25, 3 },   /* accel = 4 */
+  { 20, 4 },   /* accel = 5 */
+  { 17, 5 },   /* accel = 6 */
+  { 14, 6 },   /* accel = 7 */
+  { 13, 7 },   /* accel = 8 */
+  { 11, 8 },   /* accel = 9 */
+  { 10, 9 },   /* accel = 10 */
+};
+
+
+/*-*************************************
+* Context
+***************************************/
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
+  size_t nbDmers;
+  U32 *freqs;
+  unsigned d;
+  unsigned f;
+  FASTCOVER_accel_t accelParams;
+} FASTCOVER_ctx_t;
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of all dmers with hash value d.
+ * Let S_i be hash value of the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer with hash value d is in the dictionary we set F(d) = 0.
+ */
+static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
+                                              U32 *freqs, U32 begin, U32 end,
+                                              ZDICT_cover_params_t parameters,
+                                              U16* segmentFreqs) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 f = ctx->f;
+  const U32 dmersInK = k - d + 1;
+
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  COVER_segment_t bestSegment = {0, 0, 0};
+  COVER_segment_t activeSegment;
+
+  /* Reset the activeDmers in the segment */
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* Get hash value of current dmer */
+    const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
+
+    /* Add frequency of this index to score if this is the first occurrence of index in active segment */
+    if (segmentFreqs[idx] == 0) {
+      activeSegment.score += freqs[idx];
+    }
+    /* Increment end of segment and segmentFreqs*/
+    activeSegment.end += 1;
+    segmentFreqs[idx] += 1;
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      /* Get hash value of the dmer to be eliminated from active segment */
+      const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+      segmentFreqs[delIndex] -= 1;
+      /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
+      if (segmentFreqs[delIndex] == 0) {
+        activeSegment.score -= freqs[delIndex];
+      }
+      /* Increment start of segment */
+      activeSegment.begin += 1;
+    }
+
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+
+  /* Zero out rest of segmentFreqs array */
+  while (activeSegment.begin < end) {
+    const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+    segmentFreqs[delIndex] -= 1;
+    activeSegment.begin += 1;
+  }
+
+  {
+    /*  Zero the frequency of hash value of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d);
+      freqs[i] = 0;
+    }
+  }
+
+  return bestSegment;
+}
+
+
+static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters,
+                                     size_t maxDictSize, unsigned f,
+                                     unsigned accel) {
+  /* k, d, and f are required parameters */
+  if (parameters.d == 0 || parameters.k == 0) {
+    return 0;
+  }
+  /* d has to be 6 or 8 */
+  if (parameters.d != 6 && parameters.d != 8) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  /* 0 < f <= FASTCOVER_MAX_F*/
+  if (f > FASTCOVER_MAX_F || f == 0) {
+    return 0;
+  }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
+    return 0;
+  }
+  /* 0 < accel <= 10 */
+  if (accel > 10 || accel == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+
+/**
+ * Clean up a context initialized with `FASTCOVER_ctx_init()`.
+ */
+static void
+FASTCOVER_ctx_destroy(FASTCOVER_ctx_t* ctx)
+{
+    if (!ctx) return;
+
+    free(ctx->freqs);
+    ctx->freqs = NULL;
+
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+}
+
+
+/**
+ * Calculate for frequency of hash value of each dmer in ctx->samples
+ */
+static void
+FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
+{
+    const unsigned f = ctx->f;
+    const unsigned d = ctx->d;
+    const unsigned skip = ctx->accelParams.skip;
+    const unsigned readLength = MAX(d, 8);
+    size_t i;
+    assert(ctx->nbTrainSamples >= 5);
+    assert(ctx->nbTrainSamples <= ctx->nbSamples);
+    for (i = 0; i < ctx->nbTrainSamples; i++) {
+        size_t start = ctx->offsets[i];  /* start of current dmer */
+        size_t const currSampleEnd = ctx->offsets[i+1];
+        while (start + readLength <= currSampleEnd) {
+            const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d);
+            freqs[dmerIndex]++;
+            start = start + skip + 1;
+        }
+    }
+}
+
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can be used multiple
+ * times.
+ * Returns 0 on success or error code on error.
+ * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
+ */
+static size_t
+FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
+                   const void* samplesBuffer,
+                   const size_t* samplesSizes, unsigned nbSamples,
+                   unsigned d, double splitPoint, unsigned f,
+                   FASTCOVER_accel_t accelParams)
+{
+    const BYTE* const samples = (const BYTE*)samplesBuffer;
+    const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+    /* Split samples into testing and training sets */
+    const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+    const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+    const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+    const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
+
+    /* Checks */
+    if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+        totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
+        DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                    (unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
+        return ERROR(srcSize_wrong);
+    }
+
+    /* Check if there are at least 5 training samples */
+    if (nbTrainSamples < 5) {
+        DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
+        return ERROR(srcSize_wrong);
+    }
+
+    /* Check if there's testing sample */
+    if (nbTestSamples < 1) {
+        DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
+        return ERROR(srcSize_wrong);
+    }
+
+    /* Zero the context */
+    memset(ctx, 0, sizeof(*ctx));
+    DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+                    (unsigned)trainingSamplesSize);
+    DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+                    (unsigned)testSamplesSize);
+
+    ctx->samples = samples;
+    ctx->samplesSizes = samplesSizes;
+    ctx->nbSamples = nbSamples;
+    ctx->nbTrainSamples = nbTrainSamples;
+    ctx->nbTestSamples = nbTestSamples;
+    ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
+    ctx->d = d;
+    ctx->f = f;
+    ctx->accelParams = accelParams;
+
+    /* The offsets of each file */
+    ctx->offsets = (size_t*)calloc((nbSamples + 1), sizeof(size_t));
+    if (ctx->offsets == NULL) {
+        DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
+        FASTCOVER_ctx_destroy(ctx);
+        return ERROR(memory_allocation);
+    }
+
+    /* Fill offsets from the samplesSizes */
+    {   U32 i;
+        ctx->offsets[0] = 0;
+        assert(nbSamples >= 5);
+        for (i = 1; i <= nbSamples; ++i) {
+            ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+        }
+    }
+
+    /* Initialize frequency array of size 2^f */
+    ctx->freqs = (U32*)calloc(((U64)1 << f), sizeof(U32));
+    if (ctx->freqs == NULL) {
+        DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
+        FASTCOVER_ctx_destroy(ctx);
+        return ERROR(memory_allocation);
+    }
+
+    DISPLAYLEVEL(2, "Computing frequencies\n");
+    FASTCOVER_computeFrequency(ctx->freqs, ctx);
+
+    return 0;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t
+FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
+                          U32* freqs,
+                          void* dictBuffer, size_t dictBufferCapacity,
+                          ZDICT_cover_params_t parameters,
+                          U16* segmentFreqs)
+{
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data into epochs. We will select one segment from each epoch. */
+  const COVER_epoch_info_t epochs = COVER_computeEpochs(
+      (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
+  const size_t maxZeroScoreRun = 10;
+  size_t zeroScoreRun = 0;
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
+                (U32)epochs.num, (U32)epochs.size);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
+    const U32 epochBegin = (U32)(epoch * epochs.size);
+    const U32 epochEnd = epochBegin + epochs.size;
+    size_t segmentSize;
+    /* Select a segment */
+    COVER_segment_t segment = FASTCOVER_selectSegment(
+        ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
+
+    /* If the segment covers no dmers, then we are out of content.
+     * There may be new content in other epochs, for continue for some time.
+     */
+    if (segment.score == 0) {
+      if (++zeroScoreRun >= maxZeroScoreRun) {
+          break;
+      }
+      continue;
+    }
+    zeroScoreRun = 0;
+
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+/**
+ * Parameters for FASTCOVER_tryParameters().
+ */
+typedef struct FASTCOVER_tryParameters_data_s {
+    const FASTCOVER_ctx_t* ctx;
+    COVER_best_t* best;
+    size_t dictBufferCapacity;
+    ZDICT_cover_params_t parameters;
+} FASTCOVER_tryParameters_data_t;
+
+
+/**
+ * Tries a set of parameters and updates the COVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void FASTCOVER_tryParameters(void* opaque)
+{
+  /* Save parameters as local variables */
+  FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t*)opaque;
+  const FASTCOVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_cover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Initialize array to keep track of frequency of dmer within activeSegment */
+  U16* segmentFreqs = (U16*)calloc(((U64)1 << ctx->f), sizeof(U16));
+  /* Allocate space for hash table, dict, and freqs */
+  BYTE *const dict = (BYTE*)malloc(dictBufferCapacity);
+  COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
+  U32* freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
+  if (!segmentFreqs || !dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
+  /* Build the dictionary */
+  { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
+                                                    parameters, segmentFreqs);
+
+    const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
+    selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
+         ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
+         totalCompressedSize);
+
+    if (COVER_dictSelectionIsError(selection)) {
+      DISPLAYLEVEL(1, "Failed to select dictionary\n");
+      goto _cleanup;
+    }
+  }
+_cleanup:
+  free(dict);
+  COVER_best_finish(data->best, parameters, selection);
+  free(data);
+  free(segmentFreqs);
+  COVER_dictSelectionFree(selection);
+  free(freqs);
+}
+
+
+static void
+FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
+                               ZDICT_cover_params_t* coverParams)
+{
+    coverParams->k = fastCoverParams.k;
+    coverParams->d = fastCoverParams.d;
+    coverParams->steps = fastCoverParams.steps;
+    coverParams->nbThreads = fastCoverParams.nbThreads;
+    coverParams->splitPoint = fastCoverParams.splitPoint;
+    coverParams->zParams = fastCoverParams.zParams;
+    coverParams->shrinkDict = fastCoverParams.shrinkDict;
+}
+
+
+static void
+FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
+                                   ZDICT_fastCover_params_t* fastCoverParams,
+                                   unsigned f, unsigned accel)
+{
+    fastCoverParams->k = coverParams.k;
+    fastCoverParams->d = coverParams.d;
+    fastCoverParams->steps = coverParams.steps;
+    fastCoverParams->nbThreads = coverParams.nbThreads;
+    fastCoverParams->splitPoint = coverParams.splitPoint;
+    fastCoverParams->f = f;
+    fastCoverParams->accel = accel;
+    fastCoverParams->zParams = coverParams.zParams;
+    fastCoverParams->shrinkDict = coverParams.shrinkDict;
+}
+
+
+ZDICTLIB_STATIC_API size_t
+ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
+                                const void* samplesBuffer,
+                                const size_t* samplesSizes, unsigned nbSamples,
+                                ZDICT_fastCover_params_t parameters)
+{
+    BYTE* const dict = (BYTE*)dictBuffer;
+    FASTCOVER_ctx_t ctx;
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* Initialize global data */
+    g_displayLevel = (int)parameters.zParams.notificationLevel;
+    /* Assign splitPoint and f if not provided */
+    parameters.splitPoint = 1.0;
+    parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
+    parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
+    /* Convert to cover parameter */
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(parameters, &coverParams);
+    /* Checks */
+    if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
+                                   parameters.accel)) {
+      DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+      return ERROR(parameter_outOfBound);
+    }
+    if (nbSamples == 0) {
+      DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
+      return ERROR(srcSize_wrong);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    /* Assign corresponding FASTCOVER_accel_t to accelParams*/
+    accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
+    /* Initialize context */
+    {
+      size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                            coverParams.d, parameters.splitPoint, parameters.f,
+                            accelParams);
+      if (ZSTD_isError(initVal)) {
+        DISPLAYLEVEL(1, "Failed to initialize context\n");
+        return initVal;
+      }
+    }
+    COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
+    /* Build the dictionary */
+    DISPLAYLEVEL(2, "Building dictionary\n");
+    {
+      /* Initialize array to keep track of frequency of dmer within activeSegment */
+      U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16));
+      const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
+                                                dictBufferCapacity, coverParams, segmentFreqs);
+      const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100);
+      const size_t dictionarySize = ZDICT_finalizeDictionary(
+          dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+          samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams);
+      if (!ZSTD_isError(dictionarySize)) {
+          DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                      (unsigned)dictionarySize);
+      }
+      FASTCOVER_ctx_destroy(&ctx);
+      free(segmentFreqs);
+      return dictionarySize;
+    }
+}
+
+
+ZDICTLIB_STATIC_API size_t
+ZDICT_optimizeTrainFromBuffer_fastCover(
+                    void* dictBuffer, size_t dictBufferCapacity,
+                    const void* samplesBuffer,
+                    const size_t* samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t* parameters)
+{
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* constants */
+    const unsigned nbThreads = parameters->nbThreads;
+    const double splitPoint =
+        parameters->splitPoint <= 0.0 ? FASTCOVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
+    const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+    const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+    const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+    const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+    const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+    const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+    const unsigned kIterations =
+        (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+    const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
+    const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
+    const unsigned shrinkDict = 0;
+    /* Local variables */
+    const int displayLevel = (int)parameters->zParams.notificationLevel;
+    unsigned iteration = 1;
+    unsigned d;
+    unsigned k;
+    COVER_best_t best;
+    POOL_ctx *pool = NULL;
+    int warned = 0;
+    /* Checks */
+    if (splitPoint <= 0 || splitPoint > 1) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
+      return ERROR(parameter_outOfBound);
+    }
+    if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
+      return ERROR(parameter_outOfBound);
+    }
+    if (kMinK < kMaxD || kMaxK < kMinK) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
+      return ERROR(parameter_outOfBound);
+    }
+    if (nbSamples == 0) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
+      return ERROR(srcSize_wrong);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    if (nbThreads > 1) {
+      pool = POOL_create(nbThreads, 1);
+      if (!pool) {
+        return ERROR(memory_allocation);
+      }
+    }
+    /* Initialization */
+    COVER_best_init(&best);
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(*parameters, &coverParams);
+    accelParams = FASTCOVER_defaultAccelParameters[accel];
+    /* Turn down global display level to clean up display at level 2 and below */
+    g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+    /* Loop through d first because each new value needs a new context */
+    LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                      kIterations);
+    for (d = kMinD; d <= kMaxD; d += 2) {
+      /* Initialize the context for this value of d */
+      FASTCOVER_ctx_t ctx;
+      LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+      {
+        size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
+        if (ZSTD_isError(initVal)) {
+          LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+          COVER_best_destroy(&best);
+          POOL_free(pool);
+          return initVal;
+        }
+      }
+      if (!warned) {
+        COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
+        warned = 1;
+      }
+      /* Loop through k reusing the same context */
+      for (k = kMinK; k <= kMaxK; k += kStepSize) {
+        /* Prepare the arguments */
+        FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
+            sizeof(FASTCOVER_tryParameters_data_t));
+        LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+        if (!data) {
+          LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+          COVER_best_destroy(&best);
+          FASTCOVER_ctx_destroy(&ctx);
+          POOL_free(pool);
+          return ERROR(memory_allocation);
+        }
+        data->ctx = &ctx;
+        data->best = &best;
+        data->dictBufferCapacity = dictBufferCapacity;
+        data->parameters = coverParams;
+        data->parameters.k = k;
+        data->parameters.d = d;
+        data->parameters.splitPoint = splitPoint;
+        data->parameters.steps = kSteps;
+        data->parameters.shrinkDict = shrinkDict;
+        data->parameters.zParams.notificationLevel = (unsigned)g_displayLevel;
+        /* Check the parameters */
+        if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
+                                       data->ctx->f, accel)) {
+          DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+          free(data);
+          continue;
+        }
+        /* Call the function and pass ownership of data to it */
+        COVER_best_start(&best);
+        if (pool) {
+          POOL_add(pool, &FASTCOVER_tryParameters, data);
+        } else {
+          FASTCOVER_tryParameters(data);
+        }
+        /* Print status */
+        LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                           (unsigned)((iteration * 100) / kIterations));
+        ++iteration;
+      }
+      COVER_best_wait(&best);
+      FASTCOVER_ctx_destroy(&ctx);
+    }
+    LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+    /* Fill the output buffer and parameters with output of the best parameters */
+    {
+      const size_t dictSize = best.dictSize;
+      if (ZSTD_isError(best.compressedSize)) {
+        const size_t compressedSize = best.compressedSize;
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return compressedSize;
+      }
+      FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel);
+      memcpy(dictBuffer, best.dict, dictSize);
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return dictSize;
+    }
+
+}
diff --git a/src/zstd/dictBuilder/zdict.c b/src/zstd/dictBuilder/zdict.c
new file mode 100644
index 000000000..82e999e80
--- /dev/null
+++ b/src/zstd/dictBuilder/zdict.c
@@ -0,0 +1,1133 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*-**************************************
+*  Tuning parameters
+****************************************/
+#define MINRATIO 4   /* minimum nb of apparition to be selected in dictionary */
+#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
+#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
+
+
+/*-**************************************
+*  Compiler Options
+****************************************/
+/* Unix Large Files support (>4GB) */
+#define _FILE_OFFSET_BITS 64
+#if (defined(__sun__) && (!defined(__LP64__)))   /* Sun Solaris 32-bits requires specific definitions */
+#  ifndef _LARGEFILE_SOURCE
+#  define _LARGEFILE_SOURCE
+#  endif
+#elif ! defined(__LP64__)                        /* No point defining Large file for 64 bit */
+#  ifndef _LARGEFILE64_SOURCE
+#  define _LARGEFILE64_SOURCE
+#  endif
+#endif
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdlib.h>        /* malloc, free */
+#include <string.h>        /* memset */
+#include <stdio.h>         /* fprintf, fopen, ftello64 */
+#include <time.h>          /* clock */
+
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#  define ZDICT_STATIC_LINKING_ONLY
+#endif
+
+#include "../common/mem.h"           /* read */
+#include "../common/fse.h"           /* FSE_normalizeCount, FSE_writeNCount */
+#include "../common/huf.h"           /* HUF_buildCTable, HUF_writeCTable */
+#include "../common/zstd_internal.h" /* includes zstd.h */
+#include "../common/xxhash.h"        /* XXH64 */
+#include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
+#include "../zdict.h"
+#include "divsufsort.h"
+#include "../common/bits.h"          /* ZSTD_NbCommonBytes */
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define DICTLISTSIZE_DEFAULT 10000
+
+#define NOISELENGTH 32
+
+static const U32 g_selectivity_default = 9;
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#undef  DISPLAY
+#define DISPLAY(...)         do { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } while (0)
+#undef  DISPLAYLEVEL
+#define DISPLAYLEVEL(l, ...) do { if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } } while (0)    /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+
+static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
+
+static void ZDICT_printHex(const void* ptr, size_t length)
+{
+    const BYTE* const b = (const BYTE*)ptr;
+    size_t u;
+    for (u=0; u<length; u++) {
+        BYTE c = b[u];
+        if (c<32 || c>126) c = '.';   /* non-printable char */
+        DISPLAY("%c", c);
+    }
+}
+
+
+/*-********************************************************
+*  Helper functions
+**********************************************************/
+unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
+
+const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
+{
+    if (dictSize < 8) return 0;
+    if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
+    return MEM_readLE32((const char*)dictBuffer + 4);
+}
+
+size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
+{
+    size_t headerSize;
+    if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
+
+    {   ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
+        U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
+        if (!bs || !wksp) {
+            headerSize = ERROR(memory_allocation);
+        } else {
+            ZSTD_reset_compressedBlockState(bs);
+            headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
+        }
+
+        free(bs);
+        free(wksp);
+    }
+
+    return headerSize;
+}
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+/*! ZDICT_count() :
+    Count the nb of common bytes between 2 pointers.
+    Note : this function presumes end of buffer followed by noisy guard band.
+*/
+static size_t ZDICT_count(const void* pIn, const void* pMatch)
+{
+    const char* const pStart = (const char*)pIn;
+    for (;;) {
+        size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+        if (!diff) {
+            pIn = (const char*)pIn+sizeof(size_t);
+            pMatch = (const char*)pMatch+sizeof(size_t);
+            continue;
+        }
+        pIn = (const char*)pIn+ZSTD_NbCommonBytes(diff);
+        return (size_t)((const char*)pIn - pStart);
+    }
+}
+
+
+typedef struct {
+    U32 pos;
+    U32 length;
+    U32 savings;
+} dictItem;
+
+static void ZDICT_initDictItem(dictItem* d)
+{
+    d->pos = 1;
+    d->length = 0;
+    d->savings = (U32)(-1);
+}
+
+
+#define LLIMIT 64          /* heuristic determined experimentally */
+#define MINMATCHLENGTH 7   /* heuristic determined experimentally */
+static dictItem ZDICT_analyzePos(
+                       BYTE* doneMarks,
+                       const int* suffix, U32 start,
+                       const void* buffer, U32 minRatio, U32 notificationLevel)
+{
+    U32 lengthList[LLIMIT] = {0};
+    U32 cumulLength[LLIMIT] = {0};
+    U32 savings[LLIMIT] = {0};
+    const BYTE* b = (const BYTE*)buffer;
+    size_t maxLength = LLIMIT;
+    size_t pos = (size_t)suffix[start];
+    U32 end = start;
+    dictItem solution;
+
+    /* init */
+    memset(&solution, 0, sizeof(solution));
+    doneMarks[pos] = 1;
+
+    /* trivial repetition cases */
+    if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
+       ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
+       ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
+        /* skip and mark segment */
+        U16 const pattern16 = MEM_read16(b+pos+4);
+        U32 u, patternEnd = 6;
+        while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
+        if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
+        for (u=1; u<patternEnd; u++)
+            doneMarks[pos+u] = 1;
+        return solution;
+    }
+
+    /* look forward */
+    {   size_t length;
+        do {
+            end++;
+            length = ZDICT_count(b + pos, b + suffix[end]);
+        } while (length >= MINMATCHLENGTH);
+    }
+
+    /* look backward */
+    {   size_t length;
+        do {
+            length = ZDICT_count(b + pos, b + *(suffix+start-1));
+            if (length >=MINMATCHLENGTH) start--;
+        } while(length >= MINMATCHLENGTH);
+    }
+
+    /* exit if not found a minimum nb of repetitions */
+    if (end-start < minRatio) {
+        U32 idx;
+        for(idx=start; idx<end; idx++)
+            doneMarks[suffix[idx]] = 1;
+        return solution;
+    }
+
+    {   int i;
+        U32 mml;
+        U32 refinedStart = start;
+        U32 refinedEnd = end;
+
+        DISPLAYLEVEL(4, "\n");
+        DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u  ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
+        DISPLAYLEVEL(4, "\n");
+
+        for (mml = MINMATCHLENGTH ; ; mml++) {
+            BYTE currentChar = 0;
+            U32 currentCount = 0;
+            U32 currentID = refinedStart;
+            U32 id;
+            U32 selectedCount = 0;
+            U32 selectedID = currentID;
+            for (id =refinedStart; id < refinedEnd; id++) {
+                if (b[suffix[id] + mml] != currentChar) {
+                    if (currentCount > selectedCount) {
+                        selectedCount = currentCount;
+                        selectedID = currentID;
+                    }
+                    currentID = id;
+                    currentChar = b[ suffix[id] + mml];
+                    currentCount = 0;
+                }
+                currentCount ++;
+            }
+            if (currentCount > selectedCount) {  /* for last */
+                selectedCount = currentCount;
+                selectedID = currentID;
+            }
+
+            if (selectedCount < minRatio)
+                break;
+            refinedStart = selectedID;
+            refinedEnd = refinedStart + selectedCount;
+        }
+
+        /* evaluate gain based on new dict */
+        start = refinedStart;
+        pos = suffix[refinedStart];
+        end = start;
+        memset(lengthList, 0, sizeof(lengthList));
+
+        /* look forward */
+        {   size_t length;
+            do {
+                end++;
+                length = ZDICT_count(b + pos, b + suffix[end]);
+                if (length >= LLIMIT) length = LLIMIT-1;
+                lengthList[length]++;
+            } while (length >=MINMATCHLENGTH);
+        }
+
+        /* look backward */
+        {   size_t length = MINMATCHLENGTH;
+            while ((length >= MINMATCHLENGTH) & (start > 0)) {
+                length = ZDICT_count(b + pos, b + suffix[start - 1]);
+                if (length >= LLIMIT) length = LLIMIT - 1;
+                lengthList[length]++;
+                if (length >= MINMATCHLENGTH) start--;
+            }
+        }
+
+        /* largest useful length */
+        memset(cumulLength, 0, sizeof(cumulLength));
+        cumulLength[maxLength-1] = lengthList[maxLength-1];
+        for (i=(int)(maxLength-2); i>=0; i--)
+            cumulLength[i] = cumulLength[i+1] + lengthList[i];
+
+        for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
+        maxLength = i;
+
+        /* reduce maxLength in case of final into repetitive data */
+        {   U32 l = (U32)maxLength;
+            BYTE const c = b[pos + maxLength-1];
+            while (b[pos+l-2]==c) l--;
+            maxLength = l;
+        }
+        if (maxLength < MINMATCHLENGTH) return solution;   /* skip : no long-enough solution */
+
+        /* calculate savings */
+        savings[5] = 0;
+        for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
+            savings[i] = savings[i-1] + (lengthList[i] * (i-3));
+
+        DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f)  \n",
+                     (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength);
+
+        solution.pos = (U32)pos;
+        solution.length = (U32)maxLength;
+        solution.savings = savings[maxLength];
+
+        /* mark positions done */
+        {   U32 id;
+            for (id=start; id<end; id++) {
+                U32 p, pEnd, length;
+                U32 const testedPos = (U32)suffix[id];
+                if (testedPos == pos)
+                    length = solution.length;
+                else {
+                    length = (U32)ZDICT_count(b+pos, b+testedPos);
+                    if (length > solution.length) length = solution.length;
+                }
+                pEnd = (U32)(testedPos + length);
+                for (p=testedPos; p<pEnd; p++)
+                    doneMarks[p] = 1;
+    }   }   }
+
+    return solution;
+}
+
+
+static int isIncluded(const void* in, const void* container, size_t length)
+{
+    const char* const ip = (const char*) in;
+    const char* const into = (const char*) container;
+    size_t u;
+
+    for (u=0; u<length; u++) {  /* works because end of buffer is a noisy guard band */
+        if (ip[u] != into[u]) break;
+    }
+
+    return u==length;
+}
+
+/*! ZDICT_tryMerge() :
+    check if dictItem can be merged, do it if possible
+    @return : id of destination elt, 0 if not merged
+*/
+static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
+{
+    const U32 tableSize = table->pos;
+    const U32 eltEnd = elt.pos + elt.length;
+    const char* const buf = (const char*) buffer;
+
+    /* tail overlap */
+    U32 u; for (u=1; u<tableSize; u++) {
+        if (u==eltNbToSkip) continue;
+        if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) {  /* overlap, existing > new */
+            /* append */
+            U32 const addedLength = table[u].pos - elt.pos;
+            table[u].length += addedLength;
+            table[u].pos = elt.pos;
+            table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
+            table[u].savings += elt.length / 8;    /* rough approx bonus */
+            elt = table[u];
+            /* sort : improve rank */
+            while ((u>1) && (table[u-1].savings < elt.savings))
+                table[u] = table[u-1], u--;
+            table[u] = elt;
+            return u;
+    }   }
+
+    /* front overlap */
+    for (u=1; u<tableSize; u++) {
+        if (u==eltNbToSkip) continue;
+
+        if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) {  /* overlap, existing < new */
+            /* append */
+            int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length);
+            table[u].savings += elt.length / 8;    /* rough approx bonus */
+            if (addedLength > 0) {   /* otherwise, elt fully included into existing */
+                table[u].length += addedLength;
+                table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
+            }
+            /* sort : improve rank */
+            elt = table[u];
+            while ((u>1) && (table[u-1].savings < elt.savings))
+                table[u] = table[u-1], u--;
+            table[u] = elt;
+            return u;
+        }
+
+        if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
+            if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
+                size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
+                table[u].pos = elt.pos;
+                table[u].savings += (U32)(elt.savings * addedLength / elt.length);
+                table[u].length = MIN(elt.length, table[u].length + 1);
+                return u;
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+static void ZDICT_removeDictItem(dictItem* table, U32 id)
+{
+    /* convention : table[0].pos stores nb of elts */
+    U32 const max = table[0].pos;
+    U32 u;
+    if (!id) return;   /* protection, should never happen */
+    for (u=id; u<max-1; u++)
+        table[u] = table[u+1];
+    table->pos--;
+}
+
+
+static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
+{
+    /* merge if possible */
+    U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
+    if (mergeId) {
+        U32 newMerge = 1;
+        while (newMerge) {
+            newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
+            if (newMerge) ZDICT_removeDictItem(table, mergeId);
+            mergeId = newMerge;
+        }
+        return;
+    }
+
+    /* insert */
+    {   U32 current;
+        U32 nextElt = table->pos;
+        if (nextElt >= maxSize) nextElt = maxSize-1;
+        current = nextElt-1;
+        while (table[current].savings < elt.savings) {
+            table[current+1] = table[current];
+            current--;
+        }
+        table[current+1] = elt;
+        table->pos = nextElt+1;
+    }
+}
+
+
+static U32 ZDICT_dictSize(const dictItem* dictList)
+{
+    U32 u, dictSize = 0;
+    for (u=1; u<dictList[0].pos; u++)
+        dictSize += dictList[u].length;
+    return dictSize;
+}
+
+
+static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
+                            const void* const buffer, size_t bufferSize,   /* buffer must end with noisy guard band */
+                            const size_t* fileSizes, unsigned nbFiles,
+                            unsigned minRatio, U32 notificationLevel)
+{
+    int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
+    int* const suffix = suffix0+1;
+    U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
+    BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks));   /* +16 for overflow security */
+    U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
+    size_t result = 0;
+    clock_t displayClock = 0;
+    clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
+
+#   undef  DISPLAYUPDATE
+#   define DISPLAYUPDATE(l, ...)                                   \
+        do {                                                       \
+            if (notificationLevel>=l) {                            \
+                if (ZDICT_clockSpan(displayClock) > refreshRate) { \
+                    displayClock = clock();                        \
+                    DISPLAY(__VA_ARGS__);                          \
+                }                                                  \
+                if (notificationLevel>=4) fflush(stderr);          \
+            }                                                      \
+        } while (0)
+
+    /* init */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
+        result = ERROR(memory_allocation);
+        goto _cleanup;
+    }
+    if (minRatio < MINRATIO) minRatio = MINRATIO;
+    memset(doneMarks, 0, bufferSize+16);
+
+    /* limit sample set size (divsufsort limitation)*/
+    if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
+    while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
+
+    /* sort */
+    DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
+    {   int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
+        if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
+    }
+    suffix[bufferSize] = (int)bufferSize;   /* leads into noise */
+    suffix0[0] = (int)bufferSize;           /* leads into noise */
+    /* build reverse suffix sort */
+    {   size_t pos;
+        for (pos=0; pos < bufferSize; pos++)
+            reverseSuffix[suffix[pos]] = (U32)pos;
+        /* note filePos tracks borders between samples.
+           It's not used at this stage, but planned to become useful in a later update */
+        filePos[0] = 0;
+        for (pos=1; pos<nbFiles; pos++)
+            filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
+    }
+
+    DISPLAYLEVEL(2, "finding patterns ... \n");
+    DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
+
+    {   U32 cursor; for (cursor=0; cursor < bufferSize; ) {
+            dictItem solution;
+            if (doneMarks[cursor]) { cursor++; continue; }
+            solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
+            if (solution.length==0) { cursor++; continue; }
+            ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
+            cursor += solution.length;
+            DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / (double)bufferSize * 100.0);
+    }   }
+
+_cleanup:
+    free(suffix0);
+    free(reverseSuffix);
+    free(doneMarks);
+    free(filePos);
+    return result;
+}
+
+
+static void ZDICT_fillNoise(void* buffer, size_t length)
+{
+    unsigned const prime1 = 2654435761U;
+    unsigned const prime2 = 2246822519U;
+    unsigned acc = prime1;
+    size_t p=0;
+    for (p=0; p<length; p++) {
+        acc *= prime2;
+        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
+    }
+}
+
+
+typedef struct
+{
+    ZSTD_CDict* dict;    /* dictionary */
+    ZSTD_CCtx* zc;     /* working context */
+    void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */
+} EStats_ress_t;
+
+#define MAXREPOFFSET 1024
+
+static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
+                              unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
+                              const void* src, size_t srcSize,
+                              U32 notificationLevel)
+{
+    size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
+    size_t cSize;
+
+    if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
+    {   size_t const errorCode = ZSTD_compressBegin_usingCDict_deprecated(esr.zc, esr.dict);
+        if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
+
+    }
+    cSize = ZSTD_compressBlock_deprecated(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
+    if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
+
+    if (cSize) {  /* if == 0; block is not compressible */
+        const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
+
+        /* literals stats */
+        {   const BYTE* bytePtr;
+            for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
+                countLit[*bytePtr]++;
+        }
+
+        /* seqStats */
+        {   U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+            ZSTD_seqToCodes(seqStorePtr);
+
+            {   const BYTE* codePtr = seqStorePtr->ofCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
+            }
+
+            {   const BYTE* codePtr = seqStorePtr->mlCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
+            }
+
+            {   const BYTE* codePtr = seqStorePtr->llCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
+            }
+
+            if (nbSeq >= 2) { /* rep offsets */
+                const seqDef* const seq = seqStorePtr->sequencesStart;
+                U32 offset1 = seq[0].offBase - ZSTD_REP_NUM;
+                U32 offset2 = seq[1].offBase - ZSTD_REP_NUM;
+                if (offset1 >= MAXREPOFFSET) offset1 = 0;
+                if (offset2 >= MAXREPOFFSET) offset2 = 0;
+                repOffsets[offset1] += 3;
+                repOffsets[offset2] += 1;
+    }   }   }
+}
+
+static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
+{
+    size_t total=0;
+    unsigned u;
+    for (u=0; u<nbFiles; u++) total += fileSizes[u];
+    return total;
+}
+
+typedef struct { U32 offset; U32 count; } offsetCount_t;
+
+static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)
+{
+    U32 u;
+    table[ZSTD_REP_NUM].offset = val;
+    table[ZSTD_REP_NUM].count = count;
+    for (u=ZSTD_REP_NUM; u>0; u--) {
+        offsetCount_t tmp;
+        if (table[u-1].count >= table[u].count) break;
+        tmp = table[u-1];
+        table[u-1] = table[u];
+        table[u] = tmp;
+    }
+}
+
+/* ZDICT_flatLit() :
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
+ */
+static void ZDICT_flatLit(unsigned* countLit)
+{
+    int u;
+    for (u=1; u<256; u++) countLit[u] = 2;
+    countLit[0]   = 4;
+    countLit[253] = 1;
+    countLit[254] = 1;
+}
+
+#define OFFCODE_MAX 30  /* only applicable to first block */
+static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
+                                   int compressionLevel,
+                             const void*  srcBuffer, const size_t* fileSizes, unsigned nbFiles,
+                             const void* dictBuffer, size_t  dictBufferSize,
+                                   unsigned notificationLevel)
+{
+    unsigned countLit[256];
+    HUF_CREATE_STATIC_CTABLE(hufTable, 255);
+    unsigned offcodeCount[OFFCODE_MAX+1];
+    short offcodeNCount[OFFCODE_MAX+1];
+    U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
+    unsigned matchLengthCount[MaxML+1];
+    short matchLengthNCount[MaxML+1];
+    unsigned litLengthCount[MaxLL+1];
+    short litLengthNCount[MaxLL+1];
+    U32 repOffset[MAXREPOFFSET];
+    offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
+    EStats_ress_t esr = { NULL, NULL, NULL };
+    ZSTD_parameters params;
+    U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
+    size_t pos = 0, errorCode;
+    size_t eSize = 0;
+    size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
+    size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
+    BYTE* dstPtr = (BYTE*)dstBuffer;
+    U32 wksp[HUF_CTABLE_WORKSPACE_SIZE_U32];
+
+    /* init */
+    DEBUGLOG(4, "ZDICT_analyzeEntropy");
+    if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; }   /* too large dictionary */
+    for (u=0; u<256; u++) countLit[u] = 1;   /* any character must be described */
+    for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
+    for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
+    for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
+    memset(repOffset, 0, sizeof(repOffset));
+    repOffset[1] = repOffset[4] = repOffset[8] = 1;
+    memset(bestRepOffset, 0, sizeof(bestRepOffset));
+    if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT;
+    params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
+
+    esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
+    esr.zc = ZSTD_createCCtx();
+    esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
+    if (!esr.dict || !esr.zc || !esr.workPlace) {
+        eSize = ERROR(memory_allocation);
+        DISPLAYLEVEL(1, "Not enough memory \n");
+        goto _cleanup;
+    }
+
+    /* collect stats on all samples */
+    for (u=0; u<nbFiles; u++) {
+        ZDICT_countEStats(esr, &params,
+                          countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
+                         (const char*)srcBuffer + pos, fileSizes[u],
+                          notificationLevel);
+        pos += fileSizes[u];
+    }
+
+    if (notificationLevel >= 4) {
+        /* writeStats */
+        DISPLAYLEVEL(4, "Offset Code Frequencies : \n");
+        for (u=0; u<=offcodeMax; u++) {
+            DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]);
+    }   }
+
+    /* analyze, build stats, starting with literals */
+    {   size_t maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
+        if (HUF_isError(maxNbBits)) {
+            eSize = maxNbBits;
+            DISPLAYLEVEL(1, " HUF_buildCTable error \n");
+            goto _cleanup;
+        }
+        if (maxNbBits==8) {  /* not compressible : will fail on HUF_writeCTable() */
+            DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
+            ZDICT_flatLit(countLit);  /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
+            maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
+            assert(maxNbBits==9);
+        }
+        huffLog = (U32)maxNbBits;
+    }
+
+    /* looking for most common first offsets */
+    {   U32 offset;
+        for (offset=1; offset<MAXREPOFFSET; offset++)
+            ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
+    }
+    /* note : the result of this phase should be used to better appreciate the impact on statistics */
+
+    total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
+    errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);
+    if (FSE_isError(errorCode)) {
+        eSize = errorCode;
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
+        goto _cleanup;
+    }
+    Offlog = (U32)errorCode;
+
+    total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
+    errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);
+    if (FSE_isError(errorCode)) {
+        eSize = errorCode;
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
+        goto _cleanup;
+    }
+    mlLog = (U32)errorCode;
+
+    total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
+    errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);
+    if (FSE_isError(errorCode)) {
+        eSize = errorCode;
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
+        goto _cleanup;
+    }
+    llLog = (U32)errorCode;
+
+    /* write result to buffer */
+    {   size_t const hhSize = HUF_writeCTable_wksp(dstPtr, maxDstSize, hufTable, 255, huffLog, wksp, sizeof(wksp));
+        if (HUF_isError(hhSize)) {
+            eSize = hhSize;
+            DISPLAYLEVEL(1, "HUF_writeCTable error \n");
+            goto _cleanup;
+        }
+        dstPtr += hhSize;
+        maxDstSize -= hhSize;
+        eSize += hhSize;
+    }
+
+    {   size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
+        if (FSE_isError(ohSize)) {
+            eSize = ohSize;
+            DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += ohSize;
+        maxDstSize -= ohSize;
+        eSize += ohSize;
+    }
+
+    {   size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
+        if (FSE_isError(mhSize)) {
+            eSize = mhSize;
+            DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += mhSize;
+        maxDstSize -= mhSize;
+        eSize += mhSize;
+    }
+
+    {   size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
+        if (FSE_isError(lhSize)) {
+            eSize = lhSize;
+            DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += lhSize;
+        maxDstSize -= lhSize;
+        eSize += lhSize;
+    }
+
+    if (maxDstSize<12) {
+        eSize = ERROR(dstSize_tooSmall);
+        DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
+        goto _cleanup;
+    }
+# if 0
+    MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
+    MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
+    MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
+#else
+    /* at this stage, we don't use the result of "most common first offset",
+     * as the impact of statistics is not properly evaluated */
+    MEM_writeLE32(dstPtr+0, repStartValue[0]);
+    MEM_writeLE32(dstPtr+4, repStartValue[1]);
+    MEM_writeLE32(dstPtr+8, repStartValue[2]);
+#endif
+    eSize += 12;
+
+_cleanup:
+    ZSTD_freeCDict(esr.dict);
+    ZSTD_freeCCtx(esr.zc);
+    free(esr.workPlace);
+
+    return eSize;
+}
+
+
+/**
+ * @returns the maximum repcode value
+ */
+static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])
+{
+    U32 maxRep = reps[0];
+    int r;
+    for (r = 1; r < ZSTD_REP_NUM; ++r)
+        maxRep = MAX(maxRep, reps[r]);
+    return maxRep;
+}
+
+size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
+                          const void* customDictContent, size_t dictContentSize,
+                          const void* samplesBuffer, const size_t* samplesSizes,
+                          unsigned nbSamples, ZDICT_params_t params)
+{
+    size_t hSize;
+#define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
+    BYTE header[HBUFFSIZE];
+    int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
+    U32 const notificationLevel = params.notificationLevel;
+    /* The final dictionary content must be at least as large as the largest repcode */
+    size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue);
+    size_t paddingSize;
+
+    /* check conditions */
+    DEBUGLOG(4, "ZDICT_finalizeDictionary");
+    if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
+
+    /* dictionary header */
+    MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
+    {   U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
+        U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
+        U32 const dictID = params.dictID ? params.dictID : compliantID;
+        MEM_writeLE32(header+4, dictID);
+    }
+    hSize = 8;
+
+    /* entropy tables */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    DISPLAYLEVEL(2, "statistics ... \n");
+    {   size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
+                                  compressionLevel,
+                                  samplesBuffer, samplesSizes, nbSamples,
+                                  customDictContent, dictContentSize,
+                                  notificationLevel);
+        if (ZDICT_isError(eSize)) return eSize;
+        hSize += eSize;
+    }
+
+    /* Shrink the content size if it doesn't fit in the buffer */
+    if (hSize + dictContentSize > dictBufferCapacity) {
+        dictContentSize = dictBufferCapacity - hSize;
+    }
+
+    /* Pad the dictionary content with zeros if it is too small */
+    if (dictContentSize < minContentSize) {
+        RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall,
+                        "dictBufferCapacity too small to fit max repcode");
+        paddingSize = minContentSize - dictContentSize;
+    } else {
+        paddingSize = 0;
+    }
+
+    {
+        size_t const dictSize = hSize + paddingSize + dictContentSize;
+
+        /* The dictionary consists of the header, optional padding, and the content.
+         * The padding comes before the content because the "best" position in the
+         * dictionary is the last byte.
+         */
+        BYTE* const outDictHeader = (BYTE*)dictBuffer;
+        BYTE* const outDictPadding = outDictHeader + hSize;
+        BYTE* const outDictContent = outDictPadding + paddingSize;
+
+        assert(dictSize <= dictBufferCapacity);
+        assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize);
+
+        /* First copy the customDictContent into its final location.
+         * `customDictContent` and `dictBuffer` may overlap, so we must
+         * do this before any other writes into the output buffer.
+         * Then copy the header & padding into the output buffer.
+         */
+        memmove(outDictContent, customDictContent, dictContentSize);
+        memcpy(outDictHeader, header, hSize);
+        memset(outDictPadding, 0, paddingSize);
+
+        return dictSize;
+    }
+}
+
+
+static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
+        void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+        const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+        ZDICT_params_t params)
+{
+    int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
+    U32 const notificationLevel = params.notificationLevel;
+    size_t hSize = 8;
+
+    /* calculate entropy tables */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    DISPLAYLEVEL(2, "statistics ... \n");
+    {   size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
+                                  compressionLevel,
+                                  samplesBuffer, samplesSizes, nbSamples,
+                                  (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
+                                  notificationLevel);
+        if (ZDICT_isError(eSize)) return eSize;
+        hSize += eSize;
+    }
+
+    /* add dictionary header (after entropy tables) */
+    MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
+    {   U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
+        U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
+        U32 const dictID = params.dictID ? params.dictID : compliantID;
+        MEM_writeLE32((char*)dictBuffer+4, dictID);
+    }
+
+    if (hSize + dictContentSize < dictBufferCapacity)
+        memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
+    return MIN(dictBufferCapacity, hSize+dictContentSize);
+}
+
+/*! ZDICT_trainFromBuffer_unsafe_legacy() :
+*   Warning : `samplesBuffer` must be followed by noisy guard band !!!
+*   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
+*/
+static size_t ZDICT_trainFromBuffer_unsafe_legacy(
+                            void* dictBuffer, size_t maxDictSize,
+                            const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                            ZDICT_legacy_params_t params)
+{
+    U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
+    dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
+    unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
+    unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
+    size_t const targetDictSize = maxDictSize;
+    size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
+    size_t dictSize = 0;
+    U32 const notificationLevel = params.zParams.notificationLevel;
+
+    /* checks */
+    if (!dictList) return ERROR(memory_allocation);
+    if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); }   /* requested dictionary size is too small */
+    if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* not enough source to create dictionary */
+
+    /* init */
+    ZDICT_initDictItem(dictList);
+
+    /* build dictionary */
+    ZDICT_trainBuffer_legacy(dictList, dictListSize,
+                       samplesBuffer, samplesBuffSize,
+                       samplesSizes, nbSamples,
+                       minRep, notificationLevel);
+
+    /* display best matches */
+    if (params.zParams.notificationLevel>= 3) {
+        unsigned const nb = MIN(25, dictList[0].pos);
+        unsigned const dictContentSize = ZDICT_dictSize(dictList);
+        unsigned u;
+        DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
+        DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
+        for (u=1; u<nb; u++) {
+            unsigned const pos = dictList[u].pos;
+            unsigned const length = dictList[u].length;
+            U32 const printedLength = MIN(40, length);
+            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
+                free(dictList);
+                return ERROR(GENERIC);   /* should never happen */
+            }
+            DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
+                         u, length, pos, (unsigned)dictList[u].savings);
+            ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
+            DISPLAYLEVEL(3, "| \n");
+    }   }
+
+
+    /* create dictionary */
+    {   unsigned dictContentSize = ZDICT_dictSize(dictList);
+        if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* dictionary content too small */
+        if (dictContentSize < targetDictSize/4) {
+            DISPLAYLEVEL(2, "!  warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
+            if (samplesBuffSize < 10 * targetDictSize)
+                DISPLAYLEVEL(2, "!  consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
+            if (minRep > MINRATIO) {
+                DISPLAYLEVEL(2, "!  consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
+                DISPLAYLEVEL(2, "!  note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
+            }
+        }
+
+        if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
+            unsigned proposedSelectivity = selectivity-1;
+            while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
+            DISPLAYLEVEL(2, "!  note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
+            DISPLAYLEVEL(2, "!  consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
+            DISPLAYLEVEL(2, "!  always test dictionary efficiency on real samples \n");
+        }
+
+        /* limit dictionary size */
+        {   U32 const max = dictList->pos;   /* convention : nb of useful elts within dictList */
+            U32 currentSize = 0;
+            U32 n; for (n=1; n<max; n++) {
+                currentSize += dictList[n].length;
+                if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
+            }
+            dictList->pos = n;
+            dictContentSize = currentSize;
+        }
+
+        /* build dict content */
+        {   U32 u;
+            BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
+            for (u=1; u<dictList->pos; u++) {
+                U32 l = dictList[u].length;
+                ptr -= l;
+                if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); }   /* should not happen */
+                memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
+        }   }
+
+        dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
+                                                             samplesBuffer, samplesSizes, nbSamples,
+                                                             params.zParams);
+    }
+
+    /* clean up */
+    free(dictList);
+    return dictSize;
+}
+
+
+/* ZDICT_trainFromBuffer_legacy() :
+ * issue : samplesBuffer need to be followed by a noisy guard band.
+ * work around : duplicate the buffer, and add the noise */
+size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                              ZDICT_legacy_params_t params)
+{
+    size_t result;
+    void* newBuff;
+    size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
+    if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0;   /* not enough content => no dictionary */
+
+    newBuff = malloc(sBuffSize + NOISELENGTH);
+    if (!newBuff) return ERROR(memory_allocation);
+
+    memcpy(newBuff, samplesBuffer, sBuffSize);
+    ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH);   /* guard band, for end of buffer condition */
+
+    result =
+        ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
+                                            samplesSizes, nbSamples, params);
+    free(newBuff);
+    return result;
+}
+
+
+size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
+                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+{
+    ZDICT_fastCover_params_t params;
+    DEBUGLOG(3, "ZDICT_trainFromBuffer");
+    memset(&params, 0, sizeof(params));
+    params.d = 8;
+    params.steps = 4;
+    /* Use default level since no compression level information is available */
+    params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
+    params.zParams.notificationLevel = DEBUGLEVEL;
+#endif
+    return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
+                                               samplesBuffer, samplesSizes, nbSamples,
+                                               &params);
+}
+
+size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+{
+    ZDICT_params_t params;
+    memset(&params, 0, sizeof(params));
+    return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
+                                                     samplesBuffer, samplesSizes, nbSamples,
+                                                     params);
+}
diff --git a/src/zstd/entropy_common.c b/src/zstd/entropy_common.c
deleted file mode 100644
index e49b85d2a..000000000
--- a/src/zstd/entropy_common.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/* ******************************************************************
- * Common functions of New Generation Entropy library
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- *  You can contact the author at :
- *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
- *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
-
-/* *************************************
- *  Dependencies
- ***************************************/
-#include "mem.h"
-#include "error_private.h"		/* ERR_*, ERROR */
-#define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */
-#include "fse.h"
-#include "huf.h"
-#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
-
-/*===   Version   ===*/
-unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
-
-/*===   Error Management   ===*/
-// unsigned int FSE_isError(size_t code) { return ERR_isError(code); }
-char const* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
-
-// unsigned int HUF_isError(size_t code) { return ERR_isError(code); }
-char const* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
-
-/*-**************************************************************
- *  FSE NCount encoding-decoding
- ****************************************************************/
-FORCE_INLINE_TEMPLATE
-size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, void const* headerBuffer, size_t hbSize) {
-	const BYTE* const istart = (const BYTE*)headerBuffer;
-	const BYTE* const iend = istart + hbSize;
-	const BYTE* ip = istart;
-	int nbBits;
-	int remaining;
-	int threshold;
-	U32 bitStream;
-	int bitCount;
-	unsigned charnum = 0;
-	unsigned const maxSV1 = *maxSVPtr + 1;
-	int previous0 = 0;
-
-	if(hbSize < 8) {
-		/* This function only works when hbSize >= 8 */
-		char buffer[8] = {0};
-		ZSTD_memcpy(buffer, headerBuffer, hbSize);
-		{
-			size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr, buffer, sizeof(buffer));
-			if(FSE_isError(countSize))
-				return countSize;
-			if(countSize > hbSize)
-				return ERROR(corruption_detected);
-			return countSize;
-		}
-	}
-	assert(hbSize >= 8);
-
-	/* init */
-	ZSTD_memset(normalizedCounter, 0, (*maxSVPtr + 1) * sizeof(normalizedCounter[0])); /* all symbols not present in NCount have a frequency of 0 */
-	bitStream = MEM_readLE32(ip);
-	nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */
-	if(nbBits > FSE_TABLELOG_ABSOLUTE_MAX)
-		return ERROR(tableLog_tooLarge);
-	bitStream >>= 4;
-	bitCount = 4;
-	*tableLogPtr = nbBits;
-	remaining = (1 << nbBits) + 1;
-	threshold = 1 << nbBits;
-	nbBits++;
-
-	for(;;) {
-		if(previous0) {
-			/* Count the number of repeats. Each time the
-			 * 2-bit repeat code is 0b11 there is another
-			 * repeat.
-			 * Avoid UB by setting the high bit to 1.
-			 */
-			int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
-			while(repeats >= 12) {
-				charnum += 3 * 12;
-				if(LIKELY(ip <= iend - 7)) {
-					ip += 3;
-				} else {
-					bitCount -= (int)(8 * (iend - 7 - ip));
-					bitCount &= 31;
-					ip = iend - 4;
-				}
-				bitStream = MEM_readLE32(ip) >> bitCount;
-				repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
-			}
-			charnum += 3 * repeats;
-			bitStream >>= 2 * repeats;
-			bitCount += 2 * repeats;
-
-			/* Add the final repeat which isn't 0b11. */
-			assert((bitStream & 3) < 3);
-			charnum += bitStream & 3;
-			bitCount += 2;
-
-			/* This is an error, but break and return an error
-			 * at the end, because returning out of a loop makes
-			 * it harder for the compiler to optimize.
-			 */
-			if(charnum >= maxSV1)
-				break;
-
-			/* We don't need to set the normalized count to 0
-			 * because we already memset the whole buffer to 0.
-			 */
-
-			if(LIKELY(ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) {
-				assert((bitCount >> 3) <= 3); /* For first condition to work */
-				ip += bitCount >> 3;
-				bitCount &= 7;
-			} else {
-				bitCount -= (int)(8 * (iend - 4 - ip));
-				bitCount &= 31;
-				ip = iend - 4;
-			}
-			bitStream = MEM_readLE32(ip) >> bitCount;
-		}
-		{
-			int const max = (2 * threshold - 1) - remaining;
-			int count;
-
-			if((bitStream & (threshold - 1)) < (U32)max) {
-				count = bitStream & (threshold - 1);
-				bitCount += nbBits - 1;
-			} else {
-				count = bitStream & (2 * threshold - 1);
-				if(count >= threshold)
-					count -= max;
-				bitCount += nbBits;
-			}
-
-			count--; /* extra accuracy */
-			/* When it matters (small blocks), this is a
-			 * predictable branch, because we don't use -1.
-			 */
-			if(count >= 0) {
-				remaining -= count;
-			} else {
-				assert(count == -1);
-				remaining += count;
-			}
-			normalizedCounter[charnum++] = (short)count;
-			previous0 = !count;
-
-			assert(threshold > 1);
-			if(remaining < threshold) {
-				/* This branch can be folded into the
-				 * threshold update condition because we
-				 * know that threshold > 1.
-				 */
-				if(remaining <= 1)
-					break;
-				nbBits = ZSTD_highbit32(remaining) + 1;
-				threshold = 1 << (nbBits - 1);
-			}
-			if(charnum >= maxSV1)
-				break;
-
-			if(LIKELY(ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) {
-				ip += bitCount >> 3;
-				bitCount &= 7;
-			} else {
-				bitCount -= (int)(8 * (iend - 4 - ip));
-				bitCount &= 31;
-				ip = iend - 4;
-			}
-			bitStream = MEM_readLE32(ip) >> bitCount;
-		}
-	}
-	if(remaining != 1)
-		return ERROR(corruption_detected);
-	/* Only possible when there are too many zeros. */
-	if(charnum > maxSV1)
-		return ERROR(maxSymbolValue_tooSmall);
-	if(bitCount > 32)
-		return ERROR(corruption_detected);
-	*maxSVPtr = charnum - 1;
-
-	ip += (bitCount + 7) >> 3;
-	return ip - istart;
-}
-
-/* Avoids the FORCE_INLINE of the _body() function. */
-static size_t FSE_readNCount_body_default(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, void const* headerBuffer, size_t hbSize) { return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize); }
-
-#if DYNAMIC_BMI2
-BMI2_TARGET_ATTRIBUTE static size_t FSE_readNCount_body_bmi2(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, void const* headerBuffer, size_t hbSize) { return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize); }
-#endif
-
-size_t FSE_readNCount_bmi2(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, void const* headerBuffer, size_t hbSize, int bmi2) {
-#if DYNAMIC_BMI2
-	if(bmi2) {
-		return FSE_readNCount_body_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
-	}
-#endif
-	(void)bmi2;
-	return FSE_readNCount_body_default(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
-}
-
-size_t FSE_readNCount(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, void const* headerBuffer, size_t hbSize) { return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize, /* bmi2 */ 0); }
-
-/*! HUF_readStats() :
-	Read compact Huffman tree, saved by HUF_writeCTable().
-	`huffWeight` is destination buffer.
-	`rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
-	@return : size read from `src` , or an error Code .
-	Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
-*/
-size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, void const* src, size_t srcSize) {
-	U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
-	return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
-}
-
-FORCE_INLINE_TEMPLATE size_t HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, void const* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2) {
-	U32 weightTotal;
-	const BYTE* ip = (const BYTE*)src;
-	size_t iSize;
-	size_t oSize;
-
-	if(!srcSize)
-		return ERROR(srcSize_wrong);
-	iSize = ip[0];
-	/* ZSTD_memset(huffWeight, 0, hwSize);   */ /* is not necessary, even though some analyzer complain ... */
-
-	if(iSize >= 128) { /* special header */
-		oSize = iSize - 127;
-		iSize = ((oSize + 1) / 2);
-		if(iSize + 1 > srcSize)
-			return ERROR(srcSize_wrong);
-		if(oSize >= hwSize)
-			return ERROR(corruption_detected);
-		ip += 1;
-		{
-			U32 n;
-			for(n = 0; n < oSize; n += 2) {
-				huffWeight[n] = ip[n / 2] >> 4;
-				huffWeight[n + 1] = ip[n / 2] & 15;
-			}
-		}
-	} else { /* header compressed with FSE (normal case) */
-		if(iSize + 1 > srcSize)
-			return ERROR(srcSize_wrong);
-		/* max (hwSize-1) values decoded, as last one is implied */
-		oSize = FSE_decompress_wksp_bmi2(huffWeight, hwSize - 1, ip + 1, iSize, 6, workSpace, wkspSize, bmi2);
-		if(FSE_isError(oSize))
-			return oSize;
-	}
-
-	/* collect weight stats */
-	ZSTD_memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
-	weightTotal = 0;
-	{
-		U32 n;
-		for(n = 0; n < oSize; n++) {
-			if(huffWeight[n] > HUF_TABLELOG_MAX)
-				return ERROR(corruption_detected);
-			rankStats[huffWeight[n]]++;
-			weightTotal += (1 << huffWeight[n]) >> 1;
-		}
-	}
-	if(weightTotal == 0)
-		return ERROR(corruption_detected);
-
-	/* get last non-null symbol weight (implied, total must be 2^n) */
-	{
-		U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
-		if(tableLog > HUF_TABLELOG_MAX)
-			return ERROR(corruption_detected);
-		*tableLogPtr = tableLog;
-		/* determine last weight */
-		{
-			U32 const total = 1 << tableLog;
-			U32 const rest = total - weightTotal;
-			U32 const verif = 1 << ZSTD_highbit32(rest);
-			U32 const lastWeight = ZSTD_highbit32(rest) + 1;
-			if(verif != rest)
-				return ERROR(corruption_detected); /* last value must be a clean power of 2 */
-			huffWeight[oSize] = (BYTE)lastWeight;
-			rankStats[lastWeight]++;
-		}
-	}
-
-	/* check tree construction validity */
-	if((rankStats[1] < 2) || (rankStats[1] & 1))
-		return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */
-
-	/* results */
-	*nbSymbolsPtr = (U32)(oSize + 1);
-	return iSize + 1;
-}
-
-/* Avoids the FORCE_INLINE of the _body() function. */
-static size_t HUF_readStats_body_default(BYTE* huffWeight, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, void const* src, size_t srcSize, void* workSpace, size_t wkspSize) {
-	return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 0);
-}
-
-#if DYNAMIC_BMI2
-static BMI2_TARGET_ATTRIBUTE size_t HUF_readStats_body_bmi2(BYTE* huffWeight, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, void const* src, size_t srcSize, void* workSpace, size_t wkspSize) {
-	return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 1);
-}
-#endif
-
-size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, void const* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) {
-#if DYNAMIC_BMI2
-	if(flags & HUF_flags_bmi2) {
-		return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
-	}
-#endif
-	(void)flags;
-	return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
-}
diff --git a/src/zstd/error_private.c b/src/zstd/error_private.c
deleted file mode 100644
index 6056fce01..000000000
--- a/src/zstd/error_private.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/* The purpose of this file is to have a single list of error strings embedded in binary */
-
-#include "error_private.h"
-
-char const* ERR_getErrorString(ERR_enum code) {
-#ifdef ZSTD_STRIP_ERROR_STRINGS
-	(void)code;
-	return "Error strings stripped";
-#else
-	static char const* const notErrorCode = "Unspecified error code";
-	switch(code) {
-	case PREFIX(no_error):
-		return "No error detected";
-	case PREFIX(GENERIC):
-		return "Error (generic)";
-	case PREFIX(prefix_unknown):
-		return "Unknown frame descriptor";
-	case PREFIX(version_unsupported):
-		return "Version not supported";
-	case PREFIX(frameParameter_unsupported):
-		return "Unsupported frame parameter";
-	case PREFIX(frameParameter_windowTooLarge):
-		return "Frame requires too much memory for decoding";
-	case PREFIX(corruption_detected):
-		return "Data corruption detected";
-	case PREFIX(checksum_wrong):
-		return "Restored data doesn't match checksum";
-	case PREFIX(literals_headerWrong):
-		return "Header of Literals' block doesn't respect format specification";
-	case PREFIX(parameter_unsupported):
-		return "Unsupported parameter";
-	case PREFIX(parameter_combination_unsupported):
-		return "Unsupported combination of parameters";
-	case PREFIX(parameter_outOfBound):
-		return "Parameter is out of bound";
-	case PREFIX(init_missing):
-		return "Context should be init first";
-	case PREFIX(memory_allocation):
-		return "Allocation error : not enough memory";
-	case PREFIX(workSpace_tooSmall):
-		return "workSpace buffer is not large enough";
-	case PREFIX(stage_wrong):
-		return "Operation not authorized at current processing stage";
-	case PREFIX(tableLog_tooLarge):
-		return "tableLog requires too much memory : unsupported";
-	case PREFIX(maxSymbolValue_tooLarge):
-		return "Unsupported max Symbol Value : too large";
-	case PREFIX(maxSymbolValue_tooSmall):
-		return "Specified maxSymbolValue is too small";
-	case PREFIX(stabilityCondition_notRespected):
-		return "pledged buffer stability condition is not respected";
-	case PREFIX(dictionary_corrupted):
-		return "Dictionary is corrupted";
-	case PREFIX(dictionary_wrong):
-		return "Dictionary mismatch";
-	case PREFIX(dictionaryCreation_failed):
-		return "Cannot create Dictionary from provided samples";
-	case PREFIX(dstSize_tooSmall):
-		return "Destination buffer is too small";
-	case PREFIX(srcSize_wrong):
-		return "Src size is incorrect";
-	case PREFIX(dstBuffer_null):
-		return "Operation on NULL destination buffer";
-	case PREFIX(noForwardProgress_destFull):
-		return "Operation made no progress over multiple calls, due to output buffer being full";
-	case PREFIX(noForwardProgress_inputEmpty):
-		return "Operation made no progress over multiple calls, due to input being empty";
-		/* following error codes are not stable and may be removed or changed in a future version */
-	case PREFIX(frameIndex_tooLarge):
-		return "Frame index is too large";
-	case PREFIX(seekableIO):
-		return "An I/O error occurred when reading/seeking";
-	case PREFIX(dstBuffer_wrong):
-		return "Destination buffer is wrong";
-	case PREFIX(srcBuffer_wrong):
-		return "Source buffer is wrong";
-	case PREFIX(sequenceProducer_failed):
-		return "Block-level external sequence producer returned an error code";
-	case PREFIX(externalSequences_invalid):
-		return "External sequences are not valid";
-	case PREFIX(maxCode):
-	default:
-		return notErrorCode;
-	}
-#endif
-}
diff --git a/src/zstd/error_private.h b/src/zstd/error_private.h
deleted file mode 100644
index 77995295a..000000000
--- a/src/zstd/error_private.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/* Note : this module is expected to remain private, do not expose it */
-
-#ifndef ERROR_H_MODULE
-#define ERROR_H_MODULE
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* ****************************************
- *  Dependencies
- ******************************************/
-#include "zstd_errors.h" /* enum list */
-#include "compiler.h"
-#include "debug.h"
-#include "zstd_deps.h" /* size_t */
-
-/* ****************************************
- *  Compiler-specific
- ******************************************/
-#if defined(__GNUC__)
-#define ERR_STATIC static __attribute__((unused))
-#elif defined(__cplusplus) || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
-#define ERR_STATIC static inline
-#elif defined(_MSC_VER)
-#define ERR_STATIC static __inline
-#else
-#define ERR_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */
-#endif
-
-/*-****************************************
- *  Customization (error_public.h)
- ******************************************/
-typedef ZSTD_ErrorCode ERR_enum;
-#define PREFIX(name) ZSTD_error_##name
-
-/*-****************************************
- *  Error codes handling
- ******************************************/
-#undef ERROR /* already defined on Visual Studio */
-#define ERROR(name) ZSTD_ERROR(name)
-#define ZSTD_ERROR(name) ((size_t)-PREFIX(name))
-
-ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
-
-ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) {
-	if(!ERR_isError(code))
-		return (ERR_enum)0;
-	return (ERR_enum)(0 - code);
-}
-
-/* check and forward error code */
-#define CHECK_V_F(e, f) \
-	size_t const e = f; \
-	if(ERR_isError(e))  \
-	return e
-#define CHECK_F(f) \
-	{ CHECK_V_F(_var_err__, f); }
-
-/*-****************************************
- *  Error Strings
- ******************************************/
-
-const char* ERR_getErrorString(ERR_enum code); /* error_private.c */
-
-ERR_STATIC const char* ERR_getErrorName(size_t code) {
-	return ERR_getErrorString(ERR_getErrorCode(code));
-}
-
-/**
- * Ignore: this is an internal helper.
- *
- * This is a helper function to help force C99-correctness during compilation.
- * Under strict compilation modes, variadic macro arguments can't be empty.
- * However, variadic function arguments can be. Using a function therefore lets
- * us statically check that at least one (string) argument was passed,
- * independent of the compilation flags.
- */
-static INLINE_KEYWORD UNUSED_ATTR void _force_has_format_string(const char* format, ...) {
-	(void)format;
-}
-
-/**
- * Ignore: this is an internal helper.
- *
- * We want to force this function invocation to be syntactically correct, but
- * we don't want to force runtime evaluation of its arguments.
- */
-#define _FORCE_HAS_FORMAT_STRING(...)          \
-	if(0) {                                    \
-		_force_has_format_string(__VA_ARGS__); \
-	}
-
-#define ERR_QUOTE(str) #str
-
-/**
- * Return the specified error if the condition evaluates to true.
- *
- * In debug modes, prints additional information.
- * In order to do that (particularly, printing the conditional that failed),
- * this can't just wrap RETURN_ERROR().
- */
-#define RETURN_ERROR_IF(cond, err, ...)                                     \
-	if(cond) {                                                              \
-		RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s",           \
-		       __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
-		_FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                              \
-		RAWLOG(3, ": " __VA_ARGS__);                                        \
-		RAWLOG(3, "\n");                                                    \
-		return ERROR(err);                                                  \
-	}
-
-/**
- * Unconditionally return the specified error.
- *
- * In debug modes, prints additional information.
- */
-#define RETURN_ERROR(err, ...)                                               \
-	do {                                                                     \
-		RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
-		       __FILE__, __LINE__, ERR_QUOTE(ERROR(err)));                   \
-		_FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                               \
-		RAWLOG(3, ": " __VA_ARGS__);                                         \
-		RAWLOG(3, "\n");                                                     \
-		return ERROR(err);                                                   \
-	} while(0);
-
-/**
- * If the provided expression evaluates to an error code, returns that error code.
- *
- * In debug modes, prints additional information.
- */
-#define FORWARD_IF_ERROR(err, ...)                                                  \
-	do {                                                                            \
-		size_t const err_code = (err);                                              \
-		if(ERR_isError(err_code)) {                                                 \
-			RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s",                  \
-			       __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
-			_FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                                  \
-			RAWLOG(3, ": " __VA_ARGS__);                                            \
-			RAWLOG(3, "\n");                                                        \
-			return err_code;                                                        \
-		}                                                                           \
-	} while(0);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* ERROR_H_MODULE */
diff --git a/src/zstd/fse_compress.c b/src/zstd/fse_compress.c
deleted file mode 100644
index 3223aad9f..000000000
--- a/src/zstd/fse_compress.c
+++ /dev/null
@@ -1,660 +0,0 @@
-/* ******************************************************************
- * FSE : Finite State Entropy encoder
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- *  You can contact the author at :
- *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
- *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
-
-/* **************************************************************
- *  Includes
- ****************************************************************/
-#include "compiler.h"
-#include "mem.h"   /* U32, U16, etc. */
-#include "debug.h" /* assert, DEBUGLOG */
-#include "hist.h"  /* HIST_count_wksp */
-#include "bitstream.h"
-#define FSE_STATIC_LINKING_ONLY
-#include "fse.h"
-#include "error_private.h"
-#define ZSTD_DEPS_NEED_MALLOC
-#define ZSTD_DEPS_NEED_MATH64
-#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
-#include "bits.h"      /* ZSTD_highbit32 */
-
-/* **************************************************************
- *  Error Management
- ****************************************************************/
-#define FSE_isError ERR_isError
-
-/* **************************************************************
- *  Templates
- ****************************************************************/
-/*
-  designed to be included
-  for type-specific functions (template emulation in C)
-  Objective is to write these functions only once, for improved maintenance
-*/
-
-/* safety checks */
-#ifndef FSE_FUNCTION_EXTENSION
-#error "FSE_FUNCTION_EXTENSION must be defined"
-#endif
-#ifndef FSE_FUNCTION_TYPE
-#error "FSE_FUNCTION_TYPE must be defined"
-#endif
-
-/* Function names */
-#define FSE_CAT(X, Y) X##Y
-#define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y)
-#define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y)
-
-/* Function templates */
-
-/* FSE_buildCTable_wksp() :
- * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
- * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
- * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
- */
-size_t FSE_buildCTable_wksp(FSE_CTable* ct,
-                            const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
-                            void* workSpace, size_t wkspSize) {
-	U32 const tableSize = 1 << tableLog;
-	U32 const tableMask = tableSize - 1;
-	void* const ptr = ct;
-	U16* const tableU16 = ((U16*)ptr) + 2;
-	void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize >> 1 : 1);
-	FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*)(FSCT);
-	U32 const step = FSE_TABLESTEP(tableSize);
-	U32 const maxSV1 = maxSymbolValue + 1;
-
-	U16* cumul = (U16*)workSpace;                                                      /* size = maxSV1 */
-	FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1 + 1)); /* size = tableSize */
-
-	U32 highThreshold = tableSize - 1;
-
-	assert(((size_t)workSpace & 1) == 0); /* Must be 2 bytes-aligned */
-	if(FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize)
-		return ERROR(tableLog_tooLarge);
-	/* CTable header */
-	tableU16[-2] = (U16)tableLog;
-	tableU16[-1] = (U16)maxSymbolValue;
-	assert(tableLog < 16); /* required for threshold strategy to work */
-
-	/* For explanations on how to distribute symbol values over the table :
-	 * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
-
-#ifdef __clang_analyzer__
-	ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */
-#endif
-
-	/* symbol start positions */
-	{
-		U32 u;
-		cumul[0] = 0;
-		for(u = 1; u <= maxSV1; u++) {
-			if(normalizedCounter[u - 1] == -1) { /* Low proba symbol */
-				cumul[u] = cumul[u - 1] + 1;
-				tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u - 1);
-			} else {
-				assert(normalizedCounter[u - 1] >= 0);
-				cumul[u] = cumul[u - 1] + (U16)normalizedCounter[u - 1];
-				assert(cumul[u] >= cumul[u - 1]); /* no overflow */
-			}
-		}
-		cumul[maxSV1] = (U16)(tableSize + 1);
-	}
-
-	/* Spread symbols */
-	if(highThreshold == tableSize - 1) {
-		/* Case for no low prob count symbols. Lay down 8 bytes at a time
-		 * to reduce branch misses since we are operating on a small block
-		 */
-		BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */
-		{
-			U64 const add = 0x0101010101010101ull;
-			size_t pos = 0;
-			U64 sv = 0;
-			U32 s;
-			for(s = 0; s < maxSV1; ++s, sv += add) {
-				int i;
-				int const n = normalizedCounter[s];
-				MEM_write64(spread + pos, sv);
-				for(i = 8; i < n; i += 8) {
-					MEM_write64(spread + pos + i, sv);
-				}
-				assert(n >= 0);
-				pos += (size_t)n;
-			}
-		}
-		/* Spread symbols across the table. Lack of lowprob symbols means that
-		 * we don't need variable sized inner loop, so we can unroll the loop and
-		 * reduce branch misses.
-		 */
-		{
-			size_t position = 0;
-			size_t s;
-			size_t const unroll = 2;         /* Experimentally determined optimal unroll */
-			assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
-			for(s = 0; s < (size_t)tableSize; s += unroll) {
-				size_t u;
-				for(u = 0; u < unroll; ++u) {
-					size_t const uPosition = (position + (u * step)) & tableMask;
-					tableSymbol[uPosition] = spread[s + u];
-				}
-				position = (position + (unroll * step)) & tableMask;
-			}
-			assert(position == 0); /* Must have initialized all positions */
-		}
-	} else {
-		U32 position = 0;
-		U32 symbol;
-		for(symbol = 0; symbol < maxSV1; symbol++) {
-			int nbOccurrences;
-			int const freq = normalizedCounter[symbol];
-			for(nbOccurrences = 0; nbOccurrences < freq; nbOccurrences++) {
-				tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
-				position = (position + step) & tableMask;
-				while(position > highThreshold)
-					position = (position + step) & tableMask; /* Low proba area */
-			}
-		}
-		assert(position == 0); /* Must have initialized all positions */
-	}
-
-	/* Build table */
-	{
-		U32 u;
-		for(u = 0; u < tableSize; u++) {
-			FSE_FUNCTION_TYPE s = tableSymbol[u];        /* note : static analyzer may not understand tableSymbol is properly initialized */
-			tableU16[cumul[s]++] = (U16)(tableSize + u); /* TableU16 : sorted by symbol order; gives next state value */
-		}
-	}
-
-	/* Build Symbol Transformation Table */
-	{
-		unsigned total = 0;
-		unsigned s;
-		for(s = 0; s <= maxSymbolValue; s++) {
-			switch(normalizedCounter[s]) {
-			case 0:
-				/* filling nonetheless, for compatibility with FSE_getMaxNbBits() */
-				symbolTT[s].deltaNbBits = ((tableLog + 1) << 16) - (1 << tableLog);
-				break;
-
-			case -1:
-			case 1:
-				symbolTT[s].deltaNbBits = (tableLog << 16) - (1 << tableLog);
-				assert(total <= INT_MAX);
-				symbolTT[s].deltaFindState = (int)(total - 1);
-				total++;
-				break;
-			default:
-				assert(normalizedCounter[s] > 1);
-				{
-					U32 const maxBitsOut = tableLog - ZSTD_highbit32((U32)normalizedCounter[s] - 1);
-					U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
-					symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
-					symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
-					total += (unsigned)normalizedCounter[s];
-				}
-			}
-		}
-	}
-
-#if 0 /* debug : symbol costs */
-    DEBUGLOG(5, "\n --- table statistics : ");
-    {   U32 symbol;
-        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
-            DEBUGLOG(5, "%3u: w=%3i,   maxBits=%u, fracBits=%.2f",
-                symbol, normalizedCounter[symbol],
-                FSE_getMaxNbBits(symbolTT, symbol),
-                (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
-    }   }
-#endif
-
-	return 0;
-}
-
-#ifndef FSE_COMMONDEFS_ONLY
-
-/*-**************************************************************
- *  FSE NCount encoding
- ****************************************************************/
-size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) {
-	size_t const maxHeaderSize = (((maxSymbolValue + 1) * tableLog + 4 /* bitCount initialized at 4 */
-	                               + 2 /* first two symbols may use one additional bit each */) /
-	                              8) +
-	                             1 /* round up to whole nb bytes */
-	                             + 2 /* additional two bytes for bitstream flush */;
-	return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */
-}
-
-static size_t
-FSE_writeNCount_generic(void* header, size_t headerBufferSize,
-                        const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
-                        unsigned writeIsSafe) {
-	BYTE* const ostart = (BYTE*)header;
-	BYTE* out = ostart;
-	BYTE* const oend = ostart + headerBufferSize;
-	int nbBits;
-	const int tableSize = 1 << tableLog;
-	int remaining;
-	int threshold;
-	U32 bitStream = 0;
-	int bitCount = 0;
-	unsigned symbol = 0;
-	unsigned const alphabetSize = maxSymbolValue + 1;
-	int previousIs0 = 0;
-
-	/* Table Size */
-	bitStream += (tableLog - FSE_MIN_TABLELOG) << bitCount;
-	bitCount += 4;
-
-	/* Init */
-	remaining = tableSize + 1; /* +1 for extra accuracy */
-	threshold = tableSize;
-	nbBits = tableLog + 1;
-
-	while((symbol < alphabetSize) && (remaining > 1)) { /* stops at 1 */
-		if(previousIs0) {
-			unsigned start = symbol;
-			while((symbol < alphabetSize) && !normalizedCounter[symbol])
-				symbol++;
-			if(symbol == alphabetSize)
-				break; /* incorrect distribution */
-			while(symbol >= start + 24) {
-				start += 24;
-				bitStream += 0xFFFFU << bitCount;
-				if((!writeIsSafe) && (out > oend - 2))
-					return ERROR(dstSize_tooSmall); /* Buffer overflow */
-				out[0] = (BYTE)bitStream;
-				out[1] = (BYTE)(bitStream >> 8);
-				out += 2;
-				bitStream >>= 16;
-			}
-			while(symbol >= start + 3) {
-				start += 3;
-				bitStream += 3 << bitCount;
-				bitCount += 2;
-			}
-			bitStream += (symbol - start) << bitCount;
-			bitCount += 2;
-			if(bitCount > 16) {
-				if((!writeIsSafe) && (out > oend - 2))
-					return ERROR(dstSize_tooSmall); /* Buffer overflow */
-				out[0] = (BYTE)bitStream;
-				out[1] = (BYTE)(bitStream >> 8);
-				out += 2;
-				bitStream >>= 16;
-				bitCount -= 16;
-			}
-		}
-		{
-			int count = normalizedCounter[symbol++];
-			int const max = (2 * threshold - 1) - remaining;
-			remaining -= count < 0 ? -count : count;
-			count++; /* +1 for extra accuracy */
-			if(count >= threshold)
-				count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
-			bitStream += count << bitCount;
-			bitCount += nbBits;
-			bitCount -= (count < max);
-			previousIs0 = (count == 1);
-			if(remaining < 1)
-				return ERROR(GENERIC);
-			while(remaining < threshold) {
-				nbBits--;
-				threshold >>= 1;
-			}
-		}
-		if(bitCount > 16) {
-			if((!writeIsSafe) && (out > oend - 2))
-				return ERROR(dstSize_tooSmall); /* Buffer overflow */
-			out[0] = (BYTE)bitStream;
-			out[1] = (BYTE)(bitStream >> 8);
-			out += 2;
-			bitStream >>= 16;
-			bitCount -= 16;
-		}
-	}
-
-	if(remaining != 1)
-		return ERROR(GENERIC); /* incorrect normalized distribution */
-	assert(symbol <= alphabetSize);
-
-	/* flush remaining bitStream */
-	if((!writeIsSafe) && (out > oend - 2))
-		return ERROR(dstSize_tooSmall); /* Buffer overflow */
-	out[0] = (BYTE)bitStream;
-	out[1] = (BYTE)(bitStream >> 8);
-	out += (bitCount + 7) / 8;
-
-	return (out - ostart);
-}
-
-size_t FSE_writeNCount(void* buffer, size_t bufferSize,
-                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) {
-	if(tableLog > FSE_MAX_TABLELOG)
-		return ERROR(tableLog_tooLarge); /* Unsupported */
-	if(tableLog < FSE_MIN_TABLELOG)
-		return ERROR(GENERIC); /* Unsupported */
-
-	if(bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
-		return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
-
-	return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */);
-}
-
-/*-**************************************************************
- *  FSE Compression Code
- ****************************************************************/
-
-/* provides the minimum logSize to safely represent a distribution */
-static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) {
-	U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
-	U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
-	U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
-	assert(srcSize > 1); /* Not supported, RLE should be used instead */
-	return minBits;
-}
-
-unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) {
-	U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
-	U32 tableLog = maxTableLog;
-	U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
-	assert(srcSize > 1); /* Not supported, RLE should be used instead */
-	if(tableLog == 0)
-		tableLog = FSE_DEFAULT_TABLELOG;
-	if(maxBitsSrc < tableLog)
-		tableLog = maxBitsSrc; /* Accuracy can be reduced */
-	if(minBits > tableLog)
-		tableLog = minBits; /* Need a minimum to safely represent all symbol values */
-	if(tableLog < FSE_MIN_TABLELOG)
-		tableLog = FSE_MIN_TABLELOG;
-	if(tableLog > FSE_MAX_TABLELOG)
-		tableLog = FSE_MAX_TABLELOG;
-	return tableLog;
-}
-
-unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue) {
-	return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
-}
-
-/* Secondary normalization method.
-   To be used when primary method fails. */
-
-static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue, short lowProbCount) {
-	short const NOT_YET_ASSIGNED = -2;
-	U32 s;
-	U32 distributed = 0;
-	U32 ToDistribute;
-
-	/* Init */
-	U32 const lowThreshold = (U32)(total >> tableLog);
-	U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
-
-	for(s = 0; s <= maxSymbolValue; s++) {
-		if(count[s] == 0) {
-			norm[s] = 0;
-			continue;
-		}
-		if(count[s] <= lowThreshold) {
-			norm[s] = lowProbCount;
-			distributed++;
-			total -= count[s];
-			continue;
-		}
-		if(count[s] <= lowOne) {
-			norm[s] = 1;
-			distributed++;
-			total -= count[s];
-			continue;
-		}
-
-		norm[s] = NOT_YET_ASSIGNED;
-	}
-	ToDistribute = (1 << tableLog) - distributed;
-
-	if(ToDistribute == 0)
-		return 0;
-
-	if((total / ToDistribute) > lowOne) {
-		/* risk of rounding to zero */
-		lowOne = (U32)((total * 3) / (ToDistribute * 2));
-		for(s = 0; s <= maxSymbolValue; s++) {
-			if((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
-				norm[s] = 1;
-				distributed++;
-				total -= count[s];
-				continue;
-			}
-		}
-		ToDistribute = (1 << tableLog) - distributed;
-	}
-
-	if(distributed == maxSymbolValue + 1) {
-		/* all values are pretty poor;
-		   probably incompressible data (should have already been detected);
-		   find max, then give all remaining points to max */
-		U32 maxV = 0, maxC = 0;
-		for(s = 0; s <= maxSymbolValue; s++)
-			if(count[s] > maxC) {
-				maxV = s;
-				maxC = count[s];
-			}
-		norm[maxV] += (short)ToDistribute;
-		return 0;
-	}
-
-	if(total == 0) {
-		/* all of the symbols were low enough for the lowOne or lowThreshold */
-		for(s = 0; ToDistribute > 0; s = (s + 1) % (maxSymbolValue + 1))
-			if(norm[s] > 0) {
-				ToDistribute--;
-				norm[s]++;
-			}
-		return 0;
-	}
-
-	{
-		U64 const vStepLog = 62 - tableLog;
-		U64 const mid = (1ULL << (vStepLog - 1)) - 1;
-		U64 const rStep = ZSTD_div64((((U64)1 << vStepLog) * ToDistribute) + mid, (U32)total); /* scale on remaining */
-		U64 tmpTotal = mid;
-		for(s = 0; s <= maxSymbolValue; s++) {
-			if(norm[s] == NOT_YET_ASSIGNED) {
-				U64 const end = tmpTotal + (count[s] * rStep);
-				U32 const sStart = (U32)(tmpTotal >> vStepLog);
-				U32 const sEnd = (U32)(end >> vStepLog);
-				U32 const weight = sEnd - sStart;
-				if(weight < 1)
-					return ERROR(GENERIC);
-				norm[s] = (short)weight;
-				tmpTotal = end;
-			}
-		}
-	}
-
-	return 0;
-}
-
-size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog,
-                          const unsigned* count, size_t total,
-                          unsigned maxSymbolValue, unsigned useLowProbCount) {
-	/* Sanity checks */
-	if(tableLog == 0)
-		tableLog = FSE_DEFAULT_TABLELOG;
-	if(tableLog < FSE_MIN_TABLELOG)
-		return ERROR(GENERIC); /* Unsupported size */
-	if(tableLog > FSE_MAX_TABLELOG)
-		return ERROR(tableLog_tooLarge); /* Unsupported size */
-	if(tableLog < FSE_minTableLog(total, maxSymbolValue))
-		return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */
-
-	{
-		static U32 const rtbTable[] = {0, 473195, 504333, 520860, 550000, 700000, 750000, 830000};
-		short const lowProbCount = useLowProbCount ? -1 : 1;
-		U64 const scale = 62 - tableLog;
-		U64 const step = ZSTD_div64((U64)1 << 62, (U32)total); /* <== here, one division ! */
-		U64 const vStep = 1ULL << (scale - 20);
-		int stillToDistribute = 1 << tableLog;
-		unsigned s;
-		unsigned largest = 0;
-		short largestP = 0;
-		U32 lowThreshold = (U32)(total >> tableLog);
-
-		for(s = 0; s <= maxSymbolValue; s++) {
-			if(count[s] == total)
-				return 0; /* rle special case */
-			if(count[s] == 0) {
-				normalizedCounter[s] = 0;
-				continue;
-			}
-			if(count[s] <= lowThreshold) {
-				normalizedCounter[s] = lowProbCount;
-				stillToDistribute--;
-			} else {
-				short proba = (short)((count[s] * step) >> scale);
-				if(proba < 8) {
-					U64 restToBeat = vStep * rtbTable[proba];
-					proba += (count[s] * step) - ((U64)proba << scale) > restToBeat;
-				}
-				if(proba > largestP) {
-					largestP = proba;
-					largest = s;
-				}
-				normalizedCounter[s] = proba;
-				stillToDistribute -= proba;
-			}
-		}
-		if(-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
-			/* corner case, need another normalization method */
-			size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue, lowProbCount);
-			if(FSE_isError(errorCode))
-				return errorCode;
-		} else
-			normalizedCounter[largest] += (short)stillToDistribute;
-	}
-
-#if 0
-    {   /* Print Table (debug) */
-        U32 s;
-        U32 nTotal = 0;
-        for (s=0; s<=maxSymbolValue; s++)
-            RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]);
-        for (s=0; s<=maxSymbolValue; s++)
-            nTotal += abs(normalizedCounter[s]);
-        if (nTotal != (1U<<tableLog))
-            RAWLOG(2, "Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
-        getchar();
-    }
-#endif
-
-	return tableLog;
-}
-
-/* fake FSE_CTable, for rle input (always same symbol) */
-size_t FSE_buildCTable_rle(FSE_CTable* ct, BYTE symbolValue) {
-	void* ptr = ct;
-	U16* tableU16 = ((U16*)ptr) + 2;
-	void* FSCTptr = (U32*)ptr + 2;
-	FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*)FSCTptr;
-
-	/* header */
-	tableU16[-2] = (U16)0;
-	tableU16[-1] = (U16)symbolValue;
-
-	/* Build table */
-	tableU16[0] = 0;
-	tableU16[1] = 0; /* just in case */
-
-	/* Build Symbol Transformation Table */
-	symbolTT[symbolValue].deltaNbBits = 0;
-	symbolTT[symbolValue].deltaFindState = 0;
-
-	return 0;
-}
-
-static size_t FSE_compress_usingCTable_generic(void* dst, size_t dstSize,
-                                               const void* src, size_t srcSize,
-                                               const FSE_CTable* ct, const unsigned fast) {
-	const BYTE* const istart = (const BYTE*)src;
-	const BYTE* const iend = istart + srcSize;
-	const BYTE* ip = iend;
-
-	BIT_CStream_t bitC;
-	FSE_CState_t CState1, CState2;
-
-	/* init */
-	if(srcSize <= 2)
-		return 0;
-	{
-		size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
-		if(FSE_isError(initError))
-			return 0; /* not enough space available to write a bitstream */
-	}
-
-#define FSE_FLUSHBITS(s) (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
-
-	if(srcSize & 1) {
-		FSE_initCState2(&CState1, ct, *--ip);
-		FSE_initCState2(&CState2, ct, *--ip);
-		FSE_encodeSymbol(&bitC, &CState1, *--ip);
-		FSE_FLUSHBITS(&bitC);
-	} else {
-		FSE_initCState2(&CState2, ct, *--ip);
-		FSE_initCState2(&CState1, ct, *--ip);
-	}
-
-	/* join to mod 4 */
-	srcSize -= 2;
-	if((sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) && (srcSize & 2)) { /* test bit 2 */
-		FSE_encodeSymbol(&bitC, &CState2, *--ip);
-		FSE_encodeSymbol(&bitC, &CState1, *--ip);
-		FSE_FLUSHBITS(&bitC);
-	}
-
-	/* 2 or 4 encoding per loop */
-	while(ip > istart) {
-
-		FSE_encodeSymbol(&bitC, &CState2, *--ip);
-
-		if(sizeof(bitC.bitContainer) * 8 < FSE_MAX_TABLELOG * 2 + 7) /* this test must be static */
-			FSE_FLUSHBITS(&bitC);
-
-		FSE_encodeSymbol(&bitC, &CState1, *--ip);
-
-		if(sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) { /* this test must be static */
-			FSE_encodeSymbol(&bitC, &CState2, *--ip);
-			FSE_encodeSymbol(&bitC, &CState1, *--ip);
-		}
-
-		FSE_FLUSHBITS(&bitC);
-	}
-
-	FSE_flushCState(&bitC, &CState2);
-	FSE_flushCState(&bitC, &CState1);
-	return BIT_closeCStream(&bitC);
-}
-
-size_t FSE_compress_usingCTable(void* dst, size_t dstSize,
-                                const void* src, size_t srcSize,
-                                const FSE_CTable* ct) {
-	unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
-
-	if(fast)
-		return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
-	else
-		return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
-}
-
-size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
-
-#endif /* FSE_COMMONDEFS_ONLY */
diff --git a/src/zstd/fse_decompress.c b/src/zstd/fse_decompress.c
deleted file mode 100644
index d72991640..000000000
--- a/src/zstd/fse_decompress.c
+++ /dev/null
@@ -1,325 +0,0 @@
-/* ******************************************************************
- * FSE : Finite State Entropy decoder
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- *  You can contact the author at :
- *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
- *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
-
-/* **************************************************************
- *  Includes
- ****************************************************************/
-#include "debug.h" /* assert */
-#include "bitstream.h"
-#include "compiler.h"
-#define FSE_STATIC_LINKING_ONLY
-#include "fse.h"
-#include "error_private.h"
-#define ZSTD_DEPS_NEED_MALLOC
-#include "zstd_deps.h"
-#include "bits.h" /* ZSTD_highbit32 */
-
-/* **************************************************************
- *  Error Management
- ****************************************************************/
-#define FSE_isError ERR_isError
-#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */
-
-/* **************************************************************
- *  Templates
- ****************************************************************/
-/*
-  designed to be included
-  for type-specific functions (template emulation in C)
-  Objective is to write these functions only once, for improved maintenance
-*/
-
-/* safety checks */
-#ifndef FSE_FUNCTION_EXTENSION
-#error "FSE_FUNCTION_EXTENSION must be defined"
-#endif
-#ifndef FSE_FUNCTION_TYPE
-#error "FSE_FUNCTION_TYPE must be defined"
-#endif
-
-/* Function names */
-#define FSE_CAT(X, Y) X##Y
-#define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y)
-#define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y)
-
-static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) {
-	void* const tdPtr = dt + 1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
-	FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*)(tdPtr);
-	U16* symbolNext = (U16*)workSpace;
-	BYTE* spread = (BYTE*)(symbolNext + maxSymbolValue + 1);
-
-	U32 const maxSV1 = maxSymbolValue + 1;
-	U32 const tableSize = 1 << tableLog;
-	U32 highThreshold = tableSize - 1;
-
-	/* Sanity Checks */
-	if(FSE_BUILD_DTABLE_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize)
-		return ERROR(maxSymbolValue_tooLarge);
-	if(maxSymbolValue > FSE_MAX_SYMBOL_VALUE)
-		return ERROR(maxSymbolValue_tooLarge);
-	if(tableLog > FSE_MAX_TABLELOG)
-		return ERROR(tableLog_tooLarge);
-
-	/* Init, lay down lowprob symbols */
-	{
-		FSE_DTableHeader DTableH;
-		DTableH.tableLog = (U16)tableLog;
-		DTableH.fastMode = 1;
-		{
-			S16 const largeLimit = (S16)(1 << (tableLog - 1));
-			U32 s;
-			for(s = 0; s < maxSV1; s++) {
-				if(normalizedCounter[s] == -1) {
-					tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
-					symbolNext[s] = 1;
-				} else {
-					if(normalizedCounter[s] >= largeLimit)
-						DTableH.fastMode = 0;
-					symbolNext[s] = normalizedCounter[s];
-				}
-			}
-		}
-		ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
-	}
-
-	/* Spread symbols */
-	if(highThreshold == tableSize - 1) {
-		size_t const tableMask = tableSize - 1;
-		size_t const step = FSE_TABLESTEP(tableSize);
-		/* First lay down the symbols in order.
-		 * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
-		 * misses since small blocks generally have small table logs, so nearly
-		 * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
-		 * our buffer to handle the over-write.
-		 */
-		{
-			U64 const add = 0x0101010101010101ull;
-			size_t pos = 0;
-			U64 sv = 0;
-			U32 s;
-			for(s = 0; s < maxSV1; ++s, sv += add) {
-				int i;
-				int const n = normalizedCounter[s];
-				MEM_write64(spread + pos, sv);
-				for(i = 8; i < n; i += 8) {
-					MEM_write64(spread + pos + i, sv);
-				}
-				pos += n;
-			}
-		}
-		/* Now we spread those positions across the table.
-		 * The benefit of doing it in two stages is that we avoid the
-		 * variable size inner loop, which caused lots of branch misses.
-		 * Now we can run through all the positions without any branch misses.
-		 * We unroll the loop twice, since that is what empirically worked best.
-		 */
-		{
-			size_t position = 0;
-			size_t s;
-			size_t const unroll = 2;
-			assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
-			for(s = 0; s < (size_t)tableSize; s += unroll) {
-				size_t u;
-				for(u = 0; u < unroll; ++u) {
-					size_t const uPosition = (position + (u * step)) & tableMask;
-					tableDecode[uPosition].symbol = spread[s + u];
-				}
-				position = (position + (unroll * step)) & tableMask;
-			}
-			assert(position == 0);
-		}
-	} else {
-		U32 const tableMask = tableSize - 1;
-		U32 const step = FSE_TABLESTEP(tableSize);
-		U32 s, position = 0;
-		for(s = 0; s < maxSV1; s++) {
-			int i;
-			for(i = 0; i < normalizedCounter[s]; i++) {
-				tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
-				position = (position + step) & tableMask;
-				while(position > highThreshold)
-					position = (position + step) & tableMask; /* lowprob area */
-			}
-		}
-		if(position != 0)
-			return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
-	}
-
-	/* Build Decoding table */
-	{
-		U32 u;
-		for(u = 0; u < tableSize; u++) {
-			FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
-			U32 const nextState = symbolNext[symbol]++;
-			tableDecode[u].nbBits = (BYTE)(tableLog - ZSTD_highbit32(nextState));
-			tableDecode[u].newState = (U16)((nextState << tableDecode[u].nbBits) - tableSize);
-		}
-	}
-
-	return 0;
-}
-
-size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) {
-	return FSE_buildDTable_internal(dt, normalizedCounter, maxSymbolValue, tableLog, workSpace, wkspSize);
-}
-
-#ifndef FSE_COMMONDEFS_ONLY
-
-/*-*******************************************************
- *  Decompression (Byte symbols)
- *********************************************************/
-
-FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
-    void* dst, size_t maxDstSize,
-    const void* cSrc, size_t cSrcSize,
-    const FSE_DTable* dt, const unsigned fast) {
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* op = ostart;
-	BYTE* const omax = op + maxDstSize;
-	BYTE* const olimit = omax - 3;
-
-	BIT_DStream_t bitD;
-	FSE_DState_t state1;
-	FSE_DState_t state2;
-
-	/* Init */
-	CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
-
-	FSE_initDState(&state1, &bitD, dt);
-	FSE_initDState(&state2, &bitD, dt);
-
-#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
-
-	/* 4 symbols per loop */
-	for(; (BIT_reloadDStream(&bitD) == BIT_DStream_unfinished) & (op < olimit); op += 4) {
-		op[0] = FSE_GETSYMBOL(&state1);
-
-		if(FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
-			BIT_reloadDStream(&bitD);
-
-		op[1] = FSE_GETSYMBOL(&state2);
-
-		if(FSE_MAX_TABLELOG * 4 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
-		{
-			if(BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) {
-				op += 2;
-				break;
-			}
-		}
-
-		op[2] = FSE_GETSYMBOL(&state1);
-
-		if(FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
-			BIT_reloadDStream(&bitD);
-
-		op[3] = FSE_GETSYMBOL(&state2);
-	}
-
-	/* tail */
-	/* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
-	while(1) {
-		if(op > (omax - 2))
-			return ERROR(dstSize_tooSmall);
-		*op++ = FSE_GETSYMBOL(&state1);
-		if(BIT_reloadDStream(&bitD) == BIT_DStream_overflow) {
-			*op++ = FSE_GETSYMBOL(&state2);
-			break;
-		}
-
-		if(op > (omax - 2))
-			return ERROR(dstSize_tooSmall);
-		*op++ = FSE_GETSYMBOL(&state2);
-		if(BIT_reloadDStream(&bitD) == BIT_DStream_overflow) {
-			*op++ = FSE_GETSYMBOL(&state1);
-			break;
-		}
-	}
-
-	return op - ostart;
-}
-
-typedef struct {
-	short ncount[FSE_MAX_SYMBOL_VALUE + 1];
-	FSE_DTable dtable[1]; /* Dynamically sized */
-} FSE_DecompressWksp;
-
-FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
-    void* dst, size_t dstCapacity,
-    const void* cSrc, size_t cSrcSize,
-    unsigned maxLog, void* workSpace, size_t wkspSize,
-    int bmi2) {
-	const BYTE* const istart = (const BYTE*)cSrc;
-	const BYTE* ip = istart;
-	unsigned tableLog;
-	unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
-	FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
-
-	DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
-	if(wkspSize < sizeof(*wksp))
-		return ERROR(GENERIC);
-
-	/* normal FSE decoding mode */
-	{
-		size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
-		if(FSE_isError(NCountLength))
-			return NCountLength;
-		if(tableLog > maxLog)
-			return ERROR(tableLog_tooLarge);
-		assert(NCountLength <= cSrcSize);
-		ip += NCountLength;
-		cSrcSize -= NCountLength;
-	}
-
-	if(FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize)
-		return ERROR(tableLog_tooLarge);
-	assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
-	workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
-	wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
-
-	CHECK_F(FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize));
-
-	{
-		const void* ptr = wksp->dtable;
-		const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
-		const U32 fastMode = DTableH->fastMode;
-
-		/* select fast mode (static) */
-		if(fastMode)
-			return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1);
-		return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0);
-	}
-}
-
-/* Avoids the FORCE_INLINE of the _body() function. */
-static size_t FSE_decompress_wksp_body_default(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) {
-	return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 0);
-}
-
-#if DYNAMIC_BMI2
-BMI2_TARGET_ATTRIBUTE static size_t FSE_decompress_wksp_body_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) {
-	return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 1);
-}
-#endif
-
-size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2) {
-#if DYNAMIC_BMI2
-	if(bmi2) {
-		return FSE_decompress_wksp_body_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
-	}
-#endif
-	(void)bmi2;
-	return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
-}
-
-#endif /* FSE_COMMONDEFS_ONLY */
diff --git a/src/zstd/hist.c b/src/zstd/hist.c
deleted file mode 100644
index 1dbbea604..000000000
--- a/src/zstd/hist.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/* ******************************************************************
- * hist : Histogram functions
- * part of Finite State Entropy project
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- *  You can contact the author at :
- *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
- *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
-
-/* --- dependencies --- */
-#include "mem.h"           /* U32, BYTE, etc. */
-#include "debug.h"         /* assert, DEBUGLOG */
-#include "error_private.h" /* ERROR */
-#include "hist.h"
-
-/* --- Error management --- */
-unsigned HIST_isError(size_t code) { return ERR_isError(code); }
-
-/*-**************************************************************
- *  Histogram functions
- ****************************************************************/
-unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
-                           const void* src, size_t srcSize) {
-	const BYTE* ip = (const BYTE*)src;
-	const BYTE* const end = ip + srcSize;
-	unsigned maxSymbolValue = *maxSymbolValuePtr;
-	unsigned largestCount = 0;
-
-	ZSTD_memset(count, 0, (maxSymbolValue + 1) * sizeof(*count));
-	if(srcSize == 0) {
-		*maxSymbolValuePtr = 0;
-		return 0;
-	}
-
-	while(ip < end) {
-		assert(*ip <= maxSymbolValue);
-		count[*ip++]++;
-	}
-
-	while(!count[maxSymbolValue])
-		maxSymbolValue--;
-	*maxSymbolValuePtr = maxSymbolValue;
-
-	{
-		U32 s;
-		for(s = 0; s <= maxSymbolValue; s++)
-			if(count[s] > largestCount)
-				largestCount = count[s];
-	}
-
-	return largestCount;
-}
-
-typedef enum { trustInput,
-	           checkMaxSymbolValue } HIST_checkInput_e;
-
-/* HIST_count_parallel_wksp() :
- * store histogram into 4 intermediate tables, recombined at the end.
- * this design makes better use of OoO cpus,
- * and is noticeably faster when some values are heavily repeated.
- * But it needs some additional workspace for intermediate tables.
- * `workSpace` must be a U32 table of size >= HIST_WKSP_SIZE_U32.
- * @return : largest histogram frequency,
- *           or an error code (notably when histogram's alphabet is larger than *maxSymbolValuePtr) */
-static size_t HIST_count_parallel_wksp(
-    unsigned* count, unsigned* maxSymbolValuePtr,
-    const void* source, size_t sourceSize,
-    HIST_checkInput_e check,
-    U32* const workSpace) {
-	const BYTE* ip = (const BYTE*)source;
-	const BYTE* const iend = ip + sourceSize;
-	size_t const countSize = (*maxSymbolValuePtr + 1) * sizeof(*count);
-	unsigned max = 0;
-	U32* const Counting1 = workSpace;
-	U32* const Counting2 = Counting1 + 256;
-	U32* const Counting3 = Counting2 + 256;
-	U32* const Counting4 = Counting3 + 256;
-
-	/* safety checks */
-	assert(*maxSymbolValuePtr <= 255);
-	if(!sourceSize) {
-		ZSTD_memset(count, 0, countSize);
-		*maxSymbolValuePtr = 0;
-		return 0;
-	}
-	ZSTD_memset(workSpace, 0, 4 * 256 * sizeof(unsigned));
-
-	/* by stripes of 16 bytes */
-	{
-		U32 cached = MEM_read32(ip);
-		ip += 4;
-		while(ip < iend - 15) {
-			U32 c = cached;
-			cached = MEM_read32(ip);
-			ip += 4;
-			Counting1[(BYTE)c]++;
-			Counting2[(BYTE)(c >> 8)]++;
-			Counting3[(BYTE)(c >> 16)]++;
-			Counting4[c >> 24]++;
-			c = cached;
-			cached = MEM_read32(ip);
-			ip += 4;
-			Counting1[(BYTE)c]++;
-			Counting2[(BYTE)(c >> 8)]++;
-			Counting3[(BYTE)(c >> 16)]++;
-			Counting4[c >> 24]++;
-			c = cached;
-			cached = MEM_read32(ip);
-			ip += 4;
-			Counting1[(BYTE)c]++;
-			Counting2[(BYTE)(c >> 8)]++;
-			Counting3[(BYTE)(c >> 16)]++;
-			Counting4[c >> 24]++;
-			c = cached;
-			cached = MEM_read32(ip);
-			ip += 4;
-			Counting1[(BYTE)c]++;
-			Counting2[(BYTE)(c >> 8)]++;
-			Counting3[(BYTE)(c >> 16)]++;
-			Counting4[c >> 24]++;
-		}
-		ip -= 4;
-	}
-
-	/* finish last symbols */
-	while(ip < iend)
-		Counting1[*ip++]++;
-
-	{
-		U32 s;
-		for(s = 0; s < 256; s++) {
-			Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
-			if(Counting1[s] > max)
-				max = Counting1[s];
-		}
-	}
-
-	{
-		unsigned maxSymbolValue = 255;
-		while(!Counting1[maxSymbolValue])
-			maxSymbolValue--;
-		if(check && maxSymbolValue > *maxSymbolValuePtr)
-			return ERROR(maxSymbolValue_tooSmall);
-		*maxSymbolValuePtr = maxSymbolValue;
-		ZSTD_memmove(count, Counting1, countSize); /* in case count & Counting1 are overlapping */
-	}
-	return (size_t)max;
-}
-
-/* HIST_countFast_wksp() :
- * Same as HIST_countFast(), but using an externally provided scratch buffer.
- * `workSpace` is a writable buffer which must be 4-bytes aligned,
- * `workSpaceSize` must be >= HIST_WKSP_SIZE
- */
-size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
-                           const void* source, size_t sourceSize,
-                           void* workSpace, size_t workSpaceSize) {
-	if(sourceSize < 1500) /* heuristic threshold */
-		return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize);
-	if((size_t)workSpace & 3)
-		return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
-	if(workSpaceSize < HIST_WKSP_SIZE)
-		return ERROR(workSpace_tooSmall);
-	return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace);
-}
-
-/* HIST_count_wksp() :
- * Same as HIST_count(), but using an externally provided scratch buffer.
- * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
-size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
-                       const void* source, size_t sourceSize,
-                       void* workSpace, size_t workSpaceSize) {
-	if((size_t)workSpace & 3)
-		return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
-	if(workSpaceSize < HIST_WKSP_SIZE)
-		return ERROR(workSpace_tooSmall);
-	if(*maxSymbolValuePtr < 255)
-		return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue, (U32*)workSpace);
-	*maxSymbolValuePtr = 255;
-	return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize);
-}
-
-#ifndef ZSTD_NO_UNUSED_FUNCTIONS
-/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
-size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
-                      const void* source, size_t sourceSize) {
-	unsigned tmpCounters[HIST_WKSP_SIZE_U32];
-	return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters, sizeof(tmpCounters));
-}
-
-size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
-                  const void* src, size_t srcSize) {
-	unsigned tmpCounters[HIST_WKSP_SIZE_U32];
-	return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters, sizeof(tmpCounters));
-}
-#endif
diff --git a/src/zstd/huf_compress.c b/src/zstd/huf_compress.c
deleted file mode 100644
index 67b9d5818..000000000
--- a/src/zstd/huf_compress.c
+++ /dev/null
@@ -1,1490 +0,0 @@
-/* ******************************************************************
- * Huffman encoder, part of New Generation Entropy library
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- *  You can contact the author at :
- *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
- *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
-
-/* **************************************************************
- *  Compiler specifics
- ****************************************************************/
-#ifdef _MSC_VER                 /* Visual Studio */
-#pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
-#endif
-
-/* **************************************************************
- *  Includes
- ****************************************************************/
-#include "zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */
-#include "compiler.h"
-#include "bitstream.h"
-#include "hist.h"
-#define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */
-#include "fse.h"                /* header compression */
-#include "huf.h"
-#include "error_private.h"
-#include "bits.h" /* ZSTD_highbit32 */
-
-/* **************************************************************
- *  Error Management
- ****************************************************************/
-#define HUF_isError ERR_isError
-#define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */
-
-/* **************************************************************
- *  Required declarations
- ****************************************************************/
-typedef struct nodeElt_s {
-	U32 count;
-	U16 parent;
-	BYTE byte;
-	BYTE nbBits;
-} nodeElt;
-
-/* **************************************************************
- *  Debug Traces
- ****************************************************************/
-
-#if DEBUGLEVEL >= 2
-
-static size_t showU32(const U32* arr, size_t size) {
-	size_t u;
-	for(u = 0; u < size; u++) {
-		RAWLOG(6, " %u", arr[u]);
-		(void)arr;
-	}
-	RAWLOG(6, " \n");
-	return size;
-}
-
-static size_t HUF_getNbBits(HUF_CElt elt);
-
-static size_t showCTableBits(const HUF_CElt* ctable, size_t size) {
-	size_t u;
-	for(u = 0; u < size; u++) {
-		RAWLOG(6, " %zu", HUF_getNbBits(ctable[u]));
-		(void)ctable;
-	}
-	RAWLOG(6, " \n");
-	return size;
-}
-
-static size_t showHNodeSymbols(const nodeElt* hnode, size_t size) {
-	size_t u;
-	for(u = 0; u < size; u++) {
-		RAWLOG(6, " %u", hnode[u].byte);
-		(void)hnode;
-	}
-	RAWLOG(6, " \n");
-	return size;
-}
-
-static size_t showHNodeBits(const nodeElt* hnode, size_t size) {
-	size_t u;
-	for(u = 0; u < size; u++) {
-		RAWLOG(6, " %u", hnode[u].nbBits);
-		(void)hnode;
-	}
-	RAWLOG(6, " \n");
-	return size;
-}
-
-#endif
-
-/* *******************************************************
- *  HUF : Huffman block compression
- *********************************************************/
-#define HUF_WORKSPACE_MAX_ALIGNMENT 8
-
-static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align) {
-	size_t const mask = align - 1;
-	size_t const rem = (size_t)workspace & mask;
-	size_t const add = (align - rem) & mask;
-	BYTE* const aligned = (BYTE*)workspace + add;
-	assert((align & (align - 1)) == 0); /* pow 2 */
-	assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT);
-	if(*workspaceSizePtr >= add) {
-		assert(add < align);
-		assert(((size_t)aligned & mask) == 0);
-		*workspaceSizePtr -= add;
-		return aligned;
-	} else {
-		*workspaceSizePtr = 0;
-		return NULL;
-	}
-}
-
-/* HUF_compressWeights() :
- * Same as FSE_compress(), but dedicated to huff0's weights compression.
- * The use case needs much less stack memory.
- * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
- */
-#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
-
-typedef struct {
-	FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
-	U32 scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(HUF_TABLELOG_MAX, MAX_FSE_TABLELOG_FOR_HUFF_HEADER)];
-	unsigned count[HUF_TABLELOG_MAX + 1];
-	S16 norm[HUF_TABLELOG_MAX + 1];
-} HUF_CompressWeightsWksp;
-
-static size_t
-HUF_compressWeights(void* dst, size_t dstSize,
-                    const void* weightTable, size_t wtSize,
-                    void* workspace, size_t workspaceSize) {
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* op = ostart;
-	BYTE* const oend = ostart + dstSize;
-
-	unsigned maxSymbolValue = HUF_TABLELOG_MAX;
-	U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
-	HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
-
-	if(workspaceSize < sizeof(HUF_CompressWeightsWksp))
-		return ERROR(GENERIC);
-
-	/* init conditions */
-	if(wtSize <= 1)
-		return 0; /* Not compressible */
-
-	/* Scan input and build symbol stats */
-	{
-		unsigned const maxCount = HIST_count_simple(wksp->count, &maxSymbolValue, weightTable, wtSize); /* never fails */
-		if(maxCount == wtSize)
-			return 1; /* only a single symbol in src : rle */
-		if(maxCount == 1)
-			return 0; /* each symbol present maximum once => not compressible */
-	}
-
-	tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
-	CHECK_F(FSE_normalizeCount(wksp->norm, tableLog, wksp->count, wtSize, maxSymbolValue, /* useLowProbCount */ 0));
-
-	/* Write table description header */
-	{
-		CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, maxSymbolValue, tableLog));
-		op += hSize;
-	}
-
-	/* Compress */
-	CHECK_F(FSE_buildCTable_wksp(wksp->CTable, wksp->norm, maxSymbolValue, tableLog, wksp->scratchBuffer, sizeof(wksp->scratchBuffer)));
-	{
-		CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, wksp->CTable));
-		if(cSize == 0)
-			return 0; /* not enough space for compressed data */
-		op += cSize;
-	}
-
-	return (size_t)(op - ostart);
-}
-
-static size_t HUF_getNbBits(HUF_CElt elt) {
-	return elt & 0xFF;
-}
-
-static size_t HUF_getNbBitsFast(HUF_CElt elt) {
-	return elt;
-}
-
-static size_t HUF_getValue(HUF_CElt elt) {
-	return elt & ~(size_t)0xFF;
-}
-
-static size_t HUF_getValueFast(HUF_CElt elt) {
-	return elt;
-}
-
-static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits) {
-	assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX);
-	*elt = nbBits;
-}
-
-static void HUF_setValue(HUF_CElt* elt, size_t value) {
-	size_t const nbBits = HUF_getNbBits(*elt);
-	if(nbBits > 0) {
-		assert((value >> nbBits) == 0);
-		*elt |= value << (sizeof(HUF_CElt) * 8 - nbBits);
-	}
-}
-
-typedef struct {
-	HUF_CompressWeightsWksp wksp;
-	BYTE bitsToWeight[HUF_TABLELOG_MAX + 1]; /* precomputed conversion table */
-	BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
-} HUF_WriteCTableWksp;
-
-size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
-                            const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog,
-                            void* workspace, size_t workspaceSize) {
-	HUF_CElt const * const ct = CTable + 1;
-	BYTE* op = (BYTE*)dst;
-	U32 n;
-	HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
-
-	HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
-
-	/* check conditions */
-	if(workspaceSize < sizeof(HUF_WriteCTableWksp))
-		return ERROR(GENERIC);
-	if(maxSymbolValue > HUF_SYMBOLVALUE_MAX)
-		return ERROR(maxSymbolValue_tooLarge);
-
-	/* convert to weight */
-	wksp->bitsToWeight[0] = 0;
-	for(n = 1; n < huffLog + 1; n++)
-		wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
-	for(n = 0; n < maxSymbolValue; n++)
-		wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])];
-
-	/* attempt weights compression by FSE */
-	if(maxDstSize < 1)
-		return ERROR(dstSize_tooSmall);
-	{
-		CHECK_V_F(hSize, HUF_compressWeights(op + 1, maxDstSize - 1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)));
-		if((hSize > 1) & (hSize < maxSymbolValue / 2)) { /* FSE compressed */
-			op[0] = (BYTE)hSize;
-			return hSize + 1;
-		}
-	}
-
-	/* write raw values as 4-bits (max : 15) */
-	if(maxSymbolValue > (256 - 128))
-		return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */
-	if(((maxSymbolValue + 1) / 2) + 1 > maxDstSize)
-		return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */
-	op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue - 1));
-	wksp->huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */
-	for(n = 0; n < maxSymbolValue; n += 2)
-		op[(n / 2) + 1] = (BYTE)((wksp->huffWeight[n] << 4) + wksp->huffWeight[n + 1]);
-	return ((maxSymbolValue + 1) / 2) + 1;
-}
-
-size_t HUF_readCTable(HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) {
-	BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];  /* init not required, even though some static analyzer may complain */
-	U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */
-	U32 tableLog = 0;
-	U32 nbSymbols = 0;
-	HUF_CElt* const ct = CTable + 1;
-
-	/* get symbol weights */
-	CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize));
-	*hasZeroWeights = (rankVal[0] > 0);
-
-	/* check result */
-	if(tableLog > HUF_TABLELOG_MAX)
-		return ERROR(tableLog_tooLarge);
-	if(nbSymbols > *maxSymbolValuePtr + 1)
-		return ERROR(maxSymbolValue_tooSmall);
-
-	CTable[0] = tableLog;
-
-	/* Prepare base value per rank */
-	{
-		U32 n, nextRankStart = 0;
-		for(n = 1; n <= tableLog; n++) {
-			U32 curr = nextRankStart;
-			nextRankStart += (rankVal[n] << (n - 1));
-			rankVal[n] = curr;
-		}
-	}
-
-	/* fill nbBits */
-	{
-		U32 n;
-		for(n = 0; n < nbSymbols; n++) {
-			const U32 w = huffWeight[n];
-			HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0));
-		}
-	}
-
-	/* fill val */
-	{
-		U16 nbPerRank[HUF_TABLELOG_MAX + 2] = {0}; /* support w=0=>n=tableLog+1 */
-		U16 valPerRank[HUF_TABLELOG_MAX + 2] = {0};
-		{
-			U32 n;
-			for(n = 0; n < nbSymbols; n++)
-				nbPerRank[HUF_getNbBits(ct[n])]++;
-		}
-		/* determine stating value per rank */
-		valPerRank[tableLog + 1] = 0; /* for w==0 */
-		{
-			U16 min = 0;
-			U32 n;
-			for(n = tableLog; n > 0; n--) { /* start at n=tablelog <-> w=1 */
-				valPerRank[n] = min;        /* get starting value within each rank */
-				min += nbPerRank[n];
-				min >>= 1;
-			}
-		}
-		/* assign value within rank, symbol order */
-		{
-			U32 n;
-			for(n = 0; n < nbSymbols; n++)
-				HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);
-		}
-	}
-
-	*maxSymbolValuePtr = nbSymbols - 1;
-	return readSize;
-}
-
-U32 HUF_getNbBitsFromCTable(HUF_CElt const * CTable, U32 symbolValue) {
-	const HUF_CElt* const ct = CTable + 1;
-	assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
-	return (U32)HUF_getNbBits(ct[symbolValue]);
-}
-
-/**
- * HUF_setMaxHeight():
- * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
- *
- * It attempts to convert all nodes with nbBits > @targetNbBits
- * to employ @targetNbBits instead. Then it adjusts the tree
- * so that it remains a valid canonical Huffman tree.
- *
- * @pre               The sum of the ranks of each symbol == 2^largestBits,
- *                    where largestBits == huffNode[lastNonNull].nbBits.
- * @post              The sum of the ranks of each symbol == 2^largestBits,
- *                    where largestBits is the return value (expected <= targetNbBits).
- *
- * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
- *                    It's presumed sorted, from most frequent to rarest symbol.
- * @param lastNonNull The symbol with the lowest count in the Huffman tree.
- * @param targetNbBits  The allowed number of bits, which the Huffman tree
- *                    may not respect. After this function the Huffman tree will
- *                    respect targetNbBits.
- * @return            The maximum number of bits of the Huffman tree after adjustment.
- */
-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) {
-	const U32 largestBits = huffNode[lastNonNull].nbBits;
-	/* early exit : no elt > targetNbBits, so the tree is already valid. */
-	if(largestBits <= targetNbBits)
-		return largestBits;
-
-	DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
-
-	/* there are several too large elements (at least >= 2) */
-	{
-		int totalCost = 0;
-		const U32 baseCost = 1 << (largestBits - targetNbBits);
-		int n = (int)lastNonNull;
-
-		/* Adjust any ranks > targetNbBits to targetNbBits.
-		 * Compute totalCost, which is how far the sum of the ranks is
-		 * we are over 2^largestBits after adjust the offending ranks.
-		 */
-		while(huffNode[n].nbBits > targetNbBits) {
-			totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
-			huffNode[n].nbBits = (BYTE)targetNbBits;
-			n--;
-		}
-		/* n stops at huffNode[n].nbBits <= targetNbBits */
-		assert(huffNode[n].nbBits <= targetNbBits);
-		/* n end at index of smallest symbol using < targetNbBits */
-		while(huffNode[n].nbBits == targetNbBits)
-			--n;
-
-		/* renorm totalCost from 2^largestBits to 2^targetNbBits
-		 * note : totalCost is necessarily a multiple of baseCost */
-		assert(((U32)totalCost & (baseCost - 1)) == 0);
-		totalCost >>= (largestBits - targetNbBits);
-		assert(totalCost > 0);
-
-		/* repay normalized cost */
-		{
-			U32 const noSymbol = 0xF0F0F0F0;
-			U32 rankLast[HUF_TABLELOG_MAX + 2];
-
-			/* Get pos of last (smallest = lowest cum. count) symbol per rank */
-			ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
-			{
-				U32 currentNbBits = targetNbBits;
-				int pos;
-				for(pos = n; pos >= 0; pos--) {
-					if(huffNode[pos].nbBits >= currentNbBits)
-						continue;
-					currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */
-					rankLast[targetNbBits - currentNbBits] = (U32)pos;
-				}
-			}
-
-			while(totalCost > 0) {
-				/* Try to reduce the next power of 2 above totalCost because we
-				 * gain back half the rank.
-				 */
-				U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
-				for(; nBitsToDecrease > 1; nBitsToDecrease--) {
-					U32 const highPos = rankLast[nBitsToDecrease];
-					U32 const lowPos = rankLast[nBitsToDecrease - 1];
-					if(highPos == noSymbol)
-						continue;
-					/* Decrease highPos if no symbols of lowPos or if it is
-					 * not cheaper to remove 2 lowPos than highPos.
-					 */
-					if(lowPos == noSymbol)
-						break;
-					{
-						U32 const highTotal = huffNode[highPos].count;
-						U32 const lowTotal = 2 * huffNode[lowPos].count;
-						if(highTotal <= lowTotal)
-							break;
-					}
-				}
-				/* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
-				assert(rankLast[nBitsToDecrease] != noSymbol || nBitsToDecrease == 1);
-				/* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
-				while((nBitsToDecrease <= HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))
-					nBitsToDecrease++;
-				assert(rankLast[nBitsToDecrease] != noSymbol);
-				/* Increase the number of bits to gain back half the rank cost. */
-				totalCost -= 1 << (nBitsToDecrease - 1);
-				huffNode[rankLast[nBitsToDecrease]].nbBits++;
-
-				/* Fix up the new rank.
-				 * If the new rank was empty, this symbol is now its smallest.
-				 * Otherwise, this symbol will be the largest in the new rank so no adjustment.
-				 */
-				if(rankLast[nBitsToDecrease - 1] == noSymbol)
-					rankLast[nBitsToDecrease - 1] = rankLast[nBitsToDecrease];
-				/* Fix up the old rank.
-				 * If the symbol was at position 0, meaning it was the highest weight symbol in the tree,
-				 * it must be the only symbol in its rank, so the old rank now has no symbols.
-				 * Otherwise, since the Huffman nodes are sorted by count, the previous position is now
-				 * the smallest node in the rank. If the previous position belongs to a different rank,
-				 * then the rank is now empty.
-				 */
-				if(rankLast[nBitsToDecrease] == 0) /* special case, reached largest symbol */
-					rankLast[nBitsToDecrease] = noSymbol;
-				else {
-					rankLast[nBitsToDecrease]--;
-					if(huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits - nBitsToDecrease)
-						rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */
-				}
-			} /* while (totalCost > 0) */
-
-			/* If we've removed too much weight, then we have to add it back.
-			 * To avoid overshooting again, we only adjust the smallest rank.
-			 * We take the largest nodes from the lowest rank 0 and move them
-			 * to rank 1. There's guaranteed to be enough rank 0 symbols because
-			 * TODO.
-			 */
-			while(totalCost < 0) { /* Sometimes, cost correction overshoot */
-				/* special case : no rank 1 symbol (using targetNbBits-1);
-				 * let's create one from largest rank 0 (using targetNbBits).
-				 */
-				if(rankLast[1] == noSymbol) {
-					while(huffNode[n].nbBits == targetNbBits)
-						n--;
-					huffNode[n + 1].nbBits--;
-					assert(n >= 0);
-					rankLast[1] = (U32)(n + 1);
-					totalCost++;
-					continue;
-				}
-				huffNode[rankLast[1] + 1].nbBits--;
-				rankLast[1]++;
-				totalCost++;
-			}
-		} /* repay normalized cost */
-	}     /* there are several too large elements (at least >= 2) */
-
-	return targetNbBits;
-}
-
-typedef struct {
-	U16 base;
-	U16 curr;
-} rankPos;
-
-typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
-
-/* Number of buckets available for HUF_sort() */
-#define RANK_POSITION_TABLE_SIZE 192
-
-typedef struct {
-	huffNodeTable huffNodeTbl;
-	rankPos rankPosition[RANK_POSITION_TABLE_SIZE];
-} HUF_buildCTable_wksp_tables;
-
-/* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
- * Strategy is to use as many buckets as possible for representing distinct
- * counts while using the remainder to represent all "large" counts.
- *
- * To satisfy this requirement for 192 buckets, we can do the following:
- * Let buckets 0-166 represent distinct counts of [0, 166]
- * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
- */
-#define RANK_POSITION_MAX_COUNT_LOG 32
-#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
-
-/* Return the appropriate bucket index for a given count. See definition of
- * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
- */
-static U32 HUF_getIndex(U32 const count) {
-	return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
-	           ? count
-	           : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
-}
-
-/* Helper swap function for HUF_quickSortPartition() */
-static void HUF_swapNodes(nodeElt* a, nodeElt* b) {
-	nodeElt tmp = *a;
-	*a = *b;
-	*b = tmp;
-}
-
-/* Returns 0 if the huffNode array is not sorted by descending count */
-MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) {
-	U32 i;
-	for(i = 1; i < maxSymbolValue1; ++i) {
-		if(huffNode[i].count > huffNode[i - 1].count) {
-			return 0;
-		}
-	}
-	return 1;
-}
-
-/* Insertion sort by descending order */
-HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) {
-	int i;
-	int const size = high - low + 1;
-	huffNode += low;
-	for(i = 1; i < size; ++i) {
-		nodeElt const key = huffNode[i];
-		int j = i - 1;
-		while(j >= 0 && huffNode[j].count < key.count) {
-			huffNode[j + 1] = huffNode[j];
-			j--;
-		}
-		huffNode[j + 1] = key;
-	}
-}
-
-/* Pivot helper function for quicksort. */
-static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) {
-	/* Simply select rightmost element as pivot. "Better" selectors like
-	 * median-of-three don't experimentally appear to have any benefit.
-	 */
-	U32 const pivot = arr[high].count;
-	int i = low - 1;
-	int j = low;
-	for(; j < high; j++) {
-		if(arr[j].count > pivot) {
-			i++;
-			HUF_swapNodes(&arr[i], &arr[j]);
-		}
-	}
-	HUF_swapNodes(&arr[i + 1], &arr[high]);
-	return i + 1;
-}
-
-/* Classic quicksort by descending with partially iterative calls
- * to reduce worst case callstack size.
- */
-static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) {
-	int const kInsertionSortThreshold = 8;
-	if(high - low < kInsertionSortThreshold) {
-		HUF_insertionSort(arr, low, high);
-		return;
-	}
-	while(low < high) {
-		int const idx = HUF_quickSortPartition(arr, low, high);
-		if(idx - low < high - idx) {
-			HUF_simpleQuickSort(arr, low, idx - 1);
-			low = idx + 1;
-		} else {
-			HUF_simpleQuickSort(arr, idx + 1, high);
-			high = idx - 1;
-		}
-	}
-}
-
-/**
- * HUF_sort():
- * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order.
- * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket.
- *
- * @param[out] huffNode       Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled.
- *                            Must have (maxSymbolValue + 1) entries.
- * @param[in]  count          Histogram of the symbols.
- * @param[in]  maxSymbolValue Maximum symbol value.
- * @param      rankPosition   This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries.
- */
-static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) {
-	U32 n;
-	U32 const maxSymbolValue1 = maxSymbolValue + 1;
-
-	/* Compute base and set curr to base.
-	 * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1.
-	 * See HUF_getIndex to see bucketing strategy.
-	 * We attribute each symbol to lowerRank's base value, because we want to know where
-	 * each rank begins in the output, so for rank R we want to count ranks R+1 and above.
-	 */
-	ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE);
-	for(n = 0; n < maxSymbolValue1; ++n) {
-		U32 lowerRank = HUF_getIndex(count[n]);
-		assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1);
-		rankPosition[lowerRank].base++;
-	}
-
-	assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0);
-	/* Set up the rankPosition table */
-	for(n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) {
-		rankPosition[n - 1].base += rankPosition[n].base;
-		rankPosition[n - 1].curr = rankPosition[n - 1].base;
-	}
-
-	/* Insert each symbol into their appropriate bucket, setting up rankPosition table. */
-	for(n = 0; n < maxSymbolValue1; ++n) {
-		U32 const c = count[n];
-		U32 const r = HUF_getIndex(c) + 1;
-		U32 const pos = rankPosition[r].curr++;
-		assert(pos < maxSymbolValue1);
-		huffNode[pos].count = c;
-		huffNode[pos].byte = (BYTE)n;
-	}
-
-	/* Sort each bucket. */
-	for(n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
-		int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
-		U32 const bucketStartIdx = rankPosition[n].base;
-		if(bucketSize > 1) {
-			assert(bucketStartIdx < maxSymbolValue1);
-			HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize - 1);
-		}
-	}
-
-	assert(HUF_isSorted(huffNode, maxSymbolValue1));
-}
-
-/** HUF_buildCTable_wksp() :
- *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
- *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
- */
-#define STARTNODE (HUF_SYMBOLVALUE_MAX + 1)
-
-/* HUF_buildTree():
- * Takes the huffNode array sorted by HUF_sort() and builds an unlimited-depth Huffman tree.
- *
- * @param huffNode        The array sorted by HUF_sort(). Builds the Huffman tree in this array.
- * @param maxSymbolValue  The maximum symbol value.
- * @return                The smallest node in the Huffman tree (by count).
- */
-static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) {
-	nodeElt* const huffNode0 = huffNode - 1;
-	int nonNullRank;
-	int lowS, lowN;
-	int nodeNb = STARTNODE;
-	int n, nodeRoot;
-	DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
-	/* init for parents */
-	nonNullRank = (int)maxSymbolValue;
-	while(huffNode[nonNullRank].count == 0)
-		nonNullRank--;
-	lowS = nonNullRank;
-	nodeRoot = nodeNb + lowS - 1;
-	lowN = nodeNb;
-	huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS - 1].count;
-	huffNode[lowS].parent = huffNode[lowS - 1].parent = (U16)nodeNb;
-	nodeNb++;
-	lowS -= 2;
-	for(n = nodeNb; n <= nodeRoot; n++)
-		huffNode[n].count = (U32)(1U << 30);
-	huffNode0[0].count = (U32)(1U << 31); /* fake entry, strong barrier */
-
-	/* create parents */
-	while(nodeNb <= nodeRoot) {
-		int const n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
-		int const n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
-		huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
-		huffNode[n1].parent = huffNode[n2].parent = (U16)nodeNb;
-		nodeNb++;
-	}
-
-	/* distribute weights (unlimited tree height) */
-	huffNode[nodeRoot].nbBits = 0;
-	for(n = nodeRoot - 1; n >= STARTNODE; n--)
-		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1;
-	for(n = 0; n <= nonNullRank; n++)
-		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1;
-
-	DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue + 1));
-
-	return nonNullRank;
-}
-
-/**
- * HUF_buildCTableFromTree():
- * Build the CTable given the Huffman tree in huffNode.
- *
- * @param[out] CTable         The output Huffman CTable.
- * @param      huffNode       The Huffman tree.
- * @param      nonNullRank    The last and smallest node in the Huffman tree.
- * @param      maxSymbolValue The maximum symbol value.
- * @param      maxNbBits      The exact maximum number of bits used in the Huffman tree.
- */
-static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const * huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits) {
-	HUF_CElt* const ct = CTable + 1;
-	/* fill result into ctable (val, nbBits) */
-	int n;
-	U16 nbPerRank[HUF_TABLELOG_MAX + 1] = {0};
-	U16 valPerRank[HUF_TABLELOG_MAX + 1] = {0};
-	int const alphabetSize = (int)(maxSymbolValue + 1);
-	for(n = 0; n <= nonNullRank; n++)
-		nbPerRank[huffNode[n].nbBits]++;
-	/* determine starting value per rank */
-	{
-		U16 min = 0;
-		for(n = (int)maxNbBits; n > 0; n--) {
-			valPerRank[n] = min; /* get starting value within each rank */
-			min += nbPerRank[n];
-			min >>= 1;
-		}
-	}
-	for(n = 0; n < alphabetSize; n++)
-		HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */
-	for(n = 0; n < alphabetSize; n++)
-		HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); /* assign value within rank, symbol order */
-	CTable[0] = maxNbBits;
-}
-
-size_t
-HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
-                     void* workSpace, size_t wkspSize) {
-	HUF_buildCTable_wksp_tables* const wksp_tables =
-	    (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
-	nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
-	nodeElt* const huffNode = huffNode0 + 1;
-	int nonNullRank;
-
-	HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
-
-	DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue + 1);
-
-	/* safety checks */
-	if(wkspSize < sizeof(HUF_buildCTable_wksp_tables))
-		return ERROR(workSpace_tooSmall);
-	if(maxNbBits == 0)
-		maxNbBits = HUF_TABLELOG_DEFAULT;
-	if(maxSymbolValue > HUF_SYMBOLVALUE_MAX)
-		return ERROR(maxSymbolValue_tooLarge);
-	ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
-
-	/* sort, decreasing order */
-	HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
-	DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue + 1));
-
-	/* build tree */
-	nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
-
-	/* determine and enforce maxTableLog */
-	maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
-	if(maxNbBits > HUF_TABLELOG_MAX)
-		return ERROR(GENERIC); /* check fit into table */
-
-	HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
-
-	return maxNbBits;
-}
-
-size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
-	HUF_CElt const * ct = CTable + 1;
-	size_t nbBits = 0;
-	int s;
-	for(s = 0; s <= (int)maxSymbolValue; ++s) {
-		nbBits += HUF_getNbBits(ct[s]) * count[s];
-	}
-	return nbBits >> 3;
-}
-
-int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
-	HUF_CElt const * ct = CTable + 1;
-	int bad = 0;
-	int s;
-	for(s = 0; s <= (int)maxSymbolValue; ++s) {
-		bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
-	}
-	return !bad;
-}
-
-size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
-
-/** HUF_CStream_t:
- * Huffman uses its own BIT_CStream_t implementation.
- * There are three major differences from BIT_CStream_t:
- *   1. HUF_addBits() takes a HUF_CElt (size_t) which is
- *      the pair (nbBits, value) in the format:
- *      format:
- *        - Bits [0, 4)            = nbBits
- *        - Bits [4, 64 - nbBits)  = 0
- *        - Bits [64 - nbBits, 64) = value
- *   2. The bitContainer is built from the upper bits and
- *      right shifted. E.g. to add a new value of N bits
- *      you right shift the bitContainer by N, then or in
- *      the new value into the N upper bits.
- *   3. The bitstream has two bit containers. You can add
- *      bits to the second container and merge them into
- *      the first container.
- */
-
-#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8)
-
-typedef struct {
-	size_t bitContainer[2];
-	size_t bitPos[2];
-
-	BYTE* startPtr;
-	BYTE* ptr;
-	BYTE* endPtr;
-} HUF_CStream_t;
-
-/**! HUF_initCStream():
- * Initializes the bitstream.
- * @returns 0 or an error code.
- */
-static size_t HUF_initCStream(HUF_CStream_t* bitC,
-                              void* startPtr, size_t dstCapacity) {
-	ZSTD_memset(bitC, 0, sizeof(*bitC));
-	bitC->startPtr = (BYTE*)startPtr;
-	bitC->ptr = bitC->startPtr;
-	bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]);
-	if(dstCapacity <= sizeof(bitC->bitContainer[0]))
-		return ERROR(dstSize_tooSmall);
-	return 0;
-}
-
-/*! HUF_addBits():
- * Adds the symbol stored in HUF_CElt elt to the bitstream.
- *
- * @param elt   The element we're adding. This is a (nbBits, value) pair.
- *              See the HUF_CStream_t docs for the format.
- * @param idx   Insert into the bitstream at this idx.
- * @param kFast This is a template parameter. If the bitstream is guaranteed
- *              to have at least 4 unused bits after this call it may be 1,
- *              otherwise it must be 0. HUF_addBits() is faster when fast is set.
- */
-FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast) {
-	assert(idx <= 1);
-	assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX);
-	/* This is efficient on x86-64 with BMI2 because shrx
-	 * only reads the low 6 bits of the register. The compiler
-	 * knows this and elides the mask. When fast is set,
-	 * every operation can use the same value loaded from elt.
-	 */
-	bitC->bitContainer[idx] >>= HUF_getNbBits(elt);
-	bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt);
-	/* We only read the low 8 bits of bitC->bitPos[idx] so it
-	 * doesn't matter that the high bits have noise from the value.
-	 */
-	bitC->bitPos[idx] += HUF_getNbBitsFast(elt);
-	assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
-	/* The last 4-bits of elt are dirty if fast is set,
-	 * so we must not be overwriting bits that have already been
-	 * inserted into the bit container.
-	 */
-#if DEBUGLEVEL >= 1
-	{
-		size_t const nbBits = HUF_getNbBits(elt);
-		size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
-		(void)dirtyBits;
-		/* Middle bits are 0. */
-		assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
-		/* We didn't overwrite any bits in the bit container. */
-		assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
-		(void)dirtyBits;
-	}
-#endif
-}
-
-FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC) {
-	bitC->bitContainer[1] = 0;
-	bitC->bitPos[1] = 0;
-}
-
-/*! HUF_mergeIndex1() :
- * Merges the bit container @ index 1 into the bit container @ index 0
- * and zeros the bit container @ index 1.
- */
-FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC) {
-	assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER);
-	bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF);
-	bitC->bitContainer[0] |= bitC->bitContainer[1];
-	bitC->bitPos[0] += bitC->bitPos[1];
-	assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER);
-}
-
-/*! HUF_flushBits() :
- * Flushes the bits in the bit container @ index 0.
- *
- * @post bitPos will be < 8.
- * @param kFast If kFast is set then we must know a-priori that
- *              the bit container will not overflow.
- */
-FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast) {
-	/* The upper bits of bitPos are noisy, so we must mask by 0xFF. */
-	size_t const nbBits = bitC->bitPos[0] & 0xFF;
-	size_t const nbBytes = nbBits >> 3;
-	/* The top nbBits bits of bitContainer are the ones we need. */
-	size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits);
-	/* Mask bitPos to account for the bytes we consumed. */
-	bitC->bitPos[0] &= 7;
-	assert(nbBits > 0);
-	assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8);
-	assert(bitC->ptr <= bitC->endPtr);
-	MEM_writeLEST(bitC->ptr, bitContainer);
-	bitC->ptr += nbBytes;
-	assert(!kFast || bitC->ptr <= bitC->endPtr);
-	if(!kFast && bitC->ptr > bitC->endPtr)
-		bitC->ptr = bitC->endPtr;
-	/* bitContainer doesn't need to be modified because the leftover
-	 * bits are already the top bitPos bits. And we don't care about
-	 * noise in the lower values.
-	 */
-}
-
-/*! HUF_endMark()
- * @returns The Huffman stream end mark: A 1-bit value = 1.
- */
-static HUF_CElt HUF_endMark(void) {
-	HUF_CElt endMark;
-	HUF_setNbBits(&endMark, 1);
-	HUF_setValue(&endMark, 1);
-	return endMark;
-}
-
-/*! HUF_closeCStream() :
- *  @return Size of CStream, in bytes,
- *          or 0 if it could not fit into dstBuffer */
-static size_t HUF_closeCStream(HUF_CStream_t* bitC) {
-	HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0);
-	HUF_flushBits(bitC, /* kFast */ 0);
-	{
-		size_t const nbBits = bitC->bitPos[0] & 0xFF;
-		if(bitC->ptr >= bitC->endPtr)
-			return 0; /* overflow detected */
-		return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
-	}
-}
-
-FORCE_INLINE_TEMPLATE void
-HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast) {
-	HUF_addBits(bitCPtr, CTable[symbol], idx, fast);
-}
-
-FORCE_INLINE_TEMPLATE void
-HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC,
-                                              const BYTE* ip, size_t srcSize,
-                                              const HUF_CElt* ct,
-                                              int kUnroll, int kFastFlush, int kLastFast) {
-	/* Join to kUnroll */
-	int n = (int)srcSize;
-	int rem = n % kUnroll;
-	if(rem > 0) {
-		for(; rem > 0; --rem) {
-			HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0);
-		}
-		HUF_flushBits(bitC, kFastFlush);
-	}
-	assert(n % kUnroll == 0);
-
-	/* Join to 2 * kUnroll */
-	if(n % (2 * kUnroll)) {
-		int u;
-		for(u = 1; u < kUnroll; ++u) {
-			HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1);
-		}
-		HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast);
-		HUF_flushBits(bitC, kFastFlush);
-		n -= kUnroll;
-	}
-	assert(n % (2 * kUnroll) == 0);
-
-	for(; n > 0; n -= 2 * kUnroll) {
-		/* Encode kUnroll symbols into the bitstream @ index 0. */
-		int u;
-		for(u = 1; u < kUnroll; ++u) {
-			HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1);
-		}
-		HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast);
-		HUF_flushBits(bitC, kFastFlush);
-		/* Encode kUnroll symbols into the bitstream @ index 1.
-		 * This allows us to start filling the bit container
-		 * without any data dependencies.
-		 */
-		HUF_zeroIndex1(bitC);
-		for(u = 1; u < kUnroll; ++u) {
-			HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1);
-		}
-		HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast);
-		/* Merge bitstream @ index 1 into the bitstream @ index 0 */
-		HUF_mergeIndex1(bitC);
-		HUF_flushBits(bitC, kFastFlush);
-	}
-	assert(n == 0);
-}
-
-/**
- * Returns a tight upper bound on the output space needed by Huffman
- * with 8 bytes buffer to handle over-writes. If the output is at least
- * this large we don't need to do bounds checks during Huffman encoding.
- */
-static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog) {
-	return ((srcSize * tableLog) >> 3) + 8;
-}
-
-FORCE_INLINE_TEMPLATE size_t
-HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
-                                         const void* src, size_t srcSize,
-                                         const HUF_CElt* CTable) {
-	U32 const tableLog = (U32)CTable[0];
-	HUF_CElt const * ct = CTable + 1;
-	const BYTE* ip = (const BYTE*)src;
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = ostart + dstSize;
-	BYTE* op = ostart;
-	HUF_CStream_t bitC;
-
-	/* init */
-	if(dstSize < 8)
-		return 0; /* not enough space to compress */
-	{
-		size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend - op));
-		if(HUF_isError(initErr))
-			return 0;
-	}
-
-	if(dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
-		HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0);
-	else {
-		if(MEM_32bits()) {
-			switch(tableLog) {
-			case 11:
-				HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0);
-				break;
-			case 10:
-				ZSTD_FALLTHROUGH;
-			case 9:
-				ZSTD_FALLTHROUGH;
-			case 8:
-				HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1);
-				break;
-			case 7:
-				ZSTD_FALLTHROUGH;
-			default:
-				HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1);
-				break;
-			}
-		} else {
-			switch(tableLog) {
-			case 11:
-				HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0);
-				break;
-			case 10:
-				HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1);
-				break;
-			case 9:
-				HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0);
-				break;
-			case 8:
-				HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0);
-				break;
-			case 7:
-				HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0);
-				break;
-			case 6:
-				ZSTD_FALLTHROUGH;
-			default:
-				HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1);
-				break;
-			}
-		}
-	}
-	assert(bitC.ptr <= bitC.endPtr);
-
-	return HUF_closeCStream(&bitC);
-}
-
-#if DYNAMIC_BMI2
-
-static BMI2_TARGET_ATTRIBUTE size_t
-HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
-                                         const void* src, size_t srcSize,
-                                         const HUF_CElt* CTable) {
-	return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
-}
-
-static size_t
-HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
-                                            const void* src, size_t srcSize,
-                                            const HUF_CElt* CTable) {
-	return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
-}
-
-static size_t
-HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
-                                    const void* src, size_t srcSize,
-                                    const HUF_CElt* CTable, const int flags) {
-	if(flags & HUF_flags_bmi2) {
-		return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
-	}
-	return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
-}
-
-#else
-
-static size_t
-HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
-                                    const void* src, size_t srcSize,
-                                    const HUF_CElt* CTable, const int flags) {
-	(void)flags;
-	return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
-}
-
-#endif
-
-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) {
-	return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
-}
-
-static size_t
-HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
-                                    const void* src, size_t srcSize,
-                                    const HUF_CElt* CTable, int flags) {
-	size_t const segmentSize = (srcSize + 3) / 4; /* first 3 segments */
-	const BYTE* ip = (const BYTE*)src;
-	const BYTE* const iend = ip + srcSize;
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = ostart + dstSize;
-	BYTE* op = ostart;
-
-	if(dstSize < 6 + 1 + 1 + 1 + 8)
-		return 0; /* minimum space to compress successfully */
-	if(srcSize < 12)
-		return 0; /* no saving possible : too small input */
-	op += 6;      /* jumpTable */
-
-	assert(op <= oend);
-	{
-		CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), ip, segmentSize, CTable, flags));
-		if(cSize == 0 || cSize > 65535)
-			return 0;
-		MEM_writeLE16(ostart, (U16)cSize);
-		op += cSize;
-	}
-
-	ip += segmentSize;
-	assert(op <= oend);
-	{
-		CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), ip, segmentSize, CTable, flags));
-		if(cSize == 0 || cSize > 65535)
-			return 0;
-		MEM_writeLE16(ostart + 2, (U16)cSize);
-		op += cSize;
-	}
-
-	ip += segmentSize;
-	assert(op <= oend);
-	{
-		CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), ip, segmentSize, CTable, flags));
-		if(cSize == 0 || cSize > 65535)
-			return 0;
-		MEM_writeLE16(ostart + 4, (U16)cSize);
-		op += cSize;
-	}
-
-	ip += segmentSize;
-	assert(op <= oend);
-	assert(ip <= iend);
-	{
-		CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), ip, (size_t)(iend - ip), CTable, flags));
-		if(cSize == 0 || cSize > 65535)
-			return 0;
-		op += cSize;
-	}
-
-	return (size_t)(op - ostart);
-}
-
-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) {
-	return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
-}
-
-typedef enum { HUF_singleStream,
-	           HUF_fourStreams } HUF_nbStreams_e;
-
-static size_t HUF_compressCTable_internal(
-    BYTE* const ostart, BYTE* op, BYTE* const oend,
-    const void* src, size_t srcSize,
-    HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) {
-	size_t const cSize = (nbStreams == HUF_singleStream) ? HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
-	if(HUF_isError(cSize)) {
-		return cSize;
-	}
-	if(cSize == 0) {
-		return 0;
-	} /* uncompressible */
-	op += cSize;
-	/* check compressibility */
-	assert(op >= ostart);
-	if((size_t)(op - ostart) >= srcSize - 1) {
-		return 0;
-	}
-	return (size_t)(op - ostart);
-}
-
-typedef struct {
-	unsigned count[HUF_SYMBOLVALUE_MAX + 1];
-	HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)];
-	union {
-		HUF_buildCTable_wksp_tables buildCTable_wksp;
-		HUF_WriteCTableWksp writeCTable_wksp;
-		U32 hist_wksp[HIST_WKSP_SIZE_U32];
-	} wksps;
-} HUF_compress_tables_t;
-
-#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
-#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */
-
-unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) {
-	unsigned cardinality = 0;
-	unsigned i;
-
-	for(i = 0; i < maxSymbolValue + 1; i++) {
-		if(count[i] != 0)
-			cardinality += 1;
-	}
-
-	return cardinality;
-}
-
-unsigned HUF_minTableLog(unsigned symbolCardinality) {
-	U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
-	return minBitsSymbols;
-}
-
-unsigned HUF_optimalTableLog(
-    unsigned maxTableLog,
-    size_t srcSize,
-    unsigned maxSymbolValue,
-    void* workSpace, size_t wkspSize,
-    HUF_CElt* table,
-    const unsigned* count,
-    int flags) {
-	assert(srcSize > 1); /* Not supported, RLE should be used instead */
-	assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
-
-	if(!(flags & HUF_flags_optimalDepth)) {
-		/* cheap evaluation, based on FSE */
-		return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
-	}
-
-	{
-		BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
-		size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
-		size_t maxBits, hSize, newSize;
-		const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
-		const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
-		size_t optSize = ((size_t)~0) - 1;
-		unsigned optLog = maxTableLog, optLogGuess;
-
-		DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
-
-		/* Search until size increases */
-		for(optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
-			DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
-			maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
-			if(ERR_isError(maxBits))
-				continue;
-
-			if(maxBits < optLogGuess && optLogGuess > minTableLog)
-				break;
-
-			hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
-
-			if(ERR_isError(hSize))
-				continue;
-
-			newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
-
-			if(newSize > optSize + 1) {
-				break;
-			}
-
-			if(newSize < optSize) {
-				optSize = newSize;
-				optLog = optLogGuess;
-			}
-		}
-		assert(optLog <= HUF_TABLELOG_MAX);
-		return optLog;
-	}
-}
-
-/* HUF_compress_internal() :
- * `workSpace_align4` must be aligned on 4-bytes boundaries,
- * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
-static size_t
-HUF_compress_internal(void* dst, size_t dstSize,
-                      const void* src, size_t srcSize,
-                      unsigned maxSymbolValue, unsigned huffLog,
-                      HUF_nbStreams_e nbStreams,
-                      void* workSpace, size_t wkspSize,
-                      HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) {
-	HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = ostart + dstSize;
-	BYTE* op = ostart;
-
-	DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
-	HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
-
-	/* checks & inits */
-	if(wkspSize < sizeof(*table))
-		return ERROR(workSpace_tooSmall);
-	if(!srcSize)
-		return 0; /* Uncompressed */
-	if(!dstSize)
-		return 0; /* cannot fit anything within dst budget */
-	if(srcSize > HUF_BLOCKSIZE_MAX)
-		return ERROR(srcSize_wrong); /* current block size limit */
-	if(huffLog > HUF_TABLELOG_MAX)
-		return ERROR(tableLog_tooLarge);
-	if(maxSymbolValue > HUF_SYMBOLVALUE_MAX)
-		return ERROR(maxSymbolValue_tooLarge);
-	if(!maxSymbolValue)
-		maxSymbolValue = HUF_SYMBOLVALUE_MAX;
-	if(!huffLog)
-		huffLog = HUF_TABLELOG_DEFAULT;
-
-	/* Heuristic : If old table is valid, use it for small inputs */
-	if((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
-		return HUF_compressCTable_internal(ostart, op, oend,
-		                                   src, srcSize,
-		                                   nbStreams, oldHufTable, flags);
-	}
-
-	/* If uncompressible data is suspected, do a smaller sampling first */
-	DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
-	if((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
-		size_t largestTotal = 0;
-		DEBUGLOG(5, "input suspected incompressible : sampling to check");
-		{
-			unsigned maxSymbolValueBegin = maxSymbolValue;
-			CHECK_V_F(largestBegin, HIST_count_simple(table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE));
-			largestTotal += largestBegin;
-		}
-		{
-			unsigned maxSymbolValueEnd = maxSymbolValue;
-			CHECK_V_F(largestEnd, HIST_count_simple(table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE));
-			largestTotal += largestEnd;
-		}
-		if(largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7) + 4)
-			return 0; /* heuristic : probably not compressible enough */
-	}
-
-	/* Scan input and build symbol stats */
-	{
-		CHECK_V_F(largest, HIST_count_wksp(table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)));
-		if(largest == srcSize) {
-			*ostart = ((const BYTE*)src)[0];
-			return 1;
-		} /* single symbol, rle */
-		if(largest <= (srcSize >> 7) + 4)
-			return 0; /* heuristic : probably not compressible enough */
-	}
-	DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue + 1));
-
-	/* Check validity of previous table */
-	if(repeat && *repeat == HUF_repeat_check && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) {
-		*repeat = HUF_repeat_none;
-	}
-	/* Heuristic : use existing table for small inputs */
-	if((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
-		return HUF_compressCTable_internal(ostart, op, oend,
-		                                   src, srcSize,
-		                                   nbStreams, oldHufTable, flags);
-	}
-
-	/* Build Huffman Tree */
-	huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
-	{
-		size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
-		                                            maxSymbolValue, huffLog,
-		                                            &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
-		CHECK_F(maxBits);
-		huffLog = (U32)maxBits;
-		DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue + 1));
-	}
-	/* Zero unused symbols in CTable, so we can check it for validity */
-	{
-		size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
-		size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
-		ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
-	}
-
-	/* Write table description header */
-	{
-		CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, table->CTable, maxSymbolValue, huffLog,
-		                                      &table->wksps.writeCTable_wksp, sizeof(table->wksps.writeCTable_wksp)));
-		/* Check if using previous huffman table is beneficial */
-		if(repeat && *repeat != HUF_repeat_none) {
-			size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue);
-			size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue);
-			if(oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
-				return HUF_compressCTable_internal(ostart, op, oend,
-				                                   src, srcSize,
-				                                   nbStreams, oldHufTable, flags);
-			}
-		}
-
-		/* Use the new huffman table */
-		if(hSize + 12ul >= srcSize) {
-			return 0;
-		}
-		op += hSize;
-		if(repeat) {
-			*repeat = HUF_repeat_none;
-		}
-		if(oldHufTable)
-			ZSTD_memcpy(oldHufTable, table->CTable, sizeof(table->CTable)); /* Save new table */
-	}
-	return HUF_compressCTable_internal(ostart, op, oend,
-	                                   src, srcSize,
-	                                   nbStreams, table->CTable, flags);
-}
-
-size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
-                             const void* src, size_t srcSize,
-                             unsigned maxSymbolValue, unsigned huffLog,
-                             void* workSpace, size_t wkspSize,
-                             HUF_CElt* hufTable, HUF_repeat* repeat, int flags) {
-	DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
-	return HUF_compress_internal(dst, dstSize, src, srcSize,
-	                             maxSymbolValue, huffLog, HUF_singleStream,
-	                             workSpace, wkspSize, hufTable,
-	                             repeat, flags);
-}
-
-/* HUF_compress4X_repeat():
- * compress input using 4 streams.
- * consider skipping quickly
- * re-use an existing huffman compression table */
-size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
-                             const void* src, size_t srcSize,
-                             unsigned maxSymbolValue, unsigned huffLog,
-                             void* workSpace, size_t wkspSize,
-                             HUF_CElt* hufTable, HUF_repeat* repeat, int flags) {
-	DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
-	return HUF_compress_internal(dst, dstSize, src, srcSize,
-	                             maxSymbolValue, huffLog, HUF_fourStreams,
-	                             workSpace, wkspSize,
-	                             hufTable, repeat, flags);
-}
diff --git a/src/zstd/huf_decompress.c b/src/zstd/huf_decompress.c
deleted file mode 100644
index 118214556..000000000
--- a/src/zstd/huf_decompress.c
+++ /dev/null
@@ -1,1914 +0,0 @@
-/* ******************************************************************
- * huff0 huffman decoder,
- * part of Finite State Entropy library
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- *  You can contact the author at :
- *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- ****************************************************************** */
-
-/* **************************************************************
- *  Dependencies
- ****************************************************************/
-#include "zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */
-#include "compiler.h"
-#include "bitstream.h" /* BIT_* */
-#include "fse.h"       /* to compress headers */
-#include "huf.h"
-#include "error_private.h"
-#include "zstd_internal.h"
-#include "bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
-
-/* **************************************************************
- *  Constants
- ****************************************************************/
-
-#define HUF_DECODER_FAST_TABLELOG 11
-
-/* **************************************************************
- *  Macros
- ****************************************************************/
-
-/* These two optional macros force the use one way or another of the two
- * Huffman decompression implementations. You can't force in both directions
- * at the same time.
- */
-#if defined(HUF_FORCE_DECOMPRESS_X1) && \
-    defined(HUF_FORCE_DECOMPRESS_X2)
-#error "Cannot force the use of the X1 and X2 decoders at the same time!"
-#endif
-
-/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
- * supported at runtime, so we can add the BMI2 target attribute.
- * When it is disabled, we will still get BMI2 if it is enabled statically.
- */
-#if DYNAMIC_BMI2
-#define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
-#else
-#define HUF_FAST_BMI2_ATTRS
-#endif
-
-#ifdef __cplusplus
-#define HUF_EXTERN_C extern "C"
-#else
-#define HUF_EXTERN_C
-#endif
-#define HUF_ASM_DECL HUF_EXTERN_C
-
-#if DYNAMIC_BMI2
-#define HUF_NEED_BMI2_FUNCTION 1
-#else
-#define HUF_NEED_BMI2_FUNCTION 0
-#endif
-
-/* **************************************************************
- *  Error Management
- ****************************************************************/
-#define HUF_isError ERR_isError
-
-/* **************************************************************
- *  Byte alignment for workSpace management
- ****************************************************************/
-#define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a)-1)
-#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
-
-/* **************************************************************
- *  BMI2 Variant Wrappers
- ****************************************************************/
-typedef size_t (*HUF_DecompressUsingDTableFn)(void* dst, size_t dstSize,
-                                              const void* cSrc,
-                                              size_t cSrcSize,
-                                              const HUF_DTable* DTable);
-
-#if DYNAMIC_BMI2
-
-#define HUF_DGEN(fn)                                                          \
-                                                                              \
-	static size_t fn##_default(                                               \
-	    void* dst, size_t dstSize,                                            \
-	    const void* cSrc, size_t cSrcSize,                                    \
-	    const HUF_DTable* DTable) {                                           \
-		return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);               \
-	}                                                                         \
-                                                                              \
-	static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2(                            \
-	    void* dst, size_t dstSize,                                            \
-	    const void* cSrc, size_t cSrcSize,                                    \
-	    const HUF_DTable* DTable) {                                           \
-		return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);               \
-	}                                                                         \
-                                                                              \
-	static size_t fn(void* dst, size_t dstSize, void const * cSrc,            \
-	                 size_t cSrcSize, HUF_DTable const * DTable, int flags) { \
-		if(flags & HUF_flags_bmi2) {                                          \
-			return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);           \
-		}                                                                     \
-		return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);            \
-	}
-
-#else
-
-#define HUF_DGEN(fn)                                                          \
-	static size_t fn(void* dst, size_t dstSize, void const * cSrc,            \
-	                 size_t cSrcSize, HUF_DTable const * DTable, int flags) { \
-		(void)flags;                                                          \
-		return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);               \
-	}
-
-#endif
-
-/*-***************************/
-/*  generic DTableDesc       */
-/*-***************************/
-typedef struct {
-	BYTE maxTableLog;
-	BYTE tableType;
-	BYTE tableLog;
-	BYTE reserved;
-} DTableDesc;
-
-static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) {
-	DTableDesc dtd;
-	ZSTD_memcpy(&dtd, table, sizeof(dtd));
-	return dtd;
-}
-
-static size_t HUF_initFastDStream(BYTE const * ip) {
-	BYTE const lastByte = ip[7];
-	size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
-	size_t const value = MEM_readLEST(ip) | 1;
-	assert(bitsConsumed <= 8);
-	assert(sizeof(size_t) == 8);
-	return value << bitsConsumed;
-}
-
-/**
- * The input/output arguments to the Huffman fast decoding loop:
- *
- * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
- * op [in/out] - The output pointers, must be updated to reflect what is written.
- * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
- * dt [in] - The decoding table.
- * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
- * oend [in] - The end of the output stream. op[3] must not cross oend.
- * iend [in] - The end of each input stream. ip[i] may cross iend[i],
- *             as long as it is above ilimit, but that indicates corruption.
- */
-typedef struct {
-	BYTE const * ip[4];
-	BYTE* op[4];
-	U64 bits[4];
-	void const * dt;
-	BYTE const * ilimit;
-	BYTE* oend;
-	BYTE const * iend[4];
-} HUF_DecompressFastArgs;
-
-typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
-
-/**
- * Initializes args for the fast decoding loop.
- * @returns 1 on success
- *          0 if the fallback implementation should be used.
- *          Or an error code on failure.
- */
-static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const * src, size_t srcSize, const HUF_DTable* DTable) {
-	void const * dt = DTable + 1;
-	U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
-
-	const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
-
-	BYTE* const oend = (BYTE*)dst + dstSize;
-
-	/* The fast decoding loop assumes 64-bit little-endian.
-	 * This condition is false on x32.
-	 */
-	if(!MEM_isLittleEndian() || MEM_32bits())
-		return 0;
-
-	/* strict minimum : jump table + 1 byte per stream */
-	if(srcSize < 10)
-		return ERROR(corruption_detected);
-
-	/* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.
-	 * If table log is not correct at this point, fallback to the old decoder.
-	 * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
-	 */
-	if(dtLog != HUF_DECODER_FAST_TABLELOG)
-		return 0;
-
-	/* Read the jump table. */
-	{
-		const BYTE* const istart = (const BYTE*)src;
-		size_t const length1 = MEM_readLE16(istart);
-		size_t const length2 = MEM_readLE16(istart + 2);
-		size_t const length3 = MEM_readLE16(istart + 4);
-		size_t const length4 = srcSize - (length1 + length2 + length3 + 6);
-		args->iend[0] = istart + 6; /* jumpTable */
-		args->iend[1] = args->iend[0] + length1;
-		args->iend[2] = args->iend[1] + length2;
-		args->iend[3] = args->iend[2] + length3;
-
-		/* HUF_initFastDStream() requires this, and this small of an input
-		 * won't benefit from the ASM loop anyways.
-		 * length1 must be >= 16 so that ip[0] >= ilimit before the loop
-		 * starts.
-		 */
-		if(length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
-			return 0;
-		if(length4 > srcSize)
-			return ERROR(corruption_detected); /* overflow */
-	}
-	/* ip[] contains the position that is currently loaded into bits[]. */
-	args->ip[0] = args->iend[1] - sizeof(U64);
-	args->ip[1] = args->iend[2] - sizeof(U64);
-	args->ip[2] = args->iend[3] - sizeof(U64);
-	args->ip[3] = (BYTE const *)src + srcSize - sizeof(U64);
-
-	/* op[] contains the output pointers. */
-	args->op[0] = (BYTE*)dst;
-	args->op[1] = args->op[0] + (dstSize + 3) / 4;
-	args->op[2] = args->op[1] + (dstSize + 3) / 4;
-	args->op[3] = args->op[2] + (dstSize + 3) / 4;
-
-	/* No point to call the ASM loop for tiny outputs. */
-	if(args->op[3] >= oend)
-		return 0;
-
-	/* bits[] is the bit container.
-	 * It is read from the MSB down to the LSB.
-	 * It is shifted left as it is read, and zeros are
-	 * shifted in. After the lowest valid bit a 1 is
-	 * set, so that CountTrailingZeros(bits[]) can be used
-	 * to count how many bits we've consumed.
-	 */
-	args->bits[0] = HUF_initFastDStream(args->ip[0]);
-	args->bits[1] = HUF_initFastDStream(args->ip[1]);
-	args->bits[2] = HUF_initFastDStream(args->ip[2]);
-	args->bits[3] = HUF_initFastDStream(args->ip[3]);
-
-	/* If ip[] >= ilimit, it is guaranteed to be safe to
-	 * reload bits[]. It may be beyond its section, but is
-	 * guaranteed to be valid (>= istart).
-	 */
-	args->ilimit = ilimit;
-
-	args->oend = oend;
-	args->dt = dt;
-
-	return 1;
-}
-
-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const * args, int stream, BYTE* segmentEnd) {
-	/* Validate that we haven't overwritten. */
-	if(args->op[stream] > segmentEnd)
-		return ERROR(corruption_detected);
-	/* Validate that we haven't read beyond iend[].
-	 * Note that ip[] may be < iend[] because the MSB is
-	 * the next bit to read, and we may have consumed 100%
-	 * of the stream, so down to iend[i] - 8 is valid.
-	 */
-	if(args->ip[stream] < args->iend[stream] - 8)
-		return ERROR(corruption_detected);
-
-	/* Construct the BIT_DStream_t. */
-	assert(sizeof(size_t) == 8);
-	bit->bitContainer = MEM_readLEST(args->ip[stream]);
-	bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
-	bit->start = (const char*)args->iend[0];
-	bit->limitPtr = bit->start + sizeof(size_t);
-	bit->ptr = (const char*)args->ip[stream];
-
-	return 0;
-}
-
-#ifndef HUF_FORCE_DECOMPRESS_X2
-
-/*-***************************/
-/*  single-symbol decoding   */
-/*-***************************/
-typedef struct {
-	BYTE nbBits;
-	BYTE byte;
-} HUF_DEltX1; /* single-symbol decoding */
-
-/**
- * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
- * a time.
- */
-static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
-	U64 D4;
-	if(MEM_isLittleEndian()) {
-		D4 = (U64)((symbol << 8) + nbBits);
-	} else {
-		D4 = (U64)(symbol + (nbBits << 8));
-	}
-	assert(D4 < (1U << 16));
-	D4 *= 0x0001000100010001ULL;
-	return D4;
-}
-
-/**
- * Increase the tableLog to targetTableLog and rescales the stats.
- * If tableLog > targetTableLog this is a no-op.
- * @returns New tableLog
- */
-static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog) {
-	if(tableLog > targetTableLog)
-		return tableLog;
-	if(tableLog < targetTableLog) {
-		U32 const scale = targetTableLog - tableLog;
-		U32 s;
-		/* Increase the weight for all non-zero probability symbols by scale. */
-		for(s = 0; s < nbSymbols; ++s) {
-			huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
-		}
-		/* Update rankVal to reflect the new weights.
-		 * All weights except 0 get moved to weight + scale.
-		 * Weights [1, scale] are empty.
-		 */
-		for(s = targetTableLog; s > scale; --s) {
-			rankVal[s] = rankVal[s - scale];
-		}
-		for(s = scale; s > 0; --s) {
-			rankVal[s] = 0;
-		}
-	}
-	return targetTableLog;
-}
-
-typedef struct {
-	U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
-	U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
-	U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
-	BYTE symbols[HUF_SYMBOLVALUE_MAX + 1];
-	BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
-} HUF_ReadDTableX1_Workspace;
-
-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) {
-	U32 tableLog = 0;
-	U32 nbSymbols = 0;
-	size_t iSize;
-	void* const dtPtr = DTable + 1;
-	HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
-	HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace;
-
-	DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp));
-	if(sizeof(*wksp) > wkspSize)
-		return ERROR(tableLog_tooLarge);
-
-	DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
-	/* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
-
-	iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
-	if(HUF_isError(iSize))
-		return iSize;
-
-	/* Table header */
-	{
-		DTableDesc dtd = HUF_getDTableDesc(DTable);
-		U32 const maxTableLog = dtd.maxTableLog + 1;
-		U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);
-		tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);
-		if(tableLog > (U32)(dtd.maxTableLog + 1))
-			return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
-		dtd.tableType = 0;
-		dtd.tableLog = (BYTE)tableLog;
-		ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
-	}
-
-	/* Compute symbols and rankStart given rankVal:
-	 *
-	 * rankVal already contains the number of values of each weight.
-	 *
-	 * symbols contains the symbols ordered by weight. First are the rankVal[0]
-	 * weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on.
-	 * symbols[0] is filled (but unused) to avoid a branch.
-	 *
-	 * rankStart contains the offset where each rank belongs in the DTable.
-	 * rankStart[0] is not filled because there are no entries in the table for
-	 * weight 0.
-	 */
-	{
-		int n;
-		U32 nextRankStart = 0;
-		int const unroll = 4;
-		int const nLimit = (int)nbSymbols - unroll + 1;
-		for(n = 0; n < (int)tableLog + 1; n++) {
-			U32 const curr = nextRankStart;
-			nextRankStart += wksp->rankVal[n];
-			wksp->rankStart[n] = curr;
-		}
-		for(n = 0; n < nLimit; n += unroll) {
-			int u;
-			for(u = 0; u < unroll; ++u) {
-				size_t const w = wksp->huffWeight[n + u];
-				wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n + u);
-			}
-		}
-		for(; n < (int)nbSymbols; ++n) {
-			size_t const w = wksp->huffWeight[n];
-			wksp->symbols[wksp->rankStart[w]++] = (BYTE)n;
-		}
-	}
-
-	/* fill DTable
-	 * We fill all entries of each weight in order.
-	 * That way length is a constant for each iteration of the outer loop.
-	 * We can switch based on the length to a different inner loop which is
-	 * optimized for that particular case.
-	 */
-	{
-		U32 w;
-		int symbol = wksp->rankVal[0];
-		int rankStart = 0;
-		for(w = 1; w < tableLog + 1; ++w) {
-			int const symbolCount = wksp->rankVal[w];
-			int const length = (1 << w) >> 1;
-			int uStart = rankStart;
-			BYTE const nbBits = (BYTE)(tableLog + 1 - w);
-			int s;
-			int u;
-			switch(length) {
-			case 1:
-				for(s = 0; s < symbolCount; ++s) {
-					HUF_DEltX1 D;
-					D.byte = wksp->symbols[symbol + s];
-					D.nbBits = nbBits;
-					dt[uStart] = D;
-					uStart += 1;
-				}
-				break;
-			case 2:
-				for(s = 0; s < symbolCount; ++s) {
-					HUF_DEltX1 D;
-					D.byte = wksp->symbols[symbol + s];
-					D.nbBits = nbBits;
-					dt[uStart + 0] = D;
-					dt[uStart + 1] = D;
-					uStart += 2;
-				}
-				break;
-			case 4:
-				for(s = 0; s < symbolCount; ++s) {
-					U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
-					MEM_write64(dt + uStart, D4);
-					uStart += 4;
-				}
-				break;
-			case 8:
-				for(s = 0; s < symbolCount; ++s) {
-					U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
-					MEM_write64(dt + uStart, D4);
-					MEM_write64(dt + uStart + 4, D4);
-					uStart += 8;
-				}
-				break;
-			default:
-				for(s = 0; s < symbolCount; ++s) {
-					U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
-					for(u = 0; u < length; u += 16) {
-						MEM_write64(dt + uStart + u + 0, D4);
-						MEM_write64(dt + uStart + u + 4, D4);
-						MEM_write64(dt + uStart + u + 8, D4);
-						MEM_write64(dt + uStart + u + 12, D4);
-					}
-					assert(u == length);
-					uStart += length;
-				}
-				break;
-			}
-			symbol += symbolCount;
-			rankStart += symbolCount * length;
-		}
-	}
-	return iSize;
-}
-
-FORCE_INLINE_TEMPLATE BYTE
-HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog) {
-	size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
-	BYTE const c = dt[val].byte;
-	BIT_skipBits(Dstream, dt[val].nbBits);
-	return c;
-}
-
-#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
-	*ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
-
-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)   \
-	if(MEM_64bits() || (HUF_TABLELOG_MAX <= 12)) \
-	HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
-
-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
-	if(MEM_64bits())                           \
-	HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
-
-HINT_INLINE size_t
-HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) {
-	BYTE* const pStart = p;
-
-	/* up to 4 symbols at a time */
-	if((pEnd - p) > 3) {
-		while((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd - 3)) {
-			HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
-			HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
-			HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
-			HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
-		}
-	} else {
-		BIT_reloadDStream(bitDPtr);
-	}
-
-	/* [0-3] symbols remaining */
-	if(MEM_32bits())
-		while((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
-			HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
-
-	/* no more data to retrieve from bitstream, no need to reload */
-	while(p < pEnd)
-		HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
-
-	return (size_t)(pEnd - pStart);
-}
-
-FORCE_INLINE_TEMPLATE size_t
-HUF_decompress1X1_usingDTable_internal_body(
-    void* dst, size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable) {
-	BYTE* op = (BYTE*)dst;
-	BYTE* const oend = op + dstSize;
-	const void* dtPtr = DTable + 1;
-	const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
-	BIT_DStream_t bitD;
-	DTableDesc const dtd = HUF_getDTableDesc(DTable);
-	U32 const dtLog = dtd.tableLog;
-
-	CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
-
-	HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog);
-
-	if(!BIT_endOfDStream(&bitD))
-		return ERROR(corruption_detected);
-
-	return dstSize;
-}
-
-/* HUF_decompress4X1_usingDTable_internal_body():
- * Conditions :
- * @dstSize >= 6
- */
-FORCE_INLINE_TEMPLATE size_t
-HUF_decompress4X1_usingDTable_internal_body(
-    void* dst, size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable) {
-	/* Check */
-	if(cSrcSize < 10)
-		return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
-
-	{
-		const BYTE* const istart = (const BYTE*)cSrc;
-		BYTE* const ostart = (BYTE*)dst;
-		BYTE* const oend = ostart + dstSize;
-		BYTE* const olimit = oend - 3;
-		const void* const dtPtr = DTable + 1;
-		const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
-
-		/* Init */
-		BIT_DStream_t bitD1;
-		BIT_DStream_t bitD2;
-		BIT_DStream_t bitD3;
-		BIT_DStream_t bitD4;
-		size_t const length1 = MEM_readLE16(istart);
-		size_t const length2 = MEM_readLE16(istart + 2);
-		size_t const length3 = MEM_readLE16(istart + 4);
-		size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
-		const BYTE* const istart1 = istart + 6; /* jumpTable */
-		const BYTE* const istart2 = istart1 + length1;
-		const BYTE* const istart3 = istart2 + length2;
-		const BYTE* const istart4 = istart3 + length3;
-		const size_t segmentSize = (dstSize + 3) / 4;
-		BYTE* const opStart2 = ostart + segmentSize;
-		BYTE* const opStart3 = opStart2 + segmentSize;
-		BYTE* const opStart4 = opStart3 + segmentSize;
-		BYTE* op1 = ostart;
-		BYTE* op2 = opStart2;
-		BYTE* op3 = opStart3;
-		BYTE* op4 = opStart4;
-		DTableDesc const dtd = HUF_getDTableDesc(DTable);
-		U32 const dtLog = dtd.tableLog;
-		U32 endSignal = 1;
-
-		if(length4 > cSrcSize)
-			return ERROR(corruption_detected); /* overflow */
-		if(opStart4 > oend)
-			return ERROR(corruption_detected); /* overflow */
-		if(dstSize < 6)
-			return ERROR(corruption_detected); /* stream 4-split doesn't work */
-		CHECK_F(BIT_initDStream(&bitD1, istart1, length1));
-		CHECK_F(BIT_initDStream(&bitD2, istart2, length2));
-		CHECK_F(BIT_initDStream(&bitD3, istart3, length3));
-		CHECK_F(BIT_initDStream(&bitD4, istart4, length4));
-
-		/* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
-		if((size_t)(oend - op4) >= sizeof(size_t)) {
-			for(; (endSignal) & (op4 < olimit);) {
-				HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
-				HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
-				HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
-				HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
-				HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
-				HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
-				HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
-				HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
-				HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
-				HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
-				HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
-				HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
-				HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
-				HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
-				HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
-				HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
-				endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
-				endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
-				endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
-				endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
-			}
-		}
-
-		/* check corruption */
-		/* note : should not be necessary : op# advance in lock step, and we control op4.
-		 *        but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */
-		if(op1 > opStart2)
-			return ERROR(corruption_detected);
-		if(op2 > opStart3)
-			return ERROR(corruption_detected);
-		if(op3 > opStart4)
-			return ERROR(corruption_detected);
-		/* note : op4 supposed already verified within main loop */
-
-		/* finish bitStreams one by one */
-		HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog);
-		HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog);
-		HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog);
-		HUF_decodeStreamX1(op4, &bitD4, oend, dt, dtLog);
-
-		/* check */
-		{
-			U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
-			if(!endCheck)
-				return ERROR(corruption_detected);
-		}
-
-		/* decoded size */
-		return dstSize;
-	}
-}
-
-#if HUF_NEED_BMI2_FUNCTION
-static BMI2_TARGET_ATTRIBUTE
-    size_t
-    HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const * cSrc,
-                                                size_t cSrcSize, HUF_DTable const * DTable) {
-	return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
-}
-#endif
-
-static size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const * cSrc,
-                                                             size_t cSrcSize, HUF_DTable const * DTable) {
-	return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
-}
-
-#if ZSTD_ENABLE_ASM_X86_64_BMI2
-
-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
-
-#endif
-
-static HUF_FAST_BMI2_ATTRS void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) {
-	U64 bits[4];
-	BYTE const * ip[4];
-	BYTE* op[4];
-	U16 const * const dtable = (U16 const *)args->dt;
-	BYTE* const oend = args->oend;
-	BYTE const * const ilimit = args->ilimit;
-
-	/* Copy the arguments to local variables */
-	ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
-	ZSTD_memcpy(&ip, &args->ip, sizeof(ip));
-	ZSTD_memcpy(&op, &args->op, sizeof(op));
-
-	assert(MEM_isLittleEndian());
-	assert(!MEM_32bits());
-
-	for(;;) {
-		BYTE* olimit;
-		int stream;
-		int symbol;
-
-		/* Assert loop preconditions */
-#ifndef NDEBUG
-		for(stream = 0; stream < 4; ++stream) {
-			assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
-			assert(ip[stream] >= ilimit);
-		}
-#endif
-		/* Compute olimit */
-		{
-			/* Each iteration produces 5 output symbols per stream */
-			size_t const oiters = (size_t)(oend - op[3]) / 5;
-			/* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
-			 * per stream.
-			 */
-			size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
-			/* We can safely run iters iterations before running bounds checks */
-			size_t const iters = MIN(oiters, iiters);
-			size_t const symbols = iters * 5;
-
-			/* We can simply check that op[3] < olimit, instead of checking all
-			 * of our bounds, since we can't hit the other bounds until we've run
-			 * iters iterations, which only happens when op[3] == olimit.
-			 */
-			olimit = op[3] + symbols;
-
-			/* Exit fast decoding loop once we get close to the end. */
-			if(op[3] + 20 > olimit)
-				break;
-
-			/* Exit the decoding loop if any input pointer has crossed the
-			 * previous one. This indicates corruption, and a precondition
-			 * to our loop is that ip[i] >= ip[0].
-			 */
-			for(stream = 1; stream < 4; ++stream) {
-				if(ip[stream] < ip[stream - 1])
-					goto _out;
-			}
-		}
-
-#ifndef NDEBUG
-		for(stream = 1; stream < 4; ++stream) {
-			assert(ip[stream] >= ip[stream - 1]);
-		}
-#endif
-
-		do {
-			/* Decode 5 symbols in each of the 4 streams */
-			for(symbol = 0; symbol < 5; ++symbol) {
-				for(stream = 0; stream < 4; ++stream) {
-					int const index = (int)(bits[stream] >> 53);
-					int const entry = (int)dtable[index];
-					bits[stream] <<= (entry & 63);
-					op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
-				}
-			}
-			/* Reload the bitstreams */
-			for(stream = 0; stream < 4; ++stream) {
-				int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
-				int const nbBits = ctz & 7;
-				int const nbBytes = ctz >> 3;
-				op[stream] += 5;
-				ip[stream] -= nbBytes;
-				bits[stream] = MEM_read64(ip[stream]) | 1;
-				bits[stream] <<= nbBits;
-			}
-		} while(op[3] < olimit);
-	}
-
-_out:
-
-	/* Save the final values of each of the state variables back to args. */
-	ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
-	ZSTD_memcpy(&args->ip, &ip, sizeof(ip));
-	ZSTD_memcpy(&args->op, &op, sizeof(op));
-}
-
-/**
- * @returns @p dstSize on success (>= 6)
- *          0 if the fallback implementation should be used
- *          An error if an error occurred
- */
-static HUF_FAST_BMI2_ATTRS
-    size_t
-    HUF_decompress4X1_usingDTable_internal_fast(
-        void* dst, size_t dstSize,
-        const void* cSrc, size_t cSrcSize,
-        const HUF_DTable* DTable,
-        HUF_DecompressFastLoopFn loopFn) {
-	void const * dt = DTable + 1;
-	const BYTE* const iend = (const BYTE*)cSrc + 6;
-	BYTE* const oend = (BYTE*)dst + dstSize;
-	HUF_DecompressFastArgs args;
-	{
-		size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
-		FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
-		if(ret == 0)
-			return 0;
-	}
-
-	assert(args.ip[0] >= args.ilimit);
-	loopFn(&args);
-
-	/* Our loop guarantees that ip[] >= ilimit and that we haven't
-	 * overwritten any op[].
-	 */
-	assert(args.ip[0] >= iend);
-	assert(args.ip[1] >= iend);
-	assert(args.ip[2] >= iend);
-	assert(args.ip[3] >= iend);
-	assert(args.op[3] <= oend);
-	(void)iend;
-
-	/* finish bit streams one by one. */
-	{
-		size_t const segmentSize = (dstSize + 3) / 4;
-		BYTE* segmentEnd = (BYTE*)dst;
-		int i;
-		for(i = 0; i < 4; ++i) {
-			BIT_DStream_t bit;
-			if(segmentSize <= (size_t)(oend - segmentEnd))
-				segmentEnd += segmentSize;
-			else
-				segmentEnd = oend;
-			FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
-			/* Decompress and validate that we've produced exactly the expected length. */
-			args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const *)dt, HUF_DECODER_FAST_TABLELOG);
-			if(args.op[i] != segmentEnd)
-				return ERROR(corruption_detected);
-		}
-	}
-
-	/* decoded size */
-	assert(dstSize != 0);
-	return dstSize;
-}
-
-HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
-
-static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const * cSrc,
-                                                     size_t cSrcSize, HUF_DTable const * DTable, int flags) {
-	HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
-	HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
-
-#if DYNAMIC_BMI2
-	if(flags & HUF_flags_bmi2) {
-		fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
-#if ZSTD_ENABLE_ASM_X86_64_BMI2
-		if(!(flags & HUF_flags_disableAsm)) {
-			loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
-		}
-#endif
-	} else {
-		return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
-	}
-#endif
-
-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
-	if(!(flags & HUF_flags_disableAsm)) {
-		loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
-	}
-#endif
-
-	if(!(flags & HUF_flags_disableFast)) {
-		size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
-		if(ret != 0)
-			return ret;
-	}
-	return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
-}
-
-static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
-                                          const void* cSrc, size_t cSrcSize,
-                                          void* workSpace, size_t wkspSize, int flags) {
-	const BYTE* ip = (const BYTE*)cSrc;
-
-	size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
-	if(HUF_isError(hSize))
-		return hSize;
-	if(hSize >= cSrcSize)
-		return ERROR(srcSize_wrong);
-	ip += hSize;
-	cSrcSize -= hSize;
-
-	return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
-}
-
-#endif /* HUF_FORCE_DECOMPRESS_X2 */
-
-#ifndef HUF_FORCE_DECOMPRESS_X1
-
-/* *************************/
-/* double-symbols decoding */
-/* *************************/
-
-typedef struct {
-	U16 sequence;
-	BYTE nbBits;
-	BYTE length;
-} HUF_DEltX2; /* double-symbols decoding */
-typedef struct {
-	BYTE symbol;
-} sortedSymbol_t;
-typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
-typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
-
-/**
- * Constructs a HUF_DEltX2 in a U32.
- */
-static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level) {
-	U32 seq;
-	DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);
-	DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);
-	DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);
-	DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));
-	if(MEM_isLittleEndian()) {
-		seq = level == 1 ? symbol : (baseSeq + (symbol << 8));
-		return seq + (nbBits << 16) + ((U32)level << 24);
-	} else {
-		seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
-		return (seq << 16) + (nbBits << 8) + (U32)level;
-	}
-}
-
-/**
- * Constructs a HUF_DEltX2.
- */
-static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level) {
-	HUF_DEltX2 DElt;
-	U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
-	DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));
-	ZSTD_memcpy(&DElt, &val, sizeof(val));
-	return DElt;
-}
-
-/**
- * Constructs 2 HUF_DEltX2s and packs them into a U64.
- */
-static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level) {
-	U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
-	return (U64)DElt + ((U64)DElt << 32);
-}
-
-/**
- * Fills the DTable rank with all the symbols from [begin, end) that are each
- * nbBits long.
- *
- * @param DTableRank The start of the rank in the DTable.
- * @param begin The first symbol to fill (inclusive).
- * @param end The last symbol to fill (exclusive).
- * @param nbBits Each symbol is nbBits long.
- * @param tableLog The table log.
- * @param baseSeq If level == 1 { 0 } else { the first level symbol }
- * @param level The level in the table. Must be 1 or 2.
- */
-static void HUF_fillDTableX2ForWeight(
-    HUF_DEltX2* DTableRank,
-    sortedSymbol_t const * begin, sortedSymbol_t const * end,
-    U32 nbBits, U32 tableLog,
-    U16 baseSeq, int const level) {
-	U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);
-	const sortedSymbol_t* ptr;
-	assert(level >= 1 && level <= 2);
-	switch(length) {
-	case 1:
-		for(ptr = begin; ptr != end; ++ptr) {
-			HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
-			*DTableRank++ = DElt;
-		}
-		break;
-	case 2:
-		for(ptr = begin; ptr != end; ++ptr) {
-			HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
-			DTableRank[0] = DElt;
-			DTableRank[1] = DElt;
-			DTableRank += 2;
-		}
-		break;
-	case 4:
-		for(ptr = begin; ptr != end; ++ptr) {
-			U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
-			ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
-			ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
-			DTableRank += 4;
-		}
-		break;
-	case 8:
-		for(ptr = begin; ptr != end; ++ptr) {
-			U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
-			ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
-			ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
-			ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
-			ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
-			DTableRank += 8;
-		}
-		break;
-	default:
-		for(ptr = begin; ptr != end; ++ptr) {
-			U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
-			HUF_DEltX2* const DTableRankEnd = DTableRank + length;
-			for(; DTableRank != DTableRankEnd; DTableRank += 8) {
-				ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
-				ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
-				ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
-				ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
-			}
-		}
-		break;
-	}
-}
-
-/* HUF_fillDTableX2Level2() :
- * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
-static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,
-                                   const U32* rankVal, const int minWeight, const int maxWeight1,
-                                   const sortedSymbol_t* sortedSymbols, U32 const * rankStart,
-                                   U32 nbBitsBaseline, U16 baseSeq) {
-	/* Fill skipped values (all positions up to rankVal[minWeight]).
-	 * These are positions only get a single symbol because the combined weight
-	 * is too large.
-	 */
-	if(minWeight > 1) {
-		U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);
-		U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);
-		int const skipSize = rankVal[minWeight];
-		assert(length > 1);
-		assert((U32)skipSize < length);
-		switch(length) {
-		case 2:
-			assert(skipSize == 1);
-			ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
-			break;
-		case 4:
-			assert(skipSize <= 4);
-			ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
-			ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
-			break;
-		default: {
-			int i;
-			for(i = 0; i < skipSize; i += 8) {
-				ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
-				ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
-				ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
-				ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
-			}
-		}
-		}
-	}
-
-	/* Fill each of the second level symbols by weight. */
-	{
-		int w;
-		for(w = minWeight; w < maxWeight1; ++w) {
-			int const begin = rankStart[w];
-			int const end = rankStart[w + 1];
-			U32 const nbBits = nbBitsBaseline - w;
-			U32 const totalBits = nbBits + consumedBits;
-			HUF_fillDTableX2ForWeight(
-			    DTable + rankVal[w],
-			    sortedSymbols + begin, sortedSymbols + end,
-			    totalBits, targetLog,
-			    baseSeq, /* level */ 2);
-		}
-	}
-}
-
-static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
-                             const sortedSymbol_t* sortedList,
-                             const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
-                             const U32 nbBitsBaseline) {
-	U32* const rankVal = rankValOrigin[0];
-	const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
-	const U32 minBits = nbBitsBaseline - maxWeight;
-	int w;
-	int const wEnd = (int)maxWeight + 1;
-
-	/* Fill DTable in order of weight. */
-	for(w = 1; w < wEnd; ++w) {
-		int const begin = (int)rankStart[w];
-		int const end = (int)rankStart[w + 1];
-		U32 const nbBits = nbBitsBaseline - w;
-
-		if(targetLog - nbBits >= minBits) {
-			/* Enough room for a second symbol. */
-			int start = rankVal[w];
-			U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);
-			int minWeight = nbBits + scaleLog;
-			int s;
-			if(minWeight < 1)
-				minWeight = 1;
-			/* Fill the DTable for every symbol of weight w.
-			 * These symbols get at least 1 second symbol.
-			 */
-			for(s = begin; s != end; ++s) {
-				HUF_fillDTableX2Level2(
-				    DTable + start, targetLog, nbBits,
-				    rankValOrigin[nbBits], minWeight, wEnd,
-				    sortedList, rankStart,
-				    nbBitsBaseline, sortedList[s].symbol);
-				start += length;
-			}
-		} else {
-			/* Only a single symbol. */
-			HUF_fillDTableX2ForWeight(
-			    DTable + rankVal[w],
-			    sortedList + begin, sortedList + end,
-			    nbBits, targetLog,
-			    /* baseSeq */ 0, /* level */ 1);
-		}
-	}
-}
-
-typedef struct {
-	rankValCol_t rankVal[HUF_TABLELOG_MAX];
-	U32 rankStats[HUF_TABLELOG_MAX + 1];
-	U32 rankStart0[HUF_TABLELOG_MAX + 3];
-	sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
-	BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
-	U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
-} HUF_ReadDTableX2_Workspace;
-
-size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
-                             const void* src, size_t srcSize,
-                             void* workSpace, size_t wkspSize, int flags) {
-	U32 tableLog, maxW, nbSymbols;
-	DTableDesc dtd = HUF_getDTableDesc(DTable);
-	U32 maxTableLog = dtd.maxTableLog;
-	size_t iSize;
-	void* dtPtr = DTable + 1; /* force compiler to avoid strict-aliasing */
-	HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
-	U32* rankStart;
-
-	HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace;
-
-	if(sizeof(*wksp) > wkspSize)
-		return ERROR(GENERIC);
-
-	rankStart = wksp->rankStart0 + 1;
-	ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats));
-	ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0));
-
-	DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
-	if(maxTableLog > HUF_TABLELOG_MAX)
-		return ERROR(tableLog_tooLarge);
-	/* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
-
-	iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
-	if(HUF_isError(iSize))
-		return iSize;
-
-	/* check result */
-	if(tableLog > maxTableLog)
-		return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
-	if(tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG)
-		maxTableLog = HUF_DECODER_FAST_TABLELOG;
-
-	/* find maxWeight */
-	for(maxW = tableLog; wksp->rankStats[maxW] == 0; maxW--) {
-	} /* necessarily finds a solution before 0 */
-
-	/* Get start index of each weight */
-	{
-		U32 w, nextRankStart = 0;
-		for(w = 1; w < maxW + 1; w++) {
-			U32 curr = nextRankStart;
-			nextRankStart += wksp->rankStats[w];
-			rankStart[w] = curr;
-		}
-		rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
-		rankStart[maxW + 1] = nextRankStart;
-	}
-
-	/* sort symbols by weight */
-	{
-		U32 s;
-		for(s = 0; s < nbSymbols; s++) {
-			U32 const w = wksp->weightList[s];
-			U32 const r = rankStart[w]++;
-			wksp->sortedSymbol[r].symbol = (BYTE)s;
-		}
-		rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
-	}
-
-	/* Build rankVal */
-	{
-		U32* const rankVal0 = wksp->rankVal[0];
-		{
-			int const rescale = (maxTableLog - tableLog) - 1; /* tableLog <= maxTableLog */
-			U32 nextRankVal = 0;
-			U32 w;
-			for(w = 1; w < maxW + 1; w++) {
-				U32 curr = nextRankVal;
-				nextRankVal += wksp->rankStats[w] << (w + rescale);
-				rankVal0[w] = curr;
-			}
-		}
-		{
-			U32 const minBits = tableLog + 1 - maxW;
-			U32 consumed;
-			for(consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
-				U32* const rankValPtr = wksp->rankVal[consumed];
-				U32 w;
-				for(w = 1; w < maxW + 1; w++) {
-					rankValPtr[w] = rankVal0[w] >> consumed;
-				}
-			}
-		}
-	}
-
-	HUF_fillDTableX2(dt, maxTableLog,
-	                 wksp->sortedSymbol,
-	                 wksp->rankStart0, wksp->rankVal, maxW,
-	                 tableLog + 1);
-
-	dtd.tableLog = (BYTE)maxTableLog;
-	dtd.tableType = 1;
-	ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
-	return iSize;
-}
-
-FORCE_INLINE_TEMPLATE U32
-HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) {
-	size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
-	ZSTD_memcpy(op, &dt[val].sequence, 2);
-	BIT_skipBits(DStream, dt[val].nbBits);
-	return dt[val].length;
-}
-
-FORCE_INLINE_TEMPLATE U32
-HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) {
-	size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
-	ZSTD_memcpy(op, &dt[val].sequence, 1);
-	if(dt[val].length == 1) {
-		BIT_skipBits(DStream, dt[val].nbBits);
-	} else {
-		if(DStream->bitsConsumed < (sizeof(DStream->bitContainer) * 8)) {
-			BIT_skipBits(DStream, dt[val].nbBits);
-			if(DStream->bitsConsumed > (sizeof(DStream->bitContainer) * 8))
-				/* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
-				DStream->bitsConsumed = (sizeof(DStream->bitContainer) * 8);
-		}
-	}
-	return 1;
-}
-
-#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
-	ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
-
-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)   \
-	if(MEM_64bits() || (HUF_TABLELOG_MAX <= 12)) \
-	ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
-
-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
-	if(MEM_64bits())                           \
-	ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
-
-HINT_INLINE size_t
-HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
-                   const HUF_DEltX2* const dt, const U32 dtLog) {
-	BYTE* const pStart = p;
-
-	/* up to 8 symbols at a time */
-	if((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {
-		if(dtLog <= 11 && MEM_64bits()) {
-			/* up to 10 symbols at a time */
-			while((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd - 9)) {
-				HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
-				HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
-				HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
-				HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
-				HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
-			}
-		} else {
-			/* up to 8 symbols at a time */
-			while((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd - (sizeof(bitDPtr->bitContainer) - 1))) {
-				HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
-				HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
-				HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
-				HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
-			}
-		}
-	} else {
-		BIT_reloadDStream(bitDPtr);
-	}
-
-	/* closer to end : up to 2 symbols at a time */
-	if((size_t)(pEnd - p) >= 2) {
-		while((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd - 2))
-			HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
-
-		while(p <= pEnd - 2)
-			HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
-	}
-
-	if(p < pEnd)
-		p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
-
-	return p - pStart;
-}
-
-FORCE_INLINE_TEMPLATE size_t
-HUF_decompress1X2_usingDTable_internal_body(
-    void* dst, size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable) {
-	BIT_DStream_t bitD;
-
-	/* Init */
-	CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
-
-	/* decode */
-	{
-		BYTE* const ostart = (BYTE*)dst;
-		BYTE* const oend = ostart + dstSize;
-		const void* const dtPtr = DTable + 1; /* force compiler to not use strict-aliasing */
-		const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
-		DTableDesc const dtd = HUF_getDTableDesc(DTable);
-		HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog);
-	}
-
-	/* check */
-	if(!BIT_endOfDStream(&bitD))
-		return ERROR(corruption_detected);
-
-	/* decoded size */
-	return dstSize;
-}
-
-/* HUF_decompress4X2_usingDTable_internal_body():
- * Conditions:
- * @dstSize >= 6
- */
-FORCE_INLINE_TEMPLATE size_t
-HUF_decompress4X2_usingDTable_internal_body(
-    void* dst, size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable) {
-	if(cSrcSize < 10)
-		return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
-
-	{
-		const BYTE* const istart = (const BYTE*)cSrc;
-		BYTE* const ostart = (BYTE*)dst;
-		BYTE* const oend = ostart + dstSize;
-		BYTE* const olimit = oend - (sizeof(size_t) - 1);
-		const void* const dtPtr = DTable + 1;
-		const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
-
-		/* Init */
-		BIT_DStream_t bitD1;
-		BIT_DStream_t bitD2;
-		BIT_DStream_t bitD3;
-		BIT_DStream_t bitD4;
-		size_t const length1 = MEM_readLE16(istart);
-		size_t const length2 = MEM_readLE16(istart + 2);
-		size_t const length3 = MEM_readLE16(istart + 4);
-		size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
-		const BYTE* const istart1 = istart + 6; /* jumpTable */
-		const BYTE* const istart2 = istart1 + length1;
-		const BYTE* const istart3 = istart2 + length2;
-		const BYTE* const istart4 = istart3 + length3;
-		size_t const segmentSize = (dstSize + 3) / 4;
-		BYTE* const opStart2 = ostart + segmentSize;
-		BYTE* const opStart3 = opStart2 + segmentSize;
-		BYTE* const opStart4 = opStart3 + segmentSize;
-		BYTE* op1 = ostart;
-		BYTE* op2 = opStart2;
-		BYTE* op3 = opStart3;
-		BYTE* op4 = opStart4;
-		U32 endSignal = 1;
-		DTableDesc const dtd = HUF_getDTableDesc(DTable);
-		U32 const dtLog = dtd.tableLog;
-
-		if(length4 > cSrcSize)
-			return ERROR(corruption_detected); /* overflow */
-		if(opStart4 > oend)
-			return ERROR(corruption_detected); /* overflow */
-		if(dstSize < 6)
-			return ERROR(corruption_detected); /* stream 4-split doesn't work */
-		CHECK_F(BIT_initDStream(&bitD1, istart1, length1));
-		CHECK_F(BIT_initDStream(&bitD2, istart2, length2));
-		CHECK_F(BIT_initDStream(&bitD3, istart3, length3));
-		CHECK_F(BIT_initDStream(&bitD4, istart4, length4));
-
-		/* 16-32 symbols per loop (4-8 symbols per stream) */
-		if((size_t)(oend - op4) >= sizeof(size_t)) {
-			for(; (endSignal) & (op4 < olimit);) {
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-				HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-				HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
-				HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-				HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
-				HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-				HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
-				HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-				HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
-				endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
-				endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
-				HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-				HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
-				HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-				HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
-				HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-				HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
-				HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-				HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
-				endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
-				endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
-#else
-				HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-				HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-				HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-				HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-				HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
-				HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
-				HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
-				HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
-				HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-				HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-				HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-				HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-				HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
-				HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
-				HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
-				HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
-				endSignal = (U32)LIKELY((U32)(BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
-#endif
-			}
-		}
-
-		/* check corruption */
-		if(op1 > opStart2)
-			return ERROR(corruption_detected);
-		if(op2 > opStart3)
-			return ERROR(corruption_detected);
-		if(op3 > opStart4)
-			return ERROR(corruption_detected);
-		/* note : op4 already verified within main loop */
-
-		/* finish bitStreams one by one */
-		HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
-		HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
-		HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
-		HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog);
-
-		/* check */
-		{
-			U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
-			if(!endCheck)
-				return ERROR(corruption_detected);
-		}
-
-		/* decoded size */
-		return dstSize;
-	}
-}
-
-#if HUF_NEED_BMI2_FUNCTION
-static BMI2_TARGET_ATTRIBUTE
-    size_t
-    HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const * cSrc,
-                                                size_t cSrcSize, HUF_DTable const * DTable) {
-	return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
-}
-#endif
-
-static size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const * cSrc,
-                                                             size_t cSrcSize, HUF_DTable const * DTable) {
-	return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
-}
-
-#if ZSTD_ENABLE_ASM_X86_64_BMI2
-
-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
-
-#endif
-
-static HUF_FAST_BMI2_ATTRS void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) {
-	U64 bits[4];
-	BYTE const * ip[4];
-	BYTE* op[4];
-	BYTE* oend[4];
-	HUF_DEltX2 const * const dtable = (HUF_DEltX2 const *)args->dt;
-	BYTE const * const ilimit = args->ilimit;
-
-	/* Copy the arguments to local registers. */
-	ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
-	ZSTD_memcpy(&ip, &args->ip, sizeof(ip));
-	ZSTD_memcpy(&op, &args->op, sizeof(op));
-
-	oend[0] = op[1];
-	oend[1] = op[2];
-	oend[2] = op[3];
-	oend[3] = args->oend;
-
-	assert(MEM_isLittleEndian());
-	assert(!MEM_32bits());
-
-	for(;;) {
-		BYTE* olimit;
-		int stream;
-		int symbol;
-
-		/* Assert loop preconditions */
-#ifndef NDEBUG
-		for(stream = 0; stream < 4; ++stream) {
-			assert(op[stream] <= oend[stream]);
-			assert(ip[stream] >= ilimit);
-		}
-#endif
-		/* Compute olimit */
-		{
-			/* Each loop does 5 table lookups for each of the 4 streams.
-			 * Each table lookup consumes up to 11 bits of input, and produces
-			 * up to 2 bytes of output.
-			 */
-			/* We can consume up to 7 bytes of input per iteration per stream.
-			 * We also know that each input pointer is >= ip[0]. So we can run
-			 * iters loops before running out of input.
-			 */
-			size_t iters = (size_t)(ip[0] - ilimit) / 7;
-			/* Each iteration can produce up to 10 bytes of output per stream.
-			 * Each output stream my advance at different rates. So take the
-			 * minimum number of safe iterations among all the output streams.
-			 */
-			for(stream = 0; stream < 4; ++stream) {
-				size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
-				iters = MIN(iters, oiters);
-			}
-
-			/* Each iteration produces at least 5 output symbols. So until
-			 * op[3] crosses olimit, we know we haven't executed iters
-			 * iterations yet. This saves us maintaining an iters counter,
-			 * at the expense of computing the remaining # of iterations
-			 * more frequently.
-			 */
-			olimit = op[3] + (iters * 5);
-
-			/* Exit the fast decoding loop if we are too close to the end. */
-			if(op[3] + 10 > olimit)
-				break;
-
-			/* Exit the decoding loop if any input pointer has crossed the
-			 * previous one. This indicates corruption, and a precondition
-			 * to our loop is that ip[i] >= ip[0].
-			 */
-			for(stream = 1; stream < 4; ++stream) {
-				if(ip[stream] < ip[stream - 1])
-					goto _out;
-			}
-		}
-
-#ifndef NDEBUG
-		for(stream = 1; stream < 4; ++stream) {
-			assert(ip[stream] >= ip[stream - 1]);
-		}
-#endif
-
-		do {
-			/* Do 5 table lookups for each of the first 3 streams */
-			for(symbol = 0; symbol < 5; ++symbol) {
-				for(stream = 0; stream < 3; ++stream) {
-					int const index = (int)(bits[stream] >> 53);
-					HUF_DEltX2 const entry = dtable[index];
-					MEM_write16(op[stream], entry.sequence);
-					bits[stream] <<= (entry.nbBits);
-					op[stream] += (entry.length);
-				}
-			}
-			/* Do 1 table lookup from the final stream */
-			{
-				int const index = (int)(bits[3] >> 53);
-				HUF_DEltX2 const entry = dtable[index];
-				MEM_write16(op[3], entry.sequence);
-				bits[3] <<= (entry.nbBits);
-				op[3] += (entry.length);
-			}
-			/* Do 4 table lookups from the final stream & reload bitstreams */
-			for(stream = 0; stream < 4; ++stream) {
-				/* Do a table lookup from the final stream.
-				 * This is interleaved with the reloading to reduce register
-				 * pressure. This shouldn't be necessary, but compilers can
-				 * struggle with codegen with high register pressure.
-				 */
-				{
-					int const index = (int)(bits[3] >> 53);
-					HUF_DEltX2 const entry = dtable[index];
-					MEM_write16(op[3], entry.sequence);
-					bits[3] <<= (entry.nbBits);
-					op[3] += (entry.length);
-				}
-				/* Reload the bistreams. The final bitstream must be reloaded
-				 * after the 5th symbol was decoded.
-				 */
-				{
-					int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
-					int const nbBits = ctz & 7;
-					int const nbBytes = ctz >> 3;
-					ip[stream] -= nbBytes;
-					bits[stream] = MEM_read64(ip[stream]) | 1;
-					bits[stream] <<= nbBits;
-				}
-			}
-		} while(op[3] < olimit);
-	}
-
-_out:
-
-	/* Save the final values of each of the state variables back to args. */
-	ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
-	ZSTD_memcpy(&args->ip, &ip, sizeof(ip));
-	ZSTD_memcpy(&args->op, &op, sizeof(op));
-}
-
-static HUF_FAST_BMI2_ATTRS size_t
-HUF_decompress4X2_usingDTable_internal_fast(
-    void* dst, size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable,
-    HUF_DecompressFastLoopFn loopFn) {
-	void const * dt = DTable + 1;
-	const BYTE* const iend = (const BYTE*)cSrc + 6;
-	BYTE* const oend = (BYTE*)dst + dstSize;
-	HUF_DecompressFastArgs args;
-	{
-		size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
-		FORWARD_IF_ERROR(ret, "Failed to init asm args");
-		if(ret == 0)
-			return 0;
-	}
-
-	assert(args.ip[0] >= args.ilimit);
-	loopFn(&args);
-
-	/* note : op4 already verified within main loop */
-	assert(args.ip[0] >= iend);
-	assert(args.ip[1] >= iend);
-	assert(args.ip[2] >= iend);
-	assert(args.ip[3] >= iend);
-	assert(args.op[3] <= oend);
-	(void)iend;
-
-	/* finish bitStreams one by one */
-	{
-		size_t const segmentSize = (dstSize + 3) / 4;
-		BYTE* segmentEnd = (BYTE*)dst;
-		int i;
-		for(i = 0; i < 4; ++i) {
-			BIT_DStream_t bit;
-			if(segmentSize <= (size_t)(oend - segmentEnd))
-				segmentEnd += segmentSize;
-			else
-				segmentEnd = oend;
-			FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
-			args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const *)dt, HUF_DECODER_FAST_TABLELOG);
-			if(args.op[i] != segmentEnd)
-				return ERROR(corruption_detected);
-		}
-	}
-
-	/* decoded size */
-	return dstSize;
-}
-
-static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const * cSrc,
-                                                     size_t cSrcSize, HUF_DTable const * DTable, int flags) {
-	HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
-	HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
-
-#if DYNAMIC_BMI2
-	if(flags & HUF_flags_bmi2) {
-		fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
-#if ZSTD_ENABLE_ASM_X86_64_BMI2
-		if(!(flags & HUF_flags_disableAsm)) {
-			loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
-		}
-#endif
-	} else {
-		return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
-	}
-#endif
-
-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
-	if(!(flags & HUF_flags_disableAsm)) {
-		loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
-	}
-#endif
-
-	if(!(flags & HUF_flags_disableFast)) {
-		size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
-		if(ret != 0)
-			return ret;
-	}
-	return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
-}
-
-HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
-
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize, int flags) {
-	const BYTE* ip = (const BYTE*)cSrc;
-
-	size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
-	                                           workSpace, wkspSize, flags);
-	if(HUF_isError(hSize))
-		return hSize;
-	if(hSize >= cSrcSize)
-		return ERROR(srcSize_wrong);
-	ip += hSize;
-	cSrcSize -= hSize;
-
-	return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
-}
-
-static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
-                                          const void* cSrc, size_t cSrcSize,
-                                          void* workSpace, size_t wkspSize, int flags) {
-	const BYTE* ip = (const BYTE*)cSrc;
-
-	size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
-	                                     workSpace, wkspSize, flags);
-	if(HUF_isError(hSize))
-		return hSize;
-	if(hSize >= cSrcSize)
-		return ERROR(srcSize_wrong);
-	ip += hSize;
-	cSrcSize -= hSize;
-
-	return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
-}
-
-#endif /* HUF_FORCE_DECOMPRESS_X1 */
-
-/* ***********************************/
-/* Universal decompression selectors */
-/* ***********************************/
-
-#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
-typedef struct {
-	U32 tableTime;
-	U32 decode256Time;
-} algo_time_t;
-static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =
-    {
-        /* single, double, quad */
-        {{0, 0}, {1, 1}},           /* Q==0 : impossible */
-        {{0, 0}, {1, 1}},           /* Q==1 : impossible */
-        {{150, 216}, {381, 119}},   /* Q == 2 : 12-18% */
-        {{170, 205}, {514, 112}},   /* Q == 3 : 18-25% */
-        {{177, 199}, {539, 110}},   /* Q == 4 : 25-32% */
-        {{197, 194}, {644, 107}},   /* Q == 5 : 32-38% */
-        {{221, 192}, {735, 107}},   /* Q == 6 : 38-44% */
-        {{256, 189}, {881, 106}},   /* Q == 7 : 44-50% */
-        {{359, 188}, {1167, 109}},  /* Q == 8 : 50-56% */
-        {{582, 187}, {1570, 114}},  /* Q == 9 : 56-62% */
-        {{688, 187}, {1712, 122}},  /* Q ==10 : 62-69% */
-        {{825, 186}, {1965, 136}},  /* Q ==11 : 69-75% */
-        {{976, 185}, {2131, 150}},  /* Q ==12 : 75-81% */
-        {{1180, 186}, {2070, 175}}, /* Q ==13 : 81-87% */
-        {{1377, 185}, {1731, 202}}, /* Q ==14 : 87-93% */
-        {{1412, 185}, {1695, 202}}, /* Q ==15 : 93-99% */
-};
-#endif
-
-/** HUF_selectDecoder() :
- *  Tells which decoder is likely to decode faster,
- *  based on a set of pre-computed metrics.
- * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
- *  Assumption : 0 < dstSize <= 128 KB */
-U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize) {
-	assert(dstSize > 0);
-	assert(dstSize <= 128 * 1024);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-	(void)dstSize;
-	(void)cSrcSize;
-	return 0;
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-	(void)dstSize;
-	(void)cSrcSize;
-	return 1;
-#else
-	/* decoder timing evaluation */
-	{
-		U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */
-		U32 const D256 = (U32)(dstSize >> 8);
-		U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
-		U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
-		DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */
-		return DTime1 < DTime0;
-	}
-#endif
-}
-
-size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
-                                  const void* cSrc, size_t cSrcSize,
-                                  void* workSpace, size_t wkspSize, int flags) {
-	/* validation checks */
-	if(dstSize == 0)
-		return ERROR(dstSize_tooSmall);
-	if(cSrcSize > dstSize)
-		return ERROR(corruption_detected); /* invalid */
-	if(cSrcSize == dstSize) {
-		ZSTD_memcpy(dst, cSrc, dstSize);
-		return dstSize;
-	} /* not compressed */
-	if(cSrcSize == 1) {
-		ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize);
-		return dstSize;
-	} /* RLE */
-
-	{
-		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-		(void)algoNb;
-		assert(algoNb == 0);
-		return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
-		                                   cSrcSize, workSpace, wkspSize, flags);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-		(void)algoNb;
-		assert(algoNb == 1);
-		return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-		                                   cSrcSize, workSpace, wkspSize, flags);
-#else
-		return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-		                                            cSrcSize, workSpace, wkspSize, flags)
-		              : HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
-		                                            cSrcSize, workSpace, wkspSize, flags);
-#endif
-	}
-}
-
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) {
-	DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-	(void)dtd;
-	assert(dtd.tableType == 0);
-	return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-	(void)dtd;
-	assert(dtd.tableType == 1);
-	return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
-#else
-	return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
-#endif
-}
-
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) {
-	const BYTE* ip = (const BYTE*)cSrc;
-
-	size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
-	if(HUF_isError(hSize))
-		return hSize;
-	if(hSize >= cSrcSize)
-		return ERROR(srcSize_wrong);
-	ip += hSize;
-	cSrcSize -= hSize;
-
-	return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
-}
-#endif
-
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) {
-	DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-	(void)dtd;
-	assert(dtd.tableType == 0);
-	return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-	(void)dtd;
-	assert(dtd.tableType == 1);
-	return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
-#else
-	return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
-#endif
-}
-
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) {
-	/* validation checks */
-	if(dstSize == 0)
-		return ERROR(dstSize_tooSmall);
-	if(cSrcSize == 0)
-		return ERROR(corruption_detected);
-
-	{
-		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-		(void)algoNb;
-		assert(algoNb == 0);
-		return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-		(void)algoNb;
-		assert(algoNb == 1);
-		return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
-#else
-		return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
-#endif
-	}
-}
diff --git a/src/zstd/mem.h b/src/zstd/mem.h
deleted file mode 100644
index 13a9b3509..000000000
--- a/src/zstd/mem.h
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef MEM_H_MODULE
-#define MEM_H_MODULE
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/*-****************************************
- *  Dependencies
- ******************************************/
-#include <stddef.h>	   /* size_t, ptrdiff_t */
-#include "compiler.h"  /* __has_builtin */
-#include "debug.h"	   /* DEBUG_STATIC_ASSERT */
-#include "zstd_deps.h" /* ZSTD_memcpy */
-
-/*-****************************************
- *  Compiler specifics
- ******************************************/
-#if defined(_MSC_VER) /* Visual Studio */
-#include <stdlib.h>	  /* _byteswap_ulong */
-#include <intrin.h>	  /* _byteswap_* */
-#endif
-#if defined(__GNUC__)
-#define MEM_STATIC static __inline __attribute__((unused))
-#elif defined(__cplusplus) || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
-#define MEM_STATIC static inline
-#elif defined(_MSC_VER)
-#define MEM_STATIC static __inline
-#else
-#define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */
-#endif
-
-/*-**************************************************************
- *  Basic Types
- *****************************************************************/
-#if !defined(__VMS) && (defined(__cplusplus) || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
-#if defined(_AIX)
-#include <inttypes.h>
-#else
-#include <stdint.h> /* intptr_t */
-#endif
-typedef uint8_t BYTE;
-typedef uint8_t U8;
-typedef int8_t S8;
-typedef uint16_t U16;
-typedef int16_t S16;
-typedef uint32_t U32;
-typedef int32_t S32;
-typedef uint64_t U64;
-typedef int64_t S64;
-#else
-#include <limits.h>
-#if CHAR_BIT != 8
-#error "this implementation requires char to be exactly 8-bit type"
-#endif
-typedef unsigned char BYTE;
-typedef unsigned char U8;
-typedef signed char S8;
-#if USHRT_MAX != 65535
-#error "this implementation requires short to be exactly 16-bit type"
-#endif
-typedef unsigned short U16;
-typedef signed short S16;
-#if UINT_MAX != 4294967295
-#error "this implementation requires int to be exactly 32-bit type"
-#endif
-typedef unsigned int U32;
-typedef signed int S32;
-/* note : there are no limits defined for long long type in C90.
- * limits exist in C99, however, in such case, <stdint.h> is preferred */
-typedef unsigned long long U64;
-typedef signed long long S64;
-#endif
-
-/*-**************************************************************
- *  Memory I/O API
- *****************************************************************/
-/*=== Static platform detection ===*/
-MEM_STATIC unsigned MEM_32bits(void);
-MEM_STATIC unsigned MEM_64bits(void);
-MEM_STATIC unsigned MEM_isLittleEndian(void);
-
-/*=== Native unaligned read/write ===*/
-MEM_STATIC U16 MEM_read16(void const* memPtr);
-MEM_STATIC U32 MEM_read32(void const* memPtr);
-MEM_STATIC U64 MEM_read64(void const* memPtr);
-MEM_STATIC size_t MEM_readST(void const* memPtr);
-
-MEM_STATIC void MEM_write16(void* memPtr, U16 value);
-MEM_STATIC void MEM_write32(void* memPtr, U32 value);
-MEM_STATIC void MEM_write64(void* memPtr, U64 value);
-
-/*=== Little endian unaligned read/write ===*/
-MEM_STATIC U16 MEM_readLE16(void const* memPtr);
-MEM_STATIC U32 MEM_readLE24(void const* memPtr);
-MEM_STATIC U32 MEM_readLE32(void const* memPtr);
-MEM_STATIC U64 MEM_readLE64(void const* memPtr);
-MEM_STATIC size_t MEM_readLEST(void const* memPtr);
-
-MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val);
-MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val);
-MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32);
-MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64);
-MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val);
-
-/*=== Big endian unaligned read/write ===*/
-MEM_STATIC U32 MEM_readBE32(void const* memPtr);
-MEM_STATIC U64 MEM_readBE64(void const* memPtr);
-MEM_STATIC size_t MEM_readBEST(void const* memPtr);
-
-MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32);
-MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64);
-MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val);
-
-/*=== Byteswap ===*/
-MEM_STATIC U32 MEM_swap32(U32 in);
-MEM_STATIC U64 MEM_swap64(U64 in);
-MEM_STATIC size_t MEM_swapST(size_t in);
-
-/*-**************************************************************
- *  Memory I/O Implementation
- *****************************************************************/
-/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
- * Method 0 : always use `memcpy()`. Safe and portable.
- * Method 1 : Use compiler extension to set unaligned access.
- * Method 2 : direct access. This method is portable but violate C standard.
- *            It can generate buggy code on targets depending on alignment.
- * Default  : method 1 if supported, else method 0
- */
-#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
-#ifdef __GNUC__
-#define MEM_FORCE_MEMORY_ACCESS 1
-#endif
-#endif
-
-MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t) == 4; }
-MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t) == 8; }
-
-MEM_STATIC unsigned MEM_isLittleEndian(void) {
-#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-	return 1;
-#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-	return 0;
-#elif defined(__clang__) && __LITTLE_ENDIAN__
-	return 1;
-#elif defined(__clang__) && __BIG_ENDIAN__
-	return 0;
-#elif defined(_MSC_VER) && (_M_AMD64 || _M_IX86)
-	return 1;
-#elif defined(__DMC__) && defined(_M_IX86)
-	return 1;
-#else
-	const union {
-		U32 u;
-		BYTE c[4];
-	} one = {1}; /* don't use static : performance detrimental  */
-	return one.c[0];
-#endif
-}
-
-#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS == 2)
-
-/* violates C standard, by lying on structure alignment.
-Only use if no other choice to achieve best performance on target platform */
-MEM_STATIC U16 MEM_read16(void const* memPtr) { return *(const U16*)memPtr; }
-MEM_STATIC U32 MEM_read32(void const* memPtr) { return *(const U32*)memPtr; }
-MEM_STATIC U64 MEM_read64(void const* memPtr) { return *(const U64*)memPtr; }
-MEM_STATIC size_t MEM_readST(void const* memPtr) { return *(size_t const*)memPtr; }
-
-MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
-MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
-MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
-
-#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS == 1)
-
-typedef __attribute__((aligned(1))) U16 unalign16;
-typedef __attribute__((aligned(1))) U32 unalign32;
-typedef __attribute__((aligned(1))) U64 unalign64;
-typedef __attribute__((aligned(1))) size_t unalignArch;
-
-MEM_STATIC U16 MEM_read16(void const* ptr) { return *(unalign16 const*)ptr; }
-MEM_STATIC U32 MEM_read32(void const* ptr) { return *(unalign32 const*)ptr; }
-MEM_STATIC U64 MEM_read64(void const* ptr) { return *(unalign64 const*)ptr; }
-MEM_STATIC size_t MEM_readST(void const* ptr) { return *(unalignArch const*)ptr; }
-
-MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }
-MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(unalign32*)memPtr = value; }
-MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(unalign64*)memPtr = value; }
-
-#else
-
-/* default method, safe and standard.
-   can sometimes prove slower */
-
-MEM_STATIC U16 MEM_read16(void const* memPtr) {
-	U16 val;
-	ZSTD_memcpy(&val, memPtr, sizeof(val));
-	return val;
-}
-
-MEM_STATIC U32 MEM_read32(void const* memPtr) {
-	U32 val;
-	ZSTD_memcpy(&val, memPtr, sizeof(val));
-	return val;
-}
-
-MEM_STATIC U64 MEM_read64(void const* memPtr) {
-	U64 val;
-	ZSTD_memcpy(&val, memPtr, sizeof(val));
-	return val;
-}
-
-MEM_STATIC size_t MEM_readST(void const* memPtr) {
-	size_t val;
-	ZSTD_memcpy(&val, memPtr, sizeof(val));
-	return val;
-}
-
-MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ZSTD_memcpy(memPtr, &value, sizeof(value)); }
-
-MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ZSTD_memcpy(memPtr, &value, sizeof(value)); }
-
-MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ZSTD_memcpy(memPtr, &value, sizeof(value)); }
-
-#endif /* MEM_FORCE_MEMORY_ACCESS */
-
-MEM_STATIC U32 MEM_swap32_fallback(U32 in) { return ((in << 24) & 0xff000000) | ((in << 8) & 0x00ff0000) | ((in >> 8) & 0x0000ff00) | ((in >> 24) & 0x000000ff); }
-
-MEM_STATIC U32 MEM_swap32(U32 in) {
-#if defined(_MSC_VER) /* Visual Studio */
-	return _byteswap_ulong(in);
-#elif(defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) || (defined(__clang__) && __has_builtin(__builtin_bswap32))
-	return __builtin_bswap32(in);
-#else
-	return MEM_swap32_fallback(in);
-#endif
-}
-
-MEM_STATIC U64 MEM_swap64_fallback(U64 in) {
-	return ((in << 56) & 0xff00000000000000ULL) | ((in << 40) & 0x00ff000000000000ULL) | ((in << 24) & 0x0000ff0000000000ULL) | ((in << 8) & 0x000000ff00000000ULL) | ((in >> 8) & 0x00000000ff000000ULL) | ((in >> 24) & 0x0000000000ff0000ULL) | ((in >> 40) & 0x000000000000ff00ULL) |
-		   ((in >> 56) & 0x00000000000000ffULL);
-}
-
-MEM_STATIC U64 MEM_swap64(U64 in) {
-#if defined(_MSC_VER) /* Visual Studio */
-	return _byteswap_uint64(in);
-#elif(defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) || (defined(__clang__) && __has_builtin(__builtin_bswap64))
-	return __builtin_bswap64(in);
-#else
-	return MEM_swap64_fallback(in);
-#endif
-}
-
-MEM_STATIC size_t MEM_swapST(size_t in) {
-	if(MEM_32bits())
-		return (size_t)MEM_swap32((U32)in);
-	else
-		return (size_t)MEM_swap64((U64)in);
-}
-
-/*=== Little endian r/w ===*/
-
-MEM_STATIC U16 MEM_readLE16(void const* memPtr) {
-	if(MEM_isLittleEndian())
-		return MEM_read16(memPtr);
-	else {
-		const BYTE* p = (const BYTE*)memPtr;
-		return (U16)(p[0] + (p[1] << 8));
-	}
-}
-
-MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val) {
-	if(MEM_isLittleEndian()) {
-		MEM_write16(memPtr, val);
-	} else {
-		BYTE* p = (BYTE*)memPtr;
-		p[0] = (BYTE)val;
-		p[1] = (BYTE)(val >> 8);
-	}
-}
-
-MEM_STATIC U32 MEM_readLE24(void const* memPtr) { return (U32)MEM_readLE16(memPtr) + ((U32)(((const BYTE*)memPtr)[2]) << 16); }
-
-MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val) {
-	MEM_writeLE16(memPtr, (U16)val);
-	((BYTE*)memPtr)[2] = (BYTE)(val >> 16);
-}
-
-MEM_STATIC U32 MEM_readLE32(void const* memPtr) {
-	if(MEM_isLittleEndian())
-		return MEM_read32(memPtr);
-	else
-		return MEM_swap32(MEM_read32(memPtr));
-}
-
-MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32) {
-	if(MEM_isLittleEndian())
-		MEM_write32(memPtr, val32);
-	else
-		MEM_write32(memPtr, MEM_swap32(val32));
-}
-
-MEM_STATIC U64 MEM_readLE64(void const* memPtr) {
-	if(MEM_isLittleEndian())
-		return MEM_read64(memPtr);
-	else
-		return MEM_swap64(MEM_read64(memPtr));
-}
-
-MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64) {
-	if(MEM_isLittleEndian())
-		MEM_write64(memPtr, val64);
-	else
-		MEM_write64(memPtr, MEM_swap64(val64));
-}
-
-MEM_STATIC size_t MEM_readLEST(void const* memPtr) {
-	if(MEM_32bits())
-		return (size_t)MEM_readLE32(memPtr);
-	else
-		return (size_t)MEM_readLE64(memPtr);
-}
-
-MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val) {
-	if(MEM_32bits())
-		MEM_writeLE32(memPtr, (U32)val);
-	else
-		MEM_writeLE64(memPtr, (U64)val);
-}
-
-/*=== Big endian r/w ===*/
-
-MEM_STATIC U32 MEM_readBE32(void const* memPtr) {
-	if(MEM_isLittleEndian())
-		return MEM_swap32(MEM_read32(memPtr));
-	else
-		return MEM_read32(memPtr);
-}
-
-MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32) {
-	if(MEM_isLittleEndian())
-		MEM_write32(memPtr, MEM_swap32(val32));
-	else
-		MEM_write32(memPtr, val32);
-}
-
-MEM_STATIC U64 MEM_readBE64(void const* memPtr) {
-	if(MEM_isLittleEndian())
-		return MEM_swap64(MEM_read64(memPtr));
-	else
-		return MEM_read64(memPtr);
-}
-
-MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64) {
-	if(MEM_isLittleEndian())
-		MEM_write64(memPtr, MEM_swap64(val64));
-	else
-		MEM_write64(memPtr, val64);
-}
-
-MEM_STATIC size_t MEM_readBEST(void const* memPtr) {
-	if(MEM_32bits())
-		return (size_t)MEM_readBE32(memPtr);
-	else
-		return (size_t)MEM_readBE64(memPtr);
-}
-
-MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val) {
-	if(MEM_32bits())
-		MEM_writeBE32(memPtr, (U32)val);
-	else
-		MEM_writeBE64(memPtr, (U64)val);
-}
-
-/* code only tested on 32 and 64 bits systems */
-MEM_STATIC void MEM_check(void) { DEBUG_STATIC_ASSERT((sizeof(size_t) == 4) || (sizeof(size_t) == 8)); }
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* MEM_H_MODULE */
diff --git a/src/zstd/pool.c b/src/zstd/pool.c
deleted file mode 100644
index 81477d7b4..000000000
--- a/src/zstd/pool.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/* ======   Dependencies   ======= */
-#include "zstd_deps.h"     /* size_t */
-#include "debug.h"         /* assert */
-#include "zstd_internal.h" /* ZSTD_customCalloc, ZSTD_customFree */
-#include "pool.h"
-
-/* ======   Compiler specifics   ====== */
-#if defined(_MSC_VER)
-#pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */
-#endif
-
-#ifdef ZSTD_MULTITHREAD
-
-#include "threading.h" /* pthread adaptation */
-
-/* A job is a function and an opaque argument */
-typedef struct POOL_job_s {
-	POOL_function function;
-	void* opaque;
-} POOL_job;
-
-struct POOL_ctx_s {
-	ZSTD_customMem customMem;
-	/* Keep track of the threads */
-	ZSTD_pthread_t* threads;
-	size_t threadCapacity;
-	size_t threadLimit;
-
-	/* The queue is a circular buffer */
-	POOL_job* queue;
-	size_t queueHead;
-	size_t queueTail;
-	size_t queueSize;
-
-	/* The number of threads working on jobs */
-	size_t numThreadsBusy;
-	/* Indicates if the queue is empty */
-	int queueEmpty;
-
-	/* The mutex protects the queue */
-	ZSTD_pthread_mutex_t queueMutex;
-	/* Condition variable for pushers to wait on when the queue is full */
-	ZSTD_pthread_cond_t queuePushCond;
-	/* Condition variables for poppers to wait on when the queue is empty */
-	ZSTD_pthread_cond_t queuePopCond;
-	/* Indicates if the queue is shutting down */
-	int shutdown;
-};
-
-/* POOL_thread() :
- * Work thread for the thread pool.
- * Waits for jobs and executes them.
- * @returns : NULL on failure else non-null.
- */
-static void* POOL_thread(void* opaque) {
-	POOL_ctx* const ctx = (POOL_ctx*)opaque;
-	if(!ctx) {
-		return NULL;
-	}
-	for(;;) {
-		/* Lock the mutex and wait for a non-empty queue or until shutdown */
-		ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-
-		while(ctx->queueEmpty || (ctx->numThreadsBusy >= ctx->threadLimit)) {
-			if(ctx->shutdown) {
-				/* even if !queueEmpty, (possible if numThreadsBusy >= threadLimit),
-				 * a few threads will be shutdown while !queueEmpty,
-				 * but enough threads will remain active to finish the queue */
-				ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-				return opaque;
-			}
-			ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
-		}
-		/* Pop a job off the queue */
-		{
-			POOL_job const job = ctx->queue[ctx->queueHead];
-			ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
-			ctx->numThreadsBusy++;
-			ctx->queueEmpty = (ctx->queueHead == ctx->queueTail);
-			/* Unlock the mutex, signal a pusher, and run the job */
-			ZSTD_pthread_cond_signal(&ctx->queuePushCond);
-			ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-
-			job.function(job.opaque);
-
-			/* If the intended queue size was 0, signal after finishing job */
-			ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-			ctx->numThreadsBusy--;
-			ZSTD_pthread_cond_signal(&ctx->queuePushCond);
-			ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-		}
-	}          /* for (;;) */
-	assert(0); /* Unreachable */
-}
-
-/* ZSTD_createThreadPool() : public access point */
-POOL_ctx* ZSTD_createThreadPool(size_t numThreads) {
-	return POOL_create(numThreads, 0);
-}
-
-POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
-	return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
-}
-
-POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
-                               ZSTD_customMem customMem) {
-	POOL_ctx* ctx;
-	/* Check parameters */
-	if(!numThreads) {
-		return NULL;
-	}
-	/* Allocate the context and zero initialize */
-	ctx = (POOL_ctx*)ZSTD_customCalloc(sizeof(POOL_ctx), customMem);
-	if(!ctx) {
-		return NULL;
-	}
-	/* Initialize the job queue.
-	 * It needs one extra space since one space is wasted to differentiate
-	 * empty and full queues.
-	 */
-	ctx->queueSize = queueSize + 1;
-	ctx->queue = (POOL_job*)ZSTD_customCalloc(ctx->queueSize * sizeof(POOL_job), customMem);
-	ctx->queueHead = 0;
-	ctx->queueTail = 0;
-	ctx->numThreadsBusy = 0;
-	ctx->queueEmpty = 1;
-	{
-		int error = 0;
-		error |= ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL);
-		error |= ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL);
-		error |= ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL);
-		if(error) {
-			POOL_free(ctx);
-			return NULL;
-		}
-	}
-	ctx->shutdown = 0;
-	/* Allocate space for the thread handles */
-	ctx->threads = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
-	ctx->threadCapacity = 0;
-	ctx->customMem = customMem;
-	/* Check for errors */
-	if(!ctx->threads || !ctx->queue) {
-		POOL_free(ctx);
-		return NULL;
-	}
-	/* Initialize the threads */
-	{
-		size_t i;
-		for(i = 0; i < numThreads; ++i) {
-			if(ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
-				ctx->threadCapacity = i;
-				POOL_free(ctx);
-				return NULL;
-			}
-		}
-		ctx->threadCapacity = numThreads;
-		ctx->threadLimit = numThreads;
-	}
-	return ctx;
-}
-
-/*! POOL_join() :
-    Shutdown the queue, wake any sleeping threads, and join all of the threads.
-*/
-static void POOL_join(POOL_ctx* ctx) {
-	/* Shut down the queue */
-	ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-	ctx->shutdown = 1;
-	ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-	/* Wake up sleeping threads */
-	ZSTD_pthread_cond_broadcast(&ctx->queuePushCond);
-	ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
-	/* Join all of the threads */
-	{
-		size_t i;
-		for(i = 0; i < ctx->threadCapacity; ++i) {
-			ZSTD_pthread_join(ctx->threads[i]); /* note : could fail */
-		}
-	}
-}
-
-void POOL_free(POOL_ctx* ctx) {
-	if(!ctx) {
-		return;
-	}
-	POOL_join(ctx);
-	ZSTD_pthread_mutex_destroy(&ctx->queueMutex);
-	ZSTD_pthread_cond_destroy(&ctx->queuePushCond);
-	ZSTD_pthread_cond_destroy(&ctx->queuePopCond);
-	ZSTD_customFree(ctx->queue, ctx->customMem);
-	ZSTD_customFree(ctx->threads, ctx->customMem);
-	ZSTD_customFree(ctx, ctx->customMem);
-}
-
-/*! POOL_joinJobs() :
- *  Waits for all queued jobs to finish executing.
- */
-void POOL_joinJobs(POOL_ctx* ctx) {
-	ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-	while(!ctx->queueEmpty || ctx->numThreadsBusy > 0) {
-		ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
-	}
-	ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-}
-
-void ZSTD_freeThreadPool(ZSTD_threadPool* pool) {
-	POOL_free(pool);
-}
-
-size_t POOL_sizeof(const POOL_ctx* ctx) {
-	if(ctx == NULL)
-		return 0; /* supports sizeof NULL */
-	return sizeof(*ctx) + ctx->queueSize * sizeof(POOL_job) + ctx->threadCapacity * sizeof(ZSTD_pthread_t);
-}
-
-/* @return : 0 on success, 1 on error */
-static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads) {
-	if(numThreads <= ctx->threadCapacity) {
-		if(!numThreads)
-			return 1;
-		ctx->threadLimit = numThreads;
-		return 0;
-	}
-	/* numThreads > threadCapacity */
-	{
-		ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
-		if(!threadPool)
-			return 1;
-		/* replace existing thread pool */
-		ZSTD_memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool));
-		ZSTD_customFree(ctx->threads, ctx->customMem);
-		ctx->threads = threadPool;
-		/* Initialize additional threads */
-		{
-			size_t threadId;
-			for(threadId = ctx->threadCapacity; threadId < numThreads; ++threadId) {
-				if(ZSTD_pthread_create(&threadPool[threadId], NULL, &POOL_thread, ctx)) {
-					ctx->threadCapacity = threadId;
-					return 1;
-				}
-			}
-		}
-	}
-	/* successfully expanded */
-	ctx->threadCapacity = numThreads;
-	ctx->threadLimit = numThreads;
-	return 0;
-}
-
-/* @return : 0 on success, 1 on error */
-int POOL_resize(POOL_ctx* ctx, size_t numThreads) {
-	int result;
-	if(ctx == NULL)
-		return 1;
-	ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-	result = POOL_resize_internal(ctx, numThreads);
-	ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
-	ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-	return result;
-}
-
-/**
- * Returns 1 if the queue is full and 0 otherwise.
- *
- * When queueSize is 1 (pool was created with an intended queueSize of 0),
- * then a queue is empty if there is a thread free _and_ no job is waiting.
- */
-static int isQueueFull(POOL_ctx const * ctx) {
-	if(ctx->queueSize > 1) {
-		return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize);
-	} else {
-		return (ctx->numThreadsBusy == ctx->threadLimit) ||
-		       !ctx->queueEmpty;
-	}
-}
-
-static void
-POOL_add_internal(POOL_ctx* ctx, POOL_function function, void* opaque) {
-	POOL_job job;
-	job.function = function;
-	job.opaque = opaque;
-	assert(ctx != NULL);
-	if(ctx->shutdown)
-		return;
-
-	ctx->queueEmpty = 0;
-	ctx->queue[ctx->queueTail] = job;
-	ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize;
-	ZSTD_pthread_cond_signal(&ctx->queuePopCond);
-}
-
-void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
-	assert(ctx != NULL);
-	ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-	/* Wait until there is space in the queue for the new job */
-	while(isQueueFull(ctx) && (!ctx->shutdown)) {
-		ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
-	}
-	POOL_add_internal(ctx, function, opaque);
-	ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-}
-
-int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) {
-	assert(ctx != NULL);
-	ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-	if(isQueueFull(ctx)) {
-		ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-		return 0;
-	}
-	POOL_add_internal(ctx, function, opaque);
-	ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-	return 1;
-}
-
-#else /* ZSTD_MULTITHREAD  not defined */
-
-/* ========================== */
-/* No multi-threading support */
-/* ========================== */
-
-/* We don't need any data, but if it is empty, malloc() might return NULL. */
-struct POOL_ctx_s {
-	int dummy;
-};
-static POOL_ctx g_poolCtx;
-
-POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
-	return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
-}
-
-POOL_ctx*
-POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) {
-	(void)numThreads;
-	(void)queueSize;
-	(void)customMem;
-	return &g_poolCtx;
-}
-
-void POOL_free(POOL_ctx* ctx) {
-	assert(!ctx || ctx == &g_poolCtx);
-	(void)ctx;
-}
-
-void POOL_joinJobs(POOL_ctx* ctx) {
-	assert(!ctx || ctx == &g_poolCtx);
-	(void)ctx;
-}
-
-int POOL_resize(POOL_ctx* ctx, size_t numThreads) {
-	(void)ctx;
-	(void)numThreads;
-	return 0;
-}
-
-void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
-	(void)ctx;
-	function(opaque);
-}
-
-int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) {
-	(void)ctx;
-	function(opaque);
-	return 1;
-}
-
-size_t POOL_sizeof(const POOL_ctx* ctx) {
-	if(ctx == NULL)
-		return 0; /* supports sizeof NULL */
-	assert(ctx == &g_poolCtx);
-	return sizeof(*ctx);
-}
-
-#endif /* ZSTD_MULTITHREAD */
diff --git a/src/zstd/threading.c b/src/zstd/threading.c
deleted file mode 100644
index 890c314bf..000000000
--- a/src/zstd/threading.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/**
- * Copyright (c) 2016 Tino Reichardt
- * All rights reserved.
- *
- * You can contact the author at:
- * - zstdmt source repository: https://github.com/mcmilk/zstdmt
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/**
- * This file will hold wrapper for systems, which do not support pthreads
- */
-
-#include "threading.h"
-
-/* create fake symbol to avoid empty translation unit warning */
-int g_ZSTD_threading_useless_symbol;
-
-#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
-
-/**
- * Windows minimalist Pthread Wrapper
- */
-
-/* ===  Dependencies  === */
-#include <process.h>
-#include <errno.h>
-
-/* ===  Implementation  === */
-
-typedef struct {
-	void* (*start_routine)(void*);
-	void* arg;
-	int initialized;
-	ZSTD_pthread_cond_t initialized_cond;
-	ZSTD_pthread_mutex_t initialized_mutex;
-} ZSTD_thread_params_t;
-
-static unsigned __stdcall worker(void* arg) {
-	void* (*start_routine)(void*);
-	void* thread_arg;
-
-	/* Inialized thread_arg and start_routine and signal main thread that we don't need it
-	 * to wait any longer.
-	 */
-	{
-		ZSTD_thread_params_t* thread_param = (ZSTD_thread_params_t*)arg;
-		thread_arg = thread_param->arg;
-		start_routine = thread_param->start_routine;
-
-		/* Signal main thread that we are running and do not depend on its memory anymore */
-		ZSTD_pthread_mutex_lock(&thread_param->initialized_mutex);
-		thread_param->initialized = 1;
-		ZSTD_pthread_cond_signal(&thread_param->initialized_cond);
-		ZSTD_pthread_mutex_unlock(&thread_param->initialized_mutex);
-	}
-
-	start_routine(thread_arg);
-
-	return 0;
-}
-
-int ZSTD_pthread_create(ZSTD_pthread_t* thread, void const* unused, void* (*start_routine)(void*), void* arg) {
-	ZSTD_thread_params_t thread_param;
-	(void)unused;
-
-	thread_param.start_routine = start_routine;
-	thread_param.arg = arg;
-	thread_param.initialized = 0;
-	*thread = NULL;
-
-	/* Setup thread initialization synchronization */
-	if(ZSTD_pthread_cond_init(&thread_param.initialized_cond, NULL)) {
-		/* Should never happen on Windows */
-		return -1;
-	}
-	if(ZSTD_pthread_mutex_init(&thread_param.initialized_mutex, NULL)) {
-		/* Should never happen on Windows */
-		ZSTD_pthread_cond_destroy(&thread_param.initialized_cond);
-		return -1;
-	}
-
-	/* Spawn thread */
-	*thread = (HANDLE)_beginthreadex(NULL, 0, worker, &thread_param, 0, NULL);
-	if(!thread) {
-		ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex);
-		ZSTD_pthread_cond_destroy(&thread_param.initialized_cond);
-		return errno;
-	}
-
-	/* Wait for thread to be initialized */
-	ZSTD_pthread_mutex_lock(&thread_param.initialized_mutex);
-	while(!thread_param.initialized) {
-		ZSTD_pthread_cond_wait(&thread_param.initialized_cond, &thread_param.initialized_mutex);
-	}
-	ZSTD_pthread_mutex_unlock(&thread_param.initialized_mutex);
-	ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex);
-	ZSTD_pthread_cond_destroy(&thread_param.initialized_cond);
-
-	return 0;
-}
-
-int ZSTD_pthread_join(ZSTD_pthread_t thread) {
-	DWORD result;
-
-	if(!thread)
-		return 0;
-
-	result = WaitForSingleObject(thread, INFINITE);
-	CloseHandle(thread);
-
-	switch(result) {
-	case WAIT_OBJECT_0:
-		return 0;
-	case WAIT_ABANDONED:
-		return EINVAL;
-	default:
-		return GetLastError();
-	}
-}
-
-#endif /* ZSTD_MULTITHREAD */
-
-#if defined(ZSTD_MULTITHREAD) && DEBUGLEVEL >= 1 && !defined(_WIN32)
-
-#define ZSTD_DEPS_NEED_MALLOC
-#include "zstd_deps.h"
-
-int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr) {
-	*mutex = (pthread_mutex_t*)ZSTD_malloc(sizeof(pthread_mutex_t));
-	if(!*mutex)
-		return 1;
-	return pthread_mutex_init(*mutex, attr);
-}
-
-int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex) {
-	if(!*mutex)
-		return 0;
-	{
-		int const ret = pthread_mutex_destroy(*mutex);
-		ZSTD_free(*mutex);
-		return ret;
-	}
-}
-
-int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr) {
-	*cond = (pthread_cond_t*)ZSTD_malloc(sizeof(pthread_cond_t));
-	if(!*cond)
-		return 1;
-	return pthread_cond_init(*cond, attr);
-}
-
-int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond) {
-	if(!*cond)
-		return 0;
-	{
-		int const ret = pthread_cond_destroy(*cond);
-		ZSTD_free(*cond);
-		return ret;
-	}
-}
-
-#endif
diff --git a/src/zstd/threading.h b/src/zstd/threading.h
deleted file mode 100644
index 1c125413d..000000000
--- a/src/zstd/threading.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/**
- * Copyright (c) 2016 Tino Reichardt
- * All rights reserved.
- *
- * You can contact the author at:
- * - zstdmt source repository: https://github.com/mcmilk/zstdmt
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef THREADING_H_938743
-#define THREADING_H_938743
-
-#include "debug.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
-
-/**
- * Windows minimalist Pthread Wrapper
- */
-#ifdef WINVER
-#undef WINVER
-#endif
-#define WINVER 0x0600
-
-#ifdef _WIN32_WINNT
-#undef _WIN32_WINNT
-#endif
-#define _WIN32_WINNT 0x0600
-
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-
-#undef ERROR /* reported already defined on VS 2015 (Rich Geldreich) */
-#include <windows.h>
-#undef ERROR
-#define ERROR(name) ZSTD_ERROR(name)
-
-/* mutex */
-#define ZSTD_pthread_mutex_t CRITICAL_SECTION
-#define ZSTD_pthread_mutex_init(a, b) ((void)(b), InitializeCriticalSection((a)), 0)
-#define ZSTD_pthread_mutex_destroy(a) DeleteCriticalSection((a))
-#define ZSTD_pthread_mutex_lock(a) EnterCriticalSection((a))
-#define ZSTD_pthread_mutex_unlock(a) LeaveCriticalSection((a))
-
-/* condition variable */
-#define ZSTD_pthread_cond_t CONDITION_VARIABLE
-#define ZSTD_pthread_cond_init(a, b) ((void)(b), InitializeConditionVariable((a)), 0)
-#define ZSTD_pthread_cond_destroy(a) ((void)(a))
-#define ZSTD_pthread_cond_wait(a, b) SleepConditionVariableCS((a), (b), INFINITE)
-#define ZSTD_pthread_cond_signal(a) WakeConditionVariable((a))
-#define ZSTD_pthread_cond_broadcast(a) WakeAllConditionVariable((a))
-
-/* ZSTD_pthread_create() and ZSTD_pthread_join() */
-typedef HANDLE ZSTD_pthread_t;
-
-int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
-                        void* (*start_routine)(void*), void* arg);
-
-int ZSTD_pthread_join(ZSTD_pthread_t thread);
-
-/**
- * add here more wrappers as required
- */
-
-#elif defined(ZSTD_MULTITHREAD) /* posix assumed ; need a better detection method */
-/* ===   POSIX Systems   === */
-#include <pthread.h>
-
-#if DEBUGLEVEL < 1
-
-#define ZSTD_pthread_mutex_t pthread_mutex_t
-#define ZSTD_pthread_mutex_init(a, b) pthread_mutex_init((a), (b))
-#define ZSTD_pthread_mutex_destroy(a) pthread_mutex_destroy((a))
-#define ZSTD_pthread_mutex_lock(a) pthread_mutex_lock((a))
-#define ZSTD_pthread_mutex_unlock(a) pthread_mutex_unlock((a))
-
-#define ZSTD_pthread_cond_t pthread_cond_t
-#define ZSTD_pthread_cond_init(a, b) pthread_cond_init((a), (b))
-#define ZSTD_pthread_cond_destroy(a) pthread_cond_destroy((a))
-#define ZSTD_pthread_cond_wait(a, b) pthread_cond_wait((a), (b))
-#define ZSTD_pthread_cond_signal(a) pthread_cond_signal((a))
-#define ZSTD_pthread_cond_broadcast(a) pthread_cond_broadcast((a))
-
-#define ZSTD_pthread_t pthread_t
-#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
-#define ZSTD_pthread_join(a) pthread_join((a), NULL)
-
-#else /* DEBUGLEVEL >= 1 */
-
-/* Debug implementation of threading.
- * In this implementation we use pointers for mutexes and condition variables.
- * This way, if we forget to init/destroy them the program will crash or ASAN
- * will report leaks.
- */
-
-#define ZSTD_pthread_mutex_t pthread_mutex_t*
-int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const * attr);
-int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex);
-#define ZSTD_pthread_mutex_lock(a) pthread_mutex_lock(*(a))
-#define ZSTD_pthread_mutex_unlock(a) pthread_mutex_unlock(*(a))
-
-#define ZSTD_pthread_cond_t pthread_cond_t*
-int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const * attr);
-int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond);
-#define ZSTD_pthread_cond_wait(a, b) pthread_cond_wait(*(a), *(b))
-#define ZSTD_pthread_cond_signal(a) pthread_cond_signal(*(a))
-#define ZSTD_pthread_cond_broadcast(a) pthread_cond_broadcast(*(a))
-
-#define ZSTD_pthread_t pthread_t
-#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
-#define ZSTD_pthread_join(a) pthread_join((a), NULL)
-
-#endif
-
-#else /* ZSTD_MULTITHREAD not defined */
-/* No multithreading support */
-
-typedef int ZSTD_pthread_mutex_t;
-#define ZSTD_pthread_mutex_init(a, b) ((void)(a), (void)(b), 0)
-#define ZSTD_pthread_mutex_destroy(a) ((void)(a))
-#define ZSTD_pthread_mutex_lock(a) ((void)(a))
-#define ZSTD_pthread_mutex_unlock(a) ((void)(a))
-
-typedef int ZSTD_pthread_cond_t;
-#define ZSTD_pthread_cond_init(a, b) ((void)(a), (void)(b), 0)
-#define ZSTD_pthread_cond_destroy(a) ((void)(a))
-#define ZSTD_pthread_cond_wait(a, b) ((void)(a), (void)(b))
-#define ZSTD_pthread_cond_signal(a) ((void)(a))
-#define ZSTD_pthread_cond_broadcast(a) ((void)(a))
-
-/* do not use ZSTD_pthread_t */
-
-#endif /* ZSTD_MULTITHREAD */
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* THREADING_H_938743 */
diff --git a/src/zstd/xxhash.h b/src/zstd/xxhash.h
deleted file mode 100644
index 6f74a76bd..000000000
--- a/src/zstd/xxhash.h
+++ /dev/null
@@ -1,5751 +0,0 @@
-/*
- *  xxHash - Fast Hash algorithm
- *  Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- *  You can contact the author at :
- *  - xxHash homepage: https://cyan4973.github.io/xxHash/
- *  - xxHash source repository : https://github.com/Cyan4973/xxHash
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef XXH_NO_XXH3
-#define XXH_NO_XXH3
-#endif
-
-#ifndef XXH_NAMESPACE
-#define XXH_NAMESPACE ZSTD_
-#endif
-
-/*!
- * @mainpage xxHash
- *
- * @file xxhash.h
- * xxHash prototypes and implementation
- */
-/* TODO: update */
-/* Notice extracted from xxHash homepage:
-
-xxHash is an extremely fast hash algorithm, running at RAM speed limits.
-It also successfully passes all tests from the SMHasher suite.
-
-Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
-
-Name            Speed       Q.Score   Author
-xxHash          5.4 GB/s     10
-CrapWow         3.2 GB/s      2       Andrew
-MurmurHash 3a   2.7 GB/s     10       Austin Appleby
-SpookyHash      2.0 GB/s     10       Bob Jenkins
-SBox            1.4 GB/s      9       Bret Mulvey
-Lookup3         1.2 GB/s      9       Bob Jenkins
-SuperFastHash   1.2 GB/s      1       Paul Hsieh
-CityHash64      1.05 GB/s    10       Pike & Alakuijala
-FNV             0.55 GB/s     5       Fowler, Noll, Vo
-CRC32           0.43 GB/s     9
-MD5-32          0.33 GB/s    10       Ronald L. Rivest
-SHA1-32         0.28 GB/s    10
-
-Q.Score is a measure of quality of the hash function.
-It depends on successfully passing SMHasher test set.
-10 is a perfect score.
-
-Note: SMHasher's CRC32 implementation is not the fastest one.
-Other speed-oriented implementations can be faster,
-especially in combination with PCLMUL instruction:
-https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
-
-A 64-bit version, named XXH64, is available since r35.
-It offers much better speed, but for 64-bit applications only.
-Name     Speed on 64 bits    Speed on 32 bits
-XXH64       13.8 GB/s            1.9 GB/s
-XXH32        6.8 GB/s            6.0 GB/s
-*/
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* ****************************
- *  INLINE mode
- ******************************/
-/*!
- * XXH_INLINE_ALL (and XXH_PRIVATE_API)
- * Use these build macros to inline xxhash into the target unit.
- * Inlining improves performance on small inputs, especially when the length is
- * expressed as a compile-time constant:
- *
- *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
- *
- * It also keeps xxHash symbols private to the unit, so they are not exported.
- *
- * Usage:
- *     #define XXH_INLINE_ALL
- *     #include "xxhash.h"
- *
- * Do not compile and link xxhash.o as a separate object, as it is not useful.
- */
-#if(defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) && !defined(XXH_INLINE_ALL_31684351384)
-/* this section should be traversed only once */
-#define XXH_INLINE_ALL_31684351384
-/* give access to the advanced API, required to compile implementations */
-#undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */
-#define XXH_STATIC_LINKING_ONLY
-/* make all functions private */
-#undef XXH_PUBLIC_API
-#if defined(__GNUC__)
-#define XXH_PUBLIC_API static __inline __attribute__((unused))
-#elif defined(__cplusplus) || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
-#define XXH_PUBLIC_API static inline
-#elif defined(_MSC_VER)
-#define XXH_PUBLIC_API static __inline
-#else
-/* note: this version may generate warnings for unused static functions */
-#define XXH_PUBLIC_API static
-#endif
-
-/*
- * This part deals with the special case where a unit wants to inline xxHash,
- * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
- * such as part of some previously included *.h header file.
- * Without further action, the new include would just be ignored,
- * and functions would effectively _not_ be inlined (silent failure).
- * The following macros solve this situation by prefixing all inlined names,
- * avoiding naming collision with previous inclusions.
- */
-/* Before that, we unconditionally #undef all symbols,
- * in case they were already defined with XXH_NAMESPACE.
- * They will then be redefined for XXH_INLINE_ALL
- */
-#undef XXH_versionNumber
-/* XXH32 */
-#undef XXH32
-#undef XXH32_createState
-#undef XXH32_freeState
-#undef XXH32_reset
-#undef XXH32_update
-#undef XXH32_digest
-#undef XXH32_copyState
-#undef XXH32_canonicalFromHash
-#undef XXH32_hashFromCanonical
-/* XXH64 */
-#undef XXH64
-#undef XXH64_createState
-#undef XXH64_freeState
-#undef XXH64_reset
-#undef XXH64_update
-#undef XXH64_digest
-#undef XXH64_copyState
-#undef XXH64_canonicalFromHash
-#undef XXH64_hashFromCanonical
-/* XXH3_64bits */
-#undef XXH3_64bits
-#undef XXH3_64bits_withSecret
-#undef XXH3_64bits_withSeed
-#undef XXH3_64bits_withSecretandSeed
-#undef XXH3_createState
-#undef XXH3_freeState
-#undef XXH3_copyState
-#undef XXH3_64bits_reset
-#undef XXH3_64bits_reset_withSeed
-#undef XXH3_64bits_reset_withSecret
-#undef XXH3_64bits_update
-#undef XXH3_64bits_digest
-#undef XXH3_generateSecret
-/* XXH3_128bits */
-#undef XXH128
-#undef XXH3_128bits
-#undef XXH3_128bits_withSeed
-#undef XXH3_128bits_withSecret
-#undef XXH3_128bits_reset
-#undef XXH3_128bits_reset_withSeed
-#undef XXH3_128bits_reset_withSecret
-#undef XXH3_128bits_reset_withSecretandSeed
-#undef XXH3_128bits_update
-#undef XXH3_128bits_digest
-#undef XXH128_isEqual
-#undef XXH128_cmp
-#undef XXH128_canonicalFromHash
-#undef XXH128_hashFromCanonical
-/* Finally, free the namespace itself */
-#undef XXH_NAMESPACE
-
-/* employ the namespace for XXH_INLINE_ALL */
-#define XXH_NAMESPACE XXH_INLINE_
-/*
- * Some identifiers (enums, type names) are not symbols,
- * but they must nonetheless be renamed to avoid redeclaration.
- * Alternative solution: do not redeclare them.
- * However, this requires some #ifdefs, and has a more dispersed impact.
- * Meanwhile, renaming can be achieved in a single place.
- */
-#define XXH_IPREF(Id) XXH_NAMESPACE##Id
-#define XXH_OK XXH_IPREF(XXH_OK)
-#define XXH_ERROR XXH_IPREF(XXH_ERROR)
-#define XXH_errorcode XXH_IPREF(XXH_errorcode)
-#define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t)
-#define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t)
-#define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
-#define XXH32_state_s XXH_IPREF(XXH32_state_s)
-#define XXH32_state_t XXH_IPREF(XXH32_state_t)
-#define XXH64_state_s XXH_IPREF(XXH64_state_s)
-#define XXH64_state_t XXH_IPREF(XXH64_state_t)
-#define XXH3_state_s XXH_IPREF(XXH3_state_s)
-#define XXH3_state_t XXH_IPREF(XXH3_state_t)
-#define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
-/* Ensure the header is parsed again, even if it was previously included */
-#undef XXHASH_H_5627135585666179
-#undef XXHASH_H_STATIC_13879238742
-#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
-
-/* ****************************************************************
- *  Stable API
- *****************************************************************/
-#ifndef XXHASH_H_5627135585666179
-#define XXHASH_H_5627135585666179 1
-
-/*!
- * @defgroup public Public API
- * Contains details on the public xxHash functions.
- * @{
- */
-/* specific declaration modes for Windows */
-#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
-#if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
-#ifdef XXH_EXPORT
-#define XXH_PUBLIC_API __declspec(dllexport)
-#elif XXH_IMPORT
-#define XXH_PUBLIC_API __declspec(dllimport)
-#endif
-#else
-#define XXH_PUBLIC_API /* do nothing */
-#endif
-#endif
-
-#ifdef XXH_DOXYGEN
-/*!
- * @brief Emulate a namespace by transparently prefixing all symbols.
- *
- * If you want to include _and expose_ xxHash functions from within your own
- * library, but also want to avoid symbol collisions with other libraries which
- * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
- * any public symbol from xxhash library with the value of XXH_NAMESPACE
- * (therefore, avoid empty or numeric values).
- *
- * Note that no change is required within the calling program as long as it
- * includes `xxhash.h`: Regular symbol names will be automatically translated
- * by this header.
- */
-#define XXH_NAMESPACE /* YOUR NAME HERE */
-#undef XXH_NAMESPACE
-#endif
-
-#ifdef XXH_NAMESPACE
-#define XXH_CAT(A, B) A##B
-#define XXH_NAME2(A, B) XXH_CAT(A, B)
-#define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
-/* XXH32 */
-#define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
-#define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
-#define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
-#define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
-#define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
-#define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
-#define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
-#define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
-#define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
-/* XXH64 */
-#define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
-#define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
-#define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
-#define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
-#define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
-#define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
-#define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
-#define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
-#define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
-/* XXH3_64bits */
-#define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
-#define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
-#define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
-#define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
-#define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
-#define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
-#define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
-#define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
-#define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
-#define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
-#define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
-#define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
-#define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
-#define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
-#define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
-/* XXH3_128bits */
-#define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
-#define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
-#define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
-#define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
-#define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
-#define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
-#define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
-#define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
-#define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
-#define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
-#define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
-#define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
-#define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
-#define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
-#define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
-#endif
-
-/* *************************************
- *  Version
- ***************************************/
-#define XXH_VERSION_MAJOR 0
-#define XXH_VERSION_MINOR 8
-#define XXH_VERSION_RELEASE 1
-#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + XXH_VERSION_RELEASE)
-
-/*!
- * @brief Obtains the xxHash version.
- *
- * This is mostly useful when xxHash is compiled as a shared library,
- * since the returned value comes from the library, as opposed to header file.
- *
- * @return `XXH_VERSION_NUMBER` of the invoked library.
- */
-XXH_PUBLIC_API unsigned XXH_versionNumber(void);
-
-/* ****************************
- *  Common basic types
- ******************************/
-#include <stddef.h> /* size_t */
-typedef enum { XXH_OK = 0,
-	           XXH_ERROR } XXH_errorcode;
-
-/*-**********************************************************************
- *  32-bit hash
- ************************************************************************/
-#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
-/*!
- * @brief An unsigned 32-bit integer.
- *
- * Not necessarily defined to `uint32_t` but functionally equivalent.
- */
-typedef uint32_t XXH32_hash_t;
-
-#elif !defined(__VMS) && (defined(__cplusplus) || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
-#include <stdint.h>
-typedef uint32_t XXH32_hash_t;
-
-#else
-#include <limits.h>
-#if UINT_MAX == 0xFFFFFFFFUL
-typedef unsigned int XXH32_hash_t;
-#else
-#if ULONG_MAX == 0xFFFFFFFFUL
-typedef unsigned long XXH32_hash_t;
-#else
-#error "unsupported platform: need a 32-bit type"
-#endif
-#endif
-#endif
-
-/*!
- * @}
- *
- * @defgroup xxh32_family XXH32 family
- * @ingroup public
- * Contains functions used in the classic 32-bit xxHash algorithm.
- *
- * @note
- *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
- *   Note that @ref xxh3_family provides competitive speed
- *   for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results.
- *
- * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
- * @see @ref xxh32_impl for implementation details
- * @{
- */
-
-/*!
- * @brief Calculates the 32-bit hash of @p input using xxHash32.
- *
- * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
- *
- * @param input The block of data to be hashed, at least @p length bytes in size.
- * @param length The length of @p input, in bytes.
- * @param seed The 32-bit seed to alter the hash's output predictably.
- *
- * @pre
- *   The memory between @p input and @p input + @p length must be valid,
- *   readable, contiguous memory. However, if @p length is `0`, @p input may be
- *   `NULL`. In C++, this also must be *TriviallyCopyable*.
- *
- * @return The calculated 32-bit hash value.
- *
- * @see
- *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
- *    Direct equivalents for the other variants of xxHash.
- * @see
- *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
- */
-XXH_PUBLIC_API XXH32_hash_t XXH32(const void* input, size_t length, XXH32_hash_t seed);
-
-/*!
- * Streaming functions generate the xxHash value from an incremental input.
- * This method is slower than single-call functions, due to state management.
- * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
- *
- * An XXH state must first be allocated using `XXH*_createState()`.
- *
- * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
- *
- * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
- *
- * The function returns an error code, with 0 meaning OK, and any other value
- * meaning there is an error.
- *
- * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
- * This function returns the nn-bits hash as an int or long long.
- *
- * It's still possible to continue inserting input into the hash state after a
- * digest, and generate new hash values later on by invoking `XXH*_digest()`.
- *
- * When done, release the state using `XXH*_freeState()`.
- *
- * Example code for incrementally hashing a file:
- * @code{.c}
- *    #include <stdio.h>
- *    #include <xxhash.h>
- *    #define BUFFER_SIZE 256
- *
- *    // Note: XXH64 and XXH3 use the same interface.
- *    XXH32_hash_t
- *    hashFile(FILE* stream)
- *    {
- *        XXH32_state_t* state;
- *        unsigned char buf[BUFFER_SIZE];
- *        size_t amt;
- *        XXH32_hash_t hash;
- *
- *        state = XXH32_createState();       // Create a state
- *        assert(state != NULL);             // Error check here
- *        XXH32_reset(state, 0xbaad5eed);    // Reset state with our seed
- *        while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
- *            XXH32_update(state, buf, amt); // Hash the file in chunks
- *        }
- *        hash = XXH32_digest(state);        // Finalize the hash
- *        XXH32_freeState(state);            // Clean up
- *        return hash;
- *    }
- * @endcode
- */
-
-/*!
- * @typedef struct XXH32_state_s XXH32_state_t
- * @brief The opaque state struct for the XXH32 streaming API.
- *
- * @see XXH32_state_s for details.
- */
-typedef struct XXH32_state_s XXH32_state_t;
-
-/*!
- * @brief Allocates an @ref XXH32_state_t.
- *
- * Must be freed with XXH32_freeState().
- * @return An allocated XXH32_state_t on success, `NULL` on failure.
- */
-XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
-/*!
- * @brief Frees an @ref XXH32_state_t.
- *
- * Must be allocated with XXH32_createState().
- * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
- * @return XXH_OK.
- */
-XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);
-/*!
- * @brief Copies one @ref XXH32_state_t to another.
- *
- * @param dst_state The state to copy to.
- * @param src_state The state to copy from.
- * @pre
- *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
- */
-XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
-
-/*!
- * @brief Resets an @ref XXH32_state_t to begin a new hash.
- *
- * This function resets and seeds a state. Call it before @ref XXH32_update().
- *
- * @param statePtr The state struct to reset.
- * @param seed The 32-bit seed to alter the hash result predictably.
- *
- * @pre
- *   @p statePtr must not be `NULL`.
- *
- * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
- */
-XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed);
-
-/*!
- * @brief Consumes a block of @p input to an @ref XXH32_state_t.
- *
- * Call this to incrementally consume blocks of data.
- *
- * @param statePtr The state struct to update.
- * @param input The block of data to be hashed, at least @p length bytes in size.
- * @param length The length of @p input, in bytes.
- *
- * @pre
- *   @p statePtr must not be `NULL`.
- * @pre
- *   The memory between @p input and @p input + @p length must be valid,
- *   readable, contiguous memory. However, if @p length is `0`, @p input may be
- *   `NULL`. In C++, this also must be *TriviallyCopyable*.
- *
- * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
- */
-XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* statePtr, const void* input, size_t length);
-
-/*!
- * @brief Returns the calculated hash value from an @ref XXH32_state_t.
- *
- * @note
- *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
- *   digest, and update again.
- *
- * @param statePtr The state struct to calculate the hash from.
- *
- * @pre
- *  @p statePtr must not be `NULL`.
- *
- * @return The calculated xxHash32 value from that state.
- */
-XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* statePtr);
-
-/*******   Canonical representation   *******/
-
-/*
- * The default return values from XXH functions are unsigned 32 and 64 bit
- * integers.
- * This the simplest and fastest format for further post-processing.
- *
- * However, this leaves open the question of what is the order on the byte level,
- * since little and big endian conventions will store the same number differently.
- *
- * The canonical representation settles this issue by mandating big-endian
- * convention, the same convention as human-readable numbers (large digits first).
- *
- * When writing hash values to storage, sending them over a network, or printing
- * them, it's highly recommended to use the canonical representation to ensure
- * portability across a wider range of systems, present and future.
- *
- * The following functions allow transformation of hash values to and from
- * canonical format.
- */
-
-/*!
- * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
- */
-typedef struct {
-	unsigned char digest[4]; /*!< Hash bytes, big endian */
-} XXH32_canonical_t;
-
-/*!
- * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
- *
- * @param dst The @ref XXH32_canonical_t pointer to be stored to.
- * @param hash The @ref XXH32_hash_t to be converted.
- *
- * @pre
- *   @p dst must not be `NULL`.
- */
-XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
-
-/*!
- * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
- *
- * @param src The @ref XXH32_canonical_t to convert.
- *
- * @pre
- *   @p src must not be `NULL`.
- *
- * @return The converted hash.
- */
-XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
-
-#ifdef __has_attribute
-#define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
-#else
-#define XXH_HAS_ATTRIBUTE(x) 0
-#endif
-
-/* C-language Attributes are added in C23. */
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
-#define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
-#else
-#define XXH_HAS_C_ATTRIBUTE(x) 0
-#endif
-
-#if defined(__cplusplus) && defined(__has_cpp_attribute)
-#define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
-#else
-#define XXH_HAS_CPP_ATTRIBUTE(x) 0
-#endif
-
-/*
-Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
-introduced in CPP17 and C23.
-CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
-C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
-*/
-#if XXH_HAS_C_ATTRIBUTE(x)
-#define XXH_FALLTHROUGH [[fallthrough]]
-#elif XXH_HAS_CPP_ATTRIBUTE(x)
-#define XXH_FALLTHROUGH [[fallthrough]]
-#elif XXH_HAS_ATTRIBUTE(__fallthrough__)
-#define XXH_FALLTHROUGH __attribute__((fallthrough))
-#else
-#define XXH_FALLTHROUGH
-#endif
-
-/*!
- * @}
- * @ingroup public
- * @{
- */
-
-#ifndef XXH_NO_LONG_LONG
-/*-**********************************************************************
- *  64-bit hash
- ************************************************************************/
-#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
-/*!
- * @brief An unsigned 64-bit integer.
- *
- * Not necessarily defined to `uint64_t` but functionally equivalent.
- */
-typedef uint64_t XXH64_hash_t;
-#elif !defined(__VMS) && (defined(__cplusplus) || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
-#include <stdint.h>
-typedef uint64_t XXH64_hash_t;
-#else
-#include <limits.h>
-#if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
-/* LP64 ABI says uint64_t is unsigned long */
-typedef unsigned long XXH64_hash_t;
-#else
-/* the following type must have a width of 64-bit */
-typedef unsigned long long XXH64_hash_t;
-#endif
-#endif
-
-/*!
- * @}
- *
- * @defgroup xxh64_family XXH64 family
- * @ingroup public
- * @{
- * Contains functions used in the classic 64-bit xxHash algorithm.
- *
- * @note
- *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
- *   and offers true 64/128 bit hash results.
- *   It provides better speed for systems with vector processing capabilities.
- */
-
-/*!
- * @brief Calculates the 64-bit hash of @p input using xxHash64.
- *
- * This function usually runs faster on 64-bit systems, but slower on 32-bit
- * systems (see benchmark).
- *
- * @param input The block of data to be hashed, at least @p length bytes in size.
- * @param length The length of @p input, in bytes.
- * @param seed The 64-bit seed to alter the hash's output predictably.
- *
- * @pre
- *   The memory between @p input and @p input + @p length must be valid,
- *   readable, contiguous memory. However, if @p length is `0`, @p input may be
- *   `NULL`. In C++, this also must be *TriviallyCopyable*.
- *
- * @return The calculated 64-bit hash.
- *
- * @see
- *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
- *    Direct equivalents for the other variants of xxHash.
- * @see
- *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
- */
-XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
-
-/*******   Streaming   *******/
-/*!
- * @brief The opaque state struct for the XXH64 streaming API.
- *
- * @see XXH64_state_s for details.
- */
-typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
-XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
-XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
-XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
-
-XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed);
-XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t* statePtr, const void* input, size_t length);
-XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* statePtr);
-
-/*******   Canonical representation   *******/
-typedef struct {
-	unsigned char digest[sizeof(XXH64_hash_t)];
-} XXH64_canonical_t;
-XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
-XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
-
-#ifndef XXH_NO_XXH3
-/*!
- * @}
- * ************************************************************************
- * @defgroup xxh3_family XXH3 family
- * @ingroup public
- * @{
- *
- * XXH3 is a more recent hash algorithm featuring:
- *  - Improved speed for both small and large inputs
- *  - True 64-bit and 128-bit outputs
- *  - SIMD acceleration
- *  - Improved 32-bit viability
- *
- * Speed analysis methodology is explained here:
- *
- *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
- *
- * Compared to XXH64, expect XXH3 to run approximately
- * ~2x faster on large inputs and >3x faster on small ones,
- * exact differences vary depending on platform.
- *
- * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
- * but does not require it.
- * Any 32-bit and 64-bit targets that can run XXH32 smoothly
- * can run XXH3 at competitive speeds, even without vector support.
- * Further details are explained in the implementation.
- *
- * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
- * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
- *
- * XXH3 implementation is portable:
- * it has a generic C90 formulation that can be compiled on any platform,
- * all implementations generage exactly the same hash value on all platforms.
- * Starting from v0.8.0, it's also labelled "stable", meaning that
- * any future version will also generate the same hash value.
- *
- * XXH3 offers 2 variants, _64bits and _128bits.
- *
- * When only 64 bits are needed, prefer invoking the _64bits variant, as it
- * reduces the amount of mixing, resulting in faster speed on small inputs.
- * It's also generally simpler to manipulate a scalar return type than a struct.
- *
- * The API supports one-shot hashing, streaming mode, and custom secrets.
- */
-
-/*-**********************************************************************
- *  XXH3 64-bit variant
- ************************************************************************/
-
-/* XXH3_64bits():
- * default 64-bit variant, using default secret and default seed of 0.
- * It's the fastest variant. */
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
-
-/*
- * XXH3_64bits_withSeed():
- * This variant generates a custom secret on the fly
- * based on default secret altered using the `seed` value.
- * While this operation is decently fast, note that it's not completely free.
- * Note: seed==0 produces the same results as XXH3_64bits().
- */
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
-
-/*!
- * The bare minimum size for a custom secret.
- *
- * @see
- *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
- *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
- */
-#define XXH3_SECRET_SIZE_MIN 136
-
-/*
- * XXH3_64bits_withSecret():
- * It's possible to provide any blob of bytes as a "secret" to generate the hash.
- * This makes it more difficult for an external actor to prepare an intentional collision.
- * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
- * However, the quality of the secret impacts the dispersion of the hash algorithm.
- * Therefore, the secret _must_ look like a bunch of random bytes.
- * Avoid "trivial" or structured data such as repeated sequences or a text document.
- * Whenever in doubt about the "randomness" of the blob of bytes,
- * consider employing "XXH3_generateSecret()" instead (see below).
- * It will generate a proper high entropy secret derived from the blob of bytes.
- * Another advantage of using XXH3_generateSecret() is that
- * it guarantees that all bits within the initial blob of bytes
- * will impact every bit of the output.
- * This is not necessarily the case when using the blob of bytes directly
- * because, when hashing _small_ inputs, only a portion of the secret is employed.
- */
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
-
-/*******   Streaming   *******/
-/*
- * Streaming requires state maintenance.
- * This operation costs memory and CPU.
- * As a consequence, streaming is slower than one-shot hashing.
- * For better performance, prefer one-shot functions whenever applicable.
- */
-
-/*!
- * @brief The state struct for the XXH3 streaming API.
- *
- * @see XXH3_state_s for details.
- */
-typedef struct XXH3_state_s XXH3_state_t;
-XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
-XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
-XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
-
-/*
- * XXH3_64bits_reset():
- * Initialize with default parameters.
- * digest will be equivalent to `XXH3_64bits()`.
- */
-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
-/*
- * XXH3_64bits_reset_withSeed():
- * Generate a custom secret from `seed`, and store it into `statePtr`.
- * digest will be equivalent to `XXH3_64bits_withSeed()`.
- */
-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
-/*
- * XXH3_64bits_reset_withSecret():
- * `secret` is referenced, it _must outlive_ the hash streaming session.
- * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
- * and the quality of produced hash values depends on secret's entropy
- * (secret's content should look like a bunch of random bytes).
- * When in doubt about the randomness of a candidate `secret`,
- * consider employing `XXH3_generateSecret()` instead (see below).
- */
-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
-
-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t* statePtr, const void* input, size_t length);
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t* statePtr);
-
-/* note : canonical representation of XXH3 is the same as XXH64
- * since they both produce XXH64_hash_t values */
-
-/*-**********************************************************************
- *  XXH3 128-bit variant
- ************************************************************************/
-
-/*!
- * @brief The return value from 128-bit hashes.
- *
- * Stored in little endian order, although the fields themselves are in native
- * endianness.
- */
-typedef struct {
-	XXH64_hash_t low64;  /*!< `value & 0xFFFFFFFFFFFFFFFF` */
-	XXH64_hash_t high64; /*!< `value >> 64` */
-} XXH128_hash_t;
-
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
-
-/*******   Streaming   *******/
-/*
- * Streaming requires state maintenance.
- * This operation costs memory and CPU.
- * As a consequence, streaming is slower than one-shot hashing.
- * For better performance, prefer one-shot functions whenever applicable.
- *
- * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
- * Use already declared XXH3_createState() and XXH3_freeState().
- *
- * All reset and streaming functions have same meaning as their 64-bit counterpart.
- */
-
-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
-
-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t* statePtr, const void* input, size_t length);
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t* statePtr);
-
-/* Following helper functions make it possible to compare XXH128_hast_t values.
- * Since XXH128_hash_t is a structure, this capability is not offered by the language.
- * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
-
-/*!
- * XXH128_isEqual():
- * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
- */
-XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
-
-/*!
- * XXH128_cmp():
- *
- * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
- *
- * return: >0 if *h128_1  > *h128_2
- *         =0 if *h128_1 == *h128_2
- *         <0 if *h128_1  < *h128_2
- */
-XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
-
-/*******   Canonical representation   *******/
-typedef struct {
-	unsigned char digest[sizeof(XXH128_hash_t)];
-} XXH128_canonical_t;
-XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
-XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
-
-#endif /* !XXH_NO_XXH3 */
-#endif /* XXH_NO_LONG_LONG */
-
-/*!
- * @}
- */
-#endif /* XXHASH_H_5627135585666179 */
-
-#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
-#define XXHASH_H_STATIC_13879238742
-/* ****************************************************************************
- * This section contains declarations which are not guaranteed to remain stable.
- * They may change in future versions, becoming incompatible with a different
- * version of the library.
- * These declarations should only be used with static linking.
- * Never use them in association with dynamic linking!
- ***************************************************************************** */
-
-/*
- * These definitions are only present to allow static allocation
- * of XXH states, on stack or in a struct, for example.
- * Never **ever** access their members directly.
- */
-
-/*!
- * @internal
- * @brief Structure for XXH32 streaming API.
- *
- * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
- * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
- * an opaque type. This allows fields to safely be changed.
- *
- * Typedef'd to @ref XXH32_state_t.
- * Do not access the members of this struct directly.
- * @see XXH64_state_s, XXH3_state_s
- */
-struct XXH32_state_s {
-	XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
-	XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
-	XXH32_hash_t v[4];         /*!< Accumulator lanes */
-	XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
-	XXH32_hash_t memsize;      /*!< Amount of data in @ref mem32 */
-	XXH32_hash_t reserved;     /*!< Reserved field. Do not read nor write to it. */
-};                             /* typedef'd to XXH32_state_t */
-
-#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */
-
-/*!
- * @internal
- * @brief Structure for XXH64 streaming API.
- *
- * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
- * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
- * an opaque type. This allows fields to safely be changed.
- *
- * Typedef'd to @ref XXH64_state_t.
- * Do not access the members of this struct directly.
- * @see XXH32_state_s, XXH3_state_s
- */
-struct XXH64_state_s {
-	XXH64_hash_t total_len;  /*!< Total length hashed. This is always 64-bit. */
-	XXH64_hash_t v[4];       /*!< Accumulator lanes */
-	XXH64_hash_t mem64[4];   /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
-	XXH32_hash_t memsize;    /*!< Amount of data in @ref mem64 */
-	XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/
-	XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */
-};                           /* typedef'd to XXH64_state_t */
-
-#ifndef XXH_NO_XXH3
-
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
-#include <stdalign.h>
-#define XXH_ALIGN(n) alignas(n)
-#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
-/* In C++ alignas() is a keyword */
-#define XXH_ALIGN(n) alignas(n)
-#elif defined(__GNUC__)
-#define XXH_ALIGN(n) __attribute__((aligned(n)))
-#elif defined(_MSC_VER)
-#define XXH_ALIGN(n) __declspec(align(n))
-#else
-#define XXH_ALIGN(n) /* disabled */
-#endif
-
-/* Old GCC versions only accept the attribute after the type in structures. */
-#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */     \
-    && !(defined(__cplusplus) && (__cplusplus >= 201103L))        /* >= C++11 */ \
-    && defined(__GNUC__)
-#define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
-#else
-#define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
-#endif
-
-/*!
- * @brief The size of the internal XXH3 buffer.
- *
- * This is the optimal update size for incremental hashing.
- *
- * @see XXH3_64b_update(), XXH3_128b_update().
- */
-#define XXH3_INTERNALBUFFER_SIZE 256
-
-/*!
- * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
- *
- * This is the size used in @ref XXH3_kSecret and the seeded functions.
- *
- * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
- */
-#define XXH3_SECRET_DEFAULT_SIZE 192
-
-/*!
- * @internal
- * @brief Structure for XXH3 streaming API.
- *
- * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
- * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
- * Otherwise it is an opaque type.
- * Never use this definition in combination with dynamic library.
- * This allows fields to safely be changed in the future.
- *
- * @note ** This structure has a strict alignment requirement of 64 bytes!! **
- * Do not allocate this with `malloc()` or `new`,
- * it will not be sufficiently aligned.
- * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
- *
- * Typedef'd to @ref XXH3_state_t.
- * Do never access the members of this struct directly.
- *
- * @see XXH3_INITSTATE() for stack initialization.
- * @see XXH3_createState(), XXH3_freeState().
- * @see XXH32_state_s, XXH64_state_s
- */
-struct XXH3_state_s {
-	XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
-	/*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */
-	XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
-	/*!< Used to store a custom secret generated from a seed. */
-	XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
-	/*!< The internal buffer. @see XXH32_state_s::mem32 */
-	XXH32_hash_t bufferedSize;
-	/*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
-	XXH32_hash_t useSeed;
-	/*!< Reserved field. Needed for padding on 64-bit. */
-	size_t nbStripesSoFar;
-	/*!< Number or stripes processed. */
-	XXH64_hash_t totalLen;
-	/*!< Total length hashed. 64-bit even on 32-bit targets. */
-	size_t nbStripesPerBlock;
-	/*!< Number of stripes per block. */
-	size_t secretLimit;
-	/*!< Size of @ref customSecret or @ref extSecret */
-	XXH64_hash_t seed;
-	/*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
-	XXH64_hash_t reserved64;
-	/*!< Reserved field. */
-	const unsigned char* extSecret;
-	/*!< Reference to an external secret for the _withSecret variants, NULL
-	 *   for other variants. */
-	/* note: there may be some padding at the end due to alignment on 64 bytes */
-}; /* typedef'd to XXH3_state_t */
-
-#undef XXH_ALIGN_MEMBER
-
-/*!
- * @brief Initializes a stack-allocated `XXH3_state_s`.
- *
- * When the @ref XXH3_state_t structure is merely emplaced on stack,
- * it should be initialized with XXH3_INITSTATE() or a memset()
- * in case its first reset uses XXH3_NNbits_reset_withSeed().
- * This init can be omitted if the first reset uses default or _withSecret mode.
- * This operation isn't necessary when the state is created with XXH3_createState().
- * Note that this doesn't prepare the state for a streaming operation,
- * it's still necessary to use XXH3_NNbits_reset*() afterwards.
- */
-#define XXH3_INITSTATE(XXH3_state_ptr) \
-	{ (XXH3_state_ptr)->seed = 0; }
-
-/* XXH128() :
- * simple alias to pre-selected XXH3_128bits variant
- */
-XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
-
-/* ===   Experimental API   === */
-/* Symbols defined below must be considered tied to a specific library version. */
-
-/*
- * XXH3_generateSecret():
- *
- * Derive a high-entropy secret from any user-defined content, named customSeed.
- * The generated secret can be used in combination with `*_withSecret()` functions.
- * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
- * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
- *
- * The function accepts as input a custom seed of any length and any content,
- * and derives from it a high-entropy secret of length @secretSize
- * into an already allocated buffer @secretBuffer.
- * @secretSize must be >= XXH3_SECRET_SIZE_MIN
- *
- * The generated secret can then be used with any `*_withSecret()` variant.
- * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
- * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
- * are part of this list. They all accept a `secret` parameter
- * which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
- * _and_ feature very high entropy (consist of random-looking bytes).
- * These conditions can be a high bar to meet, so
- * XXH3_generateSecret() can be employed to ensure proper quality.
- *
- * customSeed can be anything. It can have any size, even small ones,
- * and its content can be anything, even "poor entropy" sources such as a bunch of zeroes.
- * The resulting `secret` will nonetheless provide all required qualities.
- *
- * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
- */
-XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
-
-/*
- * XXH3_generateSecret_fromSeed():
- *
- * Generate the same secret as the _withSeed() variants.
- *
- * The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily).
- * @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes.
- *
- * The generated secret can be used in combination with
- *`*_withSecret()` and `_withSecretandSeed()` variants.
- * This generator is notably useful in combination with `_withSecretandSeed()`,
- * as a way to emulate a faster `_withSeed()` variant.
- */
-XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
-
-/*
- * *_withSecretandSeed() :
- * These variants generate hash values using either
- * @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)
- * or @secret for "large" keys (>= XXH3_MIDSIZE_MAX).
- *
- * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
- * `_withSeed()` has to generate the secret on the fly for "large" keys.
- * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
- * `_withSecret()` has to generate the masks on the fly for "small" keys,
- * which requires more instructions than _withSeed() variants.
- * Therefore, _withSecretandSeed variant combines the best of both worlds.
- *
- * When @secret has been generated by XXH3_generateSecret_fromSeed(),
- * this variant produces *exactly* the same results as `_withSeed()` variant,
- * hence offering only a pure speed benefit on "large" input,
- * by skipping the need to regenerate the secret for every large input.
- *
- * Another usage scenario is to hash the secret to a 64-bit hash value,
- * for example with XXH3_64bits(), which then becomes the seed,
- * and then employ both the seed and the secret in _withSecretandSeed().
- * On top of speed, an added benefit is that each bit in the secret
- * has a 50% chance to swap each bit in the output,
- * via its impact to the seed.
- * This is not guaranteed when using the secret directly in "small data" scenarios,
- * because only portions of the secret are employed for small data.
- */
-XXH_PUBLIC_API XXH64_hash_t
-XXH3_64bits_withSecretandSeed(const void* data, size_t len,
-                              const void* secret, size_t secretSize,
-                              XXH64_hash_t seed);
-
-XXH_PUBLIC_API XXH128_hash_t
-XXH3_128bits_withSecretandSeed(const void* data, size_t len,
-                               const void* secret, size_t secretSize,
-                               XXH64_hash_t seed64);
-
-XXH_PUBLIC_API XXH_errorcode
-XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
-                                    const void* secret, size_t secretSize,
-                                    XXH64_hash_t seed64);
-
-XXH_PUBLIC_API XXH_errorcode
-XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
-                                     const void* secret, size_t secretSize,
-                                     XXH64_hash_t seed64);
-
-#endif /* XXH_NO_XXH3 */
-#endif /* XXH_NO_LONG_LONG */
-#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
-#define XXH_IMPLEMENTATION
-#endif
-
-#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
-
-/* ======================================================================== */
-/* ======================================================================== */
-/* ======================================================================== */
-
-/*-**********************************************************************
- * xxHash implementation
- *-**********************************************************************
- * xxHash's implementation used to be hosted inside xxhash.c.
- *
- * However, inlining requires implementation to be visible to the compiler,
- * hence be included alongside the header.
- * Previously, implementation was hosted inside xxhash.c,
- * which was then #included when inlining was activated.
- * This construction created issues with a few build and install systems,
- * as it required xxhash.c to be stored in /include directory.
- *
- * xxHash implementation is now directly integrated within xxhash.h.
- * As a consequence, xxhash.c is no longer needed in /include.
- *
- * xxhash.c is still available and is still useful.
- * In a "normal" setup, when xxhash is not inlined,
- * xxhash.h only exposes the prototypes and public symbols,
- * while xxhash.c can be built into an object file xxhash.o
- * which can then be linked into the final binary.
- ************************************************************************/
-
-#if(defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) || defined(XXH_IMPLEMENTATION)) && !defined(XXH_IMPLEM_13a8737387)
-#define XXH_IMPLEM_13a8737387
-
-/* *************************************
- *  Tuning parameters
- ***************************************/
-
-/*!
- * @defgroup tuning Tuning parameters
- * @{
- *
- * Various macros to control xxHash's behavior.
- */
-#ifdef XXH_DOXYGEN
-/*!
- * @brief Define this to disable 64-bit code.
- *
- * Useful if only using the @ref xxh32_family and you have a strict C90 compiler.
- */
-#define XXH_NO_LONG_LONG
-#undef XXH_NO_LONG_LONG /* don't actually */
-/*!
- * @brief Controls how unaligned memory is accessed.
- *
- * By default, access to unaligned memory is controlled by `memcpy()`, which is
- * safe and portable.
- *
- * Unfortunately, on some target/compiler combinations, the generated assembly
- * is sub-optimal.
- *
- * The below switch allow selection of a different access method
- * in the search for improved performance.
- *
- * @par Possible options:
- *
- *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
- *   @par
- *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
- *     eliminate the function call and treat it as an unaligned access.
- *
- *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
- *   @par
- *     Depends on compiler extensions and is therefore not portable.
- *     This method is safe _if_ your compiler supports it,
- *     and *generally* as fast or faster than `memcpy`.
- *
- *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
- *  @par
- *     Casts directly and dereferences. This method doesn't depend on the
- *     compiler, but it violates the C standard as it directly dereferences an
- *     unaligned pointer. It can generate buggy code on targets which do not
- *     support unaligned memory accesses, but in some circumstances, it's the
- *     only known way to get the most performance.
- *
- *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
- *  @par
- *     Also portable. This can generate the best code on old compilers which don't
- *     inline small `memcpy()` calls, and it might also be faster on big-endian
- *     systems which lack a native byteswap instruction. However, some compilers
- *     will emit literal byteshifts even if the target supports unaligned access.
- *  .
- *
- * @warning
- *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
- *   care, as what works on one compiler/platform/optimization level may cause
- *   another to read garbage data or even crash.
- *
- * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
- *
- * Prefer these methods in priority order (0 > 3 > 1 > 2)
- */
-#define XXH_FORCE_MEMORY_ACCESS 0
-
-/*!
- * @def XXH_FORCE_ALIGN_CHECK
- * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
- * and XXH64() only).
- *
- * This is an important performance trick for architectures without decent
- * unaligned memory access performance.
- *
- * It checks for input alignment, and when conditions are met, uses a "fast
- * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
- * faster_ read speed.
- *
- * The check costs one initial branch per hash, which is generally negligible,
- * but not zero.
- *
- * Moreover, it's not useful to generate an additional code path if memory
- * access uses the same instruction for both aligned and unaligned
- * addresses (e.g. x86 and aarch64).
- *
- * In these cases, the alignment check can be removed by setting this macro to 0.
- * Then the code will always use unaligned memory access.
- * Align check is automatically disabled on x86, x64 & arm64,
- * which are platforms known to offer good unaligned memory accesses performance.
- *
- * This option does not affect XXH3 (only XXH32 and XXH64).
- */
-#define XXH_FORCE_ALIGN_CHECK 0
-
-/*!
- * @def XXH_NO_INLINE_HINTS
- * @brief When non-zero, sets all functions to `static`.
- *
- * By default, xxHash tries to force the compiler to inline almost all internal
- * functions.
- *
- * This can usually improve performance due to reduced jumping and improved
- * constant folding, but significantly increases the size of the binary which
- * might not be favorable.
- *
- * Additionally, sometimes the forced inlining can be detrimental to performance,
- * depending on the architecture.
- *
- * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
- * compiler full control on whether to inline or not.
- *
- * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
- * -fno-inline with GCC or Clang, this will automatically be defined.
- */
-#define XXH_NO_INLINE_HINTS 0
-
-/*!
- * @def XXH32_ENDJMP
- * @brief Whether to use a jump for `XXH32_finalize`.
- *
- * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
- * This is generally preferable for performance,
- * but depending on exact architecture, a jmp may be preferable.
- *
- * This setting is only possibly making a difference for very small inputs.
- */
-#define XXH32_ENDJMP 0
-
-/*!
- * @internal
- * @brief Redefines old internal names.
- *
- * For compatibility with code that uses xxHash's internals before the names
- * were changed to improve namespacing. There is no other reason to use this.
- */
-#define XXH_OLD_NAMES
-#undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
-#endif               /* XXH_DOXYGEN */
-/*!
- * @}
- */
-
-#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
-                                /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
-#if !defined(__clang__) &&                                              \
-    ((defined(__INTEL_COMPILER) && !defined(_WIN32)) ||                 \
-     (defined(__GNUC__) && ((defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \
-                            (defined(__mips__) &&                       \
-                             (__mips <= 5 || __mips_isa_rev < 6) &&     \
-                             (!defined(__mips16) || defined(__mips_mips16e2))))))
-#define XXH_FORCE_MEMORY_ACCESS 1
-#endif
-#endif
-
-#ifndef XXH_FORCE_ALIGN_CHECK                                                                                                  /* can be defined externally */
-#if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */
-#define XXH_FORCE_ALIGN_CHECK 0
-#else
-#define XXH_FORCE_ALIGN_CHECK 1
-#endif
-#endif
-
-#ifndef XXH_NO_INLINE_HINTS
-#if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
-    || defined(__NO_INLINE__)  /* -O0, -fno-inline */
-#define XXH_NO_INLINE_HINTS 1
-#else
-#define XXH_NO_INLINE_HINTS 0
-#endif
-#endif
-
-#ifndef XXH32_ENDJMP
-/* generally preferable for performance */
-#define XXH32_ENDJMP 0
-#endif
-
-/*!
- * @defgroup impl Implementation
- * @{
- */
-
-/* *************************************
- *  Includes & Memory related functions
- ***************************************/
-/* Modify the local functions below should you wish to use some other memory routines */
-/* for ZSTD_malloc(), ZSTD_free() */
-#define ZSTD_DEPS_NEED_MALLOC
-#include "zstd_deps.h" /* size_t, ZSTD_malloc, ZSTD_free, ZSTD_memcpy */
-static void* XXH_malloc(size_t s) { return ZSTD_malloc(s); }
-static void XXH_free(void* p) { ZSTD_free(p); }
-static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_memcpy(dest, src, size); }
-
-/* *************************************
- *  Compiler Specific Options
- ***************************************/
-#ifdef _MSC_VER                 /* Visual Studio warning fix */
-#pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
-#endif
-
-#if XXH_NO_INLINE_HINTS /* disable inlining hints */
-#if defined(__GNUC__) || defined(__clang__)
-#define XXH_FORCE_INLINE static
-#else
-#define XXH_FORCE_INLINE static
-#endif
-#define XXH_NO_INLINE static
-/* enable inlining hints */
-#elif defined(__GNUC__) || defined(__clang__)
-#define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline))
-#define XXH_NO_INLINE static __attribute__((noinline))
-#elif defined(_MSC_VER) /* Visual Studio */
-#define XXH_FORCE_INLINE static __forceinline
-#define XXH_NO_INLINE static __declspec(noinline)
-#elif defined(__cplusplus) || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */
-#define XXH_FORCE_INLINE static inline
-#define XXH_NO_INLINE static
-#else
-#define XXH_FORCE_INLINE static
-#define XXH_NO_INLINE static
-#endif
-
-/* *************************************
- *  Debug
- ***************************************/
-/*!
- * @ingroup tuning
- * @def XXH_DEBUGLEVEL
- * @brief Sets the debugging level.
- *
- * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
- * compiler's command line options. The value must be a number.
- */
-#ifndef XXH_DEBUGLEVEL
-#ifdef DEBUGLEVEL /* backwards compat */
-#define XXH_DEBUGLEVEL DEBUGLEVEL
-#else
-#define XXH_DEBUGLEVEL 0
-#endif
-#endif
-
-#if(XXH_DEBUGLEVEL >= 1)
-#include <assert.h> /* note: can still be disabled with NDEBUG */
-#define XXH_ASSERT(c) assert(c)
-#else
-#define XXH_ASSERT(c) ((void)0)
-#endif
-
-/* note: use after variable declarations */
-#ifndef XXH_STATIC_ASSERT
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */
-#include <assert.h>
-#define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
-	do {                                     \
-		static_assert((c), m);               \
-	} while(0)
-#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */
-#define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
-	do {                                     \
-		static_assert((c), m);               \
-	} while(0)
-#else
-#define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
-	do {                                     \
-		struct xxh_sa {                      \
-			char x[(c) ? 1 : -1];            \
-		};                                   \
-	} while(0)
-#endif
-#define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c), #c)
-#endif
-
-/*!
- * @internal
- * @def XXH_COMPILER_GUARD(var)
- * @brief Used to prevent unwanted optimizations for @p var.
- *
- * It uses an empty GCC inline assembly statement with a register constraint
- * which forces @p var into a general purpose register (e.g. eax, ebx, ecx
- * on x86) and marks it as modified.
- *
- * This is used in a few places to avoid unwanted autovectorization (e.g.
- * XXH32_round()). All vectorization we want is explicit via intrinsics,
- * and _usually_ isn't wanted elsewhere.
- *
- * We also use it to prevent unwanted constant folding for AArch64 in
- * XXH3_initCustomSecret_scalar().
- */
-#if defined(__GNUC__) || defined(__clang__)
-#define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" \
-	                                                 : "+r"(var))
-#else
-#define XXH_COMPILER_GUARD(var) ((void)0)
-#endif
-
-/* *************************************
- *  Basic Types
- ***************************************/
-#if !defined(__VMS) && (defined(__cplusplus) || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
-#include <stdint.h>
-typedef uint8_t xxh_u8;
-#else
-typedef unsigned char xxh_u8;
-#endif
-typedef XXH32_hash_t xxh_u32;
-
-#ifdef XXH_OLD_NAMES
-#define BYTE xxh_u8
-#define U8 xxh_u8
-#define U32 xxh_u32
-#endif
-
-/* ***   Memory access   *** */
-
-/*!
- * @internal
- * @fn xxh_u32 XXH_read32(const void* ptr)
- * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
- *
- * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
- *
- * @param ptr The pointer to read from.
- * @return The 32-bit native endian integer from the bytes at @p ptr.
- */
-
-/*!
- * @internal
- * @fn xxh_u32 XXH_readLE32(const void* ptr)
- * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
- *
- * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
- *
- * @param ptr The pointer to read from.
- * @return The 32-bit little endian integer from the bytes at @p ptr.
- */
-
-/*!
- * @internal
- * @fn xxh_u32 XXH_readBE32(const void* ptr)
- * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
- *
- * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
- *
- * @param ptr The pointer to read from.
- * @return The 32-bit big endian integer from the bytes at @p ptr.
- */
-
-/*!
- * @internal
- * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
- * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
- *
- * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
- * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
- * always @ref XXH_alignment::XXH_unaligned.
- *
- * @param ptr The pointer to read from.
- * @param align Whether @p ptr is aligned.
- * @pre
- *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
- *   aligned.
- * @return The 32-bit little endian integer from the bytes at @p ptr.
- */
-
-#if(defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
-/*
- * Manual byteshift. Best for old compilers which don't inline memcpy.
- * We actually directly use XXH_readLE32 and XXH_readBE32.
- */
-#elif(defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
-
-/*
- * Force direct memory access. Only works on CPU which support unaligned memory
- * access in hardware.
- */
-static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*)memPtr; }
-
-#elif(defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
-
-/*
- * __pack instructions are safer but compiler specific, hence potentially
- * problematic for some compilers.
- *
- * Currently only defined for GCC and ICC.
- */
-#ifdef XXH_OLD_NAMES
-typedef union {
-	xxh_u32 u32;
-} __attribute__((packed)) unalign;
-#endif
-static xxh_u32 XXH_read32(const void* ptr) {
-	typedef union {
-		xxh_u32 u32;
-	} __attribute__((packed)) xxh_unalign;
-	return ((const xxh_unalign*)ptr)->u32;
-}
-
-#else
-
-/*
- * Portable and safe solution. Generally efficient.
- * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
- */
-static xxh_u32 XXH_read32(const void* memPtr) {
-	xxh_u32 val;
-	XXH_memcpy(&val, memPtr, sizeof(val));
-	return val;
-}
-
-#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
-
-/* ***   Endianness   *** */
-
-/*!
- * @ingroup tuning
- * @def XXH_CPU_LITTLE_ENDIAN
- * @brief Whether the target is little endian.
- *
- * Defined to 1 if the target is little endian, or 0 if it is big endian.
- * It can be defined externally, for example on the compiler command line.
- *
- * If it is not defined,
- * a runtime check (which is usually constant folded) is used instead.
- *
- * @note
- *   This is not necessarily defined to an integer constant.
- *
- * @see XXH_isLittleEndian() for the runtime check.
- */
-#ifndef XXH_CPU_LITTLE_ENDIAN
-/*
- * Try to detect endianness automatically, to avoid the nonstandard behavior
- * in `XXH_isLittleEndian()`
- */
-#if defined(_WIN32) /* Windows is always little endian */ \
-    || defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define XXH_CPU_LITTLE_ENDIAN 1
-#elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-#define XXH_CPU_LITTLE_ENDIAN 0
-#else
-/*!
- * @internal
- * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
- *
- * Most compilers will constant fold this.
- */
-static int XXH_isLittleEndian(void) {
-	/*
-	 * Portable and well-defined behavior.
-	 * Don't use static: it is detrimental to performance.
-	 */
-	const union {
-		xxh_u32 u;
-		xxh_u8 c[4];
-	} one = {1};
-	return one.c[0];
-}
-#define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
-#endif
-#endif
-
-/* ****************************************
- *  Compiler-specific Functions and Macros
- ******************************************/
-#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-
-#ifdef __has_builtin
-#define XXH_HAS_BUILTIN(x) __has_builtin(x)
-#else
-#define XXH_HAS_BUILTIN(x) 0
-#endif
-
-/*!
- * @internal
- * @def XXH_rotl32(x,r)
- * @brief 32-bit rotate left.
- *
- * @param x The 32-bit integer to be rotated.
- * @param r The number of bits to rotate.
- * @pre
- *   @p r > 0 && @p r < 32
- * @note
- *   @p x and @p r may be evaluated multiple times.
- * @return The rotated result.
- */
-#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) && XXH_HAS_BUILTIN(__builtin_rotateleft64)
-#define XXH_rotl32 __builtin_rotateleft32
-#define XXH_rotl64 __builtin_rotateleft64
-/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
-#elif defined(_MSC_VER)
-#define XXH_rotl32(x, r) _rotl(x, r)
-#define XXH_rotl64(x, r) _rotl64(x, r)
-#else
-#define XXH_rotl32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
-#define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
-#endif
-
-/*!
- * @internal
- * @fn xxh_u32 XXH_swap32(xxh_u32 x)
- * @brief A 32-bit byteswap.
- *
- * @param x The 32-bit integer to byteswap.
- * @return @p x, byteswapped.
- */
-#if defined(_MSC_VER) /* Visual Studio */
-#define XXH_swap32 _byteswap_ulong
-#elif XXH_GCC_VERSION >= 403
-#define XXH_swap32 __builtin_bswap32
-#else
-static xxh_u32 XXH_swap32(xxh_u32 x) {
-	return ((x << 24) & 0xff000000) |
-	       ((x << 8) & 0x00ff0000) |
-	       ((x >> 8) & 0x0000ff00) |
-	       ((x >> 24) & 0x000000ff);
-}
-#endif
-
-/* ***************************
- *  Memory reads
- *****************************/
-
-/*!
- * @internal
- * @brief Enum to indicate whether a pointer is aligned.
- */
-typedef enum {
-	XXH_aligned,  /*!< Aligned */
-	XXH_unaligned /*!< Possibly unaligned */
-} XXH_alignment;
-
-/*
- * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
- *
- * This is ideal for older compilers which don't inline memcpy.
- */
-#if(defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
-
-XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) {
-	const xxh_u8* bytePtr = (const xxh_u8*)memPtr;
-	return bytePtr[0] | ((xxh_u32)bytePtr[1] << 8) | ((xxh_u32)bytePtr[2] << 16) | ((xxh_u32)bytePtr[3] << 24);
-}
-
-XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) {
-	const xxh_u8* bytePtr = (const xxh_u8*)memPtr;
-	return bytePtr[3] | ((xxh_u32)bytePtr[2] << 8) | ((xxh_u32)bytePtr[1] << 16) | ((xxh_u32)bytePtr[0] << 24);
-}
-
-#else
-XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) {
-	return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
-}
-
-static xxh_u32 XXH_readBE32(const void* ptr) {
-	return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
-}
-#endif
-
-XXH_FORCE_INLINE xxh_u32
-XXH_readLE32_align(const void* ptr, XXH_alignment align) {
-	if(align == XXH_unaligned) {
-		return XXH_readLE32(ptr);
-	} else {
-		return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
-	}
-}
-
-/* *************************************
- *  Misc
- ***************************************/
-/*! @ingroup public */
-XXH_PUBLIC_API unsigned XXH_versionNumber(void) { return XXH_VERSION_NUMBER; }
-
-/* *******************************************************************
- *  32-bit hash functions
- *********************************************************************/
-/*!
- * @}
- * @defgroup xxh32_impl XXH32 implementation
- * @ingroup impl
- * @{
- */
-/* #define instead of static const, to be used as initializers */
-#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */
-#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */
-#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */
-#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */
-#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */
-
-#ifdef XXH_OLD_NAMES
-#define PRIME32_1 XXH_PRIME32_1
-#define PRIME32_2 XXH_PRIME32_2
-#define PRIME32_3 XXH_PRIME32_3
-#define PRIME32_4 XXH_PRIME32_4
-#define PRIME32_5 XXH_PRIME32_5
-#endif
-
-/*!
- * @internal
- * @brief Normal stripe processing routine.
- *
- * This shuffles the bits so that any bit from @p input impacts several bits in
- * @p acc.
- *
- * @param acc The accumulator lane.
- * @param input The stripe of input to mix.
- * @return The mixed accumulator lane.
- */
-static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) {
-	acc += input * XXH_PRIME32_2;
-	acc = XXH_rotl32(acc, 13);
-	acc *= XXH_PRIME32_1;
-#if(defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
-	/*
-	 * UGLY HACK:
-	 * A compiler fence is the only thing that prevents GCC and Clang from
-	 * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
-	 * reason) without globally disabling SSE4.1.
-	 *
-	 * The reason we want to avoid vectorization is because despite working on
-	 * 4 integers at a time, there are multiple factors slowing XXH32 down on
-	 * SSE4:
-	 * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
-	 *   newer chips!) making it slightly slower to multiply four integers at
-	 *   once compared to four integers independently. Even when pmulld was
-	 *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
-	 *   just to multiply unless doing a long operation.
-	 *
-	 * - Four instructions are required to rotate,
-	 *      movqda tmp,  v // not required with VEX encoding
-	 *      pslld  tmp, 13 // tmp <<= 13
-	 *      psrld  v,   19 // x >>= 19
-	 *      por    v,  tmp // x |= tmp
-	 *   compared to one for scalar:
-	 *      roll   v, 13    // reliably fast across the board
-	 *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
-	 *
-	 * - Instruction level parallelism is actually more beneficial here because
-	 *   the SIMD actually serializes this operation: While v1 is rotating, v2
-	 *   can load data, while v3 can multiply. SSE forces them to operate
-	 *   together.
-	 *
-	 * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
-	 * and it is pointless writing a NEON implementation that is basically the
-	 * same speed as scalar for XXH32.
-	 */
-	XXH_COMPILER_GUARD(acc);
-#endif
-	return acc;
-}
-
-/*!
- * @internal
- * @brief Mixes all bits to finalize the hash.
- *
- * The final mix ensures that all input bits have a chance to impact any bit in
- * the output digest, resulting in an unbiased distribution.
- *
- * @param h32 The hash to avalanche.
- * @return The avalanched hash.
- */
-static xxh_u32 XXH32_avalanche(xxh_u32 h32) {
-	h32 ^= h32 >> 15;
-	h32 *= XXH_PRIME32_2;
-	h32 ^= h32 >> 13;
-	h32 *= XXH_PRIME32_3;
-	h32 ^= h32 >> 16;
-	return (h32);
-}
-
-#define XXH_get32bits(p) XXH_readLE32_align(p, align)
-
-/*!
- * @internal
- * @brief Processes the last 0-15 bytes of @p ptr.
- *
- * There may be up to 15 bytes remaining to consume from the input.
- * This final stage will digest them to ensure that all input bytes are present
- * in the final mix.
- *
- * @param h32 The hash to finalize.
- * @param ptr The pointer to the remaining input.
- * @param len The remaining length, modulo 16.
- * @param align Whether @p ptr is aligned.
- * @return The finalized hash.
- */
-static xxh_u32
-XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) {
-#define XXH_PROCESS1                               \
-	do {                                           \
-		h32 += (*ptr++) * XXH_PRIME32_5;           \
-		h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \
-	} while(0)
-
-#define XXH_PROCESS4                               \
-	do {                                           \
-		h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \
-		ptr += 4;                                  \
-		h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \
-	} while(0)
-
-	if(ptr == NULL)
-		XXH_ASSERT(len == 0);
-
-	/* Compact rerolled version; generally faster */
-	if(!XXH32_ENDJMP) {
-		len &= 15;
-		while(len >= 4) {
-			XXH_PROCESS4;
-			len -= 4;
-		}
-		while(len > 0) {
-			XXH_PROCESS1;
-			--len;
-		}
-		return XXH32_avalanche(h32);
-	} else {
-		switch(len & 15) /* or switch(bEnd - p) */ {
-		case 12:
-			XXH_PROCESS4;
-			XXH_FALLTHROUGH;
-		case 8:
-			XXH_PROCESS4;
-			XXH_FALLTHROUGH;
-		case 4:
-			XXH_PROCESS4;
-			return XXH32_avalanche(h32);
-
-		case 13:
-			XXH_PROCESS4;
-			XXH_FALLTHROUGH;
-		case 9:
-			XXH_PROCESS4;
-			XXH_FALLTHROUGH;
-		case 5:
-			XXH_PROCESS4;
-			XXH_PROCESS1;
-			return XXH32_avalanche(h32);
-
-		case 14:
-			XXH_PROCESS4;
-			XXH_FALLTHROUGH;
-		case 10:
-			XXH_PROCESS4;
-			XXH_FALLTHROUGH;
-		case 6:
-			XXH_PROCESS4;
-			XXH_PROCESS1;
-			XXH_PROCESS1;
-			return XXH32_avalanche(h32);
-
-		case 15:
-			XXH_PROCESS4;
-			XXH_FALLTHROUGH;
-		case 11:
-			XXH_PROCESS4;
-			XXH_FALLTHROUGH;
-		case 7:
-			XXH_PROCESS4;
-			XXH_FALLTHROUGH;
-		case 3:
-			XXH_PROCESS1;
-			XXH_FALLTHROUGH;
-		case 2:
-			XXH_PROCESS1;
-			XXH_FALLTHROUGH;
-		case 1:
-			XXH_PROCESS1;
-			XXH_FALLTHROUGH;
-		case 0:
-			return XXH32_avalanche(h32);
-		}
-		XXH_ASSERT(0);
-		return h32; /* reaching this point is deemed impossible */
-	}
-}
-
-#ifdef XXH_OLD_NAMES
-#define PROCESS1 XXH_PROCESS1
-#define PROCESS4 XXH_PROCESS4
-#else
-#undef XXH_PROCESS1
-#undef XXH_PROCESS4
-#endif
-
-/*!
- * @internal
- * @brief The implementation for @ref XXH32().
- *
- * @param input , len , seed Directly passed from @ref XXH32().
- * @param align Whether @p input is aligned.
- * @return The calculated hash.
- */
-XXH_FORCE_INLINE xxh_u32
-XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) {
-	xxh_u32 h32;
-
-	if(input == NULL)
-		XXH_ASSERT(len == 0);
-
-	if(len >= 16) {
-		const xxh_u8* const bEnd = input + len;
-		const xxh_u8* const limit = bEnd - 15;
-		xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
-		xxh_u32 v2 = seed + XXH_PRIME32_2;
-		xxh_u32 v3 = seed + 0;
-		xxh_u32 v4 = seed - XXH_PRIME32_1;
-
-		do {
-			v1 = XXH32_round(v1, XXH_get32bits(input));
-			input += 4;
-			v2 = XXH32_round(v2, XXH_get32bits(input));
-			input += 4;
-			v3 = XXH32_round(v3, XXH_get32bits(input));
-			input += 4;
-			v4 = XXH32_round(v4, XXH_get32bits(input));
-			input += 4;
-		} while(input < limit);
-
-		h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
-	} else {
-		h32 = seed + XXH_PRIME32_5;
-	}
-
-	h32 += (xxh_u32)len;
-
-	return XXH32_finalize(h32, input, len & 15, align);
-}
-
-/*! @ingroup xxh32_family */
-XXH_PUBLIC_API XXH32_hash_t XXH32(const void* input, size_t len, XXH32_hash_t seed) {
-#if 0
-    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
-    XXH32_state_t state;
-    XXH32_reset(&state, seed);
-    XXH32_update(&state, (const xxh_u8*)input, len);
-    return XXH32_digest(&state);
-#else
-	if(XXH_FORCE_ALIGN_CHECK) {
-		if((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */
-			return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
-		}
-	}
-
-	return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
-#endif
-}
-
-/*******   Hash streaming   *******/
-/*!
- * @ingroup xxh32_family
- */
-XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) {
-	return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
-}
-/*! @ingroup xxh32_family */
-XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) {
-	XXH_free(statePtr);
-	return XXH_OK;
-}
-
-/*! @ingroup xxh32_family */
-XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) {
-	XXH_memcpy(dstState, srcState, sizeof(*dstState));
-}
-
-/*! @ingroup xxh32_family */
-XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) {
-	XXH_ASSERT(statePtr != NULL);
-	memset(statePtr, 0, sizeof(*statePtr));
-	statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
-	statePtr->v[1] = seed + XXH_PRIME32_2;
-	statePtr->v[2] = seed + 0;
-	statePtr->v[3] = seed - XXH_PRIME32_1;
-	return XXH_OK;
-}
-
-/*! @ingroup xxh32_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH32_update(XXH32_state_t* state, const void* input, size_t len) {
-	if(input == NULL) {
-		XXH_ASSERT(len == 0);
-		return XXH_OK;
-	}
-
-	{
-		const xxh_u8* p = (const xxh_u8*)input;
-		const xxh_u8* const bEnd = p + len;
-
-		state->total_len_32 += (XXH32_hash_t)len;
-		state->large_len |= (XXH32_hash_t)((len >= 16) | (state->total_len_32 >= 16));
-
-		if(state->memsize + len < 16) { /* fill in tmp buffer */
-			XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
-			state->memsize += (XXH32_hash_t)len;
-			return XXH_OK;
-		}
-
-		if(state->memsize) { /* some data left from previous update */
-			XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16 - state->memsize);
-			{
-				const xxh_u32* p32 = state->mem32;
-				state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32));
-				p32++;
-				state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32));
-				p32++;
-				state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32));
-				p32++;
-				state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));
-			}
-			p += 16 - state->memsize;
-			state->memsize = 0;
-		}
-
-		if(p <= bEnd - 16) {
-			const xxh_u8* const limit = bEnd - 16;
-
-			do {
-				state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p));
-				p += 4;
-				state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p));
-				p += 4;
-				state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p));
-				p += 4;
-				state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p));
-				p += 4;
-			} while(p <= limit);
-		}
-
-		if(p < bEnd) {
-			XXH_memcpy(state->mem32, p, (size_t)(bEnd - p));
-			state->memsize = (unsigned)(bEnd - p);
-		}
-	}
-
-	return XXH_OK;
-}
-
-/*! @ingroup xxh32_family */
-XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) {
-	xxh_u32 h32;
-
-	if(state->large_len) {
-		h32 = XXH_rotl32(state->v[0], 1) + XXH_rotl32(state->v[1], 7) + XXH_rotl32(state->v[2], 12) + XXH_rotl32(state->v[3], 18);
-	} else {
-		h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
-	}
-
-	h32 += state->total_len_32;
-
-	return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
-}
-
-/*******   Canonical representation   *******/
-
-/*!
- * @ingroup xxh32_family
- * The default return values from XXH functions are unsigned 32 and 64 bit
- * integers.
- *
- * The canonical representation uses big endian convention, the same convention
- * as human-readable numbers (large digits first).
- *
- * This way, hash values can be written into a file or buffer, remaining
- * comparable across different systems.
- *
- * The following functions allow transformation of hash values to and from their
- * canonical format.
- */
-XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) {
-	/* XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); */
-	if(XXH_CPU_LITTLE_ENDIAN)
-		hash = XXH_swap32(hash);
-	XXH_memcpy(dst, &hash, sizeof(*dst));
-}
-/*! @ingroup xxh32_family */
-XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) {
-	return XXH_readBE32(src);
-}
-
-#ifndef XXH_NO_LONG_LONG
-
-/* *******************************************************************
- *  64-bit hash functions
- *********************************************************************/
-/*!
- * @}
- * @ingroup impl
- * @{
- */
-/*******   Memory access   *******/
-
-typedef XXH64_hash_t xxh_u64;
-
-#ifdef XXH_OLD_NAMES
-#define U64 xxh_u64
-#endif
-
-#if(defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
-/*
- * Manual byteshift. Best for old compilers which don't inline memcpy.
- * We actually directly use XXH_readLE64 and XXH_readBE64.
- */
-#elif(defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
-
-/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
-static xxh_u64 XXH_read64(const void* memPtr) {
-	return *(const xxh_u64*)memPtr;
-}
-
-#elif(defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
-
-/*
- * __pack instructions are safer, but compiler specific, hence potentially
- * problematic for some compilers.
- *
- * Currently only defined for GCC and ICC.
- */
-#ifdef XXH_OLD_NAMES
-typedef union {
-	xxh_u32 u32;
-	xxh_u64 u64;
-} __attribute__((packed)) unalign64;
-#endif
-static xxh_u64 XXH_read64(const void* ptr) {
-	typedef union {
-		xxh_u32 u32;
-		xxh_u64 u64;
-	} __attribute__((packed)) xxh_unalign64;
-	return ((const xxh_unalign64*)ptr)->u64;
-}
-
-#else
-
-/*
- * Portable and safe solution. Generally efficient.
- * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
- */
-static xxh_u64 XXH_read64(const void* memPtr) {
-	xxh_u64 val;
-	XXH_memcpy(&val, memPtr, sizeof(val));
-	return val;
-}
-
-#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
-
-#if defined(_MSC_VER) /* Visual Studio */
-#define XXH_swap64 _byteswap_uint64
-#elif XXH_GCC_VERSION >= 403
-#define XXH_swap64 __builtin_bswap64
-#else
-static xxh_u64 XXH_swap64(xxh_u64 x) {
-	return ((x << 56) & 0xff00000000000000ULL) |
-	       ((x << 40) & 0x00ff000000000000ULL) |
-	       ((x << 24) & 0x0000ff0000000000ULL) |
-	       ((x << 8) & 0x000000ff00000000ULL) |
-	       ((x >> 8) & 0x00000000ff000000ULL) |
-	       ((x >> 24) & 0x0000000000ff0000ULL) |
-	       ((x >> 40) & 0x000000000000ff00ULL) |
-	       ((x >> 56) & 0x00000000000000ffULL);
-}
-#endif
-
-/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
-#if(defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
-
-XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) {
-	const xxh_u8* bytePtr = (const xxh_u8*)memPtr;
-	return bytePtr[0] | ((xxh_u64)bytePtr[1] << 8) | ((xxh_u64)bytePtr[2] << 16) | ((xxh_u64)bytePtr[3] << 24) | ((xxh_u64)bytePtr[4] << 32) | ((xxh_u64)bytePtr[5] << 40) | ((xxh_u64)bytePtr[6] << 48) | ((xxh_u64)bytePtr[7] << 56);
-}
-
-XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) {
-	const xxh_u8* bytePtr = (const xxh_u8*)memPtr;
-	return bytePtr[7] | ((xxh_u64)bytePtr[6] << 8) | ((xxh_u64)bytePtr[5] << 16) | ((xxh_u64)bytePtr[4] << 24) | ((xxh_u64)bytePtr[3] << 32) | ((xxh_u64)bytePtr[2] << 40) | ((xxh_u64)bytePtr[1] << 48) | ((xxh_u64)bytePtr[0] << 56);
-}
-
-#else
-XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) {
-	return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
-}
-
-static xxh_u64 XXH_readBE64(const void* ptr) {
-	return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
-}
-#endif
-
-XXH_FORCE_INLINE xxh_u64
-XXH_readLE64_align(const void* ptr, XXH_alignment align) {
-	if(align == XXH_unaligned)
-		return XXH_readLE64(ptr);
-	else
-		return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
-}
-
-/*******   xxh64   *******/
-/*!
- * @}
- * @defgroup xxh64_impl XXH64 implementation
- * @ingroup impl
- * @{
- */
-/* #define rather that static const, to be used as initializers */
-#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
-#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
-#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
-#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
-#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
-
-#ifdef XXH_OLD_NAMES
-#define PRIME64_1 XXH_PRIME64_1
-#define PRIME64_2 XXH_PRIME64_2
-#define PRIME64_3 XXH_PRIME64_3
-#define PRIME64_4 XXH_PRIME64_4
-#define PRIME64_5 XXH_PRIME64_5
-#endif
-
-static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) {
-	acc += input * XXH_PRIME64_2;
-	acc = XXH_rotl64(acc, 31);
-	acc *= XXH_PRIME64_1;
-	return acc;
-}
-
-static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) {
-	val = XXH64_round(0, val);
-	acc ^= val;
-	acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
-	return acc;
-}
-
-static xxh_u64 XXH64_avalanche(xxh_u64 h64) {
-	h64 ^= h64 >> 33;
-	h64 *= XXH_PRIME64_2;
-	h64 ^= h64 >> 29;
-	h64 *= XXH_PRIME64_3;
-	h64 ^= h64 >> 32;
-	return h64;
-}
-
-#define XXH_get64bits(p) XXH_readLE64_align(p, align)
-
-static xxh_u64
-XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) {
-	if(ptr == NULL)
-		XXH_ASSERT(len == 0);
-	len &= 31;
-	while(len >= 8) {
-		xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
-		ptr += 8;
-		h64 ^= k1;
-		h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4;
-		len -= 8;
-	}
-	if(len >= 4) {
-		h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
-		ptr += 4;
-		h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
-		len -= 4;
-	}
-	while(len > 0) {
-		h64 ^= (*ptr++) * XXH_PRIME64_5;
-		h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
-		--len;
-	}
-	return XXH64_avalanche(h64);
-}
-
-#ifdef XXH_OLD_NAMES
-#define PROCESS1_64 XXH_PROCESS1_64
-#define PROCESS4_64 XXH_PROCESS4_64
-#define PROCESS8_64 XXH_PROCESS8_64
-#else
-#undef XXH_PROCESS1_64
-#undef XXH_PROCESS4_64
-#undef XXH_PROCESS8_64
-#endif
-
-XXH_FORCE_INLINE xxh_u64
-XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) {
-	xxh_u64 h64;
-	if(input == NULL)
-		XXH_ASSERT(len == 0);
-
-	if(len >= 32) {
-		const xxh_u8* const bEnd = input + len;
-		const xxh_u8* const limit = bEnd - 31;
-		xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
-		xxh_u64 v2 = seed + XXH_PRIME64_2;
-		xxh_u64 v3 = seed + 0;
-		xxh_u64 v4 = seed - XXH_PRIME64_1;
-
-		do {
-			v1 = XXH64_round(v1, XXH_get64bits(input));
-			input += 8;
-			v2 = XXH64_round(v2, XXH_get64bits(input));
-			input += 8;
-			v3 = XXH64_round(v3, XXH_get64bits(input));
-			input += 8;
-			v4 = XXH64_round(v4, XXH_get64bits(input));
-			input += 8;
-		} while(input < limit);
-
-		h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
-		h64 = XXH64_mergeRound(h64, v1);
-		h64 = XXH64_mergeRound(h64, v2);
-		h64 = XXH64_mergeRound(h64, v3);
-		h64 = XXH64_mergeRound(h64, v4);
-
-	} else {
-		h64 = seed + XXH_PRIME64_5;
-	}
-
-	h64 += (xxh_u64)len;
-
-	return XXH64_finalize(h64, input, len, align);
-}
-
-/*! @ingroup xxh64_family */
-XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t len, XXH64_hash_t seed) {
-#if 0
-    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
-    XXH64_state_t state;
-    XXH64_reset(&state, seed);
-    XXH64_update(&state, (const xxh_u8*)input, len);
-    return XXH64_digest(&state);
-#else
-	if(XXH_FORCE_ALIGN_CHECK) {
-		if((((size_t)input) & 7) == 0) { /* Input is aligned, let's leverage the speed advantage */
-			return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
-		}
-	}
-
-	return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
-
-#endif
-}
-
-/*******   Hash Streaming   *******/
-
-/*! @ingroup xxh64_family*/
-XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) {
-	return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
-}
-/*! @ingroup xxh64_family */
-XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) {
-	XXH_free(statePtr);
-	return XXH_OK;
-}
-
-/*! @ingroup xxh64_family */
-XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) {
-	XXH_memcpy(dstState, srcState, sizeof(*dstState));
-}
-
-/*! @ingroup xxh64_family */
-XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) {
-	XXH_ASSERT(statePtr != NULL);
-	memset(statePtr, 0, sizeof(*statePtr));
-	statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
-	statePtr->v[1] = seed + XXH_PRIME64_2;
-	statePtr->v[2] = seed + 0;
-	statePtr->v[3] = seed - XXH_PRIME64_1;
-	return XXH_OK;
-}
-
-/*! @ingroup xxh64_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH64_update(XXH64_state_t* state, const void* input, size_t len) {
-	if(input == NULL) {
-		XXH_ASSERT(len == 0);
-		return XXH_OK;
-	}
-
-	{
-		const xxh_u8* p = (const xxh_u8*)input;
-		const xxh_u8* const bEnd = p + len;
-
-		state->total_len += len;
-
-		if(state->memsize + len < 32) { /* fill in tmp buffer */
-			XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
-			state->memsize += (xxh_u32)len;
-			return XXH_OK;
-		}
-
-		if(state->memsize) { /* tmp buffer is full */
-			XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32 - state->memsize);
-			state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64 + 0));
-			state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64 + 1));
-			state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64 + 2));
-			state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64 + 3));
-			p += 32 - state->memsize;
-			state->memsize = 0;
-		}
-
-		if(p + 32 <= bEnd) {
-			const xxh_u8* const limit = bEnd - 32;
-
-			do {
-				state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p));
-				p += 8;
-				state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p));
-				p += 8;
-				state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p));
-				p += 8;
-				state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p));
-				p += 8;
-			} while(p <= limit);
-		}
-
-		if(p < bEnd) {
-			XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));
-			state->memsize = (unsigned)(bEnd - p);
-		}
-	}
-
-	return XXH_OK;
-}
-
-/*! @ingroup xxh64_family */
-XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state) {
-	xxh_u64 h64;
-
-	if(state->total_len >= 32) {
-		h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
-		h64 = XXH64_mergeRound(h64, state->v[0]);
-		h64 = XXH64_mergeRound(h64, state->v[1]);
-		h64 = XXH64_mergeRound(h64, state->v[2]);
-		h64 = XXH64_mergeRound(h64, state->v[3]);
-	} else {
-		h64 = state->v[2] /*seed*/ + XXH_PRIME64_5;
-	}
-
-	h64 += (xxh_u64)state->total_len;
-
-	return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
-}
-
-/******* Canonical representation   *******/
-
-/*! @ingroup xxh64_family */
-XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) {
-	/* XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); */
-	if(XXH_CPU_LITTLE_ENDIAN)
-		hash = XXH_swap64(hash);
-	XXH_memcpy(dst, &hash, sizeof(*dst));
-}
-
-/*! @ingroup xxh64_family */
-XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) {
-	return XXH_readBE64(src);
-}
-
-#ifndef XXH_NO_XXH3
-
-/* *********************************************************************
- *  XXH3
- *  New generation hash designed for speed on small keys and vectorization
- ************************************************************************ */
-/*!
- * @}
- * @defgroup xxh3_impl XXH3 implementation
- * @ingroup impl
- * @{
- */
-
-/* ===   Compiler specifics   === */
-
-#if((defined(sun) || defined(__sun)) && __cplusplus)           /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
-#define XXH_RESTRICT                                           /* disable */
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
-#define XXH_RESTRICT restrict
-#else
-/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
-#define XXH_RESTRICT /* disable */
-#endif
-
-#if(defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
-#define XXH_likely(x) __builtin_expect(x, 1)
-#define XXH_unlikely(x) __builtin_expect(x, 0)
-#else
-#define XXH_likely(x) (x)
-#define XXH_unlikely(x) (x)
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
-#define inline __inline__ /* circumvent a clang bug */
-#include <arm_neon.h>
-#undef inline
-#elif defined(__AVX2__)
-#include <immintrin.h>
-#elif defined(__SSE2__)
-#include <emmintrin.h>
-#endif
-#endif
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-
-/*
- * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
- * remaining a true 64-bit/128-bit hash function.
- *
- * This is done by prioritizing a subset of 64-bit operations that can be
- * emulated without too many steps on the average 32-bit machine.
- *
- * For example, these two lines seem similar, and run equally fast on 64-bit:
- *
- *   xxh_u64 x;
- *   x ^= (x >> 47); // good
- *   x ^= (x >> 13); // bad
- *
- * However, to a 32-bit machine, there is a major difference.
- *
- * x ^= (x >> 47) looks like this:
- *
- *   x.lo ^= (x.hi >> (47 - 32));
- *
- * while x ^= (x >> 13) looks like this:
- *
- *   // note: funnel shifts are not usually cheap.
- *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
- *   x.hi ^= (x.hi >> 13);
- *
- * The first one is significantly faster than the second, simply because the
- * shift is larger than 32. This means:
- *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
- *    32 bits in the shift.
- *  - The shift result will always fit in the lower 32 bits, and therefore,
- *    we can ignore the upper 32 bits in the xor.
- *
- * Thanks to this optimization, XXH3 only requires these features to be efficient:
- *
- *  - Usable unaligned access
- *  - A 32-bit or 64-bit ALU
- *      - If 32-bit, a decent ADC instruction
- *  - A 32 or 64-bit multiply with a 64-bit result
- *  - For the 128-bit variant, a decent byteswap helps short inputs.
- *
- * The first two are already required by XXH32, and almost all 32-bit and 64-bit
- * platforms which can run XXH32 can run XXH3 efficiently.
- *
- * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
- * notable exception.
- *
- * First of all, Thumb-1 lacks support for the UMULL instruction which
- * performs the important long multiply. This means numerous __aeabi_lmul
- * calls.
- *
- * Second of all, the 8 functional registers are just not enough.
- * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
- * Lo registers, and this shuffling results in thousands more MOVs than A32.
- *
- * A32 and T32 don't have this limitation. They can access all 14 registers,
- * do a 32->64 multiply with UMULL, and the flexible operand allowing free
- * shifts is helpful, too.
- *
- * Therefore, we do a quick sanity check.
- *
- * If compiling Thumb-1 for a target which supports ARM instructions, we will
- * emit a warning, as it is not a "sane" platform to compile for.
- *
- * Usually, if this happens, it is because of an accident and you probably need
- * to specify -march, as you likely meant to compile for a newer architecture.
- *
- * Credit: large sections of the vectorial and asm source code paths
- *         have been contributed by @easyaspi314
- */
-#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
-#warning "XXH3 is highly inefficient without ARM or Thumb-2."
-#endif
-
-/* ==========================================
- * Vectorization detection
- * ========================================== */
-
-#ifdef XXH_DOXYGEN
-/*!
- * @ingroup tuning
- * @brief Overrides the vectorization implementation chosen for XXH3.
- *
- * Can be defined to 0 to disable SIMD or any of the values mentioned in
- * @ref XXH_VECTOR_TYPE.
- *
- * If this is not defined, it uses predefined macros to determine the best
- * implementation.
- */
-#define XXH_VECTOR XXH_SCALAR
-/*!
- * @ingroup tuning
- * @brief Possible values for @ref XXH_VECTOR.
- *
- * Note that these are actually implemented as macros.
- *
- * If this is not defined, it is detected automatically.
- * @ref XXH_X86DISPATCH overrides this.
- */
-enum XXH_VECTOR_TYPE /* fake enum */ {
-	XXH_SCALAR = 0, /*!< Portable scalar version */
-	XXH_SSE2 = 1,   /*!<
-	                 * SSE2 for Pentium 4, Opteron, all x86_64.
-	                 *
-	                 * @note SSE2 is also guaranteed on Windows 10, macOS, and
-	                 * Android x86.
-	                 */
-	XXH_AVX2 = 2,   /*!< AVX2 for Haswell and Bulldozer */
-	XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */
-	XXH_NEON = 4,   /*!< NEON for most ARMv7-A and all AArch64 */
-	XXH_VSX = 5,    /*!< VSX and ZVector for POWER8/z13 (64-bit) */
-};
-/*!
- * @ingroup tuning
- * @brief Selects the minimum alignment for XXH3's accumulators.
- *
- * When using SIMD, this should match the alignment required for said vector
- * type, so, for example, 32 for AVX2.
- *
- * Default: Auto detected.
- */
-#define XXH_ACC_ALIGN 8
-#endif
-
-/* Actual definition */
-#ifndef XXH_DOXYGEN
-#define XXH_SCALAR 0
-#define XXH_SSE2 1
-#define XXH_AVX2 2
-#define XXH_AVX512 3
-#define XXH_NEON 4
-#define XXH_VSX 5
-#endif
-
-#ifndef XXH_VECTOR /* can be defined on command line */
-#if(                                                                          \
-    defined(__ARM_NEON__) || defined(__ARM_NEON)                   /* gcc */  \
-    || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
-    ) &&                                                                      \
-    (defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */   \
-     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
-#define XXH_VECTOR XXH_NEON
-#elif defined(__AVX512F__)
-#define XXH_VECTOR XXH_AVX512
-#elif defined(__AVX2__)
-#define XXH_VECTOR XXH_AVX2
-#elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
-#define XXH_VECTOR XXH_SSE2
-#elif(defined(__PPC64__) && defined(__POWER8_VECTOR__)) || (defined(__s390x__) && defined(__VEC__)) && defined(__GNUC__) /* TODO: IBM XL */
-#define XXH_VECTOR XXH_VSX
-#else
-#define XXH_VECTOR XXH_SCALAR
-#endif
-#endif
-
-/*
- * Controls the alignment of the accumulator,
- * for compatibility with aligned vector loads, which are usually faster.
- */
-#ifndef XXH_ACC_ALIGN
-#if defined(XXH_X86DISPATCH)
-#define XXH_ACC_ALIGN 64       /* for compatibility with avx512 */
-#elif XXH_VECTOR == XXH_SCALAR /* scalar */
-#define XXH_ACC_ALIGN 8
-#elif XXH_VECTOR == XXH_SSE2 /* sse2 */
-#define XXH_ACC_ALIGN 16
-#elif XXH_VECTOR == XXH_AVX2 /* avx2 */
-#define XXH_ACC_ALIGN 32
-#elif XXH_VECTOR == XXH_NEON /* neon */
-#define XXH_ACC_ALIGN 16
-#elif XXH_VECTOR == XXH_VSX /* vsx */
-#define XXH_ACC_ALIGN 16
-#elif XXH_VECTOR == XXH_AVX512 /* avx512 */
-#define XXH_ACC_ALIGN 64
-#endif
-#endif
-
-#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
-#define XXH_SEC_ALIGN XXH_ACC_ALIGN
-#else
-#define XXH_SEC_ALIGN 8
-#endif
-
-/*
- * UGLY HACK:
- * GCC usually generates the best code with -O3 for xxHash.
- *
- * However, when targeting AVX2, it is overzealous in its unrolling resulting
- * in code roughly 3/4 the speed of Clang.
- *
- * There are other issues, such as GCC splitting _mm256_loadu_si256 into
- * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
- * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
- *
- * That is why when compiling the AVX2 version, it is recommended to use either
- *   -O2 -mavx2 -march=haswell
- * or
- *   -O2 -mavx2 -mno-avx256-split-unaligned-load
- * for decent performance, or to use Clang instead.
- *
- * Fortunately, we can control the first one with a pragma that forces GCC into
- * -O2, but the other one we can't control without "failed to inline always
- * inline function due to target mismatch" warnings.
- */
-#if XXH_VECTOR == XXH_AVX2                                  /* AVX2 */           \
-    && defined(__GNUC__) && !defined(__clang__)             /* GCC, not Clang */ \
-    && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
-#pragma GCC push_options
-#pragma GCC optimize("-O2")
-#endif
-
-#if XXH_VECTOR == XXH_NEON
-/*
- * NEON's setup for vmlal_u32 is a little more complicated than it is on
- * SSE2, AVX2, and VSX.
- *
- * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
- *
- * To do the same operation, the 128-bit 'Q' register needs to be split into
- * two 64-bit 'D' registers, performing this operation::
- *
- *   [                a                 |                 b                ]
- *            |              '---------. .--------'                |
- *            |                         x                          |
- *            |              .---------' '--------.                |
- *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
- *
- * Due to significant changes in aarch64, the fastest method for aarch64 is
- * completely different than the fastest method for ARMv7-A.
- *
- * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
- * D11 will modify the high half of Q5. This is similar to how modifying AH
- * will only affect bits 8-15 of AX on x86.
- *
- * VZIP takes two registers, and puts even lanes in one register and odd lanes
- * in the other.
- *
- * On ARMv7-A, this strangely modifies both parameters in place instead of
- * taking the usual 3-operand form.
- *
- * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
- * lower and upper halves of the Q register to end up with the high and low
- * halves where we want - all in one instruction.
- *
- *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
- *
- * Unfortunately we need inline assembly for this: Instructions modifying two
- * registers at once is not possible in GCC or Clang's IR, and they have to
- * create a copy.
- *
- * aarch64 requires a different approach.
- *
- * In order to make it easier to write a decent compiler for aarch64, many
- * quirks were removed, such as conditional execution.
- *
- * NEON was also affected by this.
- *
- * aarch64 cannot access the high bits of a Q-form register, and writes to a
- * D-form register zero the high bits, similar to how writes to W-form scalar
- * registers (or DWORD registers on x86_64) work.
- *
- * The formerly free vget_high intrinsics now require a vext (with a few
- * exceptions)
- *
- * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
- * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
- * operand.
- *
- * The equivalent of the VZIP.32 on the lower and upper halves would be this
- * mess:
- *
- *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
- *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
- *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
- *
- * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
- *
- *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
- *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
- *
- * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
- */
-
-/*!
- * Function-like macro:
- * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
- * {
- *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
- *     outHi = (uint32x2_t)(in >> 32);
- *     in = UNDEFINED;
- * }
- */
-#if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
-    && (defined(__GNUC__) || defined(__clang__)) && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
-#define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                                          \
-	do {                                                                                              \
-		/* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */         \
-		/* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */             \
-		/* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
-		__asm__("vzip.32  %e0, %f0"                                                                   \
-		        : "+w"(in));                                                                          \
-		(outLo) = vget_low_u32(vreinterpretq_u32_u64(in));                                            \
-		(outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                           \
-	} while(0)
-#else
-#define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
-	do {                                     \
-		(outLo) = vmovn_u64(in);             \
-		(outHi) = vshrn_n_u64((in), 32);     \
-	} while(0)
-#endif
-
-/*!
- * @ingroup tuning
- * @brief Controls the NEON to scalar ratio for XXH3
- *
- * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and
- * 2 lanes on scalar by default.
- *
- * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the
- * emulated 64-bit arithmetic is too slow.
- *
- * Modern ARM CPUs are _very_ sensitive to how their pipelines are used.
- *
- * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't
- * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
- * you are only using 2/3 of the CPU bandwidth.
- *
- * This is even more noticeable on the more advanced cores like the A76 which
- * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
- *
- * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the
- * remaining lanes will use scalar instructions. This improves the bandwidth
- * and also gives the integer pipelines something to do besides twiddling loop
- * counters and pointers.
- *
- * This change benefits CPUs with large micro-op buffers without negatively affecting
- * other CPUs:
- *
- *  | Chipset               | Dispatch type       | NEON only | 6:2 hybrid | Diff. |
- *  |:----------------------|:--------------------|----------:|-----------:|------:|
- *  | Snapdragon 730 (A76)  | 2 NEON/8 micro-ops  |  8.8 GB/s |  10.1 GB/s |  ~16% |
- *  | Snapdragon 835 (A73)  | 2 NEON/3 micro-ops  |  5.1 GB/s |   5.3 GB/s |   ~5% |
- *  | Marvell PXA1928 (A53) | In-order dual-issue |  1.9 GB/s |   1.9 GB/s |    0% |
- *
- * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
- *
- * @see XXH3_accumulate_512_neon()
- */
-#ifndef XXH3_NEON_LANES
-#if(defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__OPTIMIZE_SIZE__)
-#define XXH3_NEON_LANES 6
-#else
-#define XXH3_NEON_LANES XXH_ACC_NB
-#endif
-#endif
-#endif /* XXH_VECTOR == XXH_NEON */
-
-/*
- * VSX and Z Vector helpers.
- *
- * This is very messy, and any pull requests to clean this up are welcome.
- *
- * There are a lot of problems with supporting VSX and s390x, due to
- * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
- */
-#if XXH_VECTOR == XXH_VSX
-#if defined(__s390x__)
-#include <s390intrin.h>
-#else
-/* gcc's altivec.h can have the unwanted consequence to unconditionally
- * #define bool, vector, and pixel keywords,
- * with bad consequences for programs already using these keywords for other purposes.
- * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.
- * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,
- * but it seems that, in some cases, it isn't.
- * Force the build macro to be defined, so that keywords are not altered.
- */
-#if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
-#define __APPLE_ALTIVEC__
-#endif
-#include <altivec.h>
-#endif
-
-typedef __vector unsigned long long xxh_u64x2;
-typedef __vector unsigned char xxh_u8x16;
-typedef __vector unsigned xxh_u32x4;
-
-#ifndef XXH_VSX_BE
-#if defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-#define XXH_VSX_BE 1
-#elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
-#warning "-maltivec=be is not recommended. Please use native endianness."
-#define XXH_VSX_BE 1
-#else
-#define XXH_VSX_BE 0
-#endif
-#endif /* !defined(XXH_VSX_BE) */
-
-#if XXH_VSX_BE
-#if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
-#define XXH_vec_revb vec_revb
-#else
-/*!
- * A polyfill for POWER9's vec_revb().
- */
-XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
-	xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
-	                             0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08};
-	return vec_perm(val, val, vByteSwap);
-}
-#endif
-#endif /* XXH_VSX_BE */
-
-/*!
- * Performs an unaligned vector load and byte swaps it on big endian.
- */
-XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void* ptr) {
-	xxh_u64x2 ret;
-	XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
-#if XXH_VSX_BE
-	ret = XXH_vec_revb(ret);
-#endif
-	return ret;
-}
-
-/*
- * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
- *
- * These intrinsics weren't added until GCC 8, despite existing for a while,
- * and they are endian dependent. Also, their meaning swap depending on version.
- * */
-#if defined(__s390x__)
-/* s390x is always big endian, no issue on this platform */
-#define XXH_vec_mulo vec_mulo
-#define XXH_vec_mule vec_mule
-#elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
-/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
-#define XXH_vec_mulo __builtin_altivec_vmulouw
-#define XXH_vec_mule __builtin_altivec_vmuleuw
-#else
-/* gcc needs inline assembly */
-/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
-XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) {
-	xxh_u64x2 result;
-	__asm__("vmulouw %0, %1, %2"
-	        : "=v"(result)
-	        : "v"(a), "v"(b));
-	return result;
-}
-XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) {
-	xxh_u64x2 result;
-	__asm__("vmuleuw %0, %1, %2"
-	        : "=v"(result)
-	        : "v"(a), "v"(b));
-	return result;
-}
-#endif /* XXH_vec_mulo, XXH_vec_mule */
-#endif /* XXH_VECTOR == XXH_VSX */
-
-/* prefetch
- * can be disabled, by declaring XXH_NO_PREFETCH build macro */
-#if defined(XXH_NO_PREFETCH)
-#define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
-#else
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */
-#include <mmintrin.h>                                          /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
-#elif defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
-#define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
-#else
-#define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
-#endif
-#endif /* XXH_NO_PREFETCH */
-
-/* ==========================================
- * XXH3 default settings
- * ========================================== */
-
-#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
-
-#if(XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
-#error "default keyset is not large enough"
-#endif
-
-/*! Pseudorandom secret taken directly from FARSH. */
-XXH_ALIGN(64)
-static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
-    0xb8,
-    0xfe,
-    0x6c,
-    0x39,
-    0x23,
-    0xa4,
-    0x4b,
-    0xbe,
-    0x7c,
-    0x01,
-    0x81,
-    0x2c,
-    0xf7,
-    0x21,
-    0xad,
-    0x1c,
-    0xde,
-    0xd4,
-    0x6d,
-    0xe9,
-    0x83,
-    0x90,
-    0x97,
-    0xdb,
-    0x72,
-    0x40,
-    0xa4,
-    0xa4,
-    0xb7,
-    0xb3,
-    0x67,
-    0x1f,
-    0xcb,
-    0x79,
-    0xe6,
-    0x4e,
-    0xcc,
-    0xc0,
-    0xe5,
-    0x78,
-    0x82,
-    0x5a,
-    0xd0,
-    0x7d,
-    0xcc,
-    0xff,
-    0x72,
-    0x21,
-    0xb8,
-    0x08,
-    0x46,
-    0x74,
-    0xf7,
-    0x43,
-    0x24,
-    0x8e,
-    0xe0,
-    0x35,
-    0x90,
-    0xe6,
-    0x81,
-    0x3a,
-    0x26,
-    0x4c,
-    0x3c,
-    0x28,
-    0x52,
-    0xbb,
-    0x91,
-    0xc3,
-    0x00,
-    0xcb,
-    0x88,
-    0xd0,
-    0x65,
-    0x8b,
-    0x1b,
-    0x53,
-    0x2e,
-    0xa3,
-    0x71,
-    0x64,
-    0x48,
-    0x97,
-    0xa2,
-    0x0d,
-    0xf9,
-    0x4e,
-    0x38,
-    0x19,
-    0xef,
-    0x46,
-    0xa9,
-    0xde,
-    0xac,
-    0xd8,
-    0xa8,
-    0xfa,
-    0x76,
-    0x3f,
-    0xe3,
-    0x9c,
-    0x34,
-    0x3f,
-    0xf9,
-    0xdc,
-    0xbb,
-    0xc7,
-    0xc7,
-    0x0b,
-    0x4f,
-    0x1d,
-    0x8a,
-    0x51,
-    0xe0,
-    0x4b,
-    0xcd,
-    0xb4,
-    0x59,
-    0x31,
-    0xc8,
-    0x9f,
-    0x7e,
-    0xc9,
-    0xd9,
-    0x78,
-    0x73,
-    0x64,
-    0xea,
-    0xc5,
-    0xac,
-    0x83,
-    0x34,
-    0xd3,
-    0xeb,
-    0xc3,
-    0xc5,
-    0x81,
-    0xa0,
-    0xff,
-    0xfa,
-    0x13,
-    0x63,
-    0xeb,
-    0x17,
-    0x0d,
-    0xdd,
-    0x51,
-    0xb7,
-    0xf0,
-    0xda,
-    0x49,
-    0xd3,
-    0x16,
-    0x55,
-    0x26,
-    0x29,
-    0xd4,
-    0x68,
-    0x9e,
-    0x2b,
-    0x16,
-    0xbe,
-    0x58,
-    0x7d,
-    0x47,
-    0xa1,
-    0xfc,
-    0x8f,
-    0xf8,
-    0xb8,
-    0xd1,
-    0x7a,
-    0xd0,
-    0x31,
-    0xce,
-    0x45,
-    0xcb,
-    0x3a,
-    0x8f,
-    0x95,
-    0x16,
-    0x04,
-    0x28,
-    0xaf,
-    0xd7,
-    0xfb,
-    0xca,
-    0xbb,
-    0x4b,
-    0x40,
-    0x7e,
-};
-
-#ifdef XXH_OLD_NAMES
-#define kSecret XXH3_kSecret
-#endif
-
-#ifdef XXH_DOXYGEN
-/*!
- * @brief Calculates a 32-bit to 64-bit long multiply.
- *
- * Implemented as a macro.
- *
- * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
- * need to (but it shouldn't need to anyways, it is about 7 instructions to do
- * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
- * use that instead of the normal method.
- *
- * If you are compiling for platforms like Thumb-1 and don't have a better option,
- * you may also want to write your own long multiply routine here.
- *
- * @param x, y Numbers to be multiplied
- * @return 64-bit product of the low 32 bits of @p x and @p y.
- */
-XXH_FORCE_INLINE xxh_u64
-XXH_mult32to64(xxh_u64 x, xxh_u64 y) {
-	return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
-}
-#elif defined(_MSC_VER) && defined(_M_IX86)
-#define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
-#else
-/*
- * Downcast + upcast is usually better than masking on older compilers like
- * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
- *
- * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
- * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
- */
-#define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
-#endif
-
-/*!
- * @brief Calculates a 64->128-bit long multiply.
- *
- * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
- * version.
- *
- * @param lhs , rhs The 64-bit integers to be multiplied
- * @return The 128-bit result represented in an @ref XXH128_hash_t.
- */
-static XXH128_hash_t
-XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
-	/*
-	 * GCC/Clang __uint128_t method.
-	 *
-	 * On most 64-bit targets, GCC and Clang define a __uint128_t type.
-	 * This is usually the best way as it usually uses a native long 64-bit
-	 * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
-	 *
-	 * Usually.
-	 *
-	 * Despite being a 32-bit platform, Clang (and emscripten) define this type
-	 * despite not having the arithmetic for it. This results in a laggy
-	 * compiler builtin call which calculates a full 128-bit multiply.
-	 * In that case it is best to use the portable one.
-	 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
-	 */
-#if(defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) && defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
-
-	__uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
-	XXH128_hash_t r128;
-	r128.low64 = (xxh_u64)(product);
-	r128.high64 = (xxh_u64)(product >> 64);
-	return r128;
-
-	/*
-	 * MSVC for x64's _umul128 method.
-	 *
-	 * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
-	 *
-	 * This compiles to single operand MUL on x64.
-	 */
-#elif(defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
-
-#ifndef _MSC_VER
-#pragma intrinsic(_umul128)
-#endif
-	xxh_u64 product_high;
-	xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
-	XXH128_hash_t r128;
-	r128.low64 = product_low;
-	r128.high64 = product_high;
-	return r128;
-
-	/*
-	 * MSVC for ARM64's __umulh method.
-	 *
-	 * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
-	 */
-#elif defined(_M_ARM64) || defined(_M_ARM64EC)
-
-#ifndef _MSC_VER
-#pragma intrinsic(__umulh)
-#endif
-	XXH128_hash_t r128;
-	r128.low64 = lhs * rhs;
-	r128.high64 = __umulh(lhs, rhs);
-	return r128;
-
-#else
-	/*
-	 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
-	 *
-	 * This is a fast and simple grade school multiply, which is shown below
-	 * with base 10 arithmetic instead of base 0x100000000.
-	 *
-	 *           9 3 // D2 lhs = 93
-	 *         x 7 5 // D2 rhs = 75
-	 *     ----------
-	 *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
-	 *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
-	 *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
-	 *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
-	 *     ---------
-	 *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
-	 *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
-	 *     ---------
-	 *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
-	 *
-	 * The reasons for adding the products like this are:
-	 *  1. It avoids manual carry tracking. Just like how
-	 *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
-	 *     This avoids a lot of complexity.
-	 *
-	 *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
-	 *     instruction available in ARM's Digital Signal Processing extension
-	 *     in 32-bit ARMv6 and later, which is shown below:
-	 *
-	 *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
-	 *         {
-	 *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
-	 *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
-	 *             *RdHi = (xxh_u32)(product >> 32);
-	 *         }
-	 *
-	 *     This instruction was designed for efficient long multiplication, and
-	 *     allows this to be calculated in only 4 instructions at speeds
-	 *     comparable to some 64-bit ALUs.
-	 *
-	 *  3. It isn't terrible on other platforms. Usually this will be a couple
-	 *     of 32-bit ADD/ADCs.
-	 */
-
-	/* First calculate all of the cross products. */
-	xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
-	xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
-	xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
-	xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
-
-	/* Now add the products together. These will never overflow. */
-	xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
-	xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
-	xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
-
-	XXH128_hash_t r128;
-	r128.low64 = lower;
-	r128.high64 = upper;
-	return r128;
-#endif
-}
-
-/*!
- * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
- *
- * The reason for the separate function is to prevent passing too many structs
- * around by value. This will hopefully inline the multiply, but we don't force it.
- *
- * @param lhs , rhs The 64-bit integers to multiply
- * @return The low 64 bits of the product XOR'd by the high 64 bits.
- * @see XXH_mult64to128()
- */
-static xxh_u64
-XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
-	XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
-	return product.low64 ^ product.high64;
-}
-
-/*! Seems to produce slightly better code on GCC for some reason. */
-XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) {
-	XXH_ASSERT(0 <= shift && shift < 64);
-	return v64 ^ (v64 >> shift);
-}
-
-/*
- * This is a fast avalanche stage,
- * suitable when input bits are already partially mixed
- */
-static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) {
-	h64 = XXH_xorshift64(h64, 37);
-	h64 *= 0x165667919E3779F9ULL;
-	h64 = XXH_xorshift64(h64, 32);
-	return h64;
-}
-
-/*
- * This is a stronger avalanche,
- * inspired by Pelle Evensen's rrmxmx
- * preferable when input has not been previously mixed
- */
-static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) {
-	/* this mix is inspired by Pelle Evensen's rrmxmx */
-	h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
-	h64 *= 0x9FB21C651E98DF25ULL;
-	h64 ^= (h64 >> 35) + len;
-	h64 *= 0x9FB21C651E98DF25ULL;
-	return XXH_xorshift64(h64, 28);
-}
-
-/* ==========================================
- * Short keys
- * ==========================================
- * One of the shortcomings of XXH32 and XXH64 was that their performance was
- * sub-optimal on short lengths. It used an iterative algorithm which strongly
- * favored lengths that were a multiple of 4 or 8.
- *
- * Instead of iterating over individual inputs, we use a set of single shot
- * functions which piece together a range of lengths and operate in constant time.
- *
- * Additionally, the number of multiplies has been significantly reduced. This
- * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
- *
- * Depending on the platform, this may or may not be faster than XXH32, but it
- * is almost guaranteed to be faster than XXH64.
- */
-
-/*
- * At very short lengths, there isn't enough input to fully hide secrets, or use
- * the entire secret.
- *
- * There is also only a limited amount of mixing we can do before significantly
- * impacting performance.
- *
- * Therefore, we use different sections of the secret and always mix two secret
- * samples with an XOR. This should have no effect on performance on the
- * seedless or withSeed variants because everything _should_ be constant folded
- * by modern compilers.
- *
- * The XOR mixing hides individual parts of the secret and increases entropy.
- *
- * This adds an extra layer of strength for custom secrets.
- */
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) {
-	XXH_ASSERT(input != NULL);
-	XXH_ASSERT(1 <= len && len <= 3);
-	XXH_ASSERT(secret != NULL);
-	/*
-	 * len = 1: combined = { input[0], 0x01, input[0], input[0] }
-	 * len = 2: combined = { input[1], 0x02, input[0], input[1] }
-	 * len = 3: combined = { input[2], 0x03, input[0], input[1] }
-	 */
-	{
-		xxh_u8 const c1 = input[0];
-		xxh_u8 const c2 = input[len >> 1];
-		xxh_u8 const c3 = input[len - 1];
-		xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
-		xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
-		xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
-		return XXH64_avalanche(keyed);
-	}
-}
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) {
-	XXH_ASSERT(input != NULL);
-	XXH_ASSERT(secret != NULL);
-	XXH_ASSERT(4 <= len && len <= 8);
-	seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
-	{
-		xxh_u32 const input1 = XXH_readLE32(input);
-		xxh_u32 const input2 = XXH_readLE32(input + len - 4);
-		xxh_u64 const bitflip = (XXH_readLE64(secret + 8) ^ XXH_readLE64(secret + 16)) - seed;
-		xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
-		xxh_u64 const keyed = input64 ^ bitflip;
-		return XXH3_rrmxmx(keyed, len);
-	}
-}
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) {
-	XXH_ASSERT(input != NULL);
-	XXH_ASSERT(secret != NULL);
-	XXH_ASSERT(9 <= len && len <= 16);
-	{
-		xxh_u64 const bitflip1 = (XXH_readLE64(secret + 24) ^ XXH_readLE64(secret + 32)) + seed;
-		xxh_u64 const bitflip2 = (XXH_readLE64(secret + 40) ^ XXH_readLE64(secret + 48)) - seed;
-		xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;
-		xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
-		xxh_u64 const acc = len + XXH_swap64(input_lo) + input_hi + XXH3_mul128_fold64(input_lo, input_hi);
-		return XXH3_avalanche(acc);
-	}
-}
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) {
-	XXH_ASSERT(len <= 16);
-	{
-		if(XXH_likely(len > 8))
-			return XXH3_len_9to16_64b(input, len, secret, seed);
-		if(XXH_likely(len >= 4))
-			return XXH3_len_4to8_64b(input, len, secret, seed);
-		if(len)
-			return XXH3_len_1to3_64b(input, len, secret, seed);
-		return XXH64_avalanche(seed ^ (XXH_readLE64(secret + 56) ^ XXH_readLE64(secret + 64)));
-	}
-}
-
-/*
- * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
- * multiplication by zero, affecting hashes of lengths 17 to 240.
- *
- * However, they are very unlikely.
- *
- * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
- * unseeded non-cryptographic hashes, it does not attempt to defend itself
- * against specially crafted inputs, only random inputs.
- *
- * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
- * cancelling out the secret is taken an arbitrary number of times (addressed
- * in XXH3_accumulate_512), this collision is very unlikely with random inputs
- * and/or proper seeding:
- *
- * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
- * function that is only called up to 16 times per hash with up to 240 bytes of
- * input.
- *
- * This is not too bad for a non-cryptographic hash function, especially with
- * only 64 bit outputs.
- *
- * The 128-bit variant (which trades some speed for strength) is NOT affected
- * by this, although it is always a good idea to use a proper seed if you care
- * about strength.
- */
-XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
-                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) {
-#if defined(__GNUC__) && !defined(__clang__)  /* GCC, not Clang */ \
-    && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */     \
-    && !defined(XXH_ENABLE_AUTOVECTORIZE)     /* Define to disable like XXH32 hack */
-	/*
-	 * UGLY HACK:
-	 * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
-	 * slower code.
-	 *
-	 * By forcing seed64 into a register, we disrupt the cost model and
-	 * cause it to scalarize. See `XXH32_round()`
-	 *
-	 * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
-	 * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
-	 * GCC 9.2, despite both emitting scalar code.
-	 *
-	 * GCC generates much better scalar code than Clang for the rest of XXH3,
-	 * which is why finding a more optimal codepath is an interest.
-	 */
-	XXH_COMPILER_GUARD(seed64);
-#endif
-	{
-		xxh_u64 const input_lo = XXH_readLE64(input);
-		xxh_u64 const input_hi = XXH_readLE64(input + 8);
-		return XXH3_mul128_fold64(
-		    input_lo ^ (XXH_readLE64(secret) + seed64),
-		    input_hi ^ (XXH_readLE64(secret + 8) - seed64));
-	}
-}
-
-/* For mid range keys, XXH3 uses a Mum-hash variant. */
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
-                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
-                     XXH64_hash_t seed) {
-	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-	(void)secretSize;
-	XXH_ASSERT(16 < len && len <= 128);
-
-	{
-		xxh_u64 acc = len * XXH_PRIME64_1;
-		if(len > 32) {
-			if(len > 64) {
-				if(len > 96) {
-					acc += XXH3_mix16B(input + 48, secret + 96, seed);
-					acc += XXH3_mix16B(input + len - 64, secret + 112, seed);
-				}
-				acc += XXH3_mix16B(input + 32, secret + 64, seed);
-				acc += XXH3_mix16B(input + len - 48, secret + 80, seed);
-			}
-			acc += XXH3_mix16B(input + 16, secret + 32, seed);
-			acc += XXH3_mix16B(input + len - 32, secret + 48, seed);
-		}
-		acc += XXH3_mix16B(input + 0, secret + 0, seed);
-		acc += XXH3_mix16B(input + len - 16, secret + 16, seed);
-
-		return XXH3_avalanche(acc);
-	}
-}
-
-#define XXH3_MIDSIZE_MAX 240
-
-XXH_NO_INLINE XXH64_hash_t
-XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
-                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
-                      XXH64_hash_t seed) {
-	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-	(void)secretSize;
-	XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
-
-#define XXH3_MIDSIZE_STARTOFFSET 3
-#define XXH3_MIDSIZE_LASTOFFSET 17
-
-	{
-		xxh_u64 acc = len * XXH_PRIME64_1;
-		int const nbRounds = (int)len / 16;
-		int i;
-		for(i = 0; i < 8; i++) {
-			acc += XXH3_mix16B(input + (16 * i), secret + (16 * i), seed);
-		}
-		acc = XXH3_avalanche(acc);
-		XXH_ASSERT(nbRounds >= 8);
-#if defined(__clang__)                                /* Clang */ \
-    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
-    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
-/*
- * UGLY HACK:
- * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
- * In everywhere else, it uses scalar code.
- *
- * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
- * would still be slower than UMAAL (see XXH_mult64to128).
- *
- * Unfortunately, Clang doesn't handle the long multiplies properly and
- * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
- * scalarized into an ugly mess of VMOV.32 instructions.
- *
- * This mess is difficult to avoid without turning autovectorization
- * off completely, but they are usually relatively minor and/or not
- * worth it to fix.
- *
- * This loop is the easiest to fix, as unlike XXH32, this pragma
- * _actually works_ because it is a loop vectorization instead of an
- * SLP vectorization.
- */
-#pragma clang loop vectorize(disable)
-#endif
-		for(i = 8; i < nbRounds; i++) {
-			acc += XXH3_mix16B(input + (16 * i), secret + (16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
-		}
-		/* last bytes */
-		acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
-		return XXH3_avalanche(acc);
-	}
-}
-
-/* =======     Long Keys     ======= */
-
-#define XXH_STRIPE_LEN 64
-#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
-#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
-
-#ifdef XXH_OLD_NAMES
-#define STRIPE_LEN XXH_STRIPE_LEN
-#define ACC_NB XXH_ACC_NB
-#endif
-
-XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) {
-	if(!XXH_CPU_LITTLE_ENDIAN)
-		v64 = XXH_swap64(v64);
-	XXH_memcpy(dst, &v64, sizeof(v64));
-}
-
-/* Several intrinsic functions below are supposed to accept __int64 as argument,
- * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
- * However, several environments do not define __int64 type,
- * requiring a workaround.
- */
-#if !defined(__VMS) && (defined(__cplusplus) || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
-typedef int64_t xxh_i64;
-#else
-/* the following type must have a width of 64-bit */
-typedef long long xxh_i64;
-#endif
-
-/*
- * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
- *
- * It is a hardened version of UMAC, based off of FARSH's implementation.
- *
- * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
- * implementations, and it is ridiculously fast.
- *
- * We harden it by mixing the original input to the accumulators as well as the product.
- *
- * This means that in the (relatively likely) case of a multiply by zero, the
- * original input is preserved.
- *
- * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
- * cross-pollination, as otherwise the upper and lower halves would be
- * essentially independent.
- *
- * This doesn't matter on 64-bit hashes since they all get merged together in
- * the end, so we skip the extra step.
- *
- * Both XXH3_64bits and XXH3_128bits use this subroutine.
- */
-
-#if(XXH_VECTOR == XXH_AVX512) || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
-
-#ifndef XXH_TARGET_AVX512
-#define XXH_TARGET_AVX512 /* disable attribute target */
-#endif
-
-XXH_FORCE_INLINE XXH_TARGET_AVX512 void
-XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
-                           const void* XXH_RESTRICT input,
-                           const void* XXH_RESTRICT secret) {
-	__m512i* const xacc = (__m512i*)acc;
-	XXH_ASSERT((((size_t)acc) & 63) == 0);
-	XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
-
-	{
-		/* data_vec    = input[0]; */
-		__m512i const data_vec = _mm512_loadu_si512(input);
-		/* key_vec     = secret[0]; */
-		__m512i const key_vec = _mm512_loadu_si512(secret);
-		/* data_key    = data_vec ^ key_vec; */
-		__m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
-		/* data_key_lo = data_key >> 32; */
-		__m512i const data_key_lo = _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
-		/* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
-		__m512i const product = _mm512_mul_epu32(data_key, data_key_lo);
-		/* xacc[0] += swap(data_vec); */
-		__m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
-		__m512i const sum = _mm512_add_epi64(*xacc, data_swap);
-		/* xacc[0] += product; */
-		*xacc = _mm512_add_epi64(product, sum);
-	}
-}
-
-/*
- * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
- *
- * Multiplication isn't perfect, as explained by Google in HighwayHash:
- *
- *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
- *  // varying degrees. In descending order of goodness, bytes
- *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
- *  // As expected, the upper and lower bytes are much worse.
- *
- * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
- *
- * Since our algorithm uses a pseudorandom secret to add some variance into the
- * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
- *
- * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
- * extraction.
- *
- * Both XXH3_64bits and XXH3_128bits use this subroutine.
- */
-
-XXH_FORCE_INLINE XXH_TARGET_AVX512 void
-XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) {
-	XXH_ASSERT((((size_t)acc) & 63) == 0);
-	XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
-	{
-		__m512i* const xacc = (__m512i*)acc;
-		const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
-
-		/* xacc[0] ^= (xacc[0] >> 47) */
-		__m512i const acc_vec = *xacc;
-		__m512i const shifted = _mm512_srli_epi64(acc_vec, 47);
-		__m512i const data_vec = _mm512_xor_si512(acc_vec, shifted);
-		/* xacc[0] ^= secret; */
-		__m512i const key_vec = _mm512_loadu_si512(secret);
-		__m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
-
-		/* xacc[0] *= XXH_PRIME32_1; */
-		__m512i const data_key_hi = _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
-		__m512i const prod_lo = _mm512_mul_epu32(data_key, prime32);
-		__m512i const prod_hi = _mm512_mul_epu32(data_key_hi, prime32);
-		*xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
-	}
-}
-
-XXH_FORCE_INLINE XXH_TARGET_AVX512 void
-XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) {
-	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
-	XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
-	XXH_ASSERT(((size_t)customSecret & 63) == 0);
-	(void)(&XXH_writeLE64);
-	{
-		int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
-		__m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
-
-		const __m512i* const src = (const __m512i*)((const void*)XXH3_kSecret);
-		__m512i* const dest = (__m512i*)customSecret;
-		int i;
-		XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
-		XXH_ASSERT(((size_t)dest & 63) == 0);
-		for(i = 0; i < nbRounds; ++i) {
-			/* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
-			 * this will warn "discards 'const' qualifier". */
-			union {
-				const __m512i* cp;
-				void* p;
-			} remote_const_void;
-			remote_const_void.cp = src + i;
-			dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
-		}
-	}
-}
-
-#endif
-
-#if(XXH_VECTOR == XXH_AVX2) || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
-
-#ifndef XXH_TARGET_AVX2
-#define XXH_TARGET_AVX2 /* disable attribute target */
-#endif
-
-XXH_FORCE_INLINE XXH_TARGET_AVX2 void
-XXH3_accumulate_512_avx2(void* XXH_RESTRICT acc,
-                         const void* XXH_RESTRICT input,
-                         const void* XXH_RESTRICT secret) {
-	XXH_ASSERT((((size_t)acc) & 31) == 0);
-	{
-		__m256i* const xacc = (__m256i*)acc;
-		/* Unaligned. This is mainly for pointer arithmetic, and because
-		 * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
-		const __m256i* const xinput = (const __m256i*)input;
-		/* Unaligned. This is mainly for pointer arithmetic, and because
-		 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
-		const __m256i* const xsecret = (const __m256i*)secret;
-
-		size_t i;
-		for(i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
-			/* data_vec    = xinput[i]; */
-			__m256i const data_vec = _mm256_loadu_si256(xinput + i);
-			/* key_vec     = xsecret[i]; */
-			__m256i const key_vec = _mm256_loadu_si256(xsecret + i);
-			/* data_key    = data_vec ^ key_vec; */
-			__m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
-			/* data_key_lo = data_key >> 32; */
-			__m256i const data_key_lo = _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
-			/* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
-			__m256i const product = _mm256_mul_epu32(data_key, data_key_lo);
-			/* xacc[i] += swap(data_vec); */
-			__m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
-			__m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
-			/* xacc[i] += product; */
-			xacc[i] = _mm256_add_epi64(product, sum);
-		}
-	}
-}
-
-XXH_FORCE_INLINE XXH_TARGET_AVX2 void
-XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) {
-	XXH_ASSERT((((size_t)acc) & 31) == 0);
-	{
-		__m256i* const xacc = (__m256i*)acc;
-		/* Unaligned. This is mainly for pointer arithmetic, and because
-		 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
-		const __m256i* const xsecret = (const __m256i*)secret;
-		const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
-
-		size_t i;
-		for(i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
-			/* xacc[i] ^= (xacc[i] >> 47) */
-			__m256i const acc_vec = xacc[i];
-			__m256i const shifted = _mm256_srli_epi64(acc_vec, 47);
-			__m256i const data_vec = _mm256_xor_si256(acc_vec, shifted);
-			/* xacc[i] ^= xsecret; */
-			__m256i const key_vec = _mm256_loadu_si256(xsecret + i);
-			__m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
-
-			/* xacc[i] *= XXH_PRIME32_1; */
-			__m256i const data_key_hi = _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
-			__m256i const prod_lo = _mm256_mul_epu32(data_key, prime32);
-			__m256i const prod_hi = _mm256_mul_epu32(data_key_hi, prime32);
-			xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
-		}
-	}
-}
-
-XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) {
-	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
-	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
-	XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
-	(void)(&XXH_writeLE64);
-	XXH_PREFETCH(customSecret);
-	{
-		__m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
-
-		const __m256i* const src = (const __m256i*)((const void*)XXH3_kSecret);
-		__m256i* dest = (__m256i*)customSecret;
-
-#if defined(__GNUC__) || defined(__clang__)
-		/*
-		 * On GCC & Clang, marking 'dest' as modified will cause the compiler:
-		 *   - do not extract the secret from sse registers in the internal loop
-		 *   - use less common registers, and avoid pushing these reg into stack
-		 */
-		XXH_COMPILER_GUARD(dest);
-#endif
-		XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
-		XXH_ASSERT(((size_t)dest & 31) == 0);
-
-		/* GCC -O2 need unroll loop manually */
-		dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed);
-		dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src + 1), seed);
-		dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src + 2), seed);
-		dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src + 3), seed);
-		dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src + 4), seed);
-		dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src + 5), seed);
-	}
-}
-
-#endif
-
-/* x86dispatch always generates SSE2 */
-#if(XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
-
-#ifndef XXH_TARGET_SSE2
-#define XXH_TARGET_SSE2 /* disable attribute target */
-#endif
-
-XXH_FORCE_INLINE XXH_TARGET_SSE2 void
-XXH3_accumulate_512_sse2(void* XXH_RESTRICT acc,
-                         const void* XXH_RESTRICT input,
-                         const void* XXH_RESTRICT secret) {
-	/* SSE2 is just a half-scale version of the AVX2 version. */
-	XXH_ASSERT((((size_t)acc) & 15) == 0);
-	{
-		__m128i* const xacc = (__m128i*)acc;
-		/* Unaligned. This is mainly for pointer arithmetic, and because
-		 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
-		const __m128i* const xinput = (const __m128i*)input;
-		/* Unaligned. This is mainly for pointer arithmetic, and because
-		 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
-		const __m128i* const xsecret = (const __m128i*)secret;
-
-		size_t i;
-		for(i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
-			/* data_vec    = xinput[i]; */
-			__m128i const data_vec = _mm_loadu_si128(xinput + i);
-			/* key_vec     = xsecret[i]; */
-			__m128i const key_vec = _mm_loadu_si128(xsecret + i);
-			/* data_key    = data_vec ^ key_vec; */
-			__m128i const data_key = _mm_xor_si128(data_vec, key_vec);
-			/* data_key_lo = data_key >> 32; */
-			__m128i const data_key_lo = _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
-			/* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
-			__m128i const product = _mm_mul_epu32(data_key, data_key_lo);
-			/* xacc[i] += swap(data_vec); */
-			__m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
-			__m128i const sum = _mm_add_epi64(xacc[i], data_swap);
-			/* xacc[i] += product; */
-			xacc[i] = _mm_add_epi64(product, sum);
-		}
-	}
-}
-
-XXH_FORCE_INLINE XXH_TARGET_SSE2 void
-XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) {
-	XXH_ASSERT((((size_t)acc) & 15) == 0);
-	{
-		__m128i* const xacc = (__m128i*)acc;
-		/* Unaligned. This is mainly for pointer arithmetic, and because
-		 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
-		const __m128i* const xsecret = (const __m128i*)secret;
-		const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
-
-		size_t i;
-		for(i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
-			/* xacc[i] ^= (xacc[i] >> 47) */
-			__m128i const acc_vec = xacc[i];
-			__m128i const shifted = _mm_srli_epi64(acc_vec, 47);
-			__m128i const data_vec = _mm_xor_si128(acc_vec, shifted);
-			/* xacc[i] ^= xsecret[i]; */
-			__m128i const key_vec = _mm_loadu_si128(xsecret + i);
-			__m128i const data_key = _mm_xor_si128(data_vec, key_vec);
-
-			/* xacc[i] *= XXH_PRIME32_1; */
-			__m128i const data_key_hi = _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
-			__m128i const prod_lo = _mm_mul_epu32(data_key, prime32);
-			__m128i const prod_hi = _mm_mul_epu32(data_key_hi, prime32);
-			xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
-		}
-	}
-}
-
-XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) {
-	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
-	(void)(&XXH_writeLE64);
-	{
-		int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
-
-#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-		/* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
-		XXH_ALIGN(16)
-		const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, (xxh_i64)(0U - seed64)};
-		__m128i const seed = _mm_load_si128((__m128i const *)seed64x2);
-#else
-		__m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
-#endif
-		int i;
-
-		const void* const src16 = XXH3_kSecret;
-		__m128i* dst16 = (__m128i*)customSecret;
-#if defined(__GNUC__) || defined(__clang__)
-		/*
-		 * On GCC & Clang, marking 'dest' as modified will cause the compiler:
-		 *   - do not extract the secret from sse registers in the internal loop
-		 *   - use less common registers, and avoid pushing these reg into stack
-		 */
-		XXH_COMPILER_GUARD(dst16);
-#endif
-		XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
-		XXH_ASSERT(((size_t)dst16 & 15) == 0);
-
-		for(i = 0; i < nbRounds; ++i) {
-			dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i*)src16 + i), seed);
-		}
-	}
-}
-
-#endif
-
-#if(XXH_VECTOR == XXH_NEON)
-
-/* forward declarations for the scalar routines */
-XXH_FORCE_INLINE void
-XXH3_scalarRound(void* XXH_RESTRICT acc, void const * XXH_RESTRICT input,
-                 void const * XXH_RESTRICT secret, size_t lane);
-
-XXH_FORCE_INLINE void
-XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
-                         void const * XXH_RESTRICT secret, size_t lane);
-
-/*!
- * @internal
- * @brief The bulk processing loop for NEON.
- *
- * The NEON code path is actually partially scalar when running on AArch64. This
- * is to optimize the pipelining and can have up to 15% speedup depending on the
- * CPU, and it also mitigates some GCC codegen issues.
- *
- * @see XXH3_NEON_LANES for configuring this and details about this optimization.
- */
-XXH_FORCE_INLINE void
-XXH3_accumulate_512_neon(void* XXH_RESTRICT acc,
-                         const void* XXH_RESTRICT input,
-                         const void* XXH_RESTRICT secret) {
-	XXH_ASSERT((((size_t)acc) & 15) == 0);
-	XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
-	{
-		uint64x2_t* const xacc = (uint64x2_t*)acc;
-		/* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
-		uint8_t const * const xinput = (const uint8_t*)input;
-		uint8_t const * const xsecret = (const uint8_t*)secret;
-
-		size_t i;
-		/* NEON for the first few lanes (these loops are normally interleaved) */
-		for(i = 0; i < XXH3_NEON_LANES / 2; i++) {
-			/* data_vec = xinput[i]; */
-			uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
-			/* key_vec  = xsecret[i];  */
-			uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
-			uint64x2_t data_key;
-			uint32x2_t data_key_lo, data_key_hi;
-			/* xacc[i] += swap(data_vec); */
-			uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
-			uint64x2_t const swapped = vextq_u64(data64, data64, 1);
-			xacc[i] = vaddq_u64(xacc[i], swapped);
-			/* data_key = data_vec ^ key_vec; */
-			data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
-			/* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
-			 * data_key_hi = (uint32x2_t) (data_key >> 32);
-			 * data_key = UNDEFINED; */
-			XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
-			/* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
-			xacc[i] = vmlal_u32(xacc[i], data_key_lo, data_key_hi);
-		}
-		/* Scalar for the remainder. This may be a zero iteration loop. */
-		for(i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
-			XXH3_scalarRound(acc, input, secret, i);
-		}
-	}
-}
-
-XXH_FORCE_INLINE void
-XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) {
-	XXH_ASSERT((((size_t)acc) & 15) == 0);
-
-	{
-		uint64x2_t* xacc = (uint64x2_t*)acc;
-		uint8_t const * xsecret = (uint8_t const *)secret;
-		uint32x2_t prime = vdup_n_u32(XXH_PRIME32_1);
-
-		size_t i;
-		/* NEON for the first few lanes (these loops are normally interleaved) */
-		for(i = 0; i < XXH3_NEON_LANES / 2; i++) {
-			/* xacc[i] ^= (xacc[i] >> 47); */
-			uint64x2_t acc_vec = xacc[i];
-			uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
-			uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
-
-			/* xacc[i] ^= xsecret[i]; */
-			uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
-			uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
-
-			/* xacc[i] *= XXH_PRIME32_1 */
-			uint32x2_t data_key_lo, data_key_hi;
-			/* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
-			 * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
-			 * xacc[i] = UNDEFINED; */
-			XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
-			{ /*
-			   * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
-			   *
-			   * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
-			   * incorrectly "optimize" this:
-			   *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
-			   *   shifted = vshll_n_u32(tmp, 32);
-			   * to this:
-			   *   tmp     = "vmulq_u64"(a, b); // no such thing!
-			   *   shifted = vshlq_n_u64(tmp, 32);
-			   *
-			   * However, unlike SSE, Clang lacks a 64-bit multiply routine
-			   * for NEON, and it scalarizes two 64-bit multiplies instead.
-			   *
-			   * vmull_u32 has the same timing as vmul_u32, and it avoids
-			   * this bug completely.
-			   * See https://bugs.llvm.org/show_bug.cgi?id=39967
-			   */
-				uint64x2_t prod_hi = vmull_u32(data_key_hi, prime);
-				/* xacc[i] = prod_hi << 32; */
-				xacc[i] = vshlq_n_u64(prod_hi, 32);
-				/* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
-				xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
-			}
-		}
-		/* Scalar for the remainder. This may be a zero iteration loop. */
-		for(i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
-			XXH3_scalarScrambleRound(acc, secret, i);
-		}
-	}
-}
-
-#endif
-
-#if(XXH_VECTOR == XXH_VSX)
-
-XXH_FORCE_INLINE void
-XXH3_accumulate_512_vsx(void* XXH_RESTRICT acc,
-                        const void* XXH_RESTRICT input,
-                        const void* XXH_RESTRICT secret) {
-	/* presumed aligned */
-	unsigned int* const xacc = (unsigned int*)acc;
-	xxh_u64x2 const * const xinput = (xxh_u64x2 const *)input;   /* no alignment restriction */
-	xxh_u64x2 const * const xsecret = (xxh_u64x2 const *)secret; /* no alignment restriction */
-	xxh_u64x2 const v32 = {32, 32};
-	size_t i;
-	for(i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
-		/* data_vec = xinput[i]; */
-		xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
-		/* key_vec = xsecret[i]; */
-		xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
-		xxh_u64x2 const data_key = data_vec ^ key_vec;
-		/* shuffled = (data_key << 32) | (data_key >> 32); */
-		xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
-		/* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
-		xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
-		/* acc_vec = xacc[i]; */
-		xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i);
-		acc_vec += product;
-
-		/* swap high and low halves */
-#ifdef __s390x__
-		acc_vec += vec_permi(data_vec, data_vec, 2);
-#else
-		acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
-#endif
-		/* xacc[i] = acc_vec; */
-		vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
-	}
-}
-
-XXH_FORCE_INLINE void
-XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) {
-	XXH_ASSERT((((size_t)acc) & 15) == 0);
-
-	{
-		xxh_u64x2* const xacc = (xxh_u64x2*)acc;
-		const xxh_u64x2* const xsecret = (const xxh_u64x2*)secret;
-		/* constants */
-		xxh_u64x2 const v32 = {32, 32};
-		xxh_u64x2 const v47 = {47, 47};
-		xxh_u32x4 const prime = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1};
-		size_t i;
-		for(i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
-			/* xacc[i] ^= (xacc[i] >> 47); */
-			xxh_u64x2 const acc_vec = xacc[i];
-			xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
-
-			/* xacc[i] ^= xsecret[i]; */
-			xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
-			xxh_u64x2 const data_key = data_vec ^ key_vec;
-
-			/* xacc[i] *= XXH_PRIME32_1 */
-			/* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
-			xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);
-			/* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
-			xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);
-			xacc[i] = prod_odd + (prod_even << v32);
-		}
-	}
-}
-
-#endif
-
-/* scalar variants - universal */
-
-/*!
- * @internal
- * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
- *
- * This is extracted to its own function because the NEON path uses a combination
- * of NEON and scalar.
- */
-XXH_FORCE_INLINE void
-XXH3_scalarRound(void* XXH_RESTRICT acc,
-                 void const * XXH_RESTRICT input,
-                 void const * XXH_RESTRICT secret,
-                 size_t lane) {
-	xxh_u64* xacc = (xxh_u64*)acc;
-	xxh_u8 const * xinput = (xxh_u8 const *)input;
-	xxh_u8 const * xsecret = (xxh_u8 const *)secret;
-	XXH_ASSERT(lane < XXH_ACC_NB);
-	XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN - 1)) == 0);
-	{
-		xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
-		xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
-		xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
-		xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
-	}
-}
-
-/*!
- * @internal
- * @brief Processes a 64 byte block of data using the scalar path.
- */
-XXH_FORCE_INLINE void
-XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
-                           const void* XXH_RESTRICT input,
-                           const void* XXH_RESTRICT secret) {
-	size_t i;
-	for(i = 0; i < XXH_ACC_NB; i++) {
-		XXH3_scalarRound(acc, input, secret, i);
-	}
-}
-
-/*!
- * @internal
- * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
- *
- * This is extracted to its own function because the NEON path uses a combination
- * of NEON and scalar.
- */
-XXH_FORCE_INLINE void
-XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
-                         void const * XXH_RESTRICT secret,
-                         size_t lane) {
-	xxh_u64* const xacc = (xxh_u64*)acc;                 /* presumed aligned */
-	const xxh_u8* const xsecret = (const xxh_u8*)secret; /* no alignment restriction */
-	XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN - 1)) == 0);
-	XXH_ASSERT(lane < XXH_ACC_NB);
-	{
-		xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
-		xxh_u64 acc64 = xacc[lane];
-		acc64 = XXH_xorshift64(acc64, 47);
-		acc64 ^= key64;
-		acc64 *= XXH_PRIME32_1;
-		xacc[lane] = acc64;
-	}
-}
-
-/*!
- * @internal
- * @brief Scrambles the accumulators after a large chunk has been read
- */
-XXH_FORCE_INLINE void
-XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) {
-	size_t i;
-	for(i = 0; i < XXH_ACC_NB; i++) {
-		XXH3_scalarScrambleRound(acc, secret, i);
-	}
-}
-
-XXH_FORCE_INLINE void
-XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) {
-	/*
-	 * We need a separate pointer for the hack below,
-	 * which requires a non-const pointer.
-	 * Any decent compiler will optimize this out otherwise.
-	 */
-	const xxh_u8* kSecretPtr = XXH3_kSecret;
-	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
-
-#if defined(__clang__) && defined(__aarch64__)
-	/*
-	 * UGLY HACK:
-	 * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
-	 * placed sequentially, in order, at the top of the unrolled loop.
-	 *
-	 * While MOVK is great for generating constants (2 cycles for a 64-bit
-	 * constant compared to 4 cycles for LDR), it fights for bandwidth with
-	 * the arithmetic instructions.
-	 *
-	 *   I   L   S
-	 * MOVK
-	 * MOVK
-	 * MOVK
-	 * MOVK
-	 * ADD
-	 * SUB      STR
-	 *          STR
-	 * By forcing loads from memory (as the asm line causes Clang to assume
-	 * that XXH3_kSecretPtr has been changed), the pipelines are used more
-	 * efficiently:
-	 *   I   L   S
-	 *      LDR
-	 *  ADD LDR
-	 *  SUB     STR
-	 *          STR
-	 *
-	 * See XXH3_NEON_LANES for details on the pipsline.
-	 *
-	 * XXH3_64bits_withSeed, len == 256, Snapdragon 835
-	 *   without hack: 2654.4 MB/s
-	 *   with hack:    3202.9 MB/s
-	 */
-	XXH_COMPILER_GUARD(kSecretPtr);
-#endif
-	/*
-	 * Note: in debug mode, this overrides the asm optimization
-	 * and Clang will emit MOVK chains again.
-	 */
-	XXH_ASSERT(kSecretPtr == XXH3_kSecret);
-
-	{
-		int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
-		int i;
-		for(i = 0; i < nbRounds; i++) {
-			/*
-			 * The asm hack causes Clang to assume that kSecretPtr aliases with
-			 * customSecret, and on aarch64, this prevented LDP from merging two
-			 * loads together for free. Putting the loads together before the stores
-			 * properly generates LDP.
-			 */
-			xxh_u64 lo = XXH_readLE64(kSecretPtr + 16 * i) + seed64;
-			xxh_u64 hi = XXH_readLE64(kSecretPtr + 16 * i + 8) - seed64;
-			XXH_writeLE64((xxh_u8*)customSecret + 16 * i, lo);
-			XXH_writeLE64((xxh_u8*)customSecret + 16 * i + 8, hi);
-		}
-	}
-}
-
-typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);
-typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
-typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
-
-#if(XXH_VECTOR == XXH_AVX512)
-
-#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
-#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
-#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
-
-#elif(XXH_VECTOR == XXH_AVX2)
-
-#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
-#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
-#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
-
-#elif(XXH_VECTOR == XXH_SSE2)
-
-#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
-#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
-#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
-
-#elif(XXH_VECTOR == XXH_NEON)
-
-#define XXH3_accumulate_512 XXH3_accumulate_512_neon
-#define XXH3_scrambleAcc XXH3_scrambleAcc_neon
-#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
-
-#elif(XXH_VECTOR == XXH_VSX)
-
-#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
-#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
-#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
-
-#else /* scalar */
-
-#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
-#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
-#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
-
-#endif
-
-#ifndef XXH_PREFETCH_DIST
-#ifdef __clang__
-#define XXH_PREFETCH_DIST 320
-#else
-#if(XXH_VECTOR == XXH_AVX512)
-#define XXH_PREFETCH_DIST 512
-#else
-#define XXH_PREFETCH_DIST 384
-#endif
-#endif /* __clang__ */
-#endif /* XXH_PREFETCH_DIST */
-
-/*
- * XXH3_accumulate()
- * Loops over XXH3_accumulate_512().
- * Assumption: nbStripes will not overflow the secret size
- */
-XXH_FORCE_INLINE void
-XXH3_accumulate(xxh_u64* XXH_RESTRICT acc,
-                const xxh_u8* XXH_RESTRICT input,
-                const xxh_u8* XXH_RESTRICT secret,
-                size_t nbStripes,
-                XXH3_f_accumulate_512 f_acc512) {
-	size_t n;
-	for(n = 0; n < nbStripes; n++) {
-		const xxh_u8* const in = input + n * XXH_STRIPE_LEN;
-		XXH_PREFETCH(in + XXH_PREFETCH_DIST);
-		f_acc512(acc,
-		         in,
-		         secret + n * XXH_SECRET_CONSUME_RATE);
-	}
-}
-
-XXH_FORCE_INLINE void
-XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
-                            const xxh_u8* XXH_RESTRICT input, size_t len,
-                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
-                            XXH3_f_accumulate_512 f_acc512,
-                            XXH3_f_scrambleAcc f_scramble) {
-	size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
-	size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
-	size_t const nb_blocks = (len - 1) / block_len;
-
-	size_t n;
-
-	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-
-	for(n = 0; n < nb_blocks; n++) {
-		XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock, f_acc512);
-		f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
-	}
-
-	/* last partial block */
-	XXH_ASSERT(len > XXH_STRIPE_LEN);
-	{
-		size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
-		XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
-		XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes, f_acc512);
-
-		/* last stripe */
-		{
-			const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
-#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */
-			f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
-		}
-	}
-}
-
-XXH_FORCE_INLINE xxh_u64
-XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) {
-	return XXH3_mul128_fold64(
-	    acc[0] ^ XXH_readLE64(secret),
-	    acc[1] ^ XXH_readLE64(secret + 8));
-}
-
-static XXH64_hash_t
-XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) {
-	xxh_u64 result64 = start;
-	size_t i = 0;
-
-	for(i = 0; i < 4; i++) {
-		result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i);
-#if defined(__clang__)                                /* Clang */ \
-    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
-    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
-    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
-		/*
-		 * UGLY HACK:
-		 * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
-		 * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
-		 * XXH3_64bits, len == 256, Snapdragon 835:
-		 *   without hack: 2063.7 MB/s
-		 *   with hack:    2560.7 MB/s
-		 */
-		XXH_COMPILER_GUARD(result64);
-#endif
-	}
-
-	return XXH3_avalanche(result64);
-}
-
-#define XXH3_INIT_ACC                                             \
-	{ XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
-	  XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
-                           const void* XXH_RESTRICT secret, size_t secretSize,
-                           XXH3_f_accumulate_512 f_acc512,
-                           XXH3_f_scrambleAcc f_scramble) {
-	XXH_ALIGN(XXH_ACC_ALIGN)
-	xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
-
-	XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);
-
-	/* converge into final hash */
-	XXH_STATIC_ASSERT(sizeof(acc) == 64);
-	/* do not align on 8, so that the secret is different from the accumulator */
-#define XXH_SECRET_MERGEACCS_START 11
-	XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
-	return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
-}
-
-/*
- * It's important for performance to transmit secret's size (when it's static)
- * so that the compiler can properly optimize the vectorized loop.
- * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
- */
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
-                             XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) {
-	(void)seed64;
-	return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
-}
-
-/*
- * It's preferable for performance that XXH3_hashLong is not inlined,
- * as it results in a smaller function for small data, easier to the instruction cache.
- * Note that inside this no_inline function, we do inline the internal loop,
- * and provide a statically defined secret size to allow optimization of vector loop.
- */
-XXH_NO_INLINE XXH64_hash_t
-XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
-                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) {
-	(void)seed64;
-	(void)secret;
-	(void)secretLen;
-	return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
-}
-
-/*
- * XXH3_hashLong_64b_withSeed():
- * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
- * and then use this key for long mode hashing.
- *
- * This operation is decently fast but nonetheless costs a little bit of time.
- * Try to avoid it whenever possible (typically when seed==0).
- *
- * It's important for performance that XXH3_hashLong is not inlined. Not sure
- * why (uop cache maybe?), but the difference is large and easily measurable.
- */
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
-                                    XXH64_hash_t seed,
-                                    XXH3_f_accumulate_512 f_acc512,
-                                    XXH3_f_scrambleAcc f_scramble,
-                                    XXH3_f_initCustomSecret f_initSec) {
-	if(seed == 0)
-		return XXH3_hashLong_64b_internal(input, len,
-		                                  XXH3_kSecret, sizeof(XXH3_kSecret),
-		                                  f_acc512, f_scramble);
-	{
-		XXH_ALIGN(XXH_SEC_ALIGN)
-		xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
-		f_initSec(secret, seed);
-		return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
-		                                  f_acc512, f_scramble);
-	}
-}
-
-/*
- * It's important for performance that XXH3_hashLong is not inlined.
- */
-XXH_NO_INLINE XXH64_hash_t
-XXH3_hashLong_64b_withSeed(const void* input, size_t len,
-                           XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen) {
-	(void)secret;
-	(void)secretLen;
-	return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
-	                                           XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
-}
-
-typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
-                                          XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
-                     XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
-                     XXH3_hashLong64_f f_hashLong) {
-	XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
-	/*
-	 * If an action is to be taken if `secretLen` condition is not respected,
-	 * it should be done here.
-	 * For now, it's a contract pre-condition.
-	 * Adding a check and a branch here would cost performance at every hash.
-	 * Also, note that function signature doesn't offer room to return an error.
-	 */
-	if(len <= 16)
-		return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
-	if(len <= 128)
-		return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
-	if(len <= XXH3_MIDSIZE_MAX)
-		return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
-	return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
-}
-
-/* ===   Public entry point   === */
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) {
-	return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH64_hash_t
-XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) {
-	return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH64_hash_t
-XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) {
-	return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
-}
-
-XXH_PUBLIC_API XXH64_hash_t
-XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed) {
-	if(len <= XXH3_MIDSIZE_MAX)
-		return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
-	return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize);
-}
-
-/* ===   XXH3 streaming   === */
-
-/*
- * Malloc's a pointer that is always aligned to align.
- *
- * This must be freed with `XXH_alignedFree()`.
- *
- * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
- * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
- * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
- *
- * This underalignment previously caused a rather obvious crash which went
- * completely unnoticed due to XXH3_createState() not actually being tested.
- * Credit to RedSpah for noticing this bug.
- *
- * The alignment is done manually: Functions like posix_memalign or _mm_malloc
- * are avoided: To maintain portability, we would have to write a fallback
- * like this anyways, and besides, testing for the existence of library
- * functions without relying on external build tools is impossible.
- *
- * The method is simple: Overallocate, manually align, and store the offset
- * to the original behind the returned pointer.
- *
- * Align must be a power of 2 and 8 <= align <= 128.
- */
-static void* XXH_alignedMalloc(size_t s, size_t align) {
-	XXH_ASSERT(align <= 128 && align >= 8); /* range check */
-	XXH_ASSERT((align & (align - 1)) == 0); /* power of 2 */
-	XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
-	{                                       /* Overallocate to make room for manual realignment and an offset byte */
-		xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
-		if(base != NULL) {
-			/*
-			 * Get the offset needed to align this pointer.
-			 *
-			 * Even if the returned pointer is aligned, there will always be
-			 * at least one byte to store the offset to the original pointer.
-			 */
-			size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
-			/* Add the offset for the now-aligned pointer */
-			xxh_u8* ptr = base + offset;
-
-			XXH_ASSERT((size_t)ptr % align == 0);
-
-			/* Store the offset immediately before the returned pointer. */
-			ptr[-1] = (xxh_u8)offset;
-			return ptr;
-		}
-		return NULL;
-	}
-}
-/*
- * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
- * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
- */
-static void XXH_alignedFree(void* p) {
-	if(p != NULL) {
-		xxh_u8* ptr = (xxh_u8*)p;
-		/* Get the offset byte we added in XXH_malloc. */
-		xxh_u8 offset = ptr[-1];
-		/* Free the original malloc'd pointer */
-		xxh_u8* base = ptr - offset;
-		XXH_free(base);
-	}
-}
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) {
-	XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
-	if(state == NULL)
-		return NULL;
-	XXH3_INITSTATE(state);
-	return state;
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) {
-	XXH_alignedFree(statePtr);
-	return XXH_OK;
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API void
-XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) {
-	XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
-}
-
-static void
-XXH3_reset_internal(XXH3_state_t* statePtr,
-                    XXH64_hash_t seed,
-                    const void* secret, size_t secretSize) {
-	size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
-	size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
-	XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
-	XXH_ASSERT(statePtr != NULL);
-	/* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
-	memset((char*)statePtr + initStart, 0, initLength);
-	statePtr->acc[0] = XXH_PRIME32_3;
-	statePtr->acc[1] = XXH_PRIME64_1;
-	statePtr->acc[2] = XXH_PRIME64_2;
-	statePtr->acc[3] = XXH_PRIME64_3;
-	statePtr->acc[4] = XXH_PRIME64_4;
-	statePtr->acc[5] = XXH_PRIME32_2;
-	statePtr->acc[6] = XXH_PRIME64_5;
-	statePtr->acc[7] = XXH_PRIME32_1;
-	statePtr->seed = seed;
-	statePtr->useSeed = (seed != 0);
-	statePtr->extSecret = (const unsigned char*)secret;
-	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-	statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
-	statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH3_64bits_reset(XXH3_state_t* statePtr) {
-	if(statePtr == NULL)
-		return XXH_ERROR;
-	XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
-	return XXH_OK;
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) {
-	if(statePtr == NULL)
-		return XXH_ERROR;
-	XXH3_reset_internal(statePtr, 0, secret, secretSize);
-	if(secret == NULL)
-		return XXH_ERROR;
-	if(secretSize < XXH3_SECRET_SIZE_MIN)
-		return XXH_ERROR;
-	return XXH_OK;
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) {
-	if(statePtr == NULL)
-		return XXH_ERROR;
-	if(seed == 0)
-		return XXH3_64bits_reset(statePtr);
-	if((seed != statePtr->seed) || (statePtr->extSecret != NULL))
-		XXH3_initCustomSecret(statePtr->customSecret, seed);
-	XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
-	return XXH_OK;
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64) {
-	if(statePtr == NULL)
-		return XXH_ERROR;
-	if(secret == NULL)
-		return XXH_ERROR;
-	if(secretSize < XXH3_SECRET_SIZE_MIN)
-		return XXH_ERROR;
-	XXH3_reset_internal(statePtr, seed64, secret, secretSize);
-	statePtr->useSeed = 1; /* always, even if seed64==0 */
-	return XXH_OK;
-}
-
-/* Note : when XXH3_consumeStripes() is invoked,
- * there must be a guarantee that at least one more byte must be consumed from input
- * so that the function can blindly consume all stripes using the "normal" secret segment */
-XXH_FORCE_INLINE void
-XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
-                    size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
-                    const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
-                    const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
-                    XXH3_f_accumulate_512 f_acc512,
-                    XXH3_f_scrambleAcc f_scramble) {
-	XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */
-	XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
-	if(nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
-		/* need a scrambling operation */
-		size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
-		size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
-		XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
-		f_scramble(acc, secret + secretLimit);
-		XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
-		*nbStripesSoFarPtr = nbStripesAfterBlock;
-	} else {
-		XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
-		*nbStripesSoFarPtr += nbStripes;
-	}
-}
-
-#ifndef XXH3_STREAM_USE_STACK
-#ifndef __clang__ /* clang doesn't need additional stack space */
-#define XXH3_STREAM_USE_STACK 1
-#endif
-#endif
-/*
- * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
- */
-XXH_FORCE_INLINE XXH_errorcode
-XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
-            const xxh_u8* XXH_RESTRICT input, size_t len,
-            XXH3_f_accumulate_512 f_acc512,
-            XXH3_f_scrambleAcc f_scramble) {
-	if(input == NULL) {
-		XXH_ASSERT(len == 0);
-		return XXH_OK;
-	}
-
-	XXH_ASSERT(state != NULL);
-	{
-		const xxh_u8* const bEnd = input + len;
-		const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
-#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
-		/* For some reason, gcc and MSVC seem to suffer greatly
-		 * when operating accumulators directly into state.
-		 * Operating into stack space seems to enable proper optimization.
-		 * clang, on the other hand, doesn't seem to need this trick */
-		XXH_ALIGN(XXH_ACC_ALIGN)
-		xxh_u64 acc[8];
-		memcpy(acc, state->acc, sizeof(acc));
-#else
-		xxh_u64* XXH_RESTRICT const acc = state->acc;
-#endif
-		state->totalLen += len;
-		XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
-
-		/* small input : just fill in tmp buffer */
-		if(state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
-			XXH_memcpy(state->buffer + state->bufferedSize, input, len);
-			state->bufferedSize += (XXH32_hash_t)len;
-			return XXH_OK;
-		}
-
-/* total input is now > XXH3_INTERNALBUFFER_SIZE */
-#define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
-		XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */
-
-		/*
-		 * Internal buffer is partially filled (always, except at beginning)
-		 * Complete it, then consume it.
-		 */
-		if(state->bufferedSize) {
-			size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
-			XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
-			input += loadSize;
-			XXH3_consumeStripes(acc,
-			                    &state->nbStripesSoFar, state->nbStripesPerBlock,
-			                    state->buffer, XXH3_INTERNALBUFFER_STRIPES,
-			                    secret, state->secretLimit,
-			                    f_acc512, f_scramble);
-			state->bufferedSize = 0;
-		}
-		XXH_ASSERT(input < bEnd);
-
-		/* large input to consume : ingest per full block */
-		if((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
-			size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
-			XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
-			/* join to current block's end */
-			{
-				size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
-				XXH_ASSERT(nbStripesToEnd <= nbStripes);
-				XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
-				f_scramble(acc, secret + state->secretLimit);
-				state->nbStripesSoFar = 0;
-				input += nbStripesToEnd * XXH_STRIPE_LEN;
-				nbStripes -= nbStripesToEnd;
-			}
-			/* consume per entire blocks */
-			while(nbStripes >= state->nbStripesPerBlock) {
-				XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
-				f_scramble(acc, secret + state->secretLimit);
-				input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
-				nbStripes -= state->nbStripesPerBlock;
-			}
-			/* consume last partial block */
-			XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
-			input += nbStripes * XXH_STRIPE_LEN;
-			XXH_ASSERT(input < bEnd); /* at least some bytes left */
-			state->nbStripesSoFar = nbStripes;
-			/* buffer predecessor of last partial stripe */
-			XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
-			XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
-		} else {
-			/* content to consume <= block size */
-			/* Consume input by a multiple of internal buffer size */
-			if(bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
-				const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
-				do {
-					XXH3_consumeStripes(acc,
-					                    &state->nbStripesSoFar, state->nbStripesPerBlock,
-					                    input, XXH3_INTERNALBUFFER_STRIPES,
-					                    secret, state->secretLimit,
-					                    f_acc512, f_scramble);
-					input += XXH3_INTERNALBUFFER_SIZE;
-				} while(input < limit);
-				/* buffer predecessor of last partial stripe */
-				XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
-			}
-		}
-
-		/* Some remaining input (always) : buffer it */
-		XXH_ASSERT(input < bEnd);
-		XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
-		XXH_ASSERT(state->bufferedSize == 0);
-		XXH_memcpy(state->buffer, input, (size_t)(bEnd - input));
-		state->bufferedSize = (XXH32_hash_t)(bEnd - input);
-#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
-		/* save stack accumulators into state */
-		memcpy(state->acc, acc, sizeof(acc));
-#endif
-	}
-
-	return XXH_OK;
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) {
-	return XXH3_update(state, (const xxh_u8*)input, len,
-	                   XXH3_accumulate_512, XXH3_scrambleAcc);
-}
-
-XXH_FORCE_INLINE void
-XXH3_digest_long(XXH64_hash_t* acc,
-                 const XXH3_state_t* state,
-                 const unsigned char* secret) {
-	/*
-	 * Digest on a local copy. This way, the state remains unaltered, and it can
-	 * continue ingesting more input afterwards.
-	 */
-	XXH_memcpy(acc, state->acc, sizeof(state->acc));
-	if(state->bufferedSize >= XXH_STRIPE_LEN) {
-		size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
-		size_t nbStripesSoFar = state->nbStripesSoFar;
-		XXH3_consumeStripes(acc,
-		                    &nbStripesSoFar, state->nbStripesPerBlock,
-		                    state->buffer, nbStripes,
-		                    secret, state->secretLimit,
-		                    XXH3_accumulate_512, XXH3_scrambleAcc);
-		/* last stripe */
-		XXH3_accumulate_512(acc,
-		                    state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
-		                    secret + state->secretLimit - XXH_SECRET_LASTACC_START);
-	} else { /* bufferedSize < XXH_STRIPE_LEN */
-		xxh_u8 lastStripe[XXH_STRIPE_LEN];
-		size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
-		XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */
-		XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
-		XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
-		XXH3_accumulate_512(acc,
-		                    lastStripe,
-		                    secret + state->secretLimit - XXH_SECRET_LASTACC_START);
-	}
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t* state) {
-	const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
-	if(state->totalLen > XXH3_MIDSIZE_MAX) {
-		XXH_ALIGN(XXH_ACC_ALIGN)
-		XXH64_hash_t acc[XXH_ACC_NB];
-		XXH3_digest_long(acc, state, secret);
-		return XXH3_mergeAccs(acc,
-		                      secret + XXH_SECRET_MERGEACCS_START,
-		                      (xxh_u64)state->totalLen * XXH_PRIME64_1);
-	}
-	/* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
-	if(state->useSeed)
-		return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
-	return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
-	                              secret, state->secretLimit + XXH_STRIPE_LEN);
-}
-
-/* ==========================================
- * XXH3 128 bits (a.k.a XXH128)
- * ==========================================
- * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
- * even without counting the significantly larger output size.
- *
- * For example, extra steps are taken to avoid the seed-dependent collisions
- * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
- *
- * This strength naturally comes at the cost of some speed, especially on short
- * lengths. Note that longer hashes are about as fast as the 64-bit version
- * due to it using only a slight modification of the 64-bit loop.
- *
- * XXH128 is also more oriented towards 64-bit machines. It is still extremely
- * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
- */
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) {
-	/* A doubled version of 1to3_64b with different constants. */
-	XXH_ASSERT(input != NULL);
-	XXH_ASSERT(1 <= len && len <= 3);
-	XXH_ASSERT(secret != NULL);
-	/*
-	 * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
-	 * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
-	 * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
-	 */
-	{
-		xxh_u8 const c1 = input[0];
-		xxh_u8 const c2 = input[len >> 1];
-		xxh_u8 const c3 = input[len - 1];
-		xxh_u32 const combinedl = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
-		xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
-		xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
-		xxh_u64 const bitfliph = (XXH_readLE32(secret + 8) ^ XXH_readLE32(secret + 12)) - seed;
-		xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
-		xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
-		XXH128_hash_t h128;
-		h128.low64 = XXH64_avalanche(keyed_lo);
-		h128.high64 = XXH64_avalanche(keyed_hi);
-		return h128;
-	}
-}
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) {
-	XXH_ASSERT(input != NULL);
-	XXH_ASSERT(secret != NULL);
-	XXH_ASSERT(4 <= len && len <= 8);
-	seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
-	{
-		xxh_u32 const input_lo = XXH_readLE32(input);
-		xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
-		xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
-		xxh_u64 const bitflip = (XXH_readLE64(secret + 16) ^ XXH_readLE64(secret + 24)) + seed;
-		xxh_u64 const keyed = input_64 ^ bitflip;
-
-		/* Shift len to the left to ensure it is even, this avoids even multiplies. */
-		XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
-
-		m128.high64 += (m128.low64 << 1);
-		m128.low64 ^= (m128.high64 >> 3);
-
-		m128.low64 = XXH_xorshift64(m128.low64, 35);
-		m128.low64 *= 0x9FB21C651E98DF25ULL;
-		m128.low64 = XXH_xorshift64(m128.low64, 28);
-		m128.high64 = XXH3_avalanche(m128.high64);
-		return m128;
-	}
-}
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) {
-	XXH_ASSERT(input != NULL);
-	XXH_ASSERT(secret != NULL);
-	XXH_ASSERT(9 <= len && len <= 16);
-	{
-		xxh_u64 const bitflipl = (XXH_readLE64(secret + 32) ^ XXH_readLE64(secret + 40)) - seed;
-		xxh_u64 const bitfliph = (XXH_readLE64(secret + 48) ^ XXH_readLE64(secret + 56)) + seed;
-		xxh_u64 const input_lo = XXH_readLE64(input);
-		xxh_u64 input_hi = XXH_readLE64(input + len - 8);
-		XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
-		/*
-		 * Put len in the middle of m128 to ensure that the length gets mixed to
-		 * both the low and high bits in the 128x64 multiply below.
-		 */
-		m128.low64 += (xxh_u64)(len - 1) << 54;
-		input_hi ^= bitfliph;
-		/*
-		 * Add the high 32 bits of input_hi to the high 32 bits of m128, then
-		 * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
-		 * the high 64 bits of m128.
-		 *
-		 * The best approach to this operation is different on 32-bit and 64-bit.
-		 */
-		if(sizeof(void*) < sizeof(xxh_u64)) { /* 32-bit */
-			/*
-			 * 32-bit optimized version, which is more readable.
-			 *
-			 * On 32-bit, it removes an ADC and delays a dependency between the two
-			 * halves of m128.high64, but it generates an extra mask on 64-bit.
-			 */
-			m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
-		} else {
-			/*
-			 * 64-bit optimized (albeit more confusing) version.
-			 *
-			 * Uses some properties of addition and multiplication to remove the mask:
-			 *
-			 * Let:
-			 *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
-			 *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
-			 *    c = XXH_PRIME32_2
-			 *
-			 *    a + (b * c)
-			 * Inverse Property: x + y - x == y
-			 *    a + (b * (1 + c - 1))
-			 * Distributive Property: x * (y + z) == (x * y) + (x * z)
-			 *    a + (b * 1) + (b * (c - 1))
-			 * Identity Property: x * 1 == x
-			 *    a + b + (b * (c - 1))
-			 *
-			 * Substitute a, b, and c:
-			 *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
-			 *
-			 * Since input_hi.hi + input_hi.lo == input_hi, we get this:
-			 *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
-			 */
-			m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
-		}
-		/* m128 ^= XXH_swap64(m128 >> 64); */
-		m128.low64 ^= XXH_swap64(m128.high64);
-
-		{ /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
-			XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
-			h128.high64 += m128.high64 * XXH_PRIME64_2;
-
-			h128.low64 = XXH3_avalanche(h128.low64);
-			h128.high64 = XXH3_avalanche(h128.high64);
-			return h128;
-		}
-	}
-}
-
-/*
- * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
- */
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) {
-	XXH_ASSERT(len <= 16);
-	{
-		if(len > 8)
-			return XXH3_len_9to16_128b(input, len, secret, seed);
-		if(len >= 4)
-			return XXH3_len_4to8_128b(input, len, secret, seed);
-		if(len)
-			return XXH3_len_1to3_128b(input, len, secret, seed);
-		{
-			XXH128_hash_t h128;
-			xxh_u64 const bitflipl = XXH_readLE64(secret + 64) ^ XXH_readLE64(secret + 72);
-			xxh_u64 const bitfliph = XXH_readLE64(secret + 80) ^ XXH_readLE64(secret + 88);
-			h128.low64 = XXH64_avalanche(seed ^ bitflipl);
-			h128.high64 = XXH64_avalanche(seed ^ bitfliph);
-			return h128;
-		}
-	}
-}
-
-/*
- * A bit slower than XXH3_mix16B, but handles multiply by zero better.
- */
-XXH_FORCE_INLINE XXH128_hash_t
-XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
-              const xxh_u8* secret, XXH64_hash_t seed) {
-	acc.low64 += XXH3_mix16B(input_1, secret + 0, seed);
-	acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
-	acc.high64 += XXH3_mix16B(input_2, secret + 16, seed);
-	acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
-	return acc;
-}
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
-                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
-                      XXH64_hash_t seed) {
-	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-	(void)secretSize;
-	XXH_ASSERT(16 < len && len <= 128);
-
-	{
-		XXH128_hash_t acc;
-		acc.low64 = len * XXH_PRIME64_1;
-		acc.high64 = 0;
-		if(len > 32) {
-			if(len > 64) {
-				if(len > 96) {
-					acc = XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96, seed);
-				}
-				acc = XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed);
-			}
-			acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed);
-		}
-		acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed);
-		{
-			XXH128_hash_t h128;
-			h128.low64 = acc.low64 + acc.high64;
-			h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) + ((len - seed) * XXH_PRIME64_2);
-			h128.low64 = XXH3_avalanche(h128.low64);
-			h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
-			return h128;
-		}
-	}
-}
-
-XXH_NO_INLINE XXH128_hash_t
-XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
-                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
-                       XXH64_hash_t seed) {
-	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-	(void)secretSize;
-	XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
-
-	{
-		XXH128_hash_t acc;
-		int const nbRounds = (int)len / 32;
-		int i;
-		acc.low64 = len * XXH_PRIME64_1;
-		acc.high64 = 0;
-		for(i = 0; i < 4; i++) {
-			acc = XXH128_mix32B(acc,
-			                    input + (32 * i),
-			                    input + (32 * i) + 16,
-			                    secret + (32 * i),
-			                    seed);
-		}
-		acc.low64 = XXH3_avalanche(acc.low64);
-		acc.high64 = XXH3_avalanche(acc.high64);
-		XXH_ASSERT(nbRounds >= 4);
-		for(i = 4; i < nbRounds; i++) {
-			acc = XXH128_mix32B(acc,
-			                    input + (32 * i),
-			                    input + (32 * i) + 16,
-			                    secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
-			                    seed);
-		}
-		/* last bytes */
-		acc = XXH128_mix32B(acc,
-		                    input + len - 16,
-		                    input + len - 32,
-		                    secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
-		                    0ULL - seed);
-
-		{
-			XXH128_hash_t h128;
-			h128.low64 = acc.low64 + acc.high64;
-			h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) + ((len - seed) * XXH_PRIME64_2);
-			h128.low64 = XXH3_avalanche(h128.low64);
-			h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
-			return h128;
-		}
-	}
-}
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
-                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
-                            XXH3_f_accumulate_512 f_acc512,
-                            XXH3_f_scrambleAcc f_scramble) {
-	XXH_ALIGN(XXH_ACC_ALIGN)
-	xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
-
-	XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
-
-	/* converge into final hash */
-	XXH_STATIC_ASSERT(sizeof(acc) == 64);
-	XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
-	{
-		XXH128_hash_t h128;
-		h128.low64 = XXH3_mergeAccs(acc,
-		                            secret + XXH_SECRET_MERGEACCS_START,
-		                            (xxh_u64)len * XXH_PRIME64_1);
-		h128.high64 = XXH3_mergeAccs(acc,
-		                             secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
-		                             ~((xxh_u64)len * XXH_PRIME64_2));
-		return h128;
-	}
-}
-
-/*
- * It's important for performance that XXH3_hashLong is not inlined.
- */
-XXH_NO_INLINE XXH128_hash_t
-XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
-                           XXH64_hash_t seed64,
-                           const void* XXH_RESTRICT secret, size_t secretLen) {
-	(void)seed64;
-	(void)secret;
-	(void)secretLen;
-	return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
-	                                   XXH3_accumulate_512, XXH3_scrambleAcc);
-}
-
-/*
- * It's important for performance to pass @secretLen (when it's static)
- * to the compiler, so that it can properly optimize the vectorized loop.
- */
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
-                              XXH64_hash_t seed64,
-                              const void* XXH_RESTRICT secret, size_t secretLen) {
-	(void)seed64;
-	return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
-	                                   XXH3_accumulate_512, XXH3_scrambleAcc);
-}
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
-                                     XXH64_hash_t seed64,
-                                     XXH3_f_accumulate_512 f_acc512,
-                                     XXH3_f_scrambleAcc f_scramble,
-                                     XXH3_f_initCustomSecret f_initSec) {
-	if(seed64 == 0)
-		return XXH3_hashLong_128b_internal(input, len,
-		                                   XXH3_kSecret, sizeof(XXH3_kSecret),
-		                                   f_acc512, f_scramble);
-	{
-		XXH_ALIGN(XXH_SEC_ALIGN)
-		xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
-		f_initSec(secret, seed64);
-		return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
-		                                   f_acc512, f_scramble);
-	}
-}
-
-/*
- * It's important for performance that XXH3_hashLong is not inlined.
- */
-XXH_NO_INLINE XXH128_hash_t
-XXH3_hashLong_128b_withSeed(const void* input, size_t len,
-                            XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) {
-	(void)secret;
-	(void)secretLen;
-	return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
-	                                            XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
-}
-
-typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
-                                            XXH64_hash_t, const void* XXH_RESTRICT, size_t);
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3_128bits_internal(const void* input, size_t len,
-                      XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
-                      XXH3_hashLong128_f f_hl128) {
-	XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
-	/*
-	 * If an action is to be taken if `secret` conditions are not respected,
-	 * it should be done here.
-	 * For now, it's a contract pre-condition.
-	 * Adding a check and a branch here would cost performance at every hash.
-	 */
-	if(len <= 16)
-		return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
-	if(len <= 128)
-		return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
-	if(len <= XXH3_MIDSIZE_MAX)
-		return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
-	return f_hl128(input, len, seed64, secret, secretLen);
-}
-
-/* ===   Public XXH128 API   === */
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) {
-	return XXH3_128bits_internal(input, len, 0,
-	                             XXH3_kSecret, sizeof(XXH3_kSecret),
-	                             XXH3_hashLong_128b_default);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH128_hash_t
-XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) {
-	return XXH3_128bits_internal(input, len, 0,
-	                             (const xxh_u8*)secret, secretSize,
-	                             XXH3_hashLong_128b_withSecret);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH128_hash_t
-XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) {
-	return XXH3_128bits_internal(input, len, seed,
-	                             XXH3_kSecret, sizeof(XXH3_kSecret),
-	                             XXH3_hashLong_128b_withSeed);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH128_hash_t
-XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed) {
-	if(len <= XXH3_MIDSIZE_MAX)
-		return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
-	return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH128_hash_t
-XXH128(const void* input, size_t len, XXH64_hash_t seed) {
-	return XXH3_128bits_withSeed(input, len, seed);
-}
-
-/* ===   XXH3 128-bit streaming   === */
-
-/*
- * All initialization and update functions are identical to 64-bit streaming variant.
- * The only difference is the finalization routine.
- */
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH3_128bits_reset(XXH3_state_t* statePtr) {
-	return XXH3_64bits_reset(statePtr);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) {
-	return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) {
-	return XXH3_64bits_reset_withSeed(statePtr, seed);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed) {
-	return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) {
-	return XXH3_update(state, (const xxh_u8*)input, len,
-	                   XXH3_accumulate_512, XXH3_scrambleAcc);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t* state) {
-	const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
-	if(state->totalLen > XXH3_MIDSIZE_MAX) {
-		XXH_ALIGN(XXH_ACC_ALIGN)
-		XXH64_hash_t acc[XXH_ACC_NB];
-		XXH3_digest_long(acc, state, secret);
-		XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
-		{
-			XXH128_hash_t h128;
-			h128.low64 = XXH3_mergeAccs(acc,
-			                            secret + XXH_SECRET_MERGEACCS_START,
-			                            (xxh_u64)state->totalLen * XXH_PRIME64_1);
-			h128.high64 = XXH3_mergeAccs(acc,
-			                             secret + state->secretLimit + XXH_STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
-			                             ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
-			return h128;
-		}
-	}
-	/* len <= XXH3_MIDSIZE_MAX : short code */
-	if(state->seed)
-		return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
-	return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
-	                               secret, state->secretLimit + XXH_STRIPE_LEN);
-}
-
-/* 128-bit utility functions */
-
-#include <string.h> /* memcmp, memcpy */
-
-/* return : 1 is equal, 0 if different */
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
-	/* note : XXH128_hash_t is compact, it has no padding byte */
-	return !(memcmp(&h1, &h2, sizeof(h1)));
-}
-
-/* This prototype is compatible with stdlib's qsort().
- * return : >0 if *h128_1  > *h128_2
- *          <0 if *h128_1  < *h128_2
- *          =0 if *h128_1 == *h128_2  */
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) {
-	XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
-	XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
-	int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
-	/* note : bets that, in most cases, hash values are different */
-	if(hcmp)
-		return hcmp;
-	return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
-}
-
-/*======   Canonical representation   ======*/
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API void
-XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) {
-	XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
-	if(XXH_CPU_LITTLE_ENDIAN) {
-		hash.high64 = XXH_swap64(hash.high64);
-		hash.low64 = XXH_swap64(hash.low64);
-	}
-	XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
-	XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH128_hash_t
-XXH128_hashFromCanonical(const XXH128_canonical_t* src) {
-	XXH128_hash_t h;
-	h.high64 = XXH_readBE64(src);
-	h.low64 = XXH_readBE64(src->digest + 8);
-	return h;
-}
-
-/* ==========================================
- * Secret generators
- * ==========================================
- */
-#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
-
-XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) {
-	XXH_writeLE64(dst, XXH_readLE64(dst) ^ h128.low64);
-	XXH_writeLE64((char*)dst + 8, XXH_readLE64((char*)dst + 8) ^ h128.high64);
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API XXH_errorcode
-XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize) {
-#if(XXH_DEBUGLEVEL >= 1)
-	XXH_ASSERT(secretBuffer != NULL);
-	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
-#else
-	/* production mode, assert() are disabled */
-	if(secretBuffer == NULL)
-		return XXH_ERROR;
-	if(secretSize < XXH3_SECRET_SIZE_MIN)
-		return XXH_ERROR;
-#endif
-
-	if(customSeedSize == 0) {
-		customSeed = XXH3_kSecret;
-		customSeedSize = XXH_SECRET_DEFAULT_SIZE;
-	}
-#if(XXH_DEBUGLEVEL >= 1)
-	XXH_ASSERT(customSeed != NULL);
-#else
-	if(customSeed == NULL)
-		return XXH_ERROR;
-#endif
-
-	/* Fill secretBuffer with a copy of customSeed - repeat as needed */
-	{
-		size_t pos = 0;
-		while(pos < secretSize) {
-			size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
-			memcpy((char*)secretBuffer + pos, customSeed, toCopy);
-			pos += toCopy;
-		}
-	}
-
-	{
-		size_t const nbSeg16 = secretSize / 16;
-		size_t n;
-		XXH128_canonical_t scrambler;
-		XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
-		for(n = 0; n < nbSeg16; n++) {
-			XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
-			XXH3_combine16((char*)secretBuffer + n * 16, h128);
-		}
-		/* last segment */
-		XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
-	}
-	return XXH_OK;
-}
-
-/*! @ingroup xxh3_family */
-XXH_PUBLIC_API void
-XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed) {
-	XXH_ALIGN(XXH_SEC_ALIGN)
-	xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
-	XXH3_initCustomSecret(secret, seed);
-	XXH_ASSERT(secretBuffer != NULL);
-	memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
-}
-
-/* Pop our optimization override from above */
-#if XXH_VECTOR == XXH_AVX2                                  /* AVX2 */           \
-    && defined(__GNUC__) && !defined(__clang__)             /* GCC, not Clang */ \
-    && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
-#pragma GCC pop_options
-#endif
-
-#endif /* XXH_NO_LONG_LONG */
-
-#endif /* XXH_NO_XXH3 */
-
-/*!
- * @}
- */
-#endif /* XXH_IMPLEMENTATION */
-
-#if defined(__cplusplus)
-}
-#endif
diff --git a/src/zstd/zdict.h b/src/zstd/zdict.h
index 5a2f9fe3f..2268f948a 100644
--- a/src/zstd/zdict.h
+++ b/src/zstd/zdict.h
@@ -8,7 +8,7 @@
  * You may select, at your option, one of the above-listed licenses.
  */
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 extern "C" {
 #endif
 
@@ -16,34 +16,35 @@ extern "C" {
 #define ZSTD_ZDICT_H
 
 /*======  Dependencies  ======*/
-#include <stddef.h> /* size_t */
+#include <stddef.h>  /* size_t */
+
 
 /* =====   ZDICTLIB_API : control library symbols visibility   ===== */
 #ifndef ZDICTLIB_VISIBLE
-/* Backwards compatibility with old macro name */
-#ifdef ZDICTLIB_VISIBILITY
-#define ZDICTLIB_VISIBLE ZDICTLIB_VISIBILITY
-#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
-#define ZDICTLIB_VISIBLE __attribute__((visibility("default")))
-#else
-#define ZDICTLIB_VISIBLE
-#endif
+   /* Backwards compatibility with old macro name */
+#  ifdef ZDICTLIB_VISIBILITY
+#    define ZDICTLIB_VISIBLE ZDICTLIB_VISIBILITY
+#  elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
+#    define ZDICTLIB_VISIBLE __attribute__ ((visibility ("default")))
+#  else
+#    define ZDICTLIB_VISIBLE
+#  endif
 #endif
 
 #ifndef ZDICTLIB_HIDDEN
-#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
-#define ZDICTLIB_HIDDEN __attribute__((visibility("hidden")))
-#else
-#define ZDICTLIB_HIDDEN
-#endif
+#  if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
+#    define ZDICTLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+#  else
+#    define ZDICTLIB_HIDDEN
+#  endif
 #endif
 
-#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT == 1)
-#define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBLE
-#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT == 1)
-#define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBLE
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
 #else
-#define ZDICTLIB_API ZDICTLIB_VISIBLE
+#  define ZDICTLIB_API ZDICTLIB_VISIBLE
 #endif
 
 /*******************************************************************************
@@ -185,6 +186,7 @@ extern "C" {
  *
  ******************************************************************************/
 
+
 /*! ZDICT_trainFromBuffer():
  *  Train a dictionary from an array of samples.
  *  Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
@@ -206,20 +208,20 @@ extern "C" {
  *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
  */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                                          const void* samplesBuffer,
-                                          const size_t* samplesSizes, unsigned nbSamples);
+                                    const void* samplesBuffer,
+                                    const size_t* samplesSizes, unsigned nbSamples);
 
 typedef struct {
-	int compressionLevel;       /**< optimize for a specific zstd compression level; 0 means default */
-	unsigned notificationLevel; /**< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
-	unsigned dictID;            /**< force dictID value; 0 means auto mode (32-bits random value)
-	                             *   NOTE: The zstd format reserves some dictionary IDs for future use.
-	                             *         You may use them in private settings, but be warned that they
-	                             *         may be used by zstd in a public dictionary registry in the future.
-	                             *         These dictionary IDs are:
-	                             *           - low range  : <= 32767
-	                             *           - high range : >= (2^31)
-	                             */
+    int      compressionLevel;   /**< optimize for a specific zstd compression level; 0 means default */
+    unsigned notificationLevel;  /**< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
+    unsigned dictID;             /**< force dictID value; 0 means auto mode (32-bits random value)
+                                  *   NOTE: The zstd format reserves some dictionary IDs for future use.
+                                  *         You may use them in private settings, but be warned that they
+                                  *         may be used by zstd in a public dictionary registry in the future.
+                                  *         These dictionary IDs are:
+                                  *           - low range  : <= 32767
+                                  *           - high range : >= (2^31)
+                                  */
 } ZDICT_params_t;
 
 /*! ZDICT_finalizeDictionary():
@@ -258,30 +260,31 @@ typedef struct {
  *         * Samples are all exactly the same
  */
 ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize,
-                                             const void* dictContent, size_t dictContentSize,
-                                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                                             ZDICT_params_t parameters);
+                                const void* dictContent, size_t dictContentSize,
+                                const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                ZDICT_params_t parameters);
+
 
 /*======   Helper functions   ======*/
-ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize);       /**< extracts dictID; @return zero if error (not a valid dictionary) */
-ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */
+ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize);  /**< extracts dictID; @return zero if error (not a valid dictionary) */
+ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize);  /* returns dict header size; returns a ZSTD error code on failure */
 ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
 ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
 
-#endif /* ZSTD_ZDICT_H */
+#endif   /* ZSTD_ZDICT_H */
 
 #if defined(ZDICT_STATIC_LINKING_ONLY) && !defined(ZSTD_ZDICT_H_STATIC)
 #define ZSTD_ZDICT_H_STATIC
 
 /* This can be overridden externally to hide static symbols. */
 #ifndef ZDICTLIB_STATIC_API
-#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT == 1)
-#define ZDICTLIB_STATIC_API __declspec(dllexport) ZDICTLIB_VISIBLE
-#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT == 1)
-#define ZDICTLIB_STATIC_API __declspec(dllimport) ZDICTLIB_VISIBLE
-#else
-#define ZDICTLIB_STATIC_API ZDICTLIB_VISIBLE
-#endif
+#  if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#    define ZDICTLIB_STATIC_API __declspec(dllexport) ZDICTLIB_VISIBLE
+#  elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#    define ZDICTLIB_STATIC_API __declspec(dllimport) ZDICTLIB_VISIBLE
+#  else
+#    define ZDICTLIB_STATIC_API ZDICTLIB_VISIBLE
+#  endif
 #endif
 
 /* ====================================================================================
@@ -291,7 +294,7 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
  * Use them only in association with static linking.
  * ==================================================================================== */
 
-#define ZDICT_DICTSIZE_MIN 256
+#define ZDICT_DICTSIZE_MIN    256
 /* Deprecated: Remove in v1.6.0 */
 #define ZDICT_CONTENTSIZE_MIN 128
 
@@ -300,28 +303,28 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
  *  For others, value 0 means default.
  */
 typedef struct {
-	unsigned k;                       /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
-	unsigned d;                       /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
-	unsigned steps;                   /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
-	unsigned nbThreads;               /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
-	double splitPoint;                /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
-	unsigned shrinkDict;              /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking  */
-	unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
-	ZDICT_params_t zParams;
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
+    unsigned shrinkDict;         /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking  */
+    unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
+    ZDICT_params_t zParams;
 } ZDICT_cover_params_t;
 
 typedef struct {
-	unsigned k;                       /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
-	unsigned d;                       /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
-	unsigned f;                       /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
-	unsigned steps;                   /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
-	unsigned nbThreads;               /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
-	double splitPoint;                /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
-	unsigned accel;                   /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
-	unsigned shrinkDict;              /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking  */
-	unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
-
-	ZDICT_params_t zParams;
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned f;                  /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
+    unsigned accel;              /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
+    unsigned shrinkDict;         /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking  */
+    unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
+
+    ZDICT_params_t zParams;
 } ZDICT_fastCover_params_t;
 
 /*! ZDICT_trainFromBuffer_cover():
@@ -339,9 +342,9 @@ typedef struct {
  *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
  */
 ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover(
-    void* dictBuffer, size_t dictBufferCapacity,
-    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-    ZDICT_cover_params_t parameters);
+          void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+          ZDICT_cover_params_t parameters);
 
 /*! ZDICT_optimizeTrainFromBuffer_cover():
  * The same requirements as above hold for all the parameters except `parameters`.
@@ -361,9 +364,9 @@ ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover(
  * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
  */
 ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover(
-    void* dictBuffer, size_t dictBufferCapacity,
+          void* dictBuffer, size_t dictBufferCapacity,
     const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-    ZDICT_cover_params_t* parameters);
+          ZDICT_cover_params_t* parameters);
 
 /*! ZDICT_trainFromBuffer_fastCover():
  *  Train a dictionary from an array of samples using a modified version of COVER algorithm.
@@ -381,10 +384,10 @@ ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover(
  *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
  *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
  */
-ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_fastCover(void* dictBuffer,
-                                                           size_t dictBufferCapacity, const void* samplesBuffer,
-                                                           const size_t* samplesSizes, unsigned nbSamples,
-                                                           ZDICT_fastCover_params_t parameters);
+ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
+                    size_t dictBufferCapacity, const void *samplesBuffer,
+                    const size_t *samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t parameters);
 
 /*! ZDICT_optimizeTrainFromBuffer_fastCover():
  * The same requirements as above hold for all the parameters except `parameters`.
@@ -405,13 +408,13 @@ ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_fastCover(void* dictBuffer,
  * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 6 * 2^f bytes of memory for each thread.
  */
 ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
-                                                                   size_t dictBufferCapacity, const void* samplesBuffer,
-                                                                   const size_t* samplesSizes, unsigned nbSamples,
-                                                                   ZDICT_fastCover_params_t* parameters);
+                    size_t dictBufferCapacity, const void* samplesBuffer,
+                    const size_t* samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t* parameters);
 
 typedef struct {
-	unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
-	ZDICT_params_t zParams;
+    unsigned selectivityLevel;   /* 0 means default; larger => select more => larger dictionary */
+    ZDICT_params_t zParams;
 } ZDICT_legacy_params_t;
 
 /*! ZDICT_trainFromBuffer_legacy():
@@ -434,36 +437,38 @@ ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_legacy(
     const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
     ZDICT_legacy_params_t parameters);
 
+
 /* Deprecation warnings */
 /* It is generally possible to disable deprecation warnings from compiler,
    for example with -Wno-deprecated-declarations for gcc
    or _CRT_SECURE_NO_WARNINGS in Visual.
    Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */
 #ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS
-#define ZDICT_DEPRECATED(message) /* disable deprecation warnings */
-#else
-#define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#if defined(__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
-#define ZDICT_DEPRECATED(message) [[deprecated(message)]]
-#elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405)
-#define ZDICT_DEPRECATED(message) __attribute__((deprecated(message)))
-#elif(ZDICT_GCC_VERSION >= 301)
-#define ZDICT_DEPRECATED(message) __attribute__((deprecated))
-#elif defined(_MSC_VER)
-#define ZDICT_DEPRECATED(message) __declspec(deprecated(message))
+#  define ZDICT_DEPRECATED(message) /* disable deprecation warnings */
 #else
-#pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler")
-#define ZDICT_DEPRECATED(message)
-#endif
+#  define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define ZDICT_DEPRECATED(message) [[deprecated(message)]]
+#  elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405)
+#    define ZDICT_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif (ZDICT_GCC_VERSION >= 301)
+#    define ZDICT_DEPRECATED(message) __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define ZDICT_DEPRECATED(message) __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler")
+#    define ZDICT_DEPRECATED(message)
+#  endif
 #endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */
 
 ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead")
 ZDICTLIB_STATIC_API
 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
-                                        const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+
 
-#endif /* ZSTD_ZDICT_H_STATIC */
+#endif   /* ZSTD_ZDICT_H_STATIC */
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 }
 #endif
diff --git a/src/zstd/zstd.cpp b/src/zstd/zstd.cpp
index fa39452f7..173653da9 100644
--- a/src/zstd/zstd.cpp
+++ b/src/zstd/zstd.cpp
@@ -2,25 +2,25 @@ extern "C" {
 #define XXH_NAMESPACE ZSTD_
 #define ZSTD_DISABLE_ASM
 
-#include "zstd/xxhash.c"
-#include "zstd/zstd_decompress_block.c"
-#include "zstd/zstd_ddict.c"
-#include "zstd/huf_compress.c"
-#include "zstd/fse_compress.c"
-#include "zstd/huf_decompress.c"
-#include "zstd/fse_decompress.c"
-#include "zstd/zstd_common.c"
-#include "zstd/entropy_common.c"
-#include "zstd/hist.c"
-#include "zstd/zstd_compress_superblock.c"
-#include "zstd/zstd_ldm.c"
-#include "zstd/zstd_opt.c"
-#include "zstd/zstd_lazy.c"
-#include "zstd/zstd_double_fast.c"
-#include "zstd/zstd_fast.c"
-#include "zstd/zstd_compress_literals.c"
-#include "zstd/zstd_compress_sequences.c"
-#include "zstd/error_private.c"
-#include "zstd/zstd_decompress.c"
-#include "zstd/zstd_compress.c"
+#include "zstd/common/xxhash.c"
+#include "zstd/decompress/zstd_decompress_block.c"
+#include "zstd/decompress/zstd_ddict.c"
+#include "zstd/compress/huf_compress.c"
+#include "zstd/compress/fse_compress.c"
+#include "zstd/decompress/huf_decompress.c"
+#include "zstd/common/zstd_common.c"
+#include "zstd/common/entropy_common.c"
+#include "zstd/common/fse_decompress.c"
+#include "zstd/compress/hist.c"
+#include "zstd/compress/zstd_compress_superblock.c"
+#include "zstd/compress/zstd_ldm.c"
+#include "zstd/compress/zstd_opt.c"
+#include "zstd/compress/zstd_lazy.c"
+#include "zstd/compress/zstd_double_fast.c"
+#include "zstd/compress/zstd_fast.c"
+#include "zstd/compress/zstd_compress_literals.c"
+#include "zstd/compress/zstd_compress_sequences.c"
+#include "zstd/common/error_private.c"
+#include "zstd/decompress/zstd_decompress.c"
+#include "zstd/compress/zstd_compress.c"
 };
diff --git a/src/zstd/zstd.h b/src/zstd/zstd.h
index 7f42abef8..5d1fef8a6 100644
--- a/src/zstd/zstd.h
+++ b/src/zstd/zstd.h
@@ -7,7 +7,7 @@
  * in the COPYING file in the root directory of this source tree).
  * You may select, at your option, one of the above-listed licenses.
  */
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 extern "C" {
 #endif
 
@@ -15,35 +15,36 @@ extern "C" {
 #define ZSTD_H_235446
 
 /* ======   Dependencies   ======*/
-#include <limits.h> /* INT_MAX */
-#include <stddef.h> /* size_t */
+#include <limits.h>   /* INT_MAX */
+#include <stddef.h>   /* size_t */
+
 
 /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
 #ifndef ZSTDLIB_VISIBLE
-/* Backwards compatibility with old macro name */
-#ifdef ZSTDLIB_VISIBILITY
-#define ZSTDLIB_VISIBLE ZSTDLIB_VISIBILITY
-#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
-#define ZSTDLIB_VISIBLE __attribute__((visibility("default")))
-#else
-#define ZSTDLIB_VISIBLE
-#endif
+   /* Backwards compatibility with old macro name */
+#  ifdef ZSTDLIB_VISIBILITY
+#    define ZSTDLIB_VISIBLE ZSTDLIB_VISIBILITY
+#  elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
+#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDLIB_VISIBLE
+#  endif
 #endif
 
 #ifndef ZSTDLIB_HIDDEN
-#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
-#define ZSTDLIB_HIDDEN __attribute__((visibility("hidden")))
-#else
-#define ZSTDLIB_HIDDEN
-#endif
+#  if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
+#    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+#  else
+#    define ZSTDLIB_HIDDEN
+#  endif
 #endif
 
-#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT == 1)
-#define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBLE
-#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT == 1)
-#define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBLE
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
 #else
-#define ZSTDLIB_API ZSTDLIB_VISIBLE
+#  define ZSTDLIB_API ZSTDLIB_VISIBLE
 #endif
 
 /* Deprecation warnings :
@@ -52,22 +53,23 @@ extern "C" {
  * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
  */
 #ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
-#define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
-#else
-#if defined(__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
-#define ZSTD_DEPRECATED(message) [[deprecated(message)]]
-#elif(defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
-#define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
-#elif defined(__GNUC__) && (__GNUC__ >= 3)
-#define ZSTD_DEPRECATED(message) __attribute__((deprecated))
-#elif defined(_MSC_VER)
-#define ZSTD_DEPRECATED(message) __declspec(deprecated(message))
+#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
 #else
-#pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
-#define ZSTD_DEPRECATED(message)
-#endif
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define ZSTD_DEPRECATED(message) [[deprecated(message)]]
+#  elif (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ >= 3)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define ZSTD_DEPRECATED(message) __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+#    define ZSTD_DEPRECATED(message)
+#  endif
 #endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
 
+
 /*******************************************************************************
   Introduction
 
@@ -102,10 +104,10 @@ extern "C" {
 *******************************************************************************/
 
 /*------   Version   ------*/
-#define ZSTD_VERSION_MAJOR 1
-#define ZSTD_VERSION_MINOR 5
-#define ZSTD_VERSION_RELEASE 4
-#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR * 100 * 100 + ZSTD_VERSION_MINOR * 100 + ZSTD_VERSION_RELEASE)
+#define ZSTD_VERSION_MAJOR    1
+#define ZSTD_VERSION_MINOR    5
+#define ZSTD_VERSION_RELEASE  6
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 
 /*! ZSTD_versionNumber() :
  *  Return runtime library version, the value is (MAJOR*100*100 + MINOR*100 + RELEASE). */
@@ -124,7 +126,7 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
  *  Default constant
  ***************************************/
 #ifndef ZSTD_CLEVEL_DEFAULT
-#define ZSTD_CLEVEL_DEFAULT 3
+#  define ZSTD_CLEVEL_DEFAULT 3
 #endif
 
 /* *************************************
@@ -132,25 +134,27 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
  ***************************************/
 
 /* All magic numbers are supposed read/written to/from files/memory using little-endian convention */
-#define ZSTD_MAGICNUMBER 0xFD2FB528           /* valid since v0.8.0 */
-#define ZSTD_MAGIC_DICTIONARY 0xEC30A437      /* valid since v0.7.0 */
-#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50 /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */
-#define ZSTD_MAGIC_SKIPPABLE_MASK 0xFFFFFFF0
+#define ZSTD_MAGICNUMBER            0xFD2FB528    /* valid since v0.8.0 */
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437    /* valid since v0.7.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50    /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */
+#define ZSTD_MAGIC_SKIPPABLE_MASK   0xFFFFFFF0
+
+#define ZSTD_BLOCKSIZELOG_MAX  17
+#define ZSTD_BLOCKSIZE_MAX     (1<<ZSTD_BLOCKSIZELOG_MAX)
 
-#define ZSTD_BLOCKSIZELOG_MAX 17
-#define ZSTD_BLOCKSIZE_MAX (1 << ZSTD_BLOCKSIZELOG_MAX)
 
 /***************************************
- *  Simple API
- ***************************************/
+*  Simple API
+***************************************/
 /*! ZSTD_compress() :
  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
+ *        enough space to successfully compress the data.
  *  @return : compressed size written into `dst` (<= `dstCapacity),
  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
-ZSTDLIB_API size_t ZSTD_compress(void* dst, size_t dstCapacity,
-                                 const void* src, size_t srcSize,
-                                 int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                                  int compressionLevel);
 
 /*! ZSTD_decompress() :
  *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
@@ -158,8 +162,8 @@ ZSTDLIB_API size_t ZSTD_compress(void* dst, size_t dstCapacity,
  *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
  *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
  *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
-ZSTDLIB_API size_t ZSTD_decompress(void* dst, size_t dstCapacity,
-                                   const void* src, size_t compressedSize);
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                              const void* src, size_t compressedSize);
 
 /*! ZSTD_getFrameContentSize() : requires v1.3.0+
  *  `src` should point to the start of a ZSTD encoded frame.
@@ -185,8 +189,8 @@ ZSTDLIB_API size_t ZSTD_decompress(void* dst, size_t dstCapacity,
  *            Each application can set its own limits.
  *   note 6 : This function replaces ZSTD_getDecompressedSize() */
 #define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
-#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2)
-ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void* src, size_t srcSize);
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
 
 /*! ZSTD_getDecompressedSize() :
  *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
@@ -206,6 +210,7 @@ unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
  *        or an error code if input is invalid */
 ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
 
+
 /*======  Helper functions  ======*/
 /* ZSTD_compressBound() :
  * maximum compressed size in worst case single-pass scenario.
@@ -223,27 +228,28 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
  * for example to size a static array on stack.
  * Will produce constant value 0 if srcSize too large.
  */
-#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t) == 8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U)
-#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize) >> 8) + (((srcSize) < (128 << 10)) ? (((128 << 10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
-ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize);                                                                                                                                                               /*!< maximum compressed size in worst case single-pass scenario */
+#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U)
+#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
 /* ZSTD_isError() :
  * Most ZSTD_* functions returning a size_t value can be tested for error,
  * using ZSTD_isError().
  * @return 1 if error, 0 otherwise
  */
-ZSTDLIB_API unsigned ZSTD_isError(size_t code);         /*!< tells if a `size_t` function result is an error code */
-ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */
-ZSTDLIB_API int ZSTD_minCLevel(void);                   /*!< minimum negative compression level allowed, requires v1.4.0+ */
-ZSTDLIB_API int ZSTD_maxCLevel(void);                   /*!< maximum compression level available */
-ZSTDLIB_API int ZSTD_defaultCLevel(void);               /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */
+ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
+ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */
+
 
 /***************************************
- *  Explicit context
- ***************************************/
+*  Explicit context
+***************************************/
 /*= Compression context
  *  When compressing many times,
  *  it is recommended to allocate a context just once,
- *  and re-use it for each successive compression operation.
+ *  and reuse it for each successive compression operation.
  *  This will make workload friendlier for system's memory.
  *  Note : re-using context is just a speed / resource optimization.
  *         It doesn't change the compression ratio, which remains identical.
@@ -252,256 +258,270 @@ ZSTDLIB_API int ZSTD_defaultCLevel(void);               /*!< default compression
  */
 typedef struct ZSTD_CCtx_s ZSTD_CCtx;
 ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
-ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* accept NULL pointer */
+ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* accept NULL pointer */
 
 /*! ZSTD_compressCCtx() :
  *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
- *  Important : in order to behave similarly to `ZSTD_compress()`,
- *  this function compresses at requested compression level,
- *  __ignoring any other parameter__ .
+ *  Important : in order to mirror `ZSTD_compress()` behavior,
+ *  this function compresses at the requested compression level,
+ *  __ignoring any other advanced parameter__ .
  *  If any advanced parameter was set using the advanced API,
  *  they will all be reset. Only `compressionLevel` remains.
  */
 ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
                                      void* dst, size_t dstCapacity,
-                                     const void* src, size_t srcSize,
+                               const void* src, size_t srcSize,
                                      int compressionLevel);
 
 /*= Decompression context
  *  When decompressing many times,
  *  it is recommended to allocate a context only once,
- *  and re-use it for each successive compression operation.
+ *  and reuse it for each successive compression operation.
  *  This will make workload friendlier for system's memory.
  *  Use one context per thread for parallel execution. */
 typedef struct ZSTD_DCtx_s ZSTD_DCtx;
 ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
-ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /* accept NULL pointer */
+ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);  /* accept NULL pointer */
 
 /*! ZSTD_decompressDCtx() :
  *  Same as ZSTD_decompress(),
  *  requires an allocated ZSTD_DCtx.
- *  Compatible with sticky parameters.
+ *  Compatible with sticky parameters (see below).
  */
 ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
                                        void* dst, size_t dstCapacity,
-                                       const void* src, size_t srcSize);
+                                 const void* src, size_t srcSize);
+
 
 /*********************************************
- *  Advanced compression API (Requires v1.4.0+)
- **********************************************/
+*  Advanced compression API (Requires v1.4.0+)
+**********************************************/
 
 /* API design :
  *   Parameters are pushed one by one into an existing context,
  *   using ZSTD_CCtx_set*() functions.
  *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
  *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
- *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
+ *   __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ .
  *
  *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
  *
  *   This API supersedes all other "advanced" API entry points in the experimental section.
- *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
+ *   In the future, we expect to remove API entry points from experimental which are redundant with this API.
  */
 
+
 /* Compression strategies, listed from fastest to strongest */
-typedef enum { ZSTD_fast = 1,
-	           ZSTD_dfast = 2,
-	           ZSTD_greedy = 3,
-	           ZSTD_lazy = 4,
-	           ZSTD_lazy2 = 5,
-	           ZSTD_btlazy2 = 6,
-	           ZSTD_btopt = 7,
-	           ZSTD_btultra = 8,
-	           ZSTD_btultra2 = 9
-	           /* note : new strategies _might_ be added in the future.
-	                     Only the order (from fast to strong) is guaranteed */
+typedef enum { ZSTD_fast=1,
+               ZSTD_dfast=2,
+               ZSTD_greedy=3,
+               ZSTD_lazy=4,
+               ZSTD_lazy2=5,
+               ZSTD_btlazy2=6,
+               ZSTD_btopt=7,
+               ZSTD_btultra=8,
+               ZSTD_btultra2=9
+               /* note : new strategies _might_ be added in the future.
+                         Only the order (from fast to strong) is guaranteed */
 } ZSTD_strategy;
 
 typedef enum {
 
-	/* compression parameters
-	 * Note: When compressing with a ZSTD_CDict these parameters are superseded
-	 * by the parameters used to construct the ZSTD_CDict.
-	 * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */
-	ZSTD_c_compressionLevel = 100, /* Set compression parameters according to pre-defined cLevel table.
-	                                * Note that exact compression parameters are dynamically determined,
-	                                * depending on both compression level and srcSize (when known).
-	                                * Default level is ZSTD_CLEVEL_DEFAULT==3.
-	                                * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
-	                                * Note 1 : it's possible to pass a negative compression level.
-	                                * Note 2 : setting a level does not automatically set all other compression parameters
-	                                *   to default. Setting this will however eventually dynamically impact the compression
-	                                *   parameters which have not been manually set. The manually set
-	                                *   ones will 'stick'. */
-	/* Advanced compression parameters :
-	 * It's possible to pin down compression parameters to some specific values.
-	 * In which case, these values are no longer dynamically selected by the compressor */
-	ZSTD_c_windowLog = 101,    /* Maximum allowed back-reference distance, expressed as power of 2.
-	                            * This will set a memory budget for streaming decompression,
-	                            * with larger values requiring more memory
-	                            * and typically compressing more.
-	                            * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
-	                            * Special: value 0 means "use default windowLog".
-	                            * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT
-	                            *       requires explicitly allowing such size at streaming decompression stage. */
-	ZSTD_c_hashLog = 102,      /* Size of the initial probe table, as a power of 2.
-	                            * Resulting memory usage is (1 << (hashLog+2)).
-	                            * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
-	                            * Larger tables improve compression ratio of strategies <= dFast,
-	                            * and improve speed of strategies > dFast.
-	                            * Special: value 0 means "use default hashLog". */
-	ZSTD_c_chainLog = 103,     /* Size of the multi-probe search table, as a power of 2.
-	                            * Resulting memory usage is (1 << (chainLog+2)).
-	                            * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
-	                            * Larger tables result in better and slower compression.
-	                            * This parameter is useless for "fast" strategy.
-	                            * It's still useful when using "dfast" strategy,
-	                            * in which case it defines a secondary probe table.
-	                            * Special: value 0 means "use default chainLog". */
-	ZSTD_c_searchLog = 104,    /* Number of search attempts, as a power of 2.
-	                            * More attempts result in better and slower compression.
-	                            * This parameter is useless for "fast" and "dFast" strategies.
-	                            * Special: value 0 means "use default searchLog". */
-	ZSTD_c_minMatch = 105,     /* Minimum size of searched matches.
-	                            * Note that Zstandard can still find matches of smaller size,
-	                            * it just tweaks its search algorithm to look for this size and larger.
-	                            * Larger values increase compression and decompression speed, but decrease ratio.
-	                            * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX.
-	                            * Note that currently, for all strategies < btopt, effective minimum is 4.
-	                            *                    , for all strategies > fast, effective maximum is 6.
-	                            * Special: value 0 means "use default minMatchLength". */
-	ZSTD_c_targetLength = 106, /* Impact of this field depends on strategy.
-	                            * For strategies btopt, btultra & btultra2:
-	                            *     Length of Match considered "good enough" to stop search.
-	                            *     Larger values make compression stronger, and slower.
-	                            * For strategy fast:
-	                            *     Distance between match sampling.
-	                            *     Larger values make compression faster, and weaker.
-	                            * Special: value 0 means "use default targetLength". */
-	ZSTD_c_strategy = 107,     /* See ZSTD_strategy enum definition.
-	                            * The higher the value of selected strategy, the more complex it is,
-	                            * resulting in stronger and slower compression.
-	                            * Special: value 0 means "use default strategy". */
-	/* LDM mode parameters */
-	ZSTD_c_enableLongDistanceMatching = 160, /* Enable long distance matching.
-	                                          * This parameter is designed to improve compression ratio
-	                                          * for large inputs, by finding large matches at long distance.
-	                                          * It increases memory usage and window size.
-	                                          * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB
-	                                          * except when expressly set to a different value.
-	                                          * Note: will be enabled by default if ZSTD_c_windowLog >= 128 MB and
-	                                          * compression strategy >= ZSTD_btopt (== compression level 16+) */
-	ZSTD_c_ldmHashLog = 161,                 /* Size of the table for long distance matching, as a power of 2.
-	                                          * Larger values increase memory usage and compression ratio,
-	                                          * but decrease compression speed.
-	                                          * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
-	                                          * default: windowlog - 7.
-	                                          * Special: value 0 means "automatically determine hashlog". */
-	ZSTD_c_ldmMinMatch = 162,                /* Minimum match size for long distance matcher.
-	                                          * Larger/too small values usually decrease compression ratio.
-	                                          * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
-	                                          * Special: value 0 means "use default value" (default: 64). */
-	ZSTD_c_ldmBucketSizeLog = 163,           /* Log size of each bucket in the LDM hash table for collision resolution.
-	                                          * Larger values improve collision resolution but decrease compression speed.
-	                                          * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX.
-	                                          * Special: value 0 means "use default value" (default: 3). */
-	ZSTD_c_ldmHashRateLog = 164,             /* Frequency of inserting/looking up entries into the LDM hash table.
-	                                          * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
-	                                          * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
-	                                          * Larger values improve compression speed.
-	                                          * Deviating far from default value will likely result in a compression ratio decrease.
-	                                          * Special: value 0 means "automatically determine hashRateLog". */
-
-	/* frame parameters */
-	ZSTD_c_contentSizeFlag = 200, /* Content size will be written into frame header _whenever known_ (default:1)
-	                               * Content size must be known at the beginning of compression.
-	                               * This is automatically the case when using ZSTD_compress2(),
-	                               * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */
-	ZSTD_c_checksumFlag = 201,    /* A 32-bits checksum of content is written at end of frame (default:0) */
-	ZSTD_c_dictIDFlag = 202,      /* When applicable, dictionary's ID is written into frame header (default:1) */
-
-	/* multi-threading parameters */
-	/* These parameters are only active if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD).
-	 * Otherwise, trying to set any other value than default (0) will be a no-op and return an error.
-	 * In a situation where it's unknown if the linked library supports multi-threading or not,
-	 * setting ZSTD_c_nbWorkers to any value >= 1 and consulting the return value provides a quick way to check this property.
-	 */
-	ZSTD_c_nbWorkers = 400,  /* Select how many threads will be spawned to compress in parallel.
-	                          * When nbWorkers >= 1, triggers asynchronous mode when invoking ZSTD_compressStream*() :
-	                          * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller,
-	                          * while compression is performed in parallel, within worker thread(s).
-	                          * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end :
-	                          *  in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call).
-	                          * More workers improve speed, but also increase memory usage.
-	                          * Default value is `0`, aka "single-threaded mode" : no worker is spawned,
-	                          * compression is performed inside Caller's thread, and all invocations are blocking */
-	ZSTD_c_jobSize = 401,    /* Size of a compression job. This value is enforced only when nbWorkers >= 1.
-	                          * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads.
-	                          * 0 means default, which is dynamically determined based on compression parameters.
-	                          * Job size must be a minimum of overlap size, or ZSTDMT_JOBSIZE_MIN (= 512 KB), whichever is largest.
-	                          * The minimum size is automatically and transparently enforced. */
-	ZSTD_c_overlapLog = 402, /* Control the overlap size, as a fraction of window size.
-	                          * The overlap size is an amount of data reloaded from previous job at the beginning of a new job.
-	                          * It helps preserve compression ratio, while each job is compressed in parallel.
-	                          * This value is enforced only when nbWorkers >= 1.
-	                          * Larger values increase compression ratio, but decrease speed.
-	                          * Possible values range from 0 to 9 :
-	                          * - 0 means "default" : value will be determined by the library, depending on strategy
-	                          * - 1 means "no overlap"
-	                          * - 9 means "full overlap", using a full window size.
-	                          * Each intermediate rank increases/decreases load size by a factor 2 :
-	                          * 9: full window;  8: w/2;  7: w/4;  6: w/8;  5:w/16;  4: w/32;  3:w/64;  2:w/128;  1:no overlap;  0:default
-	                          * default value varies between 6 and 9, depending on strategy */
-
-	/* note : additional experimental parameters are also available
-	 * within the experimental section of the API.
-	 * At the time of this writing, they include :
-	 * ZSTD_c_rsyncable
-	 * ZSTD_c_format
-	 * ZSTD_c_forceMaxWindow
-	 * ZSTD_c_forceAttachDict
-	 * ZSTD_c_literalCompressionMode
-	 * ZSTD_c_targetCBlockSize
-	 * ZSTD_c_srcSizeHint
-	 * ZSTD_c_enableDedicatedDictSearch
-	 * ZSTD_c_stableInBuffer
-	 * ZSTD_c_stableOutBuffer
-	 * ZSTD_c_blockDelimiters
-	 * ZSTD_c_validateSequences
-	 * ZSTD_c_useBlockSplitter
-	 * ZSTD_c_useRowMatchFinder
-	 * ZSTD_c_prefetchCDictTables
-	 * ZSTD_c_enableSeqProducerFallback
-	 * ZSTD_c_maxBlockSize
-	 * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
-	 * note : never ever use experimentalParam? names directly;
-	 *        also, the enums values themselves are unstable and can still change.
-	 */
-	ZSTD_c_experimentalParam1 = 500,
-	ZSTD_c_experimentalParam2 = 10,
-	ZSTD_c_experimentalParam3 = 1000,
-	ZSTD_c_experimentalParam4 = 1001,
-	ZSTD_c_experimentalParam5 = 1002,
-	ZSTD_c_experimentalParam6 = 1003,
-	ZSTD_c_experimentalParam7 = 1004,
-	ZSTD_c_experimentalParam8 = 1005,
-	ZSTD_c_experimentalParam9 = 1006,
-	ZSTD_c_experimentalParam10 = 1007,
-	ZSTD_c_experimentalParam11 = 1008,
-	ZSTD_c_experimentalParam12 = 1009,
-	ZSTD_c_experimentalParam13 = 1010,
-	ZSTD_c_experimentalParam14 = 1011,
-	ZSTD_c_experimentalParam15 = 1012,
-	ZSTD_c_experimentalParam16 = 1013,
-	ZSTD_c_experimentalParam17 = 1014,
-	ZSTD_c_experimentalParam18 = 1015,
-	ZSTD_c_experimentalParam19 = 1016
+    /* compression parameters
+     * Note: When compressing with a ZSTD_CDict these parameters are superseded
+     * by the parameters used to construct the ZSTD_CDict.
+     * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */
+    ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table.
+                              * Note that exact compression parameters are dynamically determined,
+                              * depending on both compression level and srcSize (when known).
+                              * Default level is ZSTD_CLEVEL_DEFAULT==3.
+                              * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
+                              * Note 1 : it's possible to pass a negative compression level.
+                              * Note 2 : setting a level does not automatically set all other compression parameters
+                              *   to default. Setting this will however eventually dynamically impact the compression
+                              *   parameters which have not been manually set. The manually set
+                              *   ones will 'stick'. */
+    /* Advanced compression parameters :
+     * It's possible to pin down compression parameters to some specific values.
+     * In which case, these values are no longer dynamically selected by the compressor */
+    ZSTD_c_windowLog=101,    /* Maximum allowed back-reference distance, expressed as power of 2.
+                              * This will set a memory budget for streaming decompression,
+                              * with larger values requiring more memory
+                              * and typically compressing more.
+                              * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
+                              * Special: value 0 means "use default windowLog".
+                              * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT
+                              *       requires explicitly allowing such size at streaming decompression stage. */
+    ZSTD_c_hashLog=102,      /* Size of the initial probe table, as a power of 2.
+                              * Resulting memory usage is (1 << (hashLog+2)).
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
+                              * Larger tables improve compression ratio of strategies <= dFast,
+                              * and improve speed of strategies > dFast.
+                              * Special: value 0 means "use default hashLog". */
+    ZSTD_c_chainLog=103,     /* Size of the multi-probe search table, as a power of 2.
+                              * Resulting memory usage is (1 << (chainLog+2)).
+                              * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
+                              * Larger tables result in better and slower compression.
+                              * This parameter is useless for "fast" strategy.
+                              * It's still useful when using "dfast" strategy,
+                              * in which case it defines a secondary probe table.
+                              * Special: value 0 means "use default chainLog". */
+    ZSTD_c_searchLog=104,    /* Number of search attempts, as a power of 2.
+                              * More attempts result in better and slower compression.
+                              * This parameter is useless for "fast" and "dFast" strategies.
+                              * Special: value 0 means "use default searchLog". */
+    ZSTD_c_minMatch=105,     /* Minimum size of searched matches.
+                              * Note that Zstandard can still find matches of smaller size,
+                              * it just tweaks its search algorithm to look for this size and larger.
+                              * Larger values increase compression and decompression speed, but decrease ratio.
+                              * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX.
+                              * Note that currently, for all strategies < btopt, effective minimum is 4.
+                              *                    , for all strategies > fast, effective maximum is 6.
+                              * Special: value 0 means "use default minMatchLength". */
+    ZSTD_c_targetLength=106, /* Impact of this field depends on strategy.
+                              * For strategies btopt, btultra & btultra2:
+                              *     Length of Match considered "good enough" to stop search.
+                              *     Larger values make compression stronger, and slower.
+                              * For strategy fast:
+                              *     Distance between match sampling.
+                              *     Larger values make compression faster, and weaker.
+                              * Special: value 0 means "use default targetLength". */
+    ZSTD_c_strategy=107,     /* See ZSTD_strategy enum definition.
+                              * The higher the value of selected strategy, the more complex it is,
+                              * resulting in stronger and slower compression.
+                              * Special: value 0 means "use default strategy". */
+
+    ZSTD_c_targetCBlockSize=130, /* v1.5.6+
+                                  * Attempts to fit compressed block size into approximatively targetCBlockSize.
+                                  * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX.
+                                  * Note that it's not a guarantee, just a convergence target (default:0).
+                                  * No target when targetCBlockSize == 0.
+                                  * This is helpful in low bandwidth streaming environments to improve end-to-end latency,
+                                  * when a client can make use of partial documents (a prominent example being Chrome).
+                                  * Note: this parameter is stable since v1.5.6.
+                                  * It was present as an experimental parameter in earlier versions,
+                                  * but it's not recommended using it with earlier library versions
+                                  * due to massive performance regressions.
+                                  */
+    /* LDM mode parameters */
+    ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                     * This parameter is designed to improve compression ratio
+                                     * for large inputs, by finding large matches at long distance.
+                                     * It increases memory usage and window size.
+                                     * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB
+                                     * except when expressly set to a different value.
+                                     * Note: will be enabled by default if ZSTD_c_windowLog >= 128 MB and
+                                     * compression strategy >= ZSTD_btopt (== compression level 16+) */
+    ZSTD_c_ldmHashLog=161,   /* Size of the table for long distance matching, as a power of 2.
+                              * Larger values increase memory usage and compression ratio,
+                              * but decrease compression speed.
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+                              * default: windowlog - 7.
+                              * Special: value 0 means "automatically determine hashlog". */
+    ZSTD_c_ldmMinMatch=162,  /* Minimum match size for long distance matcher.
+                              * Larger/too small values usually decrease compression ratio.
+                              * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+                              * Special: value 0 means "use default value" (default: 64). */
+    ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution.
+                              * Larger values improve collision resolution but decrease compression speed.
+                              * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX.
+                              * Special: value 0 means "use default value" (default: 3). */
+    ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table.
+                              * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+                              * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+                              * Larger values improve compression speed.
+                              * Deviating far from default value will likely result in a compression ratio decrease.
+                              * Special: value 0 means "automatically determine hashRateLog". */
+
+    /* frame parameters */
+    ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1)
+                              * Content size must be known at the beginning of compression.
+                              * This is automatically the case when using ZSTD_compress2(),
+                              * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */
+    ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */
+    ZSTD_c_dictIDFlag=202,   /* When applicable, dictionary's ID is written into frame header (default:1) */
+
+    /* multi-threading parameters */
+    /* These parameters are only active if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD).
+     * Otherwise, trying to set any other value than default (0) will be a no-op and return an error.
+     * In a situation where it's unknown if the linked library supports multi-threading or not,
+     * setting ZSTD_c_nbWorkers to any value >= 1 and consulting the return value provides a quick way to check this property.
+     */
+    ZSTD_c_nbWorkers=400,    /* Select how many threads will be spawned to compress in parallel.
+                              * When nbWorkers >= 1, triggers asynchronous mode when invoking ZSTD_compressStream*() :
+                              * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller,
+                              * while compression is performed in parallel, within worker thread(s).
+                              * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end :
+                              *  in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call).
+                              * More workers improve speed, but also increase memory usage.
+                              * Default value is `0`, aka "single-threaded mode" : no worker is spawned,
+                              * compression is performed inside Caller's thread, and all invocations are blocking */
+    ZSTD_c_jobSize=401,      /* Size of a compression job. This value is enforced only when nbWorkers >= 1.
+                              * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads.
+                              * 0 means default, which is dynamically determined based on compression parameters.
+                              * Job size must be a minimum of overlap size, or ZSTDMT_JOBSIZE_MIN (= 512 KB), whichever is largest.
+                              * The minimum size is automatically and transparently enforced. */
+    ZSTD_c_overlapLog=402,   /* Control the overlap size, as a fraction of window size.
+                              * The overlap size is an amount of data reloaded from previous job at the beginning of a new job.
+                              * It helps preserve compression ratio, while each job is compressed in parallel.
+                              * This value is enforced only when nbWorkers >= 1.
+                              * Larger values increase compression ratio, but decrease speed.
+                              * Possible values range from 0 to 9 :
+                              * - 0 means "default" : value will be determined by the library, depending on strategy
+                              * - 1 means "no overlap"
+                              * - 9 means "full overlap", using a full window size.
+                              * Each intermediate rank increases/decreases load size by a factor 2 :
+                              * 9: full window;  8: w/2;  7: w/4;  6: w/8;  5:w/16;  4: w/32;  3:w/64;  2:w/128;  1:no overlap;  0:default
+                              * default value varies between 6 and 9, depending on strategy */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_c_rsyncable
+     * ZSTD_c_format
+     * ZSTD_c_forceMaxWindow
+     * ZSTD_c_forceAttachDict
+     * ZSTD_c_literalCompressionMode
+     * ZSTD_c_srcSizeHint
+     * ZSTD_c_enableDedicatedDictSearch
+     * ZSTD_c_stableInBuffer
+     * ZSTD_c_stableOutBuffer
+     * ZSTD_c_blockDelimiters
+     * ZSTD_c_validateSequences
+     * ZSTD_c_useBlockSplitter
+     * ZSTD_c_useRowMatchFinder
+     * ZSTD_c_prefetchCDictTables
+     * ZSTD_c_enableSeqProducerFallback
+     * ZSTD_c_maxBlockSize
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly;
+     *        also, the enums values themselves are unstable and can still change.
+     */
+     ZSTD_c_experimentalParam1=500,
+     ZSTD_c_experimentalParam2=10,
+     ZSTD_c_experimentalParam3=1000,
+     ZSTD_c_experimentalParam4=1001,
+     ZSTD_c_experimentalParam5=1002,
+     /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */
+     ZSTD_c_experimentalParam7=1004,
+     ZSTD_c_experimentalParam8=1005,
+     ZSTD_c_experimentalParam9=1006,
+     ZSTD_c_experimentalParam10=1007,
+     ZSTD_c_experimentalParam11=1008,
+     ZSTD_c_experimentalParam12=1009,
+     ZSTD_c_experimentalParam13=1010,
+     ZSTD_c_experimentalParam14=1011,
+     ZSTD_c_experimentalParam15=1012,
+     ZSTD_c_experimentalParam16=1013,
+     ZSTD_c_experimentalParam17=1014,
+     ZSTD_c_experimentalParam18=1015,
+     ZSTD_c_experimentalParam19=1016
 } ZSTD_cParameter;
 
 typedef struct {
-	size_t error;
-	int lowerBound;
-	int upperBound;
+    size_t error;
+    int lowerBound;
+    int upperBound;
 } ZSTD_bounds;
 
 /*! ZSTD_cParam_getBounds() :
@@ -544,9 +564,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param
 ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
 
 typedef enum {
-	ZSTD_reset_session_only = 1,
-	ZSTD_reset_parameters = 2,
-	ZSTD_reset_session_and_parameters = 3
+    ZSTD_reset_session_only = 1,
+    ZSTD_reset_parameters = 2,
+    ZSTD_reset_session_and_parameters = 3
 } ZSTD_ResetDirective;
 
 /*! ZSTD_CCtx_reset() :
@@ -567,21 +587,24 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
 
 /*! ZSTD_compress2() :
  *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
+ *  (note that this entry point doesn't even expose a compression level parameter).
  *  ZSTD_compress2() always starts a new frame.
  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
  *  - The function is always blocking, returns when compression is completed.
- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
+ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
  * @return : compressed size written into `dst` (<= `dstCapacity),
  *           or an error code if it fails (which can be tested using ZSTD_isError()).
  */
-ZSTDLIB_API size_t ZSTD_compress2(ZSTD_CCtx* cctx,
-                                  void* dst, size_t dstCapacity,
-                                  const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx,
+                                   void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
 
 /***********************************************
- *  Advanced decompression API (Requires v1.4.0+)
- ************************************************/
+*  Advanced decompression API (Requires v1.4.0+)
+************************************************/
 
 /* The advanced API pushes parameters one by one into an existing DCtx context.
  * Parameters are sticky, and remain valid for all following frames
@@ -593,29 +616,31 @@ ZSTDLIB_API size_t ZSTD_compress2(ZSTD_CCtx* cctx,
 
 typedef enum {
 
-	ZSTD_d_windowLogMax = 100, /* Select a size limit (in power of 2) beyond which
-	                            * the streaming API will refuse to allocate memory buffer
-	                            * in order to protect the host from unreasonable memory requirements.
-	                            * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
-	                            * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT).
-	                            * Special: value 0 means "use default maximum windowLog". */
-
-	/* note : additional experimental parameters are also available
-	 * within the experimental section of the API.
-	 * At the time of this writing, they include :
-	 * ZSTD_d_format
-	 * ZSTD_d_stableOutBuffer
-	 * ZSTD_d_forceIgnoreChecksum
-	 * ZSTD_d_refMultipleDDicts
-	 * ZSTD_d_disableHuffmanAssembly
-	 * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
-	 * note : never ever use experimentalParam? names directly
-	 */
-	ZSTD_d_experimentalParam1 = 1000,
-	ZSTD_d_experimentalParam2 = 1001,
-	ZSTD_d_experimentalParam3 = 1002,
-	ZSTD_d_experimentalParam4 = 1003,
-	ZSTD_d_experimentalParam5 = 1004
+    ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which
+                              * the streaming API will refuse to allocate memory buffer
+                              * in order to protect the host from unreasonable memory requirements.
+                              * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+                              * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT).
+                              * Special: value 0 means "use default maximum windowLog". */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_d_format
+     * ZSTD_d_stableOutBuffer
+     * ZSTD_d_forceIgnoreChecksum
+     * ZSTD_d_refMultipleDDicts
+     * ZSTD_d_disableHuffmanAssembly
+     * ZSTD_d_maxBlockSize
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly
+     */
+     ZSTD_d_experimentalParam1=1000,
+     ZSTD_d_experimentalParam2=1001,
+     ZSTD_d_experimentalParam3=1002,
+     ZSTD_d_experimentalParam4=1003,
+     ZSTD_d_experimentalParam5=1004,
+     ZSTD_d_experimentalParam6=1005
 
 } ZSTD_dParameter;
 
@@ -645,101 +670,104 @@ ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param
  */
 ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset);
 
+
 /****************************
- *  Streaming
- ****************************/
+*  Streaming
+****************************/
 
 typedef struct ZSTD_inBuffer_s {
-	const void* src; /**< start of input buffer */
-	size_t size;     /**< size of input buffer */
-	size_t pos;      /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+  const void* src;    /**< start of input buffer */
+  size_t size;        /**< size of input buffer */
+  size_t pos;         /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
 } ZSTD_inBuffer;
 
 typedef struct ZSTD_outBuffer_s {
-	void* dst;   /**< start of output buffer */
-	size_t size; /**< size of output buffer */
-	size_t pos;  /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+  void*  dst;         /**< start of output buffer */
+  size_t size;        /**< size of output buffer */
+  size_t pos;         /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
 } ZSTD_outBuffer;
 
-/*-***********************************************************************
- *  Streaming compression - HowTo
- *
- *  A ZSTD_CStream object is required to track streaming operation.
- *  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
- *  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
- *  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
- *
- *  For parallel execution, use one separate ZSTD_CStream per thread.
- *
- *  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
- *
- *  Parameters are sticky : when starting a new compression on the same context,
- *  it will re-use the same sticky parameters as previous compression session.
- *  When in doubt, it's recommended to fully initialize the context before usage.
- *  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
- *  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
- *  set more specific parameters, the pledged source size, or load a dictionary.
- *
- *  Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to
- *  consume input stream. The function will automatically update both `pos`
- *  fields within `input` and `output`.
- *  Note that the function may not consume the entire input, for example, because
- *  the output buffer is already full, in which case `input.pos < input.size`.
- *  The caller must check if input has been entirely consumed.
- *  If not, the caller must make some room to receive more compressed data,
- *  and then present again remaining input data.
- *  note: ZSTD_e_continue is guaranteed to make some forward progress when called,
- *        but doesn't guarantee maximal forward progress. This is especially relevant
- *        when compressing with multiple threads. The call won't block if it can
- *        consume some input, but if it can't it will wait for some, but not all,
- *        output to be flushed.
- * @return : provides a minimum amount of data remaining to be flushed from internal buffers
- *           or an error code, which can be tested using ZSTD_isError().
- *
- *  At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
- *  using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated.
- *  Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0).
- *  In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush.
- *  You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the
- *  operation.
- *  note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will
- *        block until the flush is complete or the output buffer is full.
- *  @return : 0 if internal buffers are entirely flushed,
- *            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
- *            or an error code, which can be tested using ZSTD_isError().
- *
- *  Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame.
- *  It will perform a flush and write frame epilogue.
- *  The epilogue is required for decoders to consider a frame completed.
- *  flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush.
- *  You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to
- *  start a new frame.
- *  note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will
- *        block until the flush is complete or the output buffer is full.
- *  @return : 0 if frame fully completed and fully flushed,
- *            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
- *            or an error code, which can be tested using ZSTD_isError().
- *
- * *******************************************************************/
 
-typedef ZSTD_CCtx ZSTD_CStream; /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
-                                /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
+
+/*-***********************************************************************
+*  Streaming compression - HowTo
+*
+*  A ZSTD_CStream object is required to track streaming operation.
+*  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+*  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+*  It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+*
+*  For parallel execution, use one separate ZSTD_CStream per thread.
+*
+*  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+*
+*  Parameters are sticky : when starting a new compression on the same context,
+*  it will reuse the same sticky parameters as previous compression session.
+*  When in doubt, it's recommended to fully initialize the context before usage.
+*  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+*  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+*  set more specific parameters, the pledged source size, or load a dictionary.
+*
+*  Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to
+*  consume input stream. The function will automatically update both `pos`
+*  fields within `input` and `output`.
+*  Note that the function may not consume the entire input, for example, because
+*  the output buffer is already full, in which case `input.pos < input.size`.
+*  The caller must check if input has been entirely consumed.
+*  If not, the caller must make some room to receive more compressed data,
+*  and then present again remaining input data.
+*  note: ZSTD_e_continue is guaranteed to make some forward progress when called,
+*        but doesn't guarantee maximal forward progress. This is especially relevant
+*        when compressing with multiple threads. The call won't block if it can
+*        consume some input, but if it can't it will wait for some, but not all,
+*        output to be flushed.
+* @return : provides a minimum amount of data remaining to be flushed from internal buffers
+*           or an error code, which can be tested using ZSTD_isError().
+*
+*  At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+*  using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated.
+*  Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0).
+*  In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the
+*  operation.
+*  note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if internal buffers are entirely flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+*  Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to
+*  start a new frame.
+*  note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if frame fully completed and fully flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
+                                 /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
 /*===== ZSTD_CStream management functions =====*/
 ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
-ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); /* accept NULL pointer */
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);  /* accept NULL pointer */
 
 /*===== Streaming compression functions =====*/
 typedef enum {
-	ZSTD_e_continue = 0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */
-	ZSTD_e_flush = 1,    /* flush any data provided so far,
-	                      * it creates (at least) one new block, that can be decoded immediately on reception;
-	                      * frame will continue: any future data can still reference previously compressed data, improving compression.
-	                      * note : multithreaded compression will block to flush as much output as possible. */
-	ZSTD_e_end = 2       /* flush any remaining data _and_ close current frame.
-	                      * note that frame is only closed after compressed data is fully flushed (return value == 0).
-	                      * After that point, any additional data starts a new frame.
-	                      * note : each frame is independent (does not reference any content from previous frame).
-	                      : note : multithreaded compression will block to flush as much output as possible. */
+    ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */
+    ZSTD_e_flush=1,    /* flush any data provided so far,
+                        * it creates (at least) one new block, that can be decoded immediately on reception;
+                        * frame will continue: any future data can still reference previously compressed data, improving compression.
+                        * note : multithreaded compression will block to flush as much output as possible. */
+    ZSTD_e_end=2       /* flush any remaining data _and_ close current frame.
+                        * note that frame is only closed after compressed data is fully flushed (return value == 0).
+                        * After that point, any additional data starts a new frame.
+                        * note : each frame is independent (does not reference any content from previous frame).
+                        : note : multithreaded compression will block to flush as much output as possible. */
 } ZSTD_EndDirective;
 
 /*! ZSTD_compressStream2() : Requires v1.4.0+
@@ -763,11 +791,17 @@ typedef enum {
  *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
  *            Before starting a new compression job, or changing compression parameters,
  *            it is required to fully flush internal buffers.
+ *  - note: if an operation ends with an error, it may leave @cctx in an undefined state.
+ *          Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state.
+ *          In order to be re-employed after an error, a state must be reset,
+ *          which can be done explicitly (ZSTD_CCtx_reset()),
+ *          or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx())
  */
-ZSTDLIB_API size_t ZSTD_compressStream2(ZSTD_CCtx* cctx,
-                                        ZSTD_outBuffer* output,
-                                        ZSTD_inBuffer* input,
-                                        ZSTD_EndDirective endOp);
+ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                         ZSTD_outBuffer* output,
+                                         ZSTD_inBuffer* input,
+                                         ZSTD_EndDirective endOp);
+
 
 /* These buffer sizes are softly recommended.
  * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output.
@@ -782,8 +816,9 @@ ZSTDLIB_API size_t ZSTD_compressStream2(ZSTD_CCtx* cctx,
  * In which cases, prefer using large buffers, as large as practical,
  * for both input and output, to reduce the nb of roundtrips.
  */
-ZSTDLIB_API size_t ZSTD_CStreamInSize(void);  /**< recommended size for input buffer */
-ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void);    /**< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */
+
 
 /* *****************************************************************************
  * This following is a legacy streaming API, available since v1.0+ .
@@ -814,38 +849,39 @@ ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
 /*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */
 ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
 
+
 /*-***************************************************************************
- *  Streaming decompression - HowTo
- *
- *  A ZSTD_DStream object is required to track streaming operations.
- *  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
- *  ZSTD_DStream objects can be re-used multiple times.
- *
- *  Use ZSTD_initDStream() to start a new decompression operation.
- * @return : recommended first input size
- *  Alternatively, use advanced API to set specific properties.
- *
- *  Use ZSTD_decompressStream() repetitively to consume your input.
- *  The function will update both `pos` fields.
- *  If `input.pos < input.size`, some input has not been consumed.
- *  It's up to the caller to present again remaining data.
- *  The function tries to flush all data decoded immediately, respecting output buffer size.
- *  If `output.pos < output.size`, decoder has flushed everything it could.
- *  But if `output.pos == output.size`, there might be some data left within internal buffers.,
- *  In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
- *  Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
- * @return : 0 when a frame is completely decoded and fully flushed,
- *        or an error code, which can be tested using ZSTD_isError(),
- *        or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
- *                                the return value is a suggested next input size (just a hint for better latency)
- *                                that will never request more than the remaining frame size.
- * *******************************************************************************/
-
-typedef ZSTD_DCtx ZSTD_DStream; /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
-                                /* For compatibility with versions <= v1.2.0, prefer differentiating them. */
+*  Streaming decompression - HowTo
+*
+*  A ZSTD_DStream object is required to track streaming operations.
+*  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+*  ZSTD_DStream objects can be reused multiple times.
+*
+*  Use ZSTD_initDStream() to start a new decompression operation.
+* @return : recommended first input size
+*  Alternatively, use advanced API to set specific properties.
+*
+*  Use ZSTD_decompressStream() repetitively to consume your input.
+*  The function will update both `pos` fields.
+*  If `input.pos < input.size`, some input has not been consumed.
+*  It's up to the caller to present again remaining data.
+*  The function tries to flush all data decoded immediately, respecting output buffer size.
+*  If `output.pos < output.size`, decoder has flushed everything it could.
+*  But if `output.pos == output.size`, there might be some data left within internal buffers.,
+*  In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+*  Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+* @return : 0 when a frame is completely decoded and fully flushed,
+*        or an error code, which can be tested using ZSTD_isError(),
+*        or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+*                                the return value is a suggested next input size (just a hint for better latency)
+*                                that will never request more than the remaining frame size.
+* *******************************************************************************/
+
+typedef ZSTD_DCtx ZSTD_DStream;  /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
+                                 /* For compatibility with versions <= v1.2.0, prefer differentiating them. */
 /*===== ZSTD_DStream management functions =====*/
 ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
-ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer */
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer */
 
 /*===== Streaming decompression functions =====*/
 
@@ -873,15 +909,22 @@ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
  * @return : 0 when a frame is completely decoded and fully flushed,
  *           or an error code, which can be tested using ZSTD_isError(),
  *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
+ *
+ * Note: when an operation returns with an error code, the @zds state may be left in undefined state.
+ *       It's UB to invoke `ZSTD_decompressStream()` on such a state.
+ *       In order to re-use such a state, it must be first reset,
+ *       which can be done explicitly (`ZSTD_DCtx_reset()`),
+ *       or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`)
  */
 ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
 
-ZSTDLIB_API size_t ZSTD_DStreamInSize(void);  /*!< recommended size for input buffer */
-ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
 
 /**************************
- *  Simple dictionary API
- ***************************/
+*  Simple dictionary API
+***************************/
 /*! ZSTD_compress_usingDict() :
  *  Compression at an explicit compression level using a Dictionary.
  *  A dictionary can be any arbitrary data segment (also called a prefix),
@@ -891,8 +934,8 @@ ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output b
  *  Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */
 ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
                                            void* dst, size_t dstCapacity,
-                                           const void* src, size_t srcSize,
-                                           const void* dict, size_t dictSize,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
                                            int compressionLevel);
 
 /*! ZSTD_decompress_usingDict() :
@@ -903,8 +946,9 @@ ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
  *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
 ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
                                              void* dst, size_t dstCapacity,
-                                             const void* src, size_t srcSize,
-                                             const void* dict, size_t dictSize);
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
 
 /***********************************
  *  Bulk processing dictionary API
@@ -929,7 +973,7 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize
 /*! ZSTD_freeCDict() :
  *  Function frees memory allocated by ZSTD_createCDict().
  *  If a NULL pointer is passed, no operation is performed. */
-ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict);
+ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
 
 /*! ZSTD_compress_usingCDict() :
  *  Compression using a digested Dictionary.
@@ -938,8 +982,9 @@ ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict);
  *     and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
 ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
                                             void* dst, size_t dstCapacity,
-                                            const void* src, size_t srcSize,
-                                            const ZSTD_CDict* cdict);
+                                      const void* src, size_t srcSize,
+                                      const ZSTD_CDict* cdict);
+
 
 typedef struct ZSTD_DDict_s ZSTD_DDict;
 
@@ -951,15 +996,16 @@ ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize
 /*! ZSTD_freeDDict() :
  *  Function frees memory allocated with ZSTD_createDDict()
  *  If a NULL pointer is passed, no operation is performed. */
-ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict);
+ZSTDLIB_API size_t      ZSTD_freeDDict(ZSTD_DDict* ddict);
 
 /*! ZSTD_decompress_usingDDict() :
  *  Decompression using a digested Dictionary.
  *  Recommended when same dictionary is used multiple times. */
 ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
                                               void* dst, size_t dstCapacity,
-                                              const void* src, size_t srcSize,
-                                              const ZSTD_DDict* ddict);
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_DDict* ddict);
+
 
 /********************************
  *  Dictionary helper functions
@@ -995,15 +1041,19 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
  *  When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */
 ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
 
+
 /*******************************************************************************
  * Advanced dictionary and prefix API (Requires v1.4.0+)
  *
  * This API allows dictionaries to be used with ZSTD_compress2(),
- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
- * only reset with the context is reset with ZSTD_reset_parameters or
- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
+ * Dictionaries are sticky, they remain valid when same context is reused,
+ * they only reset when the context is reset
+ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
+ * In contrast, Prefixes are single-use.
  ******************************************************************************/
 
+
 /*! ZSTD_CCtx_loadDictionary() : Requires v1.4.0+
  *  Create an internal CDict from `dict` buffer.
  *  Decompression will have to use same dictionary.
@@ -1021,7 +1071,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
  *           In such a case, dictionary buffer must outlive its users.
  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
- *           to precisely select how dictionary content must be interpreted. */
+ *           to precisely select how dictionary content must be interpreted.
+ *  Note 5 : This method does not benefit from LDM (long distance mode).
+ *           If you want to employ LDM on some large dictionary content,
+ *           prefer employing ZSTD_CCtx_refPrefix() described below.
+ */
 ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
 
 /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
@@ -1044,6 +1098,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
  *  Decompression will need same prefix to properly regenerate data.
  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ *  This method is compatible with LDM (long distance mode).
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
@@ -1057,7 +1112,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
  *  Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent).
  *           Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */
 ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
-                                       const void* prefix, size_t prefixSize);
+                                 const void* prefix, size_t prefixSize);
 
 /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
  *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
@@ -1112,7 +1167,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
  *           A full dictionary is more costly, as it requires building tables.
  */
 ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
-                                       const void* prefix, size_t prefixSize);
+                                 const void* prefix, size_t prefixSize);
 
 /* ===   Memory management   === */
 
@@ -1126,7 +1181,8 @@ ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
 ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
 ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 
-#endif /* ZSTD_H_235446 */
+#endif  /* ZSTD_H_235446 */
+
 
 /* **************************************************************************************
  *   ADVANCED AND EXPERIMENTAL FUNCTIONS
@@ -1142,13 +1198,13 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 
 /* This can be overridden externally to hide static symbols. */
 #ifndef ZSTDLIB_STATIC_API
-#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT == 1)
-#define ZSTDLIB_STATIC_API __declspec(dllexport) ZSTDLIB_VISIBLE
-#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT == 1)
-#define ZSTDLIB_STATIC_API __declspec(dllimport) ZSTDLIB_VISIBLE
-#else
-#define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
-#endif
+#  if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#    define ZSTDLIB_STATIC_API __declspec(dllexport) ZSTDLIB_VISIBLE
+#  elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#    define ZSTDLIB_STATIC_API __declspec(dllimport) ZSTDLIB_VISIBLE
+#  else
+#    define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
+#  endif
 #endif
 
 /****************************************************************************************
@@ -1161,210 +1217,213 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
  * Some of them might be removed in the future (especially when redundant with existing stable functions)
  * ***************************************************************************************/
 
-#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1) /* minimum input size required to query frame header size */
-#define ZSTD_FRAMEHEADERSIZE_MIN(format) ((format) == ZSTD_f_zstd1 ? 6 : 2)
-#define ZSTD_FRAMEHEADERSIZE_MAX 18 /* can be useful for static allocation */
-#define ZSTD_SKIPPABLEHEADERSIZE 8
+#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1)   /* minimum input size required to query frame header size */
+#define ZSTD_FRAMEHEADERSIZE_MIN(format)    ((format) == ZSTD_f_zstd1 ? 6 : 2)
+#define ZSTD_FRAMEHEADERSIZE_MAX   18   /* can be useful for static allocation */
+#define ZSTD_SKIPPABLEHEADERSIZE    8
 
 /* compression parameter bounds */
-#define ZSTD_WINDOWLOG_MAX_32 30
-#define ZSTD_WINDOWLOG_MAX_64 31
-#define ZSTD_WINDOWLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
-#define ZSTD_WINDOWLOG_MIN 10
-#define ZSTD_HASHLOG_MAX ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
-#define ZSTD_HASHLOG_MIN 6
-#define ZSTD_CHAINLOG_MAX_32 29
-#define ZSTD_CHAINLOG_MAX_64 30
-#define ZSTD_CHAINLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
-#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN
-#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX - 1)
-#define ZSTD_SEARCHLOG_MIN 1
-#define ZSTD_MINMATCH_MAX 7 /* only for ZSTD_fast, other strategies are limited to 6 */
-#define ZSTD_MINMATCH_MIN 3 /* only for ZSTD_btopt+, faster strategies are limited to 4 */
-#define ZSTD_TARGETLENGTH_MAX ZSTD_BLOCKSIZE_MAX
-#define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */
-#define ZSTD_STRATEGY_MIN ZSTD_fast
-#define ZSTD_STRATEGY_MAX ZSTD_btultra2
+#define ZSTD_WINDOWLOG_MAX_32    30
+#define ZSTD_WINDOWLOG_MAX_64    31
+#define ZSTD_WINDOWLOG_MAX     ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN       10
+#define ZSTD_HASHLOG_MAX       ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
+#define ZSTD_HASHLOG_MIN          6
+#define ZSTD_CHAINLOG_MAX_32     29
+#define ZSTD_CHAINLOG_MAX_64     30
+#define ZSTD_CHAINLOG_MAX      ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
+#define ZSTD_CHAINLOG_MIN        ZSTD_HASHLOG_MIN
+#define ZSTD_SEARCHLOG_MAX      (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN        1
+#define ZSTD_MINMATCH_MAX         7   /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_MINMATCH_MIN         3   /* only for ZSTD_btopt+, faster strategies are limited to 4 */
+#define ZSTD_TARGETLENGTH_MAX    ZSTD_BLOCKSIZE_MAX
+#define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+#define ZSTD_STRATEGY_MIN        ZSTD_fast
+#define ZSTD_STRATEGY_MAX        ZSTD_btultra2
 #define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
 
-#define ZSTD_OVERLAPLOG_MIN 0
-#define ZSTD_OVERLAPLOG_MAX 9
 
-#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27 /* by default, the streaming decoder will refuse any frame                           \
-	                                     * requiring larger than (1<<ZSTD_WINDOWLOG_LIMIT_DEFAULT) window size,              \
-	                                     * to preserve host's memory from unreasonable requirements.                         \
-	                                     * This limit can be overridden using ZSTD_DCtx_setParameter(,ZSTD_d_windowLogMax,). \
-	                                     * The limit does not apply for one-pass decoders (such as ZSTD_decompress()), since no additional memory is allocated */
+#define ZSTD_OVERLAPLOG_MIN       0
+#define ZSTD_OVERLAPLOG_MAX       9
+
+#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27   /* by default, the streaming decoder will refuse any frame
+                                           * requiring larger than (1<<ZSTD_WINDOWLOG_LIMIT_DEFAULT) window size,
+                                           * to preserve host's memory from unreasonable requirements.
+                                           * This limit can be overridden using ZSTD_DCtx_setParameter(,ZSTD_d_windowLogMax,).
+                                           * The limit does not apply for one-pass decoders (such as ZSTD_decompress()), since no additional memory is allocated */
+
 
 /* LDM parameter bounds */
-#define ZSTD_LDM_HASHLOG_MIN ZSTD_HASHLOG_MIN
-#define ZSTD_LDM_HASHLOG_MAX ZSTD_HASHLOG_MAX
-#define ZSTD_LDM_MINMATCH_MIN 4
-#define ZSTD_LDM_MINMATCH_MAX 4096
-#define ZSTD_LDM_BUCKETSIZELOG_MIN 1
-#define ZSTD_LDM_BUCKETSIZELOG_MAX 8
-#define ZSTD_LDM_HASHRATELOG_MIN 0
+#define ZSTD_LDM_HASHLOG_MIN      ZSTD_HASHLOG_MIN
+#define ZSTD_LDM_HASHLOG_MAX      ZSTD_HASHLOG_MAX
+#define ZSTD_LDM_MINMATCH_MIN        4
+#define ZSTD_LDM_MINMATCH_MAX     4096
+#define ZSTD_LDM_BUCKETSIZELOG_MIN   1
+#define ZSTD_LDM_BUCKETSIZELOG_MAX   8
+#define ZSTD_LDM_HASHRATELOG_MIN     0
 #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
 
 /* Advanced parameter bounds */
-#define ZSTD_TARGETCBLOCKSIZE_MIN 64
-#define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX
-#define ZSTD_SRCSIZEHINT_MIN 0
-#define ZSTD_SRCSIZEHINT_MAX INT_MAX
+#define ZSTD_TARGETCBLOCKSIZE_MIN   1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */
+#define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+#define ZSTD_SRCSIZEHINT_MIN        0
+#define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+
 
 /* ---  Advanced types  --- */
 
 typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
 
 typedef struct {
-	unsigned int offset; /* The offset of the match. (NOT the same as the offset code)
-	                      * If offset == 0 and matchLength == 0, this sequence represents the last
-	                      * literals in the block of litLength size.
-	                      */
-
-	unsigned int litLength;   /* Literal length of the sequence. */
-	unsigned int matchLength; /* Match length of the sequence. */
-
-	/* Note: Users of this API may provide a sequence with matchLength == litLength == offset == 0.
-	 * In this case, we will treat the sequence as a marker for a block boundary.
-	 */
-
-	unsigned int rep; /* Represents which repeat offset is represented by the field 'offset'.
-	                   * Ranges from [0, 3].
-	                   *
-	                   * Repeat offsets are essentially previous offsets from previous sequences sorted in
-	                   * recency order. For more detail, see doc/zstd_compression_format.md
-	                   *
-	                   * If rep == 0, then 'offset' does not contain a repeat offset.
-	                   * If rep > 0:
-	                   *  If litLength != 0:
-	                   *      rep == 1 --> offset == repeat_offset_1
-	                   *      rep == 2 --> offset == repeat_offset_2
-	                   *      rep == 3 --> offset == repeat_offset_3
-	                   *  If litLength == 0:
-	                   *      rep == 1 --> offset == repeat_offset_2
-	                   *      rep == 2 --> offset == repeat_offset_3
-	                   *      rep == 3 --> offset == repeat_offset_1 - 1
-	                   *
-	                   * Note: This field is optional. ZSTD_generateSequences() will calculate the value of
-	                   * 'rep', but repeat offsets do not necessarily need to be calculated from an external
-	                   * sequence provider's perspective. For example, ZSTD_compressSequences() does not
-	                   * use this 'rep' field at all (as of now).
-	                   */
+    unsigned int offset;      /* The offset of the match. (NOT the same as the offset code)
+                               * If offset == 0 and matchLength == 0, this sequence represents the last
+                               * literals in the block of litLength size.
+                               */
+
+    unsigned int litLength;   /* Literal length of the sequence. */
+    unsigned int matchLength; /* Match length of the sequence. */
+
+                              /* Note: Users of this API may provide a sequence with matchLength == litLength == offset == 0.
+                               * In this case, we will treat the sequence as a marker for a block boundary.
+                               */
+
+    unsigned int rep;         /* Represents which repeat offset is represented by the field 'offset'.
+                               * Ranges from [0, 3].
+                               *
+                               * Repeat offsets are essentially previous offsets from previous sequences sorted in
+                               * recency order. For more detail, see doc/zstd_compression_format.md
+                               *
+                               * If rep == 0, then 'offset' does not contain a repeat offset.
+                               * If rep > 0:
+                               *  If litLength != 0:
+                               *      rep == 1 --> offset == repeat_offset_1
+                               *      rep == 2 --> offset == repeat_offset_2
+                               *      rep == 3 --> offset == repeat_offset_3
+                               *  If litLength == 0:
+                               *      rep == 1 --> offset == repeat_offset_2
+                               *      rep == 2 --> offset == repeat_offset_3
+                               *      rep == 3 --> offset == repeat_offset_1 - 1
+                               *
+                               * Note: This field is optional. ZSTD_generateSequences() will calculate the value of
+                               * 'rep', but repeat offsets do not necessarily need to be calculated from an external
+                               * sequence provider's perspective. For example, ZSTD_compressSequences() does not
+                               * use this 'rep' field at all (as of now).
+                               */
 } ZSTD_Sequence;
 
 typedef struct {
-	unsigned windowLog;     /**< largest match distance : larger == more compression, more memory needed during decompression */
-	unsigned chainLog;      /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
-	unsigned hashLog;       /**< dispatch table : larger == faster, more memory */
-	unsigned searchLog;     /**< nb of searches : larger == more compression, slower */
-	unsigned minMatch;      /**< match length searched : larger == faster decompression, sometimes less compression */
-	unsigned targetLength;  /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
-	ZSTD_strategy strategy; /**< see ZSTD_strategy definition above */
+    unsigned windowLog;       /**< largest match distance : larger == more compression, more memory needed during decompression */
+    unsigned chainLog;        /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+    unsigned hashLog;         /**< dispatch table : larger == faster, more memory */
+    unsigned searchLog;       /**< nb of searches : larger == more compression, slower */
+    unsigned minMatch;        /**< match length searched : larger == faster decompression, sometimes less compression */
+    unsigned targetLength;    /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
+    ZSTD_strategy strategy;   /**< see ZSTD_strategy definition above */
 } ZSTD_compressionParameters;
 
 typedef struct {
-	int contentSizeFlag; /**< 1: content size will be in frame header (when known) */
-	int checksumFlag;    /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */
-	int noDictIDFlag;    /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */
+    int contentSizeFlag; /**< 1: content size will be in frame header (when known) */
+    int checksumFlag;    /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */
+    int noDictIDFlag;    /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */
 } ZSTD_frameParameters;
 
 typedef struct {
-	ZSTD_compressionParameters cParams;
-	ZSTD_frameParameters fParams;
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
 } ZSTD_parameters;
 
 typedef enum {
-	ZSTD_dct_auto = 0,       /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
-	ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
-	ZSTD_dct_fullDict = 2    /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */
+    ZSTD_dct_auto = 0,       /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+    ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+    ZSTD_dct_fullDict = 2    /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */
 } ZSTD_dictContentType_e;
 
 typedef enum {
-	ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */
-	ZSTD_dlm_byRef = 1   /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
+    ZSTD_dlm_byCopy = 0,  /**< Copy dictionary content internally */
+    ZSTD_dlm_byRef = 1    /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
 } ZSTD_dictLoadMethod_e;
 
 typedef enum {
-	ZSTD_f_zstd1 = 0,          /* zstd frame format, specified in zstd_compression_format.md (default) */
-	ZSTD_f_zstd1_magicless = 1 /* Variant of zstd frame format, without initial 4-bytes magic number.
-	                            * Useful to save 4 bytes per generated frame.
-	                            * Decoder cannot recognise automatically this format, requiring this instruction. */
+    ZSTD_f_zstd1 = 0,           /* zstd frame format, specified in zstd_compression_format.md (default) */
+    ZSTD_f_zstd1_magicless = 1  /* Variant of zstd frame format, without initial 4-bytes magic number.
+                                 * Useful to save 4 bytes per generated frame.
+                                 * Decoder cannot recognise automatically this format, requiring this instruction. */
 } ZSTD_format_e;
 
 typedef enum {
-	/* Note: this enum controls ZSTD_d_forceIgnoreChecksum */
-	ZSTD_d_validateChecksum = 0,
-	ZSTD_d_ignoreChecksum = 1
+    /* Note: this enum controls ZSTD_d_forceIgnoreChecksum */
+    ZSTD_d_validateChecksum = 0,
+    ZSTD_d_ignoreChecksum = 1
 } ZSTD_forceIgnoreChecksum_e;
 
 typedef enum {
-	/* Note: this enum controls ZSTD_d_refMultipleDDicts */
-	ZSTD_rmd_refSingleDDict = 0,
-	ZSTD_rmd_refMultipleDDicts = 1
+    /* Note: this enum controls ZSTD_d_refMultipleDDicts */
+    ZSTD_rmd_refSingleDDict = 0,
+    ZSTD_rmd_refMultipleDDicts = 1
 } ZSTD_refMultipleDDicts_e;
 
 typedef enum {
-	/* Note: this enum and the behavior it controls are effectively internal
-	 * implementation details of the compressor. They are expected to continue
-	 * to evolve and should be considered only in the context of extremely
-	 * advanced performance tuning.
-	 *
-	 * Zstd currently supports the use of a CDict in three ways:
-	 *
-	 * - The contents of the CDict can be copied into the working context. This
-	 *   means that the compression can search both the dictionary and input
-	 *   while operating on a single set of internal tables. This makes
-	 *   the compression faster per-byte of input. However, the initial copy of
-	 *   the CDict's tables incurs a fixed cost at the beginning of the
-	 *   compression. For small compressions (< 8 KB), that copy can dominate
-	 *   the cost of the compression.
-	 *
-	 * - The CDict's tables can be used in-place. In this model, compression is
-	 *   slower per input byte, because the compressor has to search two sets of
-	 *   tables. However, this model incurs no start-up cost (as long as the
-	 *   working context's tables can be reused). For small inputs, this can be
-	 *   faster than copying the CDict's tables.
-	 *
-	 * - The CDict's tables are not used at all, and instead we use the working
-	 *   context alone to reload the dictionary and use params based on the source
-	 *   size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict().
-	 *   This method is effective when the dictionary sizes are very small relative
-	 *   to the input size, and the input size is fairly large to begin with.
-	 *
-	 * Zstd has a simple internal heuristic that selects which strategy to use
-	 * at the beginning of a compression. However, if experimentation shows that
-	 * Zstd is making poor choices, it is possible to override that choice with
-	 * this enum.
-	 */
-	ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */
-	ZSTD_dictForceAttach = 1,   /* Never copy the dictionary. */
-	ZSTD_dictForceCopy = 2,     /* Always copy the dictionary. */
-	ZSTD_dictForceLoad = 3      /* Always reload the dictionary */
+    /* Note: this enum and the behavior it controls are effectively internal
+     * implementation details of the compressor. They are expected to continue
+     * to evolve and should be considered only in the context of extremely
+     * advanced performance tuning.
+     *
+     * Zstd currently supports the use of a CDict in three ways:
+     *
+     * - The contents of the CDict can be copied into the working context. This
+     *   means that the compression can search both the dictionary and input
+     *   while operating on a single set of internal tables. This makes
+     *   the compression faster per-byte of input. However, the initial copy of
+     *   the CDict's tables incurs a fixed cost at the beginning of the
+     *   compression. For small compressions (< 8 KB), that copy can dominate
+     *   the cost of the compression.
+     *
+     * - The CDict's tables can be used in-place. In this model, compression is
+     *   slower per input byte, because the compressor has to search two sets of
+     *   tables. However, this model incurs no start-up cost (as long as the
+     *   working context's tables can be reused). For small inputs, this can be
+     *   faster than copying the CDict's tables.
+     *
+     * - The CDict's tables are not used at all, and instead we use the working
+     *   context alone to reload the dictionary and use params based on the source
+     *   size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict().
+     *   This method is effective when the dictionary sizes are very small relative
+     *   to the input size, and the input size is fairly large to begin with.
+     *
+     * Zstd has a simple internal heuristic that selects which strategy to use
+     * at the beginning of a compression. However, if experimentation shows that
+     * Zstd is making poor choices, it is possible to override that choice with
+     * this enum.
+     */
+    ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */
+    ZSTD_dictForceAttach   = 1, /* Never copy the dictionary. */
+    ZSTD_dictForceCopy     = 2, /* Always copy the dictionary. */
+    ZSTD_dictForceLoad     = 3  /* Always reload the dictionary */
 } ZSTD_dictAttachPref_e;
 
 typedef enum {
-	ZSTD_lcm_auto = 0,        /**< Automatically determine the compression mode based on the compression level.
-	                           *   Negative compression levels will be uncompressed, and positive compression
-	                           *   levels will be compressed. */
-	ZSTD_lcm_huffman = 1,     /**< Always attempt Huffman compression. Uncompressed literals will still be
-	                           *   emitted if Huffman compression is not profitable. */
-	ZSTD_lcm_uncompressed = 2 /**< Always emit uncompressed literals. */
+  ZSTD_lcm_auto = 0,          /**< Automatically determine the compression mode based on the compression level.
+                               *   Negative compression levels will be uncompressed, and positive compression
+                               *   levels will be compressed. */
+  ZSTD_lcm_huffman = 1,       /**< Always attempt Huffman compression. Uncompressed literals will still be
+                               *   emitted if Huffman compression is not profitable. */
+  ZSTD_lcm_uncompressed = 2   /**< Always emit uncompressed literals. */
 } ZSTD_literalCompressionMode_e;
 
 typedef enum {
-	/* Note: This enum controls features which are conditionally beneficial. Zstd typically will make a final
-	 * decision on whether or not to enable the feature (ZSTD_ps_auto), but setting the switch to ZSTD_ps_enable
-	 * or ZSTD_ps_disable allow for a force enable/disable the feature.
-	 */
-	ZSTD_ps_auto = 0,   /* Let the library automatically determine whether the feature shall be enabled */
-	ZSTD_ps_enable = 1, /* Force-enable the feature */
-	ZSTD_ps_disable = 2 /* Do not use the feature */
+  /* Note: This enum controls features which are conditionally beneficial. Zstd typically will make a final
+   * decision on whether or not to enable the feature (ZSTD_ps_auto), but setting the switch to ZSTD_ps_enable
+   * or ZSTD_ps_disable allow for a force enable/disable the feature.
+   */
+  ZSTD_ps_auto = 0,         /* Let the library automatically determine whether the feature shall be enabled */
+  ZSTD_ps_enable = 1,       /* Force-enable the feature */
+  ZSTD_ps_disable = 2       /* Do not use the feature */
 } ZSTD_paramSwitch_e;
 
 /***************************************
- *  Frame size functions
- ***************************************/
+*  Frame header and size functions
+***************************************/
 
 /*! ZSTD_findDecompressedSize() :
  *  `src` should point to the start of a series of ZSTD encoded and/or skippable frames
@@ -1410,6 +1469,30 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
  *           or an error code (if srcSize is too small) */
 ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
 
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+    unsigned _reserved1;
+    unsigned _reserved2;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+
 /*! ZSTD_decompressionMargin() :
  * Zstd supports in-place decompression, where the input and output buffers overlap.
  * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
@@ -1448,15 +1531,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSi
  *                     Unless you explicitly set the windowLog smaller than
  *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
  */
-#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ +                                                            \
-	                                                                 4 /* checksum */ +                                                                                       \
-	                                                                 ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize)-1) / blockSize)) /* 3 bytes per block */ + \
-	                                                                 (blockSize) /* One block of margin */                                                                    \
-	                                                                 ))
+#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
+        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
+        4                                                                                         /* checksum */ + \
+        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
+        (blockSize)                                                                    /* One block of margin */   \
+    ))
 
 typedef enum {
-	ZSTD_sf_noBlockDelimiters = 0,      /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
-	ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */
+  ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+  ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
 } ZSTD_sequenceFormat_e;
 
 /*! ZSTD_sequenceBound() :
@@ -1469,21 +1553,34 @@ typedef enum {
 ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
 
 /*! ZSTD_generateSequences() :
+ * WARNING: This function is meant for debugging and informational purposes ONLY!
+ * Its implementation is flawed, and it will be deleted in a future version.
+ * It is not guaranteed to succeed, as there are several cases where it will give
+ * up and fail. You should NOT use this function in production code.
+ *
+ * This function is deprecated, and will be removed in a future version.
+ *
  * Generate sequences using ZSTD_compress2(), given a source buffer.
  *
+ * @param zc The compression context to be used for ZSTD_compress2(). Set any
+ *           compression parameters you need on this context.
+ * @param outSeqs The output sequences buffer of size @p outSeqsSize
+ * @param outSeqsSize The size of the output sequences buffer.
+ *                    ZSTD_sequenceBound(srcSize) is an upper bound on the number
+ *                    of sequences that can be generated.
+ * @param src The source buffer to generate sequences from of size @p srcSize.
+ * @param srcSize The size of the source buffer.
+ *
  * Each block will end with a dummy sequence
  * with offset == 0, matchLength == 0, and litLength == length of last literals.
  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
  * simply acts as a block delimiter.
  *
- * @zc can be used to insert custom compression params.
- * This function invokes ZSTD_compress2().
- *
- * The output of this function can be fed into ZSTD_compressSequences() with CCtx
- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
- * @return : number of sequences generated
+ * @returns The number of sequences generated, necessarily less than
+ *          ZSTD_sequenceBound(srcSize), or an error code that can be checked
+ *          with ZSTD_isError().
  */
-
+ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()")
 ZSTDLIB_STATIC_API size_t
 ZSTD_generateSequences(ZSTD_CCtx* zc,
                        ZSTD_Sequence* outSeqs, size_t outSeqsSize,
@@ -1533,9 +1630,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
  * @return : final compressed size, or a ZSTD error code.
  */
 ZSTDLIB_STATIC_API size_t
-ZSTD_compressSequences(ZSTD_CCtx* cctx, void* dst, size_t dstSize,
-                       const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
-                       const void* src, size_t srcSize);
+ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
+                        const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                        const void* src, size_t srcSize);
+
 
 /*! ZSTD_writeSkippableFrame() :
  * Generates a zstd skippable frame containing data given by src, and writes it to dst buffer.
@@ -1551,7 +1649,7 @@ ZSTD_compressSequences(ZSTD_CCtx* cctx, void* dst, size_t dstSize,
  * @return : number of bytes written or a ZSTD error.
  */
 ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity,
-                                                   const void* src, size_t srcSize, unsigned magicVariant);
+                                            const void* src, size_t srcSize, unsigned magicVariant);
 
 /*! ZSTD_readSkippableFrame() :
  * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
@@ -1565,70 +1663,75 @@ ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity
  * @return : number of bytes written or a ZSTD error.
  */
 ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
-                                           const void* src, size_t srcSize);
+                                            const void* src, size_t srcSize);
 
 /*! ZSTD_isSkippableFrame() :
  *  Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame.
  */
 ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
 
+
+
 /***************************************
- *  Memory management
- ***************************************/
+*  Memory management
+***************************************/
 
 /*! ZSTD_estimate*() :
  *  These functions make it possible to estimate memory usage
  *  of a future {D,C}Ctx, before its creation.
+ *  This is useful in combination with ZSTD_initStatic(),
+ *  which makes it possible to employ a static buffer for ZSTD_CCtx* state.
  *
  *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
- *  for any compression level up to selected one.
- *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
- *         does not include space for a window buffer.
- *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
+ *  to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2()
+ *  associated with any compression level up to max specified one.
  *  The estimate will assume the input may be arbitrarily large,
  *  which is the worst case.
  *
+ *  Note that the size estimation is specific for one-shot compression,
+ *  it is not valid for streaming (see ZSTD_estimateCStreamSize*())
+ *  nor other potential ways of using a ZSTD_CCtx* state.
+ *
  *  When srcSize can be bound by a known and rather "small" value,
- *  this fact can be used to provide a tighter estimation
- *  because the CCtx compression context will need less memory.
- *  This tighter estimation can be provided by more advanced functions
+ *  this knowledge can be used to provide a tighter budget estimation
+ *  because the ZSTD_CCtx* state will need less memory for small inputs.
+ *  This tighter estimation can be provided by employing more advanced functions
  *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
  *
  *  Note : only single-threaded compression is supported.
  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
- *
- *  Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
- *  Size estimates assume that no external sequence producer is registered.
  */
-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
 
 /*! ZSTD_estimateCStreamSize() :
- *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
- *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression
+ *  using any compression level up to the max specified one.
+ *  It will also consider src size to be arbitrarily "large", which is a worst case scenario.
  *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
  *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
  *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
  *  Note : CStream size estimation is only correct for single-threaded compression.
- *  ZSTD_DStream memory budget depends on window Size.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *  Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
+ *  Size estimates assume that no external sequence producer is registered.
+ *
+ *  ZSTD_DStream memory budget depends on frame's window Size.
  *  This information can be passed manually, using ZSTD_estimateDStreamSize,
  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ *  Any frame requesting a window size larger than max specified one will be rejected.
  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
  *         an internal ?Dict will be created, which additional size is not estimated here.
  *         In this case, get total size by adding ZSTD_estimate?DictSize
- *  Note 2 : only single-threaded compression is supported.
- *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
- *  Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
- *  Size estimates assume that no external sequence producer is registered.
  */
-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
 
 /*! ZSTD_estimate?DictSize() :
@@ -1661,53 +1764,50 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadM
  *  Limitation 2 : static cctx currently not compatible with multi-threading.
  *  Limitation 3 : static dctx is incompatible with legacy support.
  */
-ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
-ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticCCtx() */
+ZSTDLIB_STATIC_API ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticCCtx() */
 
-ZSTDLIB_STATIC_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
-ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticDCtx() */
+ZSTDLIB_STATIC_API ZSTD_DCtx*    ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticDCtx() */
 
 ZSTDLIB_STATIC_API const ZSTD_CDict* ZSTD_initStaticCDict(
-    void* workspace, size_t workspaceSize,
-    const void* dict, size_t dictSize,
-    ZSTD_dictLoadMethod_e dictLoadMethod,
-    ZSTD_dictContentType_e dictContentType,
-    ZSTD_compressionParameters cParams);
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType,
+                                        ZSTD_compressionParameters cParams);
 
 ZSTDLIB_STATIC_API const ZSTD_DDict* ZSTD_initStaticDDict(
-    void* workspace, size_t workspaceSize,
-    const void* dict, size_t dictSize,
-    ZSTD_dictLoadMethod_e dictLoadMethod,
-    ZSTD_dictContentType_e dictContentType);
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType);
+
 
 /*! Custom memory allocation :
  *  These prototypes make it possible to pass your own allocation/free functions.
  *  ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
  *  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
  */
-typedef void* (*ZSTD_allocFunction)(void* opaque, size_t size);
-typedef void (*ZSTD_freeFunction)(void* opaque, void* address);
-typedef struct {
-	ZSTD_allocFunction customAlloc;
-	ZSTD_freeFunction customFree;
-	void* opaque;
-} ZSTD_customMem;
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
 static
 #ifdef __GNUC__
-    __attribute__((__unused__))
+__attribute__((__unused__))
 #endif
-    ZSTD_customMem const ZSTD_defaultCMem = {NULL, NULL, NULL}; /**< this constant defers to stdlib's functions */
+ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  /**< this constant defers to stdlib's functions */
 
-ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_STATIC_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
 ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
-ZSTDLIB_STATIC_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_STATIC_API ZSTD_DCtx*    ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
 ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
 
 ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
-                                                         ZSTD_dictLoadMethod_e dictLoadMethod,
-                                                         ZSTD_dictContentType_e dictContentType,
-                                                         ZSTD_compressionParameters cParams,
-                                                         ZSTD_customMem customMem);
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_compressionParameters cParams,
+                                                  ZSTD_customMem customMem);
 
 /*! Thread pool :
  *  These prototypes make it possible to share a thread pool among multiple compression contexts.
@@ -1721,9 +1821,10 @@ ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_
  */
 typedef struct POOL_ctx_s ZSTD_threadPool;
 ZSTDLIB_STATIC_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads);
-ZSTDLIB_STATIC_API void ZSTD_freeThreadPool(ZSTD_threadPool* pool); /* accept NULL pointer */
+ZSTDLIB_STATIC_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool);  /* accept NULL pointer */
 ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool);
 
+
 /*
  * This API is temporary and is expected to change or disappear in the future!
  */
@@ -1740,9 +1841,10 @@ ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_advanced(
     ZSTD_dictContentType_e dictContentType,
     ZSTD_customMem customMem);
 
+
 /***************************************
- *  Advanced compression functions
- ***************************************/
+*  Advanced compression functions
+***************************************/
 
 /*! ZSTD_createCDict_byReference() :
  *  Create a digested dictionary for compression
@@ -1776,12 +1878,26 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
 ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
 
 /*! ZSTD_CCtx_setCParams() :
- *  Set all parameters provided within @cparams into the working @cctx.
+ *  Set all parameters provided within @p cparams into the working @p cctx.
  *  Note : if modifying parameters during compression (MT mode only),
  *         note that changes to the .windowLog parameter will be ignored.
- * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ *         On failure, no parameters are updated.
+ */
 ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
 
+/*! ZSTD_CCtx_setFParams() :
+ *  Set all parameters provided within @p fparams into the working @p cctx.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
+
+/*! ZSTD_CCtx_setParams() :
+ *  Set all parameters provided within @p params into the working @p cctx.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
+
 /*! ZSTD_compress_advanced() :
  *  Note : this function is now DEPRECATED.
  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
@@ -1790,8 +1906,8 @@ ZSTD_DEPRECATED("use ZSTD_compress2")
 ZSTDLIB_STATIC_API
 size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
                               void* dst, size_t dstCapacity,
-                              const void* src, size_t srcSize,
-                              const void* dict, size_t dictSize,
+                        const void* src, size_t srcSize,
+                        const void* dict,size_t dictSize,
                               ZSTD_parameters params);
 
 /*! ZSTD_compress_usingCDict_advanced() :
@@ -1801,10 +1917,11 @@ size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
 ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
 ZSTDLIB_STATIC_API
 size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
-                                         void* dst, size_t dstCapacity,
-                                         const void* src, size_t srcSize,
-                                         const ZSTD_CDict* cdict,
-                                         ZSTD_frameParameters fParams);
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_CDict* cdict,
+                                              ZSTD_frameParameters fParams);
+
 
 /*! ZSTD_CCtx_loadDictionary_byReference() :
  *  Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx.
@@ -1826,23 +1943,23 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
 /* these parameters can be used with ZSTD_setParameter()
  * they are not guaranteed to remain supported in the future */
 
-/* Enables rsyncable mode,
- * which makes compressed files more rsync friendly
- * by adding periodic synchronization points to the compressed data.
- * The target average block size is ZSTD_c_jobSize / 2.
- * It's possible to modify the job size to increase or decrease
- * the granularity of the synchronization point.
- * Once the jobSize is smaller than the window size,
- * it will result in compression ratio degradation.
- * NOTE 1: rsyncable mode only works when multithreading is enabled.
- * NOTE 2: rsyncable performs poorly in combination with long range mode,
- * since it will decrease the effectiveness of synchronization points,
- * though mileage may vary.
- * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s.
- * If the selected compression level is already running significantly slower,
- * the overall speed won't be significantly impacted.
- */
-#define ZSTD_c_rsyncable ZSTD_c_experimentalParam1
+ /* Enables rsyncable mode,
+  * which makes compressed files more rsync friendly
+  * by adding periodic synchronization points to the compressed data.
+  * The target average block size is ZSTD_c_jobSize / 2.
+  * It's possible to modify the job size to increase or decrease
+  * the granularity of the synchronization point.
+  * Once the jobSize is smaller than the window size,
+  * it will result in compression ratio degradation.
+  * NOTE 1: rsyncable mode only works when multithreading is enabled.
+  * NOTE 2: rsyncable performs poorly in combination with long range mode,
+  * since it will decrease the effectiveness of synchronization points,
+  * though mileage may vary.
+  * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s.
+  * If the selected compression level is already running significantly slower,
+  * the overall speed won't be significantly impacted.
+  */
+ #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1
 
 /* Select a compression format.
  * The value must be of type ZSTD_format_e.
@@ -1871,11 +1988,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  */
 #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
 
-/* Tries to fit compressed block size to be around targetCBlockSize.
- * No target when targetCBlockSize == 0.
- * There is no guarantee on compressed block size (default:0) */
-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
-
 /* User's best guess of source size.
  * Hint is not valid when srcSizeHint == 0.
  * There is no guarantee that hint is close to actual source size,
@@ -2106,7 +2218,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * This parameter can be used to set an upper bound on the blocksize
  * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
  * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
- * compressBound() innacurate). Only currently meant to be used for testing.
+ * compressBound() inaccurate). Only currently meant to be used for testing.
  *
  */
 #define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
@@ -2142,6 +2254,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  */
 ZSTDLIB_STATIC_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value);
 
+
 /*! ZSTD_CCtx_params :
  *  Quick howto :
  *  - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure
@@ -2160,7 +2273,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cPa
  *  for static allocation of CCtx for single-threaded compression.
  */
 ZSTDLIB_STATIC_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
-ZSTDLIB_STATIC_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); /* accept NULL pointer */
+ZSTDLIB_STATIC_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);  /* accept NULL pointer */
 
 /*! ZSTD_CCtxParams_reset() :
  *  Reset params to default values.
@@ -2204,7 +2317,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* p
  *       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
  */
 ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
-    ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
 
 /*! ZSTD_compressStream2_simpleArgs() :
  *  Same as ZSTD_compressStream2(),
@@ -2212,15 +2325,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
  *  This variant might be helpful for binders from dynamic languages
  *  which have troubles handling structures containing memory pointers.
  */
-ZSTDLIB_STATIC_API size_t ZSTD_compressStream2_simpleArgs(
-    ZSTD_CCtx* cctx,
-    void* dst, size_t dstCapacity, size_t* dstPos,
-    const void* src, size_t srcSize, size_t* srcPos,
-    ZSTD_EndDirective endOp);
+ZSTDLIB_STATIC_API size_t ZSTD_compressStream2_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp);
+
 
 /***************************************
- *  Advanced decompression functions
- ***************************************/
+*  Advanced decompression functions
+***************************************/
 
 /*! ZSTD_isFrame() :
  *  Tells if the content of `buffer` starts with a valid Frame Identifier.
@@ -2353,6 +2467,23 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  */
 #define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
 
+/* ZSTD_d_maxBlockSize
+ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
+ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
+ *
+ * Forces the decompressor to reject blocks whose content size is
+ * larger than the configured maxBlockSize. When maxBlockSize is
+ * larger than the windowSize, the windowSize is used instead.
+ * This saves memory on the decoder when you know all blocks are small.
+ *
+ * This option is typically used in conjunction with ZSTD_c_maxBlockSize.
+ *
+ * WARNING: This causes the decoder to reject otherwise valid frames
+ * that have block sizes larger than the configured maxBlockSize.
+ */
+#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6
+
+
 /*! ZSTD_DCtx_setFormat() :
  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
  *  Instruct the decoder context about what kind of data to decode next.
@@ -2369,17 +2500,18 @@ size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
  *  This can be helpful for binders from dynamic languages
  *  which have troubles handling structures containing memory pointers.
  */
-ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs(
-    ZSTD_DCtx* dctx,
-    void* dst, size_t dstCapacity, size_t* dstPos,
-    const void* src, size_t srcSize, size_t* srcPos);
+ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos);
+
 
 /********************************************************************
- *  Advanced streaming functions
- *  Warning : most of these functions are now redundant with the Advanced API.
- *  Once Advanced API reaches "stable" status,
- *  redundant functions will be deprecated, and then at some point removed.
- ********************************************************************/
+*  Advanced streaming functions
+*  Warning : most of these functions are now redundant with the Advanced API.
+*  Once Advanced API reaches "stable" status,
+*  redundant functions will be deprecated, and then at some point removed.
+********************************************************************/
 
 /*=====   Advanced Streaming compression functions  =====*/
 
@@ -2398,8 +2530,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs(
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
-                                int compressionLevel,
-                                unsigned long long pledgedSrcSize);
+                         int compressionLevel,
+                         unsigned long long pledgedSrcSize);
 
 /*! ZSTD_initCStream_usingDict() :
  * This function is DEPRECATED, and is equivalent to:
@@ -2416,16 +2548,13 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
-                                  const void* dict, size_t dictSize,
-                                  int compressionLevel);
+                     const void* dict, size_t dictSize,
+                           int compressionLevel);
 
 /*! ZSTD_initCStream_advanced() :
- * This function is DEPRECATED, and is approximately equivalent to:
+ * This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
- *     for ((param, value) : params) {
- *         ZSTD_CCtx_setParameter(zcs, param, value);
- *     }
+ *     ZSTD_CCtx_setParams(zcs, params);
  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
  *
@@ -2437,9 +2566,9 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
-                                 const void* dict, size_t dictSize,
-                                 ZSTD_parameters params,
-                                 unsigned long long pledgedSrcSize);
+                    const void* dict, size_t dictSize,
+                          ZSTD_parameters params,
+                          unsigned long long pledgedSrcSize);
 
 /*! ZSTD_initCStream_usingCDict() :
  * This function is DEPRECATED, and equivalent to:
@@ -2454,12 +2583,9 @@ ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
 
 /*! ZSTD_initCStream_usingCDict_advanced() :
- *   This function is DEPRECATED, and is approximately equivalent to:
+ *   This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
- *     for ((fParam, value) : fParams) {
- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
- *     }
+ *     ZSTD_CCtx_setFParams(zcs, fParams);
  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
  *     ZSTD_CCtx_refCDict(zcs, cdict);
  *
@@ -2471,9 +2597,9 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
-                                            const ZSTD_CDict* cdict,
-                                            ZSTD_frameParameters fParams,
-                                            unsigned long long pledgedSrcSize);
+                               const ZSTD_CDict* cdict,
+                                     ZSTD_frameParameters fParams,
+                                     unsigned long long pledgedSrcSize);
 
 /*! ZSTD_resetCStream() :
  * This function is DEPRECATED, and is equivalent to:
@@ -2484,7 +2610,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
  *       explicitly specified.
  *
  *  start a new frame, using same parameters from previous frame.
- *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
+ *  This is typically useful to skip dictionary loading stage, since it will reuse it in-place.
  *  Note that zcs must be init at least once before using ZSTD_resetCStream().
  *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
  *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
@@ -2497,13 +2623,14 @@ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API
 size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
 
+
 typedef struct {
-	unsigned long long ingested; /* nb input bytes read and buffered */
-	unsigned long long consumed; /* nb input bytes actually compressed */
-	unsigned long long produced; /* nb of compressed bytes generated and buffered */
-	unsigned long long flushed;  /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
-	unsigned currentJobID;       /* MT only : latest started job nb */
-	unsigned nbActiveWorkers;    /* MT only : nb of workers actively compressing at probe time */
+    unsigned long long ingested;   /* nb input bytes read and buffered */
+    unsigned long long consumed;   /* nb input bytes actually compressed */
+    unsigned long long produced;   /* nb of compressed bytes generated and buffered */
+    unsigned long long flushed;    /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
+    unsigned currentJobID;         /* MT only : latest started job nb */
+    unsigned nbActiveWorkers;      /* MT only : nb of workers actively compressing at probe time */
 } ZSTD_frameProgression;
 
 /* ZSTD_getFrameProgression() :
@@ -2529,6 +2656,7 @@ ZSTDLIB_STATIC_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCt
  */
 ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
 
+
 /*=====   Advanced Streaming decompression functions  =====*/
 
 /*!
@@ -2558,25 +2686,210 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
  *
  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
  *
- * re-use decompression parameters from previous init; saves dictionary loading
+ * reuse decompression parameters from previous init; saves dictionary loading
  */
 ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 
-/*********************************************************************
- *  Buffer-less and synchronous inner streaming functions
+
+/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
+ *
+ * *** OVERVIEW ***
+ * The Block-Level Sequence Producer API allows users to provide their own custom
+ * sequence producer which libzstd invokes to process each block. The produced list
+ * of sequences (literals and matches) is then post-processed by libzstd to produce
+ * valid compressed blocks.
+ *
+ * This block-level offload API is a more granular complement of the existing
+ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
+ * an easier migration story for applications already integrated with libzstd: the
+ * user application continues to invoke the same compression functions
+ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
+ * from the specific advantages of the external sequence producer. For example,
+ * the sequence producer could be tuned to take advantage of known characteristics
+ * of the input, to offer better speed / ratio, or could leverage hardware
+ * acceleration not available within libzstd itself.
+ *
+ * See contrib/externalSequenceProducer for an example program employing the
+ * Block-Level Sequence Producer API.
+ *
+ * *** USAGE ***
+ * The user is responsible for implementing a function of type
+ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
+ * arguments to the user-provided function:
+ *
+ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
+ *     producer.
+ *
+ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
+ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
+ *     backing outSeqs is managed by the CCtx.
+ *
+ *   - src, srcSize: an input buffer for the sequence producer to parse.
+ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
+ *
+ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
+ *     producer may reference as it parses the src buffer. Currently, zstd will
+ *     always pass dictSize == 0 into external sequence producers, but this will
+ *     change in the future.
+ *
+ *   - compressionLevel: a signed integer representing the zstd compression level
+ *     set by the user for the current operation. The sequence producer may choose
+ *     to use this information to change its compression strategy and speed/ratio
+ *     tradeoff. Note: the compression level does not reflect zstd parameters set
+ *     through the advanced API.
+ *
+ *   - windowSize: a size_t representing the maximum allowed offset for external
+ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
+ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
+ *     for details.
+ *
+ * The user-provided function shall return a size_t representing the number of
+ * sequences written to outSeqs. This return value will be treated as an error
+ * code if it is greater than outSeqsCapacity. The return value must be non-zero
+ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
+ * for convenience, but any value greater than outSeqsCapacity will be treated as
+ * an error code.
+ *
+ * If the user-provided function does not return an error code, the sequences
+ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
+ * occur if the parse is not valid. A parse is defined to be valid if the
+ * following conditions hold:
+ *   - The sum of matchLengths and literalLengths must equal srcSize.
+ *   - All sequences in the parse, except for the final sequence, must have
+ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
+ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
+ *   - All offsets must respect the windowSize parameter as specified in
+ *     doc/zstd_compression_format.md.
+ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
+ *
+ * zstd will only validate these conditions (and fail compression if they do not
+ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
+ * validation has a performance cost.
+ *
+ * If the user-provided function returns an error, zstd will either fall back
+ * to an internal sequence producer or fail the compression operation. The user can
+ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
+ * cParam. Fallback compression will follow any other cParam settings, such as
+ * compression level, the same as in a normal compression operation.
+ *
+ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
+ * function by calling
+ *         ZSTD_registerSequenceProducer(cctx,
+ *                                       sequenceProducerState,
+ *                                       sequenceProducer)
+ * This setting will persist until the next parameter reset of the CCtx.
+ *
+ * The sequenceProducerState must be initialized by the user before calling
+ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
+ * sequenceProducerState.
+ *
+ * *** LIMITATIONS ***
+ * This API is compatible with all zstd compression APIs which respect advanced parameters.
+ * However, there are three limitations:
+ *
+ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
+ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
+ * external sequence producer.
+ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
+ *     cases (see its documentation for details). Users must explicitly set
+ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
+ *     sequence producer is registered.
+ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
+ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
+ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
+ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
+ *
+ * Second, history buffers are not currently supported. Concretely, zstd will always pass
+ * dictSize == 0 to the external sequence producer (for now). This has two implications:
+ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
+ *     references a dictionary, but the dictionary won't have any effect.
+ *   - Stream history is not currently supported. All advanced compression APIs, including
+ *     streaming APIs, work with external sequence producers, but each block is treated as
+ *     an independent chunk without history from previous blocks.
+ *
+ * Third, multi-threading within a single compression is not currently supported. In other words,
+ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
+ * Multi-threading across compressions is fine: simply create one CCtx per thread.
  *
- *  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
- *  But it's also a complex one, with several restrictions, documented below.
- *  Prefer normal streaming API for an easier experience.
- ********************************************************************* */
+ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
+ * overcoming them. It is purely a question of engineering effort.
+ */
+
+#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
+
+typedef size_t (*ZSTD_sequenceProducer_F) (
+  void* sequenceProducerState,
+  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
+  const void* src, size_t srcSize,
+  const void* dict, size_t dictSize,
+  int compressionLevel,
+  size_t windowSize
+);
+
+/*! ZSTD_registerSequenceProducer() :
+ * Instruct zstd to use a block-level external sequence producer function.
+ *
+ * The sequenceProducerState must be initialized by the caller, and the caller is
+ * responsible for managing its lifetime. This parameter is sticky across
+ * compressions. It will remain set until the user explicitly resets compression
+ * parameters.
+ *
+ * Sequence producer registration is considered to be an "advanced parameter",
+ * part of the "advanced API". This means it will only have an effect on compression
+ * APIs which respect advanced parameters, such as compress2() and compressStream2().
+ * Older compression APIs such as compressCCtx(), which predate the introduction of
+ * "advanced parameters", will ignore any external sequence producer setting.
+ *
+ * The sequence producer can be "cleared" by registering a NULL function pointer. This
+ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
+ *
+ * The user is strongly encouraged to read the full API documentation (above) before
+ * calling this function. */
+ZSTDLIB_STATIC_API void
+ZSTD_registerSequenceProducer(
+  ZSTD_CCtx* cctx,
+  void* sequenceProducerState,
+  ZSTD_sequenceProducer_F sequenceProducer
+);
+
+/*! ZSTD_CCtxParams_registerSequenceProducer() :
+ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params.
+ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(),
+ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx().
+ *
+ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx()
+ * is required, then this function is for you. Otherwise, you probably don't need it.
+ *
+ * See tests/zstreamtest.c for example usage. */
+ZSTDLIB_STATIC_API void
+ZSTD_CCtxParams_registerSequenceProducer(
+  ZSTD_CCtx_params* params,
+  void* sequenceProducerState,
+  ZSTD_sequenceProducer_F sequenceProducer
+);
+
+
+/*********************************************************************
+*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
+*
+*  This API is deprecated, and will be removed in a future version.
+*  It allows streaming (de)compression with user allocated buffers.
+*  However, it is hard to use, and not as well tested as the rest of
+*  our API.
+*
+*  Please use the normal streaming API instead: ZSTD_compressStream2,
+*  and ZSTD_decompressStream.
+*  If there is functionality that you need, but it doesn't provide,
+*  please open an issue on our GitHub.
+********************************************************************* */
 
 /**
   Buffer-less streaming compression (synchronous mode)
 
   A ZSTD_CCtx object is required to track streaming operations.
   Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
-  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+  ZSTD_CCtx object can be reused multiple times within successive compression operations.
 
   Start by initializing a context.
   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
@@ -2597,19 +2910,24 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
   It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
   Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
 
-  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
+  `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again.
 */
 
 /*=====   Buffer-less streaming compression functions  =====*/
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */
 
 ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
 ZSTDLIB_STATIC_API
 size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
 
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
@@ -2618,13 +2936,13 @@ ZSTDLIB_STATIC_API
 size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
 ZSTD_DEPRECATED("use advanced API to access custom parameters")
 ZSTDLIB_STATIC_API
-size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
 /**
   Buffer-less streaming decompression (synchronous mode)
 
   A ZSTD_DCtx object is required to track streaming operations.
   Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
-  A ZSTD_DCtx object can be re-used multiple times.
+  A ZSTD_DCtx object can be reused multiple times.
 
   First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
@@ -2693,31 +3011,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
 */
 
 /*=====   Buffer-less streaming decompression functions  =====*/
-typedef enum { ZSTD_frame,
-	           ZSTD_skippableFrame } ZSTD_frameType_e;
-typedef struct {
-	unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
-	unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
-	unsigned blockSizeMax;
-	ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
-	unsigned headerSize;
-	unsigned dictID;
-	unsigned checksumFlag;
-	unsigned _reserved1;
-	unsigned _reserved2;
-} ZSTD_frameHeader;
 
-/*! ZSTD_getFrameHeader() :
- *  decode Frame Header, or requires larger `srcSize`.
- * @return : 0, `zfhPtr` is correctly filled,
- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
- *           or an error code, which can be tested using ZSTD_isError() */
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /**< doesn't consume input */
-/*! ZSTD_getFrameHeader_advanced() :
- *  same as ZSTD_getFrameHeader(),
- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
-ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
 
 ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
 ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
@@ -2728,20 +3023,30 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, si
 
 /* misc */
 ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
-ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
-typedef enum { ZSTDnit_frameHeader,
-	           ZSTDnit_blockHeader,
-	           ZSTDnit_block,
-	           ZSTDnit_lastBlock,
-	           ZSTDnit_checksum,
-	           ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
 ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 
-/* ============================ */
-/**       Block level API       */
-/* ============================ */
+
+
+
+/* ========================================= */
+/**       Block level API (DEPRECATED)       */
+/* ========================================= */
 
 /*!
+
+    This API is deprecated in favor of the regular compression API.
+    You can get the frame header down to 2 bytes by setting:
+      - ZSTD_c_format = ZSTD_f_zstd1_magicless
+      - ZSTD_c_contentSizeFlag = 0
+      - ZSTD_c_checksumFlag = 0
+      - ZSTD_c_dictIDFlag = 0
+
+    This API is not as well tested as our normal API, so we recommend not using it.
+    We will be removing it in a future version. If the normal API doesn't provide
+    the functionality you need, please open a GitHub issue.
+
     Block functions produce and decode raw zstd blocks, without frame metadata.
     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
@@ -2768,172 +3073,17 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 */
 
 /*=====   Raw zstd block functions  =====*/
-ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx);
-ZSTDLIB_STATIC_API size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
-ZSTDLIB_STATIC_API size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
-
-/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
- *
- * *** OVERVIEW ***
- * The Block-Level Sequence Producer API allows users to provide their own custom
- * sequence producer which libzstd invokes to process each block. The produced list
- * of sequences (literals and matches) is then post-processed by libzstd to produce
- * valid compressed blocks.
- *
- * This block-level offload API is a more granular complement of the existing
- * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
- * an easier migration story for applications already integrated with libzstd: the
- * user application continues to invoke the same compression functions
- * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
- * from the specific advantages of the external sequence producer. For example,
- * the sequence producer could be tuned to take advantage of known characteristics
- * of the input, to offer better speed / ratio, or could leverage hardware
- * acceleration not available within libzstd itself.
- *
- * See contrib/externalSequenceProducer for an example program employing the
- * Block-Level Sequence Producer API.
- *
- * *** USAGE ***
- * The user is responsible for implementing a function of type
- * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
- * arguments to the user-provided function:
- *
- *   - sequenceProducerState: a pointer to a user-managed state for the sequence
- *     producer.
- *
- *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
- *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
- *     backing outSeqs is managed by the CCtx.
- *
- *   - src, srcSize: an input buffer for the sequence producer to parse.
- *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
- *
- *   - dict, dictSize: a history buffer, which may be empty, which the sequence
- *     producer may reference as it parses the src buffer. Currently, zstd will
- *     always pass dictSize == 0 into external sequence producers, but this will
- *     change in the future.
- *
- *   - compressionLevel: a signed integer representing the zstd compression level
- *     set by the user for the current operation. The sequence producer may choose
- *     to use this information to change its compression strategy and speed/ratio
- *     tradeoff. Note: the compression level does not reflect zstd parameters set
- *     through the advanced API.
- *
- *   - windowSize: a size_t representing the maximum allowed offset for external
- *     sequences. Note that sequence offsets are sometimes allowed to exceed the
- *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
- *     for details.
- *
- * The user-provided function shall return a size_t representing the number of
- * sequences written to outSeqs. This return value will be treated as an error
- * code if it is greater than outSeqsCapacity. The return value must be non-zero
- * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
- * for convenience, but any value greater than outSeqsCapacity will be treated as
- * an error code.
- *
- * If the user-provided function does not return an error code, the sequences
- * written to outSeqs must be a valid parse of the src buffer. Data corruption may
- * occur if the parse is not valid. A parse is defined to be valid if the
- * following conditions hold:
- *   - The sum of matchLengths and literalLengths must equal srcSize.
- *   - All sequences in the parse, except for the final sequence, must have
- *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
- *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
- *   - All offsets must respect the windowSize parameter as specified in
- *     doc/zstd_compression_format.md.
- *   - If the final sequence has matchLength == 0, it must also have offset == 0.
- *
- * zstd will only validate these conditions (and fail compression if they do not
- * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
- * validation has a performance cost.
- *
- * If the user-provided function returns an error, zstd will either fall back
- * to an internal sequence producer or fail the compression operation. The user can
- * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
- * cParam. Fallback compression will follow any other cParam settings, such as
- * compression level, the same as in a normal compression operation.
- *
- * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
- * function by calling
- *         ZSTD_registerSequenceProducer(cctx,
- *                                       sequenceProducerState,
- *                                       sequenceProducer)
- * This setting will persist until the next parameter reset of the CCtx.
- *
- * The sequenceProducerState must be initialized by the user before calling
- * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
- * sequenceProducerState.
- *
- * *** LIMITATIONS ***
- * This API is compatible with all zstd compression APIs which respect advanced parameters.
- * However, there are three limitations:
- *
- * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
- * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
- * external sequence producer.
- *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
- *     cases (see its documentation for details). Users must explicitly set
- *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
- *     sequence producer is registered.
- *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
- *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
- *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
- *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
- *
- * Second, history buffers are not currently supported. Concretely, zstd will always pass
- * dictSize == 0 to the external sequence producer (for now). This has two implications:
- *   - Dictionaries are not currently supported. Compression will *not* fail if the user
- *     references a dictionary, but the dictionary won't have any effect.
- *   - Stream history is not currently supported. All advanced compression APIs, including
- *     streaming APIs, work with external sequence producers, but each block is treated as
- *     an independent chunk without history from previous blocks.
- *
- * Third, multi-threading within a single compression is not currently supported. In other words,
- * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
- * Multi-threading across compressions is fine: simply create one CCtx per thread.
- *
- * Long-term, we plan to overcome all three limitations. There is no technical blocker to
- * overcoming them. It is purely a question of engineering effort.
- */
-
-#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
-
-typedef size_t ZSTD_sequenceProducer_F(
-    void* sequenceProducerState,
-    ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
-    const void* src, size_t srcSize,
-    const void* dict, size_t dictSize,
-    int compressionLevel,
-    size_t windowSize);
-
-/*! ZSTD_registerSequenceProducer() :
- * Instruct zstd to use a block-level external sequence producer function.
- *
- * The sequenceProducerState must be initialized by the caller, and the caller is
- * responsible for managing its lifetime. This parameter is sticky across
- * compressions. It will remain set until the user explicitly resets compression
- * parameters.
- *
- * Sequence producer registration is considered to be an "advanced parameter",
- * part of the "advanced API". This means it will only have an effect on compression
- * APIs which respect advanced parameters, such as compress2() and compressStream2().
- * Older compression APIs such as compressCCtx(), which predate the introduction of
- * "advanced parameters", will ignore any external sequence producer setting.
- *
- * The sequence producer can be "cleared" by registering a NULL function pointer. This
- * removes all limitations described above in the "LIMITATIONS" section of the API docs.
- *
- * The user is strongly encouraged to read the full API documentation (above) before
- * calling this function. */
-ZSTDLIB_STATIC_API void
-ZSTD_registerSequenceProducer(
-    ZSTD_CCtx* cctx,
-    void* sequenceProducerState,
-    ZSTD_sequenceProducer_F* sequenceProducer);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
 
-#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+#endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 }
 #endif
diff --git a/src/zstd/zstd_compress.c b/src/zstd/zstd_compress.c
deleted file mode 100644
index fc7ceeafb..000000000
--- a/src/zstd/zstd_compress.c
+++ /dev/null
@@ -1,6835 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/*-*************************************
- *  Dependencies
- ***************************************/
-#include "zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
-#include "mem.h"
-#include "hist.h"               /* HIST_countFast_wksp */
-#define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */
-#include "fse.h"
-#include "huf.h"
-#include "zstd_compress_internal.h"
-#include "zstd_compress_sequences.h"
-#include "zstd_compress_literals.h"
-#include "zstd_fast.h"
-#include "zstd_double_fast.h"
-#include "zstd_lazy.h"
-#include "zstd_opt.h"
-#include "zstd_ldm.h"
-#include "zstd_compress_superblock.h"
-#include "bits.h" /* ZSTD_highbit32 */
-
-/* ***************************************************************
- *  Tuning parameters
- *****************************************************************/
-/*!
- * COMPRESS_HEAPMODE :
- * Select how default decompression function ZSTD_compress() allocates its context,
- * on stack (0, default), or into heap (1).
- * Note that functions with explicit context such as ZSTD_compressCCtx() are unaffected.
- */
-#ifndef ZSTD_COMPRESS_HEAPMODE
-#define ZSTD_COMPRESS_HEAPMODE 0
-#endif
-
-/*!
- * ZSTD_HASHLOG3_MAX :
- * Maximum size of the hash table dedicated to find 3-bytes matches,
- * in log format, aka 17 => 1 << 17 == 128Ki positions.
- * This structure is only used in zstd_opt.
- * Since allocation is centralized for all strategies, it has to be known here.
- * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3,
- * so that zstd_opt.c doesn't need to know about this constant.
- */
-#ifndef ZSTD_HASHLOG3_MAX
-#define ZSTD_HASHLOG3_MAX 17
-#endif
-
-/*-*************************************
- *  Helper functions
- ***************************************/
-/* ZSTD_compressBound()
- * Note that the result from this function is only valid for
- * the one-pass compression functions.
- * When employing the streaming mode,
- * if flushes are frequently altering the size of blocks,
- * the overhead from block headers can make the compressed data larger
- * than the return value of ZSTD_compressBound().
- */
-size_t ZSTD_compressBound(size_t srcSize) {
-	size_t const r = ZSTD_COMPRESSBOUND(srcSize);
-	if(r == 0)
-		return ERROR(srcSize_wrong);
-	return r;
-}
-
-/*-*************************************
- *  Context memory management
- ***************************************/
-struct ZSTD_CDict_s {
-	const void* dictContent;
-	size_t dictContentSize;
-	ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */
-	U32* entropyWorkspace;                  /* entropy workspace of HUF_WORKSPACE_SIZE bytes */
-	ZSTD_cwksp workspace;
-	ZSTD_matchState_t matchState;
-	ZSTD_compressedBlockState_t cBlockState;
-	ZSTD_customMem customMem;
-	U32 dictID;
-	int compressionLevel;                 /* 0 indicates that advanced API was used to select CDict params */
-	ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use
-	                                       * row-based matchfinder. Unless the cdict is reloaded, we will use
-	                                       * the same greedy/lazy matchfinder at compression time.
-	                                       */
-};                                        /* typedef'd to ZSTD_CDict within "zstd.h" */
-
-ZSTD_CCtx* ZSTD_createCCtx(void) {
-	return ZSTD_createCCtx_advanced(ZSTD_defaultCMem);
-}
-
-static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager) {
-	assert(cctx != NULL);
-	ZSTD_memset(cctx, 0, sizeof(*cctx));
-	cctx->customMem = memManager;
-	cctx->bmi2 = ZSTD_cpuSupportsBmi2();
-	{
-		size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters);
-		assert(!ZSTD_isError(err));
-		(void)err;
-	}
-}
-
-ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem) {
-	ZSTD_STATIC_ASSERT(zcss_init == 0);
-	ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (0ULL - 1));
-	if((!customMem.customAlloc) ^ (!customMem.customFree))
-		return NULL;
-	{
-		ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_customMalloc(sizeof(ZSTD_CCtx), customMem);
-		if(!cctx)
-			return NULL;
-		ZSTD_initCCtx(cctx, customMem);
-		return cctx;
-	}
-}
-
-ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize) {
-	ZSTD_cwksp ws;
-	ZSTD_CCtx* cctx;
-	if(workspaceSize <= sizeof(ZSTD_CCtx))
-		return NULL; /* minimum size */
-	if((size_t)workspace & 7)
-		return NULL; /* must be 8-aligned */
-	ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc);
-
-	cctx = (ZSTD_CCtx*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CCtx));
-	if(cctx == NULL)
-		return NULL;
-
-	ZSTD_memset(cctx, 0, sizeof(ZSTD_CCtx));
-	ZSTD_cwksp_move(&cctx->workspace, &ws);
-	cctx->staticSize = workspaceSize;
-
-	/* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
-	if(!ZSTD_cwksp_check_available(&cctx->workspace, ENTROPY_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t)))
-		return NULL;
-	cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
-	cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
-	cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, ENTROPY_WORKSPACE_SIZE);
-	cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
-	return cctx;
-}
-
-/**
- * Clears and frees all of the dictionaries in the CCtx.
- */
-static void ZSTD_clearAllDicts(ZSTD_CCtx* cctx) {
-	ZSTD_customFree(cctx->localDict.dictBuffer, cctx->customMem);
-	ZSTD_freeCDict(cctx->localDict.cdict);
-	ZSTD_memset(&cctx->localDict, 0, sizeof(cctx->localDict));
-	ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));
-	cctx->cdict = NULL;
-}
-
-static size_t ZSTD_sizeof_localDict(ZSTD_localDict dict) {
-	size_t const bufferSize = dict.dictBuffer != NULL ? dict.dictSize : 0;
-	size_t const cdictSize = ZSTD_sizeof_CDict(dict.cdict);
-	return bufferSize + cdictSize;
-}
-
-static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) {
-	assert(cctx != NULL);
-	assert(cctx->staticSize == 0);
-	ZSTD_clearAllDicts(cctx);
-#ifdef ZSTD_MULTITHREAD
-	ZSTDMT_freeCCtx(cctx->mtctx);
-	cctx->mtctx = NULL;
-#endif
-	ZSTD_cwksp_free(&cctx->workspace, cctx->customMem);
-}
-
-size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) {
-	if(cctx == NULL)
-		return 0; /* support free on NULL */
-	RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
-	                "not compatible with static CCtx");
-	{
-		int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
-		ZSTD_freeCCtxContent(cctx);
-		if(!cctxInWorkspace)
-			ZSTD_customFree(cctx, cctx->customMem);
-	}
-	return 0;
-}
-
-static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx) {
-#ifdef ZSTD_MULTITHREAD
-	return ZSTDMT_sizeof_CCtx(cctx->mtctx);
-#else
-	(void)cctx;
-	return 0;
-#endif
-}
-
-size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx) {
-	if(cctx == NULL)
-		return 0; /* support sizeof on NULL */
-	/* cctx may be in the workspace */
-	return (cctx->workspace.workspace == cctx ? 0 : sizeof(*cctx)) + ZSTD_cwksp_sizeof(&cctx->workspace) + ZSTD_sizeof_localDict(cctx->localDict) + ZSTD_sizeof_mtctx(cctx);
-}
-
-size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) {
-	return ZSTD_sizeof_CCtx(zcs); /* same object */
-}
-
-/* private API call, for dictBuilder only */
-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
-
-/* Returns true if the strategy supports using a row based matchfinder */
-static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) {
-	return (strategy >= ZSTD_greedy && strategy <= ZSTD_lazy2);
-}
-
-/* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder
- * for this compression.
- */
-static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) {
-	assert(mode != ZSTD_ps_auto);
-	return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable);
-}
-
-/* Returns row matchfinder usage given an initial mode and cParams */
-static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode,
-                                                         const ZSTD_compressionParameters* const cParams) {
-#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON)
-	int const kHasSIMD128 = 1;
-#else
-	int const kHasSIMD128 = 0;
-#endif
-	if(mode != ZSTD_ps_auto)
-		return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */
-	mode = ZSTD_ps_disable;
-	if(!ZSTD_rowMatchFinderSupported(cParams->strategy))
-		return mode;
-	if(kHasSIMD128) {
-		if(cParams->windowLog > 14)
-			mode = ZSTD_ps_enable;
-	} else {
-		if(cParams->windowLog > 17)
-			mode = ZSTD_ps_enable;
-	}
-	return mode;
-}
-
-/* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */
-static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode,
-                                                        const ZSTD_compressionParameters* const cParams) {
-	if(mode != ZSTD_ps_auto)
-		return mode;
-	return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable;
-}
-
-/* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */
-static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
-                                   const ZSTD_paramSwitch_e useRowMatchFinder,
-                                   const U32 forDDSDict) {
-	assert(useRowMatchFinder != ZSTD_ps_auto);
-	/* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate.
-	 * We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder.
-	 */
-	return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
-}
-
-/* Returns ZSTD_ps_enable if compression parameters are such that we should
- * enable long distance matching (wlog >= 27, strategy >= btopt).
- * Returns ZSTD_ps_disable otherwise.
- */
-static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
-                                                const ZSTD_compressionParameters* const cParams) {
-	if(mode != ZSTD_ps_auto)
-		return mode;
-	return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
-}
-
-static int ZSTD_resolveExternalSequenceValidation(int mode) {
-	return mode;
-}
-
-/* Resolves maxBlockSize to the default if no value is present. */
-static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
-	if(maxBlockSize == 0) {
-		return ZSTD_BLOCKSIZE_MAX;
-	} else {
-		return maxBlockSize;
-	}
-}
-
-static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
-	if(value != ZSTD_ps_auto)
-		return value;
-	if(cLevel < 10) {
-		return ZSTD_ps_disable;
-	} else {
-		return ZSTD_ps_enable;
-	}
-}
-
-/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
- * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
-static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
-	return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
-}
-
-static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
-    ZSTD_compressionParameters cParams) {
-	ZSTD_CCtx_params cctxParams;
-	/* should not matter, as all cParams are presumed properly defined */
-	ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT);
-	cctxParams.cParams = cParams;
-
-	/* Adjust advanced params according to cParams */
-	cctxParams.ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams.ldmParams.enableLdm, &cParams);
-	if(cctxParams.ldmParams.enableLdm == ZSTD_ps_enable) {
-		ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams);
-		assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog);
-		assert(cctxParams.ldmParams.hashRateLog < 32);
-	}
-	cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
-	cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
-	cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
-	cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
-	cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
-	                                                                         cctxParams.compressionLevel);
-	assert(!ZSTD_checkCParams(cParams));
-	return cctxParams;
-}
-
-static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced(
-    ZSTD_customMem customMem) {
-	ZSTD_CCtx_params* params;
-	if((!customMem.customAlloc) ^ (!customMem.customFree))
-		return NULL;
-	params = (ZSTD_CCtx_params*)ZSTD_customCalloc(
-	    sizeof(ZSTD_CCtx_params), customMem);
-	if(!params) {
-		return NULL;
-	}
-	ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT);
-	params->customMem = customMem;
-	return params;
-}
-
-ZSTD_CCtx_params* ZSTD_createCCtxParams(void) {
-	return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem);
-}
-
-size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params) {
-	if(params == NULL) {
-		return 0;
-	}
-	ZSTD_customFree(params, params->customMem);
-	return 0;
-}
-
-size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params) {
-	return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT);
-}
-
-size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) {
-	RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!");
-	ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
-	cctxParams->compressionLevel = compressionLevel;
-	cctxParams->fParams.contentSizeFlag = 1;
-	return 0;
-}
-
-#define ZSTD_NO_CLEVEL 0
-
-/**
- * Initializes `cctxParams` from `params` and `compressionLevel`.
- * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
- */
-static void
-ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
-                              const ZSTD_parameters* params,
-                              int compressionLevel) {
-	assert(!ZSTD_checkCParams(params->cParams));
-	ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
-	cctxParams->cParams = params->cParams;
-	cctxParams->fParams = params->fParams;
-	/* Should not matter, as all cParams are presumed properly defined.
-	 * But, set it for tracing anyway.
-	 */
-	cctxParams->compressionLevel = compressionLevel;
-	cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
-	cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
-	cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
-	cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
-	cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
-	cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
-	DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
-	         cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
-}
-
-size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) {
-	RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!");
-	FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
-	ZSTD_CCtxParams_init_internal(cctxParams, &params, ZSTD_NO_CLEVEL);
-	return 0;
-}
-
-/**
- * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
- * @param params Validated zstd parameters.
- */
-static void ZSTD_CCtxParams_setZstdParams(
-    ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) {
-	assert(!ZSTD_checkCParams(params->cParams));
-	cctxParams->cParams = params->cParams;
-	cctxParams->fParams = params->fParams;
-	/* Should not matter, as all cParams are presumed properly defined.
-	 * But, set it for tracing anyway.
-	 */
-	cctxParams->compressionLevel = ZSTD_NO_CLEVEL;
-}
-
-ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) {
-	ZSTD_bounds bounds = {0, 0, 0};
-
-	switch(param) {
-	case ZSTD_c_compressionLevel:
-		bounds.lowerBound = ZSTD_minCLevel();
-		bounds.upperBound = ZSTD_maxCLevel();
-		return bounds;
-
-	case ZSTD_c_windowLog:
-		bounds.lowerBound = ZSTD_WINDOWLOG_MIN;
-		bounds.upperBound = ZSTD_WINDOWLOG_MAX;
-		return bounds;
-
-	case ZSTD_c_hashLog:
-		bounds.lowerBound = ZSTD_HASHLOG_MIN;
-		bounds.upperBound = ZSTD_HASHLOG_MAX;
-		return bounds;
-
-	case ZSTD_c_chainLog:
-		bounds.lowerBound = ZSTD_CHAINLOG_MIN;
-		bounds.upperBound = ZSTD_CHAINLOG_MAX;
-		return bounds;
-
-	case ZSTD_c_searchLog:
-		bounds.lowerBound = ZSTD_SEARCHLOG_MIN;
-		bounds.upperBound = ZSTD_SEARCHLOG_MAX;
-		return bounds;
-
-	case ZSTD_c_minMatch:
-		bounds.lowerBound = ZSTD_MINMATCH_MIN;
-		bounds.upperBound = ZSTD_MINMATCH_MAX;
-		return bounds;
-
-	case ZSTD_c_targetLength:
-		bounds.lowerBound = ZSTD_TARGETLENGTH_MIN;
-		bounds.upperBound = ZSTD_TARGETLENGTH_MAX;
-		return bounds;
-
-	case ZSTD_c_strategy:
-		bounds.lowerBound = ZSTD_STRATEGY_MIN;
-		bounds.upperBound = ZSTD_STRATEGY_MAX;
-		return bounds;
-
-	case ZSTD_c_contentSizeFlag:
-		bounds.lowerBound = 0;
-		bounds.upperBound = 1;
-		return bounds;
-
-	case ZSTD_c_checksumFlag:
-		bounds.lowerBound = 0;
-		bounds.upperBound = 1;
-		return bounds;
-
-	case ZSTD_c_dictIDFlag:
-		bounds.lowerBound = 0;
-		bounds.upperBound = 1;
-		return bounds;
-
-	case ZSTD_c_nbWorkers:
-		bounds.lowerBound = 0;
-#ifdef ZSTD_MULTITHREAD
-		bounds.upperBound = ZSTDMT_NBWORKERS_MAX;
-#else
-		bounds.upperBound = 0;
-#endif
-		return bounds;
-
-	case ZSTD_c_jobSize:
-		bounds.lowerBound = 0;
-#ifdef ZSTD_MULTITHREAD
-		bounds.upperBound = ZSTDMT_JOBSIZE_MAX;
-#else
-		bounds.upperBound = 0;
-#endif
-		return bounds;
-
-	case ZSTD_c_overlapLog:
-#ifdef ZSTD_MULTITHREAD
-		bounds.lowerBound = ZSTD_OVERLAPLOG_MIN;
-		bounds.upperBound = ZSTD_OVERLAPLOG_MAX;
-#else
-		bounds.lowerBound = 0;
-		bounds.upperBound = 0;
-#endif
-		return bounds;
-
-	case ZSTD_c_enableDedicatedDictSearch:
-		bounds.lowerBound = 0;
-		bounds.upperBound = 1;
-		return bounds;
-
-	case ZSTD_c_enableLongDistanceMatching:
-		bounds.lowerBound = (int)ZSTD_ps_auto;
-		bounds.upperBound = (int)ZSTD_ps_disable;
-		return bounds;
-
-	case ZSTD_c_ldmHashLog:
-		bounds.lowerBound = ZSTD_LDM_HASHLOG_MIN;
-		bounds.upperBound = ZSTD_LDM_HASHLOG_MAX;
-		return bounds;
-
-	case ZSTD_c_ldmMinMatch:
-		bounds.lowerBound = ZSTD_LDM_MINMATCH_MIN;
-		bounds.upperBound = ZSTD_LDM_MINMATCH_MAX;
-		return bounds;
-
-	case ZSTD_c_ldmBucketSizeLog:
-		bounds.lowerBound = ZSTD_LDM_BUCKETSIZELOG_MIN;
-		bounds.upperBound = ZSTD_LDM_BUCKETSIZELOG_MAX;
-		return bounds;
-
-	case ZSTD_c_ldmHashRateLog:
-		bounds.lowerBound = ZSTD_LDM_HASHRATELOG_MIN;
-		bounds.upperBound = ZSTD_LDM_HASHRATELOG_MAX;
-		return bounds;
-
-	/* experimental parameters */
-	case ZSTD_c_rsyncable:
-		bounds.lowerBound = 0;
-		bounds.upperBound = 1;
-		return bounds;
-
-	case ZSTD_c_forceMaxWindow:
-		bounds.lowerBound = 0;
-		bounds.upperBound = 1;
-		return bounds;
-
-	case ZSTD_c_format:
-		ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless);
-		bounds.lowerBound = ZSTD_f_zstd1;
-		bounds.upperBound = ZSTD_f_zstd1_magicless; /* note : how to ensure at compile time that this is the highest value enum ? */
-		return bounds;
-
-	case ZSTD_c_forceAttachDict:
-		ZSTD_STATIC_ASSERT(ZSTD_dictDefaultAttach < ZSTD_dictForceLoad);
-		bounds.lowerBound = ZSTD_dictDefaultAttach;
-		bounds.upperBound = ZSTD_dictForceLoad; /* note : how to ensure at compile time that this is the highest value enum ? */
-		return bounds;
-
-	case ZSTD_c_literalCompressionMode:
-		ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable < ZSTD_ps_disable);
-		bounds.lowerBound = (int)ZSTD_ps_auto;
-		bounds.upperBound = (int)ZSTD_ps_disable;
-		return bounds;
-
-	case ZSTD_c_targetCBlockSize:
-		bounds.lowerBound = ZSTD_TARGETCBLOCKSIZE_MIN;
-		bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX;
-		return bounds;
-
-	case ZSTD_c_srcSizeHint:
-		bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN;
-		bounds.upperBound = ZSTD_SRCSIZEHINT_MAX;
-		return bounds;
-
-	case ZSTD_c_stableInBuffer:
-	case ZSTD_c_stableOutBuffer:
-		bounds.lowerBound = (int)ZSTD_bm_buffered;
-		bounds.upperBound = (int)ZSTD_bm_stable;
-		return bounds;
-
-	case ZSTD_c_blockDelimiters:
-		bounds.lowerBound = (int)ZSTD_sf_noBlockDelimiters;
-		bounds.upperBound = (int)ZSTD_sf_explicitBlockDelimiters;
-		return bounds;
-
-	case ZSTD_c_validateSequences:
-		bounds.lowerBound = 0;
-		bounds.upperBound = 1;
-		return bounds;
-
-	case ZSTD_c_useBlockSplitter:
-		bounds.lowerBound = (int)ZSTD_ps_auto;
-		bounds.upperBound = (int)ZSTD_ps_disable;
-		return bounds;
-
-	case ZSTD_c_useRowMatchFinder:
-		bounds.lowerBound = (int)ZSTD_ps_auto;
-		bounds.upperBound = (int)ZSTD_ps_disable;
-		return bounds;
-
-	case ZSTD_c_deterministicRefPrefix:
-		bounds.lowerBound = 0;
-		bounds.upperBound = 1;
-		return bounds;
-
-	case ZSTD_c_prefetchCDictTables:
-		bounds.lowerBound = (int)ZSTD_ps_auto;
-		bounds.upperBound = (int)ZSTD_ps_disable;
-		return bounds;
-
-	case ZSTD_c_enableSeqProducerFallback:
-		bounds.lowerBound = 0;
-		bounds.upperBound = 1;
-		return bounds;
-
-	case ZSTD_c_maxBlockSize:
-		bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
-		bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
-		return bounds;
-
-	case ZSTD_c_searchForExternalRepcodes:
-		bounds.lowerBound = (int)ZSTD_ps_auto;
-		bounds.upperBound = (int)ZSTD_ps_disable;
-		return bounds;
-
-	default:
-		bounds.error = ERROR(parameter_unsupported);
-		return bounds;
-	}
-}
-
-/* ZSTD_cParam_clampBounds:
- * Clamps the value into the bounded range.
- */
-static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) {
-	ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);
-	if(ZSTD_isError(bounds.error))
-		return bounds.error;
-	if(*value < bounds.lowerBound)
-		*value = bounds.lowerBound;
-	if(*value > bounds.upperBound)
-		*value = bounds.upperBound;
-	return 0;
-}
-
-#define BOUNDCHECK(cParam, val)                                       \
-	{                                                                 \
-		RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam, val),       \
-		                parameter_outOfBound, "Param out of bounds"); \
-	}
-
-static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) {
-	switch(param) {
-	case ZSTD_c_compressionLevel:
-	case ZSTD_c_hashLog:
-	case ZSTD_c_chainLog:
-	case ZSTD_c_searchLog:
-	case ZSTD_c_minMatch:
-	case ZSTD_c_targetLength:
-	case ZSTD_c_strategy:
-		return 1;
-
-	case ZSTD_c_format:
-	case ZSTD_c_windowLog:
-	case ZSTD_c_contentSizeFlag:
-	case ZSTD_c_checksumFlag:
-	case ZSTD_c_dictIDFlag:
-	case ZSTD_c_forceMaxWindow:
-	case ZSTD_c_nbWorkers:
-	case ZSTD_c_jobSize:
-	case ZSTD_c_overlapLog:
-	case ZSTD_c_rsyncable:
-	case ZSTD_c_enableDedicatedDictSearch:
-	case ZSTD_c_enableLongDistanceMatching:
-	case ZSTD_c_ldmHashLog:
-	case ZSTD_c_ldmMinMatch:
-	case ZSTD_c_ldmBucketSizeLog:
-	case ZSTD_c_ldmHashRateLog:
-	case ZSTD_c_forceAttachDict:
-	case ZSTD_c_literalCompressionMode:
-	case ZSTD_c_targetCBlockSize:
-	case ZSTD_c_srcSizeHint:
-	case ZSTD_c_stableInBuffer:
-	case ZSTD_c_stableOutBuffer:
-	case ZSTD_c_blockDelimiters:
-	case ZSTD_c_validateSequences:
-	case ZSTD_c_useBlockSplitter:
-	case ZSTD_c_useRowMatchFinder:
-	case ZSTD_c_deterministicRefPrefix:
-	case ZSTD_c_prefetchCDictTables:
-	case ZSTD_c_enableSeqProducerFallback:
-	case ZSTD_c_maxBlockSize:
-	case ZSTD_c_searchForExternalRepcodes:
-	default:
-		return 0;
-	}
-}
-
-size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) {
-	DEBUGLOG(4, "ZSTD_CCtx_setParameter (%i, %i)", (int)param, value);
-	if(cctx->streamStage != zcss_init) {
-		if(ZSTD_isUpdateAuthorized(param)) {
-			cctx->cParamsChanged = 1;
-		} else {
-			RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
-		}
-	}
-
-	switch(param) {
-	case ZSTD_c_nbWorkers:
-		RETURN_ERROR_IF((value != 0) && cctx->staticSize, parameter_unsupported,
-		                "MT not compatible with static alloc");
-		break;
-
-	case ZSTD_c_compressionLevel:
-	case ZSTD_c_windowLog:
-	case ZSTD_c_hashLog:
-	case ZSTD_c_chainLog:
-	case ZSTD_c_searchLog:
-	case ZSTD_c_minMatch:
-	case ZSTD_c_targetLength:
-	case ZSTD_c_strategy:
-	case ZSTD_c_ldmHashRateLog:
-	case ZSTD_c_format:
-	case ZSTD_c_contentSizeFlag:
-	case ZSTD_c_checksumFlag:
-	case ZSTD_c_dictIDFlag:
-	case ZSTD_c_forceMaxWindow:
-	case ZSTD_c_forceAttachDict:
-	case ZSTD_c_literalCompressionMode:
-	case ZSTD_c_jobSize:
-	case ZSTD_c_overlapLog:
-	case ZSTD_c_rsyncable:
-	case ZSTD_c_enableDedicatedDictSearch:
-	case ZSTD_c_enableLongDistanceMatching:
-	case ZSTD_c_ldmHashLog:
-	case ZSTD_c_ldmMinMatch:
-	case ZSTD_c_ldmBucketSizeLog:
-	case ZSTD_c_targetCBlockSize:
-	case ZSTD_c_srcSizeHint:
-	case ZSTD_c_stableInBuffer:
-	case ZSTD_c_stableOutBuffer:
-	case ZSTD_c_blockDelimiters:
-	case ZSTD_c_validateSequences:
-	case ZSTD_c_useBlockSplitter:
-	case ZSTD_c_useRowMatchFinder:
-	case ZSTD_c_deterministicRefPrefix:
-	case ZSTD_c_prefetchCDictTables:
-	case ZSTD_c_enableSeqProducerFallback:
-	case ZSTD_c_maxBlockSize:
-	case ZSTD_c_searchForExternalRepcodes:
-		break;
-
-	default:
-		RETURN_ERROR(parameter_unsupported, "unknown parameter");
-	}
-	return ZSTD_CCtxParams_setParameter(&cctx->requestedParams, param, value);
-}
-
-size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
-                                    ZSTD_cParameter param, int value) {
-	DEBUGLOG(4, "ZSTD_CCtxParams_setParameter (%i, %i)", (int)param, value);
-	switch(param) {
-	case ZSTD_c_format:
-		BOUNDCHECK(ZSTD_c_format, value);
-		CCtxParams->format = (ZSTD_format_e)value;
-		return (size_t)CCtxParams->format;
-
-	case ZSTD_c_compressionLevel: {
-		FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
-		if(value == 0)
-			CCtxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* 0 == default */
-		else
-			CCtxParams->compressionLevel = value;
-		if(CCtxParams->compressionLevel >= 0)
-			return (size_t)CCtxParams->compressionLevel;
-		return 0; /* return type (size_t) cannot represent negative values */
-	}
-
-	case ZSTD_c_windowLog:
-		if(value != 0) /* 0 => use default */
-			BOUNDCHECK(ZSTD_c_windowLog, value);
-		CCtxParams->cParams.windowLog = (U32)value;
-		return CCtxParams->cParams.windowLog;
-
-	case ZSTD_c_hashLog:
-		if(value != 0) /* 0 => use default */
-			BOUNDCHECK(ZSTD_c_hashLog, value);
-		CCtxParams->cParams.hashLog = (U32)value;
-		return CCtxParams->cParams.hashLog;
-
-	case ZSTD_c_chainLog:
-		if(value != 0) /* 0 => use default */
-			BOUNDCHECK(ZSTD_c_chainLog, value);
-		CCtxParams->cParams.chainLog = (U32)value;
-		return CCtxParams->cParams.chainLog;
-
-	case ZSTD_c_searchLog:
-		if(value != 0) /* 0 => use default */
-			BOUNDCHECK(ZSTD_c_searchLog, value);
-		CCtxParams->cParams.searchLog = (U32)value;
-		return (size_t)value;
-
-	case ZSTD_c_minMatch:
-		if(value != 0) /* 0 => use default */
-			BOUNDCHECK(ZSTD_c_minMatch, value);
-		CCtxParams->cParams.minMatch = (U32)value;
-		return CCtxParams->cParams.minMatch;
-
-	case ZSTD_c_targetLength:
-		BOUNDCHECK(ZSTD_c_targetLength, value);
-		CCtxParams->cParams.targetLength = (U32)value;
-		return CCtxParams->cParams.targetLength;
-
-	case ZSTD_c_strategy:
-		if(value != 0) /* 0 => use default */
-			BOUNDCHECK(ZSTD_c_strategy, value);
-		CCtxParams->cParams.strategy = (ZSTD_strategy)value;
-		return (size_t)CCtxParams->cParams.strategy;
-
-	case ZSTD_c_contentSizeFlag:
-		/* Content size written in frame header _when known_ (default:1) */
-		DEBUGLOG(4, "set content size flag = %u", (value != 0));
-		CCtxParams->fParams.contentSizeFlag = value != 0;
-		return (size_t)CCtxParams->fParams.contentSizeFlag;
-
-	case ZSTD_c_checksumFlag:
-		/* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
-		CCtxParams->fParams.checksumFlag = value != 0;
-		return (size_t)CCtxParams->fParams.checksumFlag;
-
-	case ZSTD_c_dictIDFlag: /* When applicable, dictionary's dictID is provided in frame header (default:1) */
-		DEBUGLOG(4, "set dictIDFlag = %u", (value != 0));
-		CCtxParams->fParams.noDictIDFlag = !value;
-		return !CCtxParams->fParams.noDictIDFlag;
-
-	case ZSTD_c_forceMaxWindow:
-		CCtxParams->forceWindow = (value != 0);
-		return (size_t)CCtxParams->forceWindow;
-
-	case ZSTD_c_forceAttachDict: {
-		const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
-		BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
-		CCtxParams->attachDictPref = pref;
-		return CCtxParams->attachDictPref;
-	}
-
-	case ZSTD_c_literalCompressionMode: {
-		const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
-		BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
-		CCtxParams->literalCompressionMode = lcm;
-		return CCtxParams->literalCompressionMode;
-	}
-
-	case ZSTD_c_nbWorkers:
-#ifndef ZSTD_MULTITHREAD
-		RETURN_ERROR_IF(value != 0, parameter_unsupported, "not compiled with multithreading");
-		return 0;
-#else
-		FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
-		CCtxParams->nbWorkers = value;
-		return CCtxParams->nbWorkers;
-#endif
-
-	case ZSTD_c_jobSize:
-#ifndef ZSTD_MULTITHREAD
-		RETURN_ERROR_IF(value != 0, parameter_unsupported, "not compiled with multithreading");
-		return 0;
-#else
-		/* Adjust to the minimum non-default value. */
-		if(value != 0 && value < ZSTDMT_JOBSIZE_MIN)
-			value = ZSTDMT_JOBSIZE_MIN;
-		FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
-		assert(value >= 0);
-		CCtxParams->jobSize = value;
-		return CCtxParams->jobSize;
-#endif
-
-	case ZSTD_c_overlapLog:
-#ifndef ZSTD_MULTITHREAD
-		RETURN_ERROR_IF(value != 0, parameter_unsupported, "not compiled with multithreading");
-		return 0;
-#else
-		FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
-		CCtxParams->overlapLog = value;
-		return CCtxParams->overlapLog;
-#endif
-
-	case ZSTD_c_rsyncable:
-#ifndef ZSTD_MULTITHREAD
-		RETURN_ERROR_IF(value != 0, parameter_unsupported, "not compiled with multithreading");
-		return 0;
-#else
-		FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
-		CCtxParams->rsyncable = value;
-		return CCtxParams->rsyncable;
-#endif
-
-	case ZSTD_c_enableDedicatedDictSearch:
-		CCtxParams->enableDedicatedDictSearch = (value != 0);
-		return (size_t)CCtxParams->enableDedicatedDictSearch;
-
-	case ZSTD_c_enableLongDistanceMatching:
-		BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
-		CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
-		return CCtxParams->ldmParams.enableLdm;
-
-	case ZSTD_c_ldmHashLog:
-		if(value != 0) /* 0 ==> auto */
-			BOUNDCHECK(ZSTD_c_ldmHashLog, value);
-		CCtxParams->ldmParams.hashLog = (U32)value;
-		return CCtxParams->ldmParams.hashLog;
-
-	case ZSTD_c_ldmMinMatch:
-		if(value != 0) /* 0 ==> default */
-			BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
-		CCtxParams->ldmParams.minMatchLength = (U32)value;
-		return CCtxParams->ldmParams.minMatchLength;
-
-	case ZSTD_c_ldmBucketSizeLog:
-		if(value != 0) /* 0 ==> default */
-			BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
-		CCtxParams->ldmParams.bucketSizeLog = (U32)value;
-		return CCtxParams->ldmParams.bucketSizeLog;
-
-	case ZSTD_c_ldmHashRateLog:
-		if(value != 0) /* 0 ==> default */
-			BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
-		CCtxParams->ldmParams.hashRateLog = (U32)value;
-		return CCtxParams->ldmParams.hashRateLog;
-
-	case ZSTD_c_targetCBlockSize:
-		if(value != 0) /* 0 ==> default */
-			BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
-		CCtxParams->targetCBlockSize = (U32)value;
-		return CCtxParams->targetCBlockSize;
-
-	case ZSTD_c_srcSizeHint:
-		if(value != 0) /* 0 ==> default */
-			BOUNDCHECK(ZSTD_c_srcSizeHint, value);
-		CCtxParams->srcSizeHint = value;
-		return (size_t)CCtxParams->srcSizeHint;
-
-	case ZSTD_c_stableInBuffer:
-		BOUNDCHECK(ZSTD_c_stableInBuffer, value);
-		CCtxParams->inBufferMode = (ZSTD_bufferMode_e)value;
-		return CCtxParams->inBufferMode;
-
-	case ZSTD_c_stableOutBuffer:
-		BOUNDCHECK(ZSTD_c_stableOutBuffer, value);
-		CCtxParams->outBufferMode = (ZSTD_bufferMode_e)value;
-		return CCtxParams->outBufferMode;
-
-	case ZSTD_c_blockDelimiters:
-		BOUNDCHECK(ZSTD_c_blockDelimiters, value);
-		CCtxParams->blockDelimiters = (ZSTD_sequenceFormat_e)value;
-		return CCtxParams->blockDelimiters;
-
-	case ZSTD_c_validateSequences:
-		BOUNDCHECK(ZSTD_c_validateSequences, value);
-		CCtxParams->validateSequences = value;
-		return CCtxParams->validateSequences;
-
-	case ZSTD_c_useBlockSplitter:
-		BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
-		CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value;
-		return CCtxParams->useBlockSplitter;
-
-	case ZSTD_c_useRowMatchFinder:
-		BOUNDCHECK(ZSTD_c_useRowMatchFinder, value);
-		CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value;
-		return CCtxParams->useRowMatchFinder;
-
-	case ZSTD_c_deterministicRefPrefix:
-		BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
-		CCtxParams->deterministicRefPrefix = !!value;
-		return CCtxParams->deterministicRefPrefix;
-
-	case ZSTD_c_prefetchCDictTables:
-		BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
-		CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
-		return CCtxParams->prefetchCDictTables;
-
-	case ZSTD_c_enableSeqProducerFallback:
-		BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
-		CCtxParams->enableMatchFinderFallback = value;
-		return CCtxParams->enableMatchFinderFallback;
-
-	case ZSTD_c_maxBlockSize:
-		if(value != 0) /* 0 ==> default */
-			BOUNDCHECK(ZSTD_c_maxBlockSize, value);
-		CCtxParams->maxBlockSize = value;
-		return CCtxParams->maxBlockSize;
-
-	case ZSTD_c_searchForExternalRepcodes:
-		BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
-		CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
-		return CCtxParams->searchForExternalRepcodes;
-
-	default:
-		RETURN_ERROR(parameter_unsupported, "unknown parameter");
-	}
-}
-
-size_t ZSTD_CCtx_getParameter(ZSTD_CCtx const * cctx, ZSTD_cParameter param, int* value) {
-	return ZSTD_CCtxParams_getParameter(&cctx->requestedParams, param, value);
-}
-
-size_t ZSTD_CCtxParams_getParameter(
-    ZSTD_CCtx_params const * CCtxParams, ZSTD_cParameter param, int* value) {
-	switch(param) {
-	case ZSTD_c_format:
-		*value = CCtxParams->format;
-		break;
-	case ZSTD_c_compressionLevel:
-		*value = CCtxParams->compressionLevel;
-		break;
-	case ZSTD_c_windowLog:
-		*value = (int)CCtxParams->cParams.windowLog;
-		break;
-	case ZSTD_c_hashLog:
-		*value = (int)CCtxParams->cParams.hashLog;
-		break;
-	case ZSTD_c_chainLog:
-		*value = (int)CCtxParams->cParams.chainLog;
-		break;
-	case ZSTD_c_searchLog:
-		*value = CCtxParams->cParams.searchLog;
-		break;
-	case ZSTD_c_minMatch:
-		*value = CCtxParams->cParams.minMatch;
-		break;
-	case ZSTD_c_targetLength:
-		*value = CCtxParams->cParams.targetLength;
-		break;
-	case ZSTD_c_strategy:
-		*value = (unsigned)CCtxParams->cParams.strategy;
-		break;
-	case ZSTD_c_contentSizeFlag:
-		*value = CCtxParams->fParams.contentSizeFlag;
-		break;
-	case ZSTD_c_checksumFlag:
-		*value = CCtxParams->fParams.checksumFlag;
-		break;
-	case ZSTD_c_dictIDFlag:
-		*value = !CCtxParams->fParams.noDictIDFlag;
-		break;
-	case ZSTD_c_forceMaxWindow:
-		*value = CCtxParams->forceWindow;
-		break;
-	case ZSTD_c_forceAttachDict:
-		*value = CCtxParams->attachDictPref;
-		break;
-	case ZSTD_c_literalCompressionMode:
-		*value = CCtxParams->literalCompressionMode;
-		break;
-	case ZSTD_c_nbWorkers:
-#ifndef ZSTD_MULTITHREAD
-		assert(CCtxParams->nbWorkers == 0);
-#endif
-		*value = CCtxParams->nbWorkers;
-		break;
-	case ZSTD_c_jobSize:
-#ifndef ZSTD_MULTITHREAD
-		RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
-#else
-		assert(CCtxParams->jobSize <= INT_MAX);
-		*value = (int)CCtxParams->jobSize;
-		break;
-#endif
-	case ZSTD_c_overlapLog:
-#ifndef ZSTD_MULTITHREAD
-		RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
-#else
-		*value = CCtxParams->overlapLog;
-		break;
-#endif
-	case ZSTD_c_rsyncable:
-#ifndef ZSTD_MULTITHREAD
-		RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
-#else
-		*value = CCtxParams->rsyncable;
-		break;
-#endif
-	case ZSTD_c_enableDedicatedDictSearch:
-		*value = CCtxParams->enableDedicatedDictSearch;
-		break;
-	case ZSTD_c_enableLongDistanceMatching:
-		*value = CCtxParams->ldmParams.enableLdm;
-		break;
-	case ZSTD_c_ldmHashLog:
-		*value = CCtxParams->ldmParams.hashLog;
-		break;
-	case ZSTD_c_ldmMinMatch:
-		*value = CCtxParams->ldmParams.minMatchLength;
-		break;
-	case ZSTD_c_ldmBucketSizeLog:
-		*value = CCtxParams->ldmParams.bucketSizeLog;
-		break;
-	case ZSTD_c_ldmHashRateLog:
-		*value = CCtxParams->ldmParams.hashRateLog;
-		break;
-	case ZSTD_c_targetCBlockSize:
-		*value = (int)CCtxParams->targetCBlockSize;
-		break;
-	case ZSTD_c_srcSizeHint:
-		*value = (int)CCtxParams->srcSizeHint;
-		break;
-	case ZSTD_c_stableInBuffer:
-		*value = (int)CCtxParams->inBufferMode;
-		break;
-	case ZSTD_c_stableOutBuffer:
-		*value = (int)CCtxParams->outBufferMode;
-		break;
-	case ZSTD_c_blockDelimiters:
-		*value = (int)CCtxParams->blockDelimiters;
-		break;
-	case ZSTD_c_validateSequences:
-		*value = (int)CCtxParams->validateSequences;
-		break;
-	case ZSTD_c_useBlockSplitter:
-		*value = (int)CCtxParams->useBlockSplitter;
-		break;
-	case ZSTD_c_useRowMatchFinder:
-		*value = (int)CCtxParams->useRowMatchFinder;
-		break;
-	case ZSTD_c_deterministicRefPrefix:
-		*value = (int)CCtxParams->deterministicRefPrefix;
-		break;
-	case ZSTD_c_prefetchCDictTables:
-		*value = (int)CCtxParams->prefetchCDictTables;
-		break;
-	case ZSTD_c_enableSeqProducerFallback:
-		*value = CCtxParams->enableMatchFinderFallback;
-		break;
-	case ZSTD_c_maxBlockSize:
-		*value = (int)CCtxParams->maxBlockSize;
-		break;
-	case ZSTD_c_searchForExternalRepcodes:
-		*value = (int)CCtxParams->searchForExternalRepcodes;
-		break;
-	default:
-		RETURN_ERROR(parameter_unsupported, "unknown parameter");
-	}
-	return 0;
-}
-
-/** ZSTD_CCtx_setParametersUsingCCtxParams() :
- *  just applies `params` into `cctx`
- *  no action is performed, parameters are merely stored.
- *  If ZSTDMT is enabled, parameters are pushed to cctx->mtctx.
- *    This is possible even if a compression is ongoing.
- *    In which case, new parameters will be applied on the fly, starting with next compression job.
- */
-size_t ZSTD_CCtx_setParametersUsingCCtxParams(
-    ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params) {
-	DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams");
-	RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-	                "The context is in the wrong stage!");
-	RETURN_ERROR_IF(cctx->cdict, stage_wrong,
-	                "Can't override parameters with cdict attached (some must "
-	                "be inherited from the cdict).");
-
-	cctx->requestedParams = *params;
-	return 0;
-}
-
-size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) {
-	DEBUGLOG(4, "ZSTD_CCtx_setCParams");
-	assert(cctx != NULL);
-	if(cctx->streamStage != zcss_init) {
-		/* All parameters in @cparams are allowed to be updated during MT compression.
-		 * This must be signaled, so that MT compression picks up the changes */
-		cctx->cParamsChanged = 1;
-	}
-	/* only update if parameters are valid */
-	FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
-	cctx->requestedParams.cParams = cparams;
-	return 0;
-}
-
-size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) {
-	DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
-	RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-	                "Can't set pledgedSrcSize when not in init stage.");
-	cctx->pledgedSrcSizePlusOne = pledgedSrcSize + 1;
-	return 0;
-}
-
-static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(
-    int const compressionLevel,
-    size_t const dictSize);
-static int ZSTD_dedicatedDictSearch_isSupported(
-    const ZSTD_compressionParameters* cParams);
-static void ZSTD_dedicatedDictSearch_revertCParams(
-    ZSTD_compressionParameters* cParams);
-
-/**
- * Initializes the local dict using the requested parameters.
- * NOTE: This does not use the pledged src size, because it may be used for more
- * than one compression.
- */
-static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) {
-	ZSTD_localDict* const dl = &cctx->localDict;
-	if(dl->dict == NULL) {
-		/* No local dictionary. */
-		assert(dl->dictBuffer == NULL);
-		assert(dl->cdict == NULL);
-		assert(dl->dictSize == 0);
-		return 0;
-	}
-	if(dl->cdict != NULL) {
-		assert(cctx->cdict == dl->cdict);
-		/* Local dictionary already initialized. */
-		return 0;
-	}
-	assert(dl->dictSize > 0);
-	assert(cctx->cdict == NULL);
-	assert(cctx->prefixDict.dict == NULL);
-
-	dl->cdict = ZSTD_createCDict_advanced2(
-	    dl->dict,
-	    dl->dictSize,
-	    ZSTD_dlm_byRef,
-	    dl->dictContentType,
-	    &cctx->requestedParams,
-	    cctx->customMem);
-	RETURN_ERROR_IF(!dl->cdict, memory_allocation, "ZSTD_createCDict_advanced failed");
-	cctx->cdict = dl->cdict;
-	return 0;
-}
-
-size_t ZSTD_CCtx_loadDictionary_advanced(
-    ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
-    ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) {
-	RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-	                "Can't load a dictionary when ctx is not in init stage.");
-	DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
-	ZSTD_clearAllDicts(cctx);         /* in case one already exists */
-	if(dict == NULL || dictSize == 0) /* no dictionary mode */
-		return 0;
-	if(dictLoadMethod == ZSTD_dlm_byRef) {
-		cctx->localDict.dict = dict;
-	} else {
-		void* dictBuffer;
-		RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
-		                "no malloc for static CCtx");
-		dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
-		RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
-		ZSTD_memcpy(dictBuffer, dict, dictSize);
-		cctx->localDict.dictBuffer = dictBuffer;
-		cctx->localDict.dict = dictBuffer;
-	}
-	cctx->localDict.dictSize = dictSize;
-	cctx->localDict.dictContentType = dictContentType;
-	return 0;
-}
-
-size_t ZSTD_CCtx_loadDictionary_byReference(
-    ZSTD_CCtx* cctx, const void* dict, size_t dictSize) {
-	return ZSTD_CCtx_loadDictionary_advanced(
-	    cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
-}
-
-size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) {
-	return ZSTD_CCtx_loadDictionary_advanced(
-	    cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
-}
-
-size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) {
-	RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-	                "Can't ref a dict when ctx not in init stage.");
-	/* Free the existing local cdict (if any) to save memory. */
-	ZSTD_clearAllDicts(cctx);
-	cctx->cdict = cdict;
-	return 0;
-}
-
-size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool) {
-	RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-	                "Can't ref a pool when ctx not in init stage.");
-	cctx->pool = pool;
-	return 0;
-}
-
-size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize) {
-	return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent);
-}
-
-size_t ZSTD_CCtx_refPrefix_advanced(
-    ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) {
-	RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-	                "Can't ref a prefix when ctx not in init stage.");
-	ZSTD_clearAllDicts(cctx);
-	if(prefix != NULL && prefixSize > 0) {
-		cctx->prefixDict.dict = prefix;
-		cctx->prefixDict.dictSize = prefixSize;
-		cctx->prefixDict.dictContentType = dictContentType;
-	}
-	return 0;
-}
-
-/*! ZSTD_CCtx_reset() :
- *  Also dumps dictionary */
-size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) {
-	if((reset == ZSTD_reset_session_only) || (reset == ZSTD_reset_session_and_parameters)) {
-		cctx->streamStage = zcss_init;
-		cctx->pledgedSrcSizePlusOne = 0;
-	}
-	if((reset == ZSTD_reset_parameters) || (reset == ZSTD_reset_session_and_parameters)) {
-		RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-		                "Can't reset parameters only when not in init stage.");
-		ZSTD_clearAllDicts(cctx);
-		ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
-		return ZSTD_CCtxParams_reset(&cctx->requestedParams);
-	}
-	return 0;
-}
-
-/** ZSTD_checkCParams() :
-    control CParam values remain within authorized range.
-    @return : 0, or an error code if one value is beyond authorized range */
-size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) {
-	BOUNDCHECK(ZSTD_c_windowLog, (int)cParams.windowLog);
-	BOUNDCHECK(ZSTD_c_chainLog, (int)cParams.chainLog);
-	BOUNDCHECK(ZSTD_c_hashLog, (int)cParams.hashLog);
-	BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog);
-	BOUNDCHECK(ZSTD_c_minMatch, (int)cParams.minMatch);
-	BOUNDCHECK(ZSTD_c_targetLength, (int)cParams.targetLength);
-	BOUNDCHECK(ZSTD_c_strategy, cParams.strategy);
-	return 0;
-}
-
-/** ZSTD_clampCParams() :
- *  make CParam values within valid range.
- *  @return : valid CParams */
-static ZSTD_compressionParameters
-ZSTD_clampCParams(ZSTD_compressionParameters cParams) {
-#define CLAMP_TYPE(cParam, val, type)                             \
-	{                                                             \
-		ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \
-		if((int)val < bounds.lowerBound)                          \
-			val = (type)bounds.lowerBound;                        \
-		else if((int)val > bounds.upperBound)                     \
-			val = (type)bounds.upperBound;                        \
-	}
-#define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
-	CLAMP(ZSTD_c_windowLog, cParams.windowLog);
-	CLAMP(ZSTD_c_chainLog, cParams.chainLog);
-	CLAMP(ZSTD_c_hashLog, cParams.hashLog);
-	CLAMP(ZSTD_c_searchLog, cParams.searchLog);
-	CLAMP(ZSTD_c_minMatch, cParams.minMatch);
-	CLAMP(ZSTD_c_targetLength, cParams.targetLength);
-	CLAMP_TYPE(ZSTD_c_strategy, cParams.strategy, ZSTD_strategy);
-	return cParams;
-}
-
-/** ZSTD_cycleLog() :
- *  condition for correct operation : hashLog > 1 */
-U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat) {
-	U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
-	return hashLog - btScale;
-}
-
-/** ZSTD_dictAndWindowLog() :
- * Returns an adjusted window log that is large enough to fit the source and the dictionary.
- * The zstd format says that the entire dictionary is valid if one byte of the dictionary
- * is within the window. So the hashLog and chainLog should be large enough to reference both
- * the dictionary and the window. So we must use this adjusted dictAndWindowLog when downsizing
- * the hashLog and windowLog.
- * NOTE: srcSize must not be ZSTD_CONTENTSIZE_UNKNOWN.
- */
-static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize) {
-	const U64 maxWindowSize = 1ULL << ZSTD_WINDOWLOG_MAX;
-	/* No dictionary ==> No change */
-	if(dictSize == 0) {
-		return windowLog;
-	}
-	assert(windowLog <= ZSTD_WINDOWLOG_MAX);
-	assert(srcSize != ZSTD_CONTENTSIZE_UNKNOWN); /* Handled in ZSTD_adjustCParams_internal() */
-	{
-		U64 const windowSize = 1ULL << windowLog;
-		U64 const dictAndWindowSize = dictSize + windowSize;
-		/* If the window size is already large enough to fit both the source and the dictionary
-		 * then just use the window size. Otherwise adjust so that it fits the dictionary and
-		 * the window.
-		 */
-		if(windowSize >= dictSize + srcSize) {
-			return windowLog; /* Window size large enough already */
-		} else if(dictAndWindowSize >= maxWindowSize) {
-			return ZSTD_WINDOWLOG_MAX; /* Larger than max window log */
-		} else {
-			return ZSTD_highbit32((U32)dictAndWindowSize - 1) + 1;
-		}
-	}
-}
-
-/** ZSTD_adjustCParams_internal() :
- *  optimize `cPar` for a specified input (`srcSize` and `dictSize`).
- *  mostly downsize to reduce memory consumption and initialization latency.
- * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known.
- * `mode` is the mode for parameter adjustment. See docs for `ZSTD_cParamMode_e`.
- *  note : `srcSize==0` means 0!
- *  condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */
-static ZSTD_compressionParameters
-ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
-                            unsigned long long srcSize,
-                            size_t dictSize,
-                            ZSTD_cParamMode_e mode,
-                            ZSTD_paramSwitch_e useRowMatchFinder) {
-	const U64 minSrcSize = 513; /* (1<<9) + 1 */
-	const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX - 1);
-	assert(ZSTD_checkCParams(cPar) == 0);
-
-	switch(mode) {
-	case ZSTD_cpm_unknown:
-	case ZSTD_cpm_noAttachDict:
-		/* If we don't know the source size, don't make any
-		 * assumptions about it. We will already have selected
-		 * smaller parameters if a dictionary is in use.
-		 */
-		break;
-	case ZSTD_cpm_createCDict:
-		/* Assume a small source size when creating a dictionary
-		 * with an unknown source size.
-		 */
-		if(dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)
-			srcSize = minSrcSize;
-		break;
-	case ZSTD_cpm_attachDict:
-		/* Dictionary has its own dedicated parameters which have
-		 * already been selected. We are selecting parameters
-		 * for only the source.
-		 */
-		dictSize = 0;
-		break;
-	default:
-		assert(0);
-		break;
-	}
-
-	/* resize windowLog if input is small enough, to use less memory */
-	if((srcSize <= maxWindowResize) && (dictSize <= maxWindowResize)) {
-		U32 const tSize = (U32)(srcSize + dictSize);
-		static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
-		U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : ZSTD_highbit32(tSize - 1) + 1;
-		if(cPar.windowLog > srcLog)
-			cPar.windowLog = srcLog;
-	}
-	if(srcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
-		U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize);
-		U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
-		if(cPar.hashLog > dictAndWindowLog + 1)
-			cPar.hashLog = dictAndWindowLog + 1;
-		if(cycleLog > dictAndWindowLog)
-			cPar.chainLog -= (cycleLog - dictAndWindowLog);
-	}
-
-	if(cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
-		cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */
-
-	/* We can't use more than 32 bits of hash in total, so that means that we require:
-	 * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
-	 */
-	if(mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
-		U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
-		if(cPar.hashLog > maxShortCacheHashLog) {
-			cPar.hashLog = maxShortCacheHashLog;
-		}
-		if(cPar.chainLog > maxShortCacheHashLog) {
-			cPar.chainLog = maxShortCacheHashLog;
-		}
-	}
-
-	/* At this point, we aren't 100% sure if we are using the row match finder.
-	 * Unless it is explicitly disabled, conservatively assume that it is enabled.
-	 * In this case it will only be disabled for small sources, so shrinking the
-	 * hash log a little bit shouldn't result in any ratio loss.
-	 */
-	if(useRowMatchFinder == ZSTD_ps_auto)
-		useRowMatchFinder = ZSTD_ps_enable;
-
-	/* We can't hash more than 32-bits in total. So that means that we require:
-	 * (hashLog - rowLog + 8) <= 32
-	 */
-	if(ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
-		/* Switch to 32-entry rows if searchLog is 5 (or more) */
-		U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
-		U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
-		U32 const maxHashLog = maxRowHashLog + rowLog;
-		assert(cPar.hashLog >= rowLog);
-		if(cPar.hashLog > maxHashLog) {
-			cPar.hashLog = maxHashLog;
-		}
-	}
-
-	return cPar;
-}
-
-ZSTD_compressionParameters
-ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
-                   unsigned long long srcSize,
-                   size_t dictSize) {
-	cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */
-	if(srcSize == 0)
-		srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
-	return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
-}
-
-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
-
-static void ZSTD_overrideCParams(
-    ZSTD_compressionParameters* cParams,
-    const ZSTD_compressionParameters* overrides) {
-	if(overrides->windowLog)
-		cParams->windowLog = overrides->windowLog;
-	if(overrides->hashLog)
-		cParams->hashLog = overrides->hashLog;
-	if(overrides->chainLog)
-		cParams->chainLog = overrides->chainLog;
-	if(overrides->searchLog)
-		cParams->searchLog = overrides->searchLog;
-	if(overrides->minMatch)
-		cParams->minMatch = overrides->minMatch;
-	if(overrides->targetLength)
-		cParams->targetLength = overrides->targetLength;
-	if(overrides->strategy)
-		cParams->strategy = overrides->strategy;
-}
-
-ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
-    const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) {
-	ZSTD_compressionParameters cParams;
-	if(srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) {
-		srcSizeHint = CCtxParams->srcSizeHint;
-	}
-	cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode);
-	if(CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable)
-		cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
-	ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
-	assert(!ZSTD_checkCParams(cParams));
-	/* srcSizeHint == 0 means 0 */
-	return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
-}
-
-static size_t
-ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
-                       const ZSTD_paramSwitch_e useRowMatchFinder,
-                       const U32 enableDedicatedDictSearch,
-                       const U32 forCCtx) {
-	/* chain table size should be 0 for fast or row-hash strategies */
-	size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, enableDedicatedDictSearch && !forCCtx)
-	                             ? ((size_t)1 << cParams->chainLog)
-	                             : 0;
-	size_t const hSize = ((size_t)1) << cParams->hashLog;
-	U32 const hashLog3 = (forCCtx && cParams->minMatch == 3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
-	size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
-	/* We don't use ZSTD_cwksp_alloc_size() here because the tables aren't
-	 * surrounded by redzones in ASAN. */
-	size_t const tableSpace = chainSize * sizeof(U32) + hSize * sizeof(U32) + h3Size * sizeof(U32);
-	size_t const optPotentialSpace =
-	    ZSTD_cwksp_aligned_alloc_size((MaxML + 1) * sizeof(U32)) + ZSTD_cwksp_aligned_alloc_size((MaxLL + 1) * sizeof(U32)) + ZSTD_cwksp_aligned_alloc_size((MaxOff + 1) * sizeof(U32)) + ZSTD_cwksp_aligned_alloc_size((1 << Litbits) * sizeof(U32)) + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM + 1) * sizeof(ZSTD_match_t)) + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM + 1) * sizeof(ZSTD_optimal_t));
-	size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
-	                                       ? ZSTD_cwksp_aligned_alloc_size(hSize * sizeof(U16))
-	                                       : 0;
-	size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
-	                            ? optPotentialSpace
-	                            : 0;
-	size_t const slackSpace = ZSTD_cwksp_slack_space_required();
-
-	/* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */
-	ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4);
-	assert(useRowMatchFinder != ZSTD_ps_auto);
-
-	DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u",
-	         (U32)chainSize, (U32)hSize, (U32)h3Size);
-	return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
-}
-
-/* Helper function for calculating memory requirements.
- * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
-static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
-	U32 const divider = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
-	return blockSize / divider;
-}
-
-static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-    const ZSTD_compressionParameters* cParams,
-    const ldmParams_t* ldmParams,
-    const int isStatic,
-    const ZSTD_paramSwitch_e useRowMatchFinder,
-    const size_t buffInSize,
-    const size_t buffOutSize,
-    const U64 pledgedSrcSize,
-    int useSequenceProducer,
-    size_t maxBlockSize) {
-	size_t const windowSize = (size_t)BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
-	size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
-	size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
-	size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
-	size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE);
-	size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t));
-	size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1);
-
-	size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams);
-	size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize);
-	size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ? ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0;
-
-	size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) + ZSTD_cwksp_alloc_size(buffOutSize);
-
-	size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
-
-	size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
-	size_t const externalSeqSpace = useSequenceProducer
-	                                    ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
-	                                    : 0;
-
-	size_t const neededSpace =
-	    cctxSpace +
-	    entropySpace +
-	    blockStateSpace +
-	    ldmSpace +
-	    ldmSeqSpace +
-	    matchStateSize +
-	    tokenSpace +
-	    bufferSpace +
-	    externalSeqSpace;
-
-	DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
-	return neededSpace;
-}
-
-size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) {
-	ZSTD_compressionParameters const cParams =
-	    ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
-	ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder,
-	                                                                            &cParams);
-
-	RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
-	/* estimateCCtxSize is for one-shot compression. So no buffers should
-	 * be needed. However, we still allocate two 0-sized buffers, which can
-	 * take space under ASAN. */
-	return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-	    &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
-}
-
-size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) {
-	ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams);
-	if(ZSTD_rowMatchFinderSupported(cParams.strategy)) {
-		/* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */
-		size_t noRowCCtxSize;
-		size_t rowCCtxSize;
-		initialParams.useRowMatchFinder = ZSTD_ps_disable;
-		noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
-		initialParams.useRowMatchFinder = ZSTD_ps_enable;
-		rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
-		return MAX(noRowCCtxSize, rowCCtxSize);
-	} else {
-		return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
-	}
-}
-
-static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) {
-	int tier = 0;
-	size_t largestSize = 0;
-	static const unsigned long long srcSizeTiers[4] = {16 KB, 128 KB, 256 KB, ZSTD_CONTENTSIZE_UNKNOWN};
-	for(; tier < 4; ++tier) {
-		/* Choose the set of cParams for a given level across all srcSizes that give the largest cctxSize */
-		ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeTiers[tier], 0, ZSTD_cpm_noAttachDict);
-		largestSize = MAX(ZSTD_estimateCCtxSize_usingCParams(cParams), largestSize);
-	}
-	return largestSize;
-}
-
-size_t ZSTD_estimateCCtxSize(int compressionLevel) {
-	int level;
-	size_t memBudget = 0;
-	for(level = MIN(compressionLevel, 1); level <= compressionLevel; level++) {
-		/* Ensure monotonically increasing memory usage as compression level increases */
-		size_t const newMB = ZSTD_estimateCCtxSize_internal(level);
-		if(newMB > memBudget)
-			memBudget = newMB;
-	}
-	return memBudget;
-}
-
-size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) {
-	RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
-	{
-		ZSTD_compressionParameters const cParams =
-		    ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
-		size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
-		size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
-		                              ? ((size_t)1 << cParams.windowLog) + blockSize
-		                              : 0;
-		size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered)
-		                               ? ZSTD_compressBound(blockSize) + 1
-		                               : 0;
-		ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &params->cParams);
-
-		return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-		    &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
-		    ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
-	}
-}
-
-size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams) {
-	ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams);
-	if(ZSTD_rowMatchFinderSupported(cParams.strategy)) {
-		/* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */
-		size_t noRowCCtxSize;
-		size_t rowCCtxSize;
-		initialParams.useRowMatchFinder = ZSTD_ps_disable;
-		noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
-		initialParams.useRowMatchFinder = ZSTD_ps_enable;
-		rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
-		return MAX(noRowCCtxSize, rowCCtxSize);
-	} else {
-		return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
-	}
-}
-
-static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) {
-	ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
-	return ZSTD_estimateCStreamSize_usingCParams(cParams);
-}
-
-size_t ZSTD_estimateCStreamSize(int compressionLevel) {
-	int level;
-	size_t memBudget = 0;
-	for(level = MIN(compressionLevel, 1); level <= compressionLevel; level++) {
-		size_t const newMB = ZSTD_estimateCStreamSize_internal(level);
-		if(newMB > memBudget)
-			memBudget = newMB;
-	}
-	return memBudget;
-}
-
-/* ZSTD_getFrameProgression():
- * tells how much data has been consumed (input) and produced (output) for current frame.
- * able to count progression inside worker threads (non-blocking mode).
- */
-ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx) {
-#ifdef ZSTD_MULTITHREAD
-	if(cctx->appliedParams.nbWorkers > 0) {
-		return ZSTDMT_getFrameProgression(cctx->mtctx);
-	}
-#endif
-	{
-		ZSTD_frameProgression fp;
-		size_t const buffered = (cctx->inBuff == NULL) ? 0 : cctx->inBuffPos - cctx->inToCompress;
-		if(buffered)
-			assert(cctx->inBuffPos >= cctx->inToCompress);
-		assert(buffered <= ZSTD_BLOCKSIZE_MAX);
-		fp.ingested = cctx->consumedSrcSize + buffered;
-		fp.consumed = cctx->consumedSrcSize;
-		fp.produced = cctx->producedCSize;
-		fp.flushed = cctx->producedCSize; /* simplified; some data might still be left within streaming output buffer */
-		fp.currentJobID = 0;
-		fp.nbActiveWorkers = 0;
-		return fp;
-	}
-}
-
-/*! ZSTD_toFlushNow()
- *  Only useful for multithreading scenarios currently (nbWorkers >= 1).
- */
-size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx) {
-#ifdef ZSTD_MULTITHREAD
-	if(cctx->appliedParams.nbWorkers > 0) {
-		return ZSTDMT_toFlushNow(cctx->mtctx);
-	}
-#endif
-	(void)cctx;
-	return 0; /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */
-}
-
-static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1,
-                                    ZSTD_compressionParameters cParams2) {
-	(void)cParams1;
-	(void)cParams2;
-	assert(cParams1.windowLog == cParams2.windowLog);
-	assert(cParams1.chainLog == cParams2.chainLog);
-	assert(cParams1.hashLog == cParams2.hashLog);
-	assert(cParams1.searchLog == cParams2.searchLog);
-	assert(cParams1.minMatch == cParams2.minMatch);
-	assert(cParams1.targetLength == cParams2.targetLength);
-	assert(cParams1.strategy == cParams2.strategy);
-}
-
-void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs) {
-	int i;
-	for(i = 0; i < ZSTD_REP_NUM; ++i)
-		bs->rep[i] = repStartValue[i];
-	bs->entropy.huf.repeatMode = HUF_repeat_none;
-	bs->entropy.fse.offcode_repeatMode = FSE_repeat_none;
-	bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none;
-	bs->entropy.fse.litlength_repeatMode = FSE_repeat_none;
-}
-
-/*! ZSTD_invalidateMatchState()
- *  Invalidate all the matches in the match finder tables.
- *  Requires nextSrc and base to be set (can be NULL).
- */
-static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms) {
-	ZSTD_window_clear(&ms->window);
-
-	ms->nextToUpdate = ms->window.dictLimit;
-	ms->loadedDictEnd = 0;
-	ms->opt.litLengthSum = 0; /* force reset of btopt stats */
-	ms->dictMatchState = NULL;
-}
-
-/**
- * Controls, for this matchState reset, whether the tables need to be cleared /
- * prepared for the coming compression (ZSTDcrp_makeClean), or whether the
- * tables can be left unclean (ZSTDcrp_leaveDirty), because we know that a
- * subsequent operation will overwrite the table space anyways (e.g., copying
- * the matchState contents in from a CDict).
- */
-typedef enum {
-	ZSTDcrp_makeClean,
-	ZSTDcrp_leaveDirty
-} ZSTD_compResetPolicy_e;
-
-/**
- * Controls, for this matchState reset, whether indexing can continue where it
- * left off (ZSTDirp_continue), or whether it needs to be restarted from zero
- * (ZSTDirp_reset).
- */
-typedef enum {
-	ZSTDirp_continue,
-	ZSTDirp_reset
-} ZSTD_indexResetPolicy_e;
-
-typedef enum {
-	ZSTD_resetTarget_CDict,
-	ZSTD_resetTarget_CCtx
-} ZSTD_resetTarget_e;
-
-static size_t
-ZSTD_reset_matchState(ZSTD_matchState_t* ms,
-                      ZSTD_cwksp* ws,
-                      const ZSTD_compressionParameters* cParams,
-                      const ZSTD_paramSwitch_e useRowMatchFinder,
-                      const ZSTD_compResetPolicy_e crp,
-                      const ZSTD_indexResetPolicy_e forceResetIndex,
-                      const ZSTD_resetTarget_e forWho) {
-	/* disable chain table allocation for fast or row-based strategies */
-	size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder,
-	                                                 ms->dedicatedDictSearch && (forWho == ZSTD_resetTarget_CDict))
-	                             ? ((size_t)1 << cParams->chainLog)
-	                             : 0;
-	size_t const hSize = ((size_t)1) << cParams->hashLog;
-	U32 const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch == 3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
-	size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
-
-	DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset);
-	assert(useRowMatchFinder != ZSTD_ps_auto);
-	if(forceResetIndex == ZSTDirp_reset) {
-		ZSTD_window_init(&ms->window);
-		ZSTD_cwksp_mark_tables_dirty(ws);
-	}
-
-	ms->hashLog3 = hashLog3;
-
-	ZSTD_invalidateMatchState(ms);
-
-	assert(!ZSTD_cwksp_reserve_failed(ws)); /* check that allocation hasn't already failed */
-
-	ZSTD_cwksp_clear_tables(ws);
-
-	DEBUGLOG(5, "reserving table space");
-	/* table Space */
-	ms->hashTable = (U32*)ZSTD_cwksp_reserve_table(ws, hSize * sizeof(U32));
-	ms->chainTable = (U32*)ZSTD_cwksp_reserve_table(ws, chainSize * sizeof(U32));
-	ms->hashTable3 = (U32*)ZSTD_cwksp_reserve_table(ws, h3Size * sizeof(U32));
-	RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
-	                "failed a workspace allocation in ZSTD_reset_matchState");
-
-	DEBUGLOG(4, "reset table : %u", crp != ZSTDcrp_leaveDirty);
-	if(crp != ZSTDcrp_leaveDirty) {
-		/* reset tables only */
-		ZSTD_cwksp_clean_tables(ws);
-	}
-
-	/* opt parser space */
-	if((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
-		DEBUGLOG(4, "reserving optimal parser space");
-		ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1 << Litbits) * sizeof(unsigned));
-		ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL + 1) * sizeof(unsigned));
-		ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML + 1) * sizeof(unsigned));
-		ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff + 1) * sizeof(unsigned));
-		ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM + 1) * sizeof(ZSTD_match_t));
-		ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM + 1) * sizeof(ZSTD_optimal_t));
-	}
-
-	if(ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
-		{ /* Row match finder needs an additional table of hashes ("tags") */
-			size_t const tagTableSize = hSize * sizeof(U16);
-			ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
-			if(ms->tagTable)
-				ZSTD_memset(ms->tagTable, 0, tagTableSize);
-		}
-		{ /* Switch to 32-entry rows if searchLog is 5 (or more) */
-			U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
-			assert(cParams->hashLog >= rowLog);
-			ms->rowHashLog = cParams->hashLog - rowLog;
-		}
-	}
-
-	ms->cParams = *cParams;
-
-	RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
-	                "failed a workspace allocation in ZSTD_reset_matchState");
-	return 0;
-}
-
-/* ZSTD_indexTooCloseToMax() :
- * minor optimization : prefer memset() rather than reduceIndex()
- * which is measurably slow in some circumstances (reported for Visual Studio).
- * Works when re-using a context for a lot of smallish inputs :
- * if all inputs are smaller than ZSTD_INDEXOVERFLOW_MARGIN,
- * memset() will be triggered before reduceIndex().
- */
-#define ZSTD_INDEXOVERFLOW_MARGIN (16 MB)
-static int ZSTD_indexTooCloseToMax(ZSTD_window_t w) {
-	return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN);
-}
-
-/** ZSTD_dictTooBig():
- * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be loaded in
- * one go generically. So we ensure that in that case we reset the tables to zero,
- * so that we can load as much of the dictionary as possible.
- */
-static int ZSTD_dictTooBig(size_t const loadedDictSize) {
-	return loadedDictSize > ZSTD_CHUNKSIZE_MAX;
-}
-
-/*! ZSTD_resetCCtx_internal() :
- * @param loadedDictSize The size of the dictionary to be loaded
- * into the context, if any. If no dictionary is used, or the
- * dictionary is being attached / copied, then pass 0.
- * note : `params` are assumed fully validated at this stage.
- */
-static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
-                                      ZSTD_CCtx_params const * params,
-                                      U64 const pledgedSrcSize,
-                                      size_t const loadedDictSize,
-                                      ZSTD_compResetPolicy_e const crp,
-                                      ZSTD_buffered_policy_e const zbuff) {
-	ZSTD_cwksp* const ws = &zc->workspace;
-	DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d",
-	         (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter);
-	assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
-
-	zc->isFirstBlock = 1;
-
-	/* Set applied params early so we can modify them for LDM,
-	 * and point params at the applied params.
-	 */
-	zc->appliedParams = *params;
-	params = &zc->appliedParams;
-
-	assert(params->useRowMatchFinder != ZSTD_ps_auto);
-	assert(params->useBlockSplitter != ZSTD_ps_auto);
-	assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
-	assert(params->maxBlockSize != 0);
-	if(params->ldmParams.enableLdm == ZSTD_ps_enable) {
-		/* Adjust long distance matching parameters */
-		ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
-		assert(params->ldmParams.hashLog >= params->ldmParams.bucketSizeLog);
-		assert(params->ldmParams.hashRateLog < 32);
-	}
-
-	{
-		size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
-		size_t const blockSize = MIN(params->maxBlockSize, windowSize);
-		size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer);
-		size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
-		                               ? ZSTD_compressBound(blockSize) + 1
-		                               : 0;
-		size_t const buffInSize = (zbuff == ZSTDb_buffered && params->inBufferMode == ZSTD_bm_buffered)
-		                              ? windowSize + blockSize
-		                              : 0;
-		size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize);
-
-		int const indexTooClose = ZSTD_indexTooCloseToMax(zc->blockState.matchState.window);
-		int const dictTooBig = ZSTD_dictTooBig(loadedDictSize);
-		ZSTD_indexResetPolicy_e needsIndexReset =
-		    (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_reset : ZSTDirp_continue;
-
-		size_t const neededSpace =
-		    ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-		        &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
-		        buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize);
-		int resizeWorkspace;
-
-		FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
-
-		if(!zc->staticSize)
-			ZSTD_cwksp_bump_oversized_duration(ws, 0);
-
-		{ /* Check if workspace is large enough, alloc a new one if needed */
-			int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
-			int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
-			resizeWorkspace = workspaceTooSmall || workspaceWasteful;
-			DEBUGLOG(4, "Need %zu B workspace", neededSpace);
-			DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
-
-			if(resizeWorkspace) {
-				DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB",
-				         ZSTD_cwksp_sizeof(ws) >> 10,
-				         neededSpace >> 10);
-
-				RETURN_ERROR_IF(zc->staticSize, memory_allocation, "static cctx : no resize");
-
-				needsIndexReset = ZSTDirp_reset;
-
-				ZSTD_cwksp_free(ws, zc->customMem);
-				FORWARD_IF_ERROR(ZSTD_cwksp_create(ws, neededSpace, zc->customMem), "");
-
-				DEBUGLOG(5, "reserving object space");
-				/* Statically sized space.
-				 * entropyWorkspace never moves,
-				 * though prev/next block swap places */
-				assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t)));
-				zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
-				RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock");
-				zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
-				RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock");
-				zc->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE);
-				RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace");
-			}
-		}
-
-		ZSTD_cwksp_clear(ws);
-
-		/* init params */
-		zc->blockState.matchState.cParams = params->cParams;
-		zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
-		zc->pledgedSrcSizePlusOne = pledgedSrcSize + 1;
-		zc->consumedSrcSize = 0;
-		zc->producedCSize = 0;
-		if(pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)
-			zc->appliedParams.fParams.contentSizeFlag = 0;
-		DEBUGLOG(4, "pledged content size : %u ; flag : %u",
-		         (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag);
-		zc->blockSize = blockSize;
-
-		XXH64_reset(&zc->xxhState, 0);
-		zc->stage = ZSTDcs_init;
-		zc->dictID = 0;
-		zc->dictContentSize = 0;
-
-		ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
-
-		/* ZSTD_wildcopy() is used to copy into the literals buffer,
-		 * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
-		 */
-		zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
-		zc->seqStore.maxNbLit = blockSize;
-
-		/* buffers */
-		zc->bufferedPolicy = zbuff;
-		zc->inBuffSize = buffInSize;
-		zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
-		zc->outBuffSize = buffOutSize;
-		zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize);
-
-		/* ldm bucketOffsets table */
-		if(params->ldmParams.enableLdm == ZSTD_ps_enable) {
-			/* TODO: avoid memset? */
-			size_t const numBuckets =
-			    ((size_t)1) << (params->ldmParams.hashLog -
-			                    params->ldmParams.bucketSizeLog);
-			zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets);
-			ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets);
-		}
-
-		/* sequences storage */
-		ZSTD_referenceExternalSequences(zc, NULL, 0);
-		zc->seqStore.maxNbSeq = maxNbSeq;
-		zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
-		zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
-		zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
-		zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
-
-		FORWARD_IF_ERROR(ZSTD_reset_matchState(
-		                     &zc->blockState.matchState,
-		                     ws,
-		                     &params->cParams,
-		                     params->useRowMatchFinder,
-		                     crp,
-		                     needsIndexReset,
-		                     ZSTD_resetTarget_CCtx),
-		                 "");
-
-		/* ldm hash table */
-		if(params->ldmParams.enableLdm == ZSTD_ps_enable) {
-			/* TODO: avoid memset? */
-			size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
-			zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
-			ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
-			zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
-			zc->maxNbLdmSequences = maxNbLdmSeq;
-
-			ZSTD_window_init(&zc->ldmState.window);
-			zc->ldmState.loadedDictEnd = 0;
-		}
-
-		/* reserve space for block-level external sequences */
-		if(params->useSequenceProducer) {
-			size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
-			zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq;
-			zc->externalMatchCtx.seqBuffer =
-			    (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
-		}
-
-		DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
-		assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
-
-		zc->initialized = 1;
-
-		return 0;
-	}
-}
-
-/* ZSTD_invalidateRepCodes() :
- * ensures next compression will not use repcodes from previous block.
- * Note : only works with regular variant;
- *        do not use with extDict variant ! */
-void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) {
-	int i;
-	for(i = 0; i < ZSTD_REP_NUM; i++)
-		cctx->blockState.prevCBlock->rep[i] = 0;
-	assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
-}
-
-/* These are the approximate sizes for each strategy past which copying the
- * dictionary tables into the working context is faster than using them
- * in-place.
- */
-static const size_t attachDictSizeCutoffs[ZSTD_STRATEGY_MAX + 1] = {
-    8 KB,  /* unused */
-    8 KB,  /* ZSTD_fast */
-    16 KB, /* ZSTD_dfast */
-    32 KB, /* ZSTD_greedy */
-    32 KB, /* ZSTD_lazy */
-    32 KB, /* ZSTD_lazy2 */
-    32 KB, /* ZSTD_btlazy2 */
-    32 KB, /* ZSTD_btopt */
-    8 KB,  /* ZSTD_btultra */
-    8 KB   /* ZSTD_btultra2 */
-};
-
-static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict,
-                                 const ZSTD_CCtx_params* params,
-                                 U64 pledgedSrcSize) {
-	size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy];
-	int const dedicatedDictSearch = cdict->matchState.dedicatedDictSearch;
-	return dedicatedDictSearch || ((pledgedSrcSize <= cutoff || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN || params->attachDictPref == ZSTD_dictForceAttach) && params->attachDictPref != ZSTD_dictForceCopy && !params->forceWindow); /* dictMatchState isn't correctly
-	                                                                                                                                                                                                                                     * handled in _enforceMaxDist */
-}
-
-static size_t
-ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
-                                const ZSTD_CDict* cdict,
-                                ZSTD_CCtx_params params,
-                                U64 pledgedSrcSize,
-                                ZSTD_buffered_policy_e zbuff) {
-	DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=%llu",
-	         (unsigned long long)pledgedSrcSize);
-	{
-		ZSTD_compressionParameters adjusted_cdict_cParams = cdict->matchState.cParams;
-		unsigned const windowLog = params.cParams.windowLog;
-		assert(windowLog != 0);
-		/* Resize working context table params for input only, since the dict
-		 * has its own tables. */
-		/* pledgedSrcSize == 0 means 0! */
-
-		if(cdict->matchState.dedicatedDictSearch) {
-			ZSTD_dedicatedDictSearch_revertCParams(&adjusted_cdict_cParams);
-		}
-
-		params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
-		                                             cdict->dictContentSize, ZSTD_cpm_attachDict,
-		                                             params.useRowMatchFinder);
-		params.cParams.windowLog = windowLog;
-		params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */
-		FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
-		                                         /* loadedDictSize */ 0,
-		                                         ZSTDcrp_makeClean, zbuff),
-		                 "");
-		assert(cctx->appliedParams.cParams.strategy == adjusted_cdict_cParams.strategy);
-	}
-
-	{
-		const U32 cdictEnd = (U32)(cdict->matchState.window.nextSrc - cdict->matchState.window.base);
-		const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit;
-		if(cdictLen == 0) {
-			/* don't even attach dictionaries with no contents */
-			DEBUGLOG(4, "skipping attaching empty dictionary");
-		} else {
-			DEBUGLOG(4, "attaching dictionary into context");
-			cctx->blockState.matchState.dictMatchState = &cdict->matchState;
-
-			/* prep working match state so dict matches never have negative indices
-			 * when they are translated to the working context's index space. */
-			if(cctx->blockState.matchState.window.dictLimit < cdictEnd) {
-				cctx->blockState.matchState.window.nextSrc =
-				    cctx->blockState.matchState.window.base + cdictEnd;
-				ZSTD_window_clear(&cctx->blockState.matchState.window);
-			}
-			/* loadedDictEnd is expressed within the referential of the active context */
-			cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit;
-		}
-	}
-
-	cctx->dictID = cdict->dictID;
-	cctx->dictContentSize = cdict->dictContentSize;
-
-	/* copy block state */
-	ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
-
-	return 0;
-}
-
-static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const * src, size_t tableSize,
-                                        ZSTD_compressionParameters const * cParams) {
-	if(ZSTD_CDictIndicesAreTagged(cParams)) {
-		/* Remove tags from the CDict table if they are present.
-		 * See docs on "short cache" in zstd_compress_internal.h for context. */
-		size_t i;
-		for(i = 0; i < tableSize; i++) {
-			U32 const taggedIndex = src[i];
-			U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
-			dst[i] = index;
-		}
-	} else {
-		ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
-	}
-}
-
-static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
-                                            const ZSTD_CDict* cdict,
-                                            ZSTD_CCtx_params params,
-                                            U64 pledgedSrcSize,
-                                            ZSTD_buffered_policy_e zbuff) {
-	const ZSTD_compressionParameters* cdict_cParams = &cdict->matchState.cParams;
-
-	assert(!cdict->matchState.dedicatedDictSearch);
-	DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=%llu",
-	         (unsigned long long)pledgedSrcSize);
-
-	{
-		unsigned const windowLog = params.cParams.windowLog;
-		assert(windowLog != 0);
-		/* Copy only compression parameters related to tables. */
-		params.cParams = *cdict_cParams;
-		params.cParams.windowLog = windowLog;
-		params.useRowMatchFinder = cdict->useRowMatchFinder;
-		FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
-		                                         /* loadedDictSize */ 0,
-		                                         ZSTDcrp_leaveDirty, zbuff),
-		                 "");
-		assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
-		assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog);
-		assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog);
-	}
-
-	ZSTD_cwksp_mark_tables_dirty(&cctx->workspace);
-	assert(params.useRowMatchFinder != ZSTD_ps_auto);
-
-	/* copy tables */
-	{
-		size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */)
-		                             ? ((size_t)1 << cdict_cParams->chainLog)
-		                             : 0;
-		size_t const hSize = (size_t)1 << cdict_cParams->hashLog;
-
-		ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
-		                            cdict->matchState.hashTable,
-		                            hSize, cdict_cParams);
-
-		/* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
-		if(ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
-			ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
-			                            cdict->matchState.chainTable,
-			                            chainSize, cdict_cParams);
-		}
-		/* copy tag table */
-		if(ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
-			size_t const tagTableSize = hSize * sizeof(U16);
-			ZSTD_memcpy(cctx->blockState.matchState.tagTable,
-			            cdict->matchState.tagTable,
-			            tagTableSize);
-		}
-	}
-
-	/* Zero the hashTable3, since the cdict never fills it */
-	{
-		int const h3log = cctx->blockState.matchState.hashLog3;
-		size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
-		assert(cdict->matchState.hashLog3 == 0);
-		ZSTD_memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32));
-	}
-
-	ZSTD_cwksp_mark_tables_clean(&cctx->workspace);
-
-	/* copy dictionary offsets */
-	{
-		ZSTD_matchState_t const * srcMatchState = &cdict->matchState;
-		ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
-		dstMatchState->window = srcMatchState->window;
-		dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
-		dstMatchState->loadedDictEnd = srcMatchState->loadedDictEnd;
-	}
-
-	cctx->dictID = cdict->dictID;
-	cctx->dictContentSize = cdict->dictContentSize;
-
-	/* copy block state */
-	ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
-
-	return 0;
-}
-
-/* We have a choice between copying the dictionary context into the working
- * context, or referencing the dictionary context from the working context
- * in-place. We decide here which strategy to use. */
-static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
-                                        const ZSTD_CDict* cdict,
-                                        const ZSTD_CCtx_params* params,
-                                        U64 pledgedSrcSize,
-                                        ZSTD_buffered_policy_e zbuff) {
-
-	DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)",
-	         (unsigned)pledgedSrcSize);
-
-	if(ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) {
-		return ZSTD_resetCCtx_byAttachingCDict(
-		    cctx, cdict, *params, pledgedSrcSize, zbuff);
-	} else {
-		return ZSTD_resetCCtx_byCopyingCDict(
-		    cctx, cdict, *params, pledgedSrcSize, zbuff);
-	}
-}
-
-/*! ZSTD_copyCCtx_internal() :
- *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
- *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
- *  The "context", in this case, refers to the hash and chain tables,
- *  entropy tables, and dictionary references.
- * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx.
- * @return : 0, or an error code */
-static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
-                                     const ZSTD_CCtx* srcCCtx,
-                                     ZSTD_frameParameters fParams,
-                                     U64 pledgedSrcSize,
-                                     ZSTD_buffered_policy_e zbuff) {
-	RETURN_ERROR_IF(srcCCtx->stage != ZSTDcs_init, stage_wrong,
-	                "Can't copy a ctx that's not in init stage.");
-	DEBUGLOG(5, "ZSTD_copyCCtx_internal");
-	ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
-	{
-		ZSTD_CCtx_params params = dstCCtx->requestedParams;
-		/* Copy only compression parameters related to tables. */
-		params.cParams = srcCCtx->appliedParams.cParams;
-		assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto);
-		assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto);
-		assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto);
-		params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder;
-		params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
-		params.ldmParams = srcCCtx->appliedParams.ldmParams;
-		params.fParams = fParams;
-		params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
-		ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
-		                        /* loadedDictSize */ 0,
-		                        ZSTDcrp_leaveDirty, zbuff);
-		assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog);
-		assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy);
-		assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog);
-		assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog);
-		assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3);
-	}
-
-	ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace);
-
-	/* copy tables */
-	{
-		size_t const chainSize = ZSTD_allocateChainTable(srcCCtx->appliedParams.cParams.strategy,
-		                                                 srcCCtx->appliedParams.useRowMatchFinder,
-		                                                 0 /* forDDSDict */)
-		                             ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog)
-		                             : 0;
-		size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog;
-		int const h3log = srcCCtx->blockState.matchState.hashLog3;
-		size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
-
-		ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable,
-		            srcCCtx->blockState.matchState.hashTable,
-		            hSize * sizeof(U32));
-		ZSTD_memcpy(dstCCtx->blockState.matchState.chainTable,
-		            srcCCtx->blockState.matchState.chainTable,
-		            chainSize * sizeof(U32));
-		ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable3,
-		            srcCCtx->blockState.matchState.hashTable3,
-		            h3Size * sizeof(U32));
-	}
-
-	ZSTD_cwksp_mark_tables_clean(&dstCCtx->workspace);
-
-	/* copy dictionary offsets */
-	{
-		const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState;
-		ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState;
-		dstMatchState->window = srcMatchState->window;
-		dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
-		dstMatchState->loadedDictEnd = srcMatchState->loadedDictEnd;
-	}
-	dstCCtx->dictID = srcCCtx->dictID;
-	dstCCtx->dictContentSize = srcCCtx->dictContentSize;
-
-	/* copy block state */
-	ZSTD_memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock));
-
-	return 0;
-}
-
-/*! ZSTD_copyCCtx() :
- *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
- *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
- *  pledgedSrcSize==0 means "unknown".
- *   @return : 0, or an error code */
-size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize) {
-	ZSTD_frameParameters fParams = {1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/};
-	ZSTD_buffered_policy_e const zbuff = srcCCtx->bufferedPolicy;
-	ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered == 1);
-	if(pledgedSrcSize == 0)
-		pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
-	fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN);
-
-	return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx,
-	                              fParams, pledgedSrcSize,
-	                              zbuff);
-}
-
-#define ZSTD_ROWSIZE 16
-/*! ZSTD_reduceTable() :
- *  reduce table indexes by `reducerValue`, or squash to zero.
- *  PreserveMark preserves "unsorted mark" for btlazy2 strategy.
- *  It must be set to a clear 0/1 value, to remove branch during inlining.
- *  Presume table size is a multiple of ZSTD_ROWSIZE
- *  to help auto-vectorization */
-FORCE_INLINE_TEMPLATE void
-ZSTD_reduceTable_internal(U32* const table, U32 const size, U32 const reducerValue, int const preserveMark) {
-	int const nbRows = (int)size / ZSTD_ROWSIZE;
-	int cellNb = 0;
-	int rowNb;
-	/* Protect special index values < ZSTD_WINDOW_START_INDEX. */
-	U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX;
-	assert((size & (ZSTD_ROWSIZE - 1)) == 0); /* multiple of ZSTD_ROWSIZE */
-	assert(size < (1U << 31));                /* can be casted to int */
-
-#if ZSTD_MEMORY_SANITIZER && !defined(ZSTD_MSAN_DONT_POISON_WORKSPACE)
-	/* To validate that the table re-use logic is sound, and that we don't
-	 * access table space that we haven't cleaned, we re-"poison" the table
-	 * space every time we mark it dirty.
-	 *
-	 * This function however is intended to operate on those dirty tables and
-	 * re-clean them. So when this function is used correctly, we can unpoison
-	 * the memory it operated on. This introduces a blind spot though, since
-	 * if we now try to operate on __actually__ poisoned memory, we will not
-	 * detect that. */
-	__msan_unpoison(table, size * sizeof(U32));
-#endif
-
-	for(rowNb = 0; rowNb < nbRows; rowNb++) {
-		int column;
-		for(column = 0; column < ZSTD_ROWSIZE; column++) {
-			U32 newVal;
-			if(preserveMark && table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) {
-				/* This write is pointless, but is required(?) for the compiler
-				 * to auto-vectorize the loop. */
-				newVal = ZSTD_DUBT_UNSORTED_MARK;
-			} else if(table[cellNb] < reducerThreshold) {
-				newVal = 0;
-			} else {
-				newVal = table[cellNb] - reducerValue;
-			}
-			table[cellNb] = newVal;
-			cellNb++;
-		}
-	}
-}
-
-static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue) {
-	ZSTD_reduceTable_internal(table, size, reducerValue, 0);
-}
-
-static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const reducerValue) {
-	ZSTD_reduceTable_internal(table, size, reducerValue, 1);
-}
-
-/*! ZSTD_reduceIndex() :
- *   rescale all indexes to avoid future overflow (indexes are U32) */
-static void ZSTD_reduceIndex(ZSTD_matchState_t* ms, ZSTD_CCtx_params const * params, const U32 reducerValue) {
-	{
-		U32 const hSize = (U32)1 << params->cParams.hashLog;
-		ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);
-	}
-
-	if(ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMatchFinder, (U32)ms->dedicatedDictSearch)) {
-		U32 const chainSize = (U32)1 << params->cParams.chainLog;
-		if(params->cParams.strategy == ZSTD_btlazy2)
-			ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue);
-		else
-			ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
-	}
-
-	if(ms->hashLog3) {
-		U32 const h3Size = (U32)1 << ms->hashLog3;
-		ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue);
-	}
-}
-
-/*-*******************************************************
- *  Block entropic compression
- *********************************************************/
-
-/* See doc/zstd_compression_format.md for detailed format description */
-
-int ZSTD_seqToCodes(const seqStore_t* seqStorePtr) {
-	const seqDef* const sequences = seqStorePtr->sequencesStart;
-	BYTE* const llCodeTable = seqStorePtr->llCode;
-	BYTE* const ofCodeTable = seqStorePtr->ofCode;
-	BYTE* const mlCodeTable = seqStorePtr->mlCode;
-	U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
-	U32 u;
-	int longOffsets = 0;
-	assert(nbSeq <= seqStorePtr->maxNbSeq);
-	for(u = 0; u < nbSeq; u++) {
-		U32 const llv = sequences[u].litLength;
-		U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
-		U32 const mlv = sequences[u].mlBase;
-		llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
-		ofCodeTable[u] = (BYTE)ofCode;
-		mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
-		assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
-		if(MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
-			longOffsets = 1;
-	}
-	if(seqStorePtr->longLengthType == ZSTD_llt_literalLength)
-		llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
-	if(seqStorePtr->longLengthType == ZSTD_llt_matchLength)
-		mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
-	return longOffsets;
-}
-
-/* ZSTD_useTargetCBlockSize():
- * Returns if target compressed block size param is being used.
- * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize.
- * Returns 1 if true, 0 otherwise. */
-static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams) {
-	DEBUGLOG(5, "ZSTD_useTargetCBlockSize (targetCBlockSize=%zu)", cctxParams->targetCBlockSize);
-	return (cctxParams->targetCBlockSize != 0);
-}
-
-/* ZSTD_blockSplitterEnabled():
- * Returns if block splitting param is being used
- * If used, compression will do best effort to split a block in order to improve compression ratio.
- * At the time this function is called, the parameter must be finalized.
- * Returns 1 if true, 0 otherwise. */
-static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) {
-	DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter);
-	assert(cctxParams->useBlockSplitter != ZSTD_ps_auto);
-	return (cctxParams->useBlockSplitter == ZSTD_ps_enable);
-}
-
-/* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types
- * and size of the sequences statistics
- */
-typedef struct {
-	U32 LLtype;
-	U32 Offtype;
-	U32 MLtype;
-	size_t size;
-	size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
-	int longOffsets;
-} ZSTD_symbolEncodingTypeStats_t;
-
-/* ZSTD_buildSequencesStatistics():
- * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `size` field.
- * Modifies `nextEntropy` to have the appropriate values as a side effect.
- * nbSeq must be greater than 0.
- *
- * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
- */
-static ZSTD_symbolEncodingTypeStats_t
-ZSTD_buildSequencesStatistics(
-    const seqStore_t* seqStorePtr, size_t nbSeq,
-    const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
-    BYTE* dst, const BYTE* const dstEnd,
-    ZSTD_strategy strategy, unsigned* countWorkspace,
-    void* entropyWorkspace, size_t entropyWkspSize) {
-	BYTE* const ostart = dst;
-	const BYTE* const oend = dstEnd;
-	BYTE* op = ostart;
-	FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable;
-	FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable;
-	FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable;
-	const BYTE* const ofCodeTable = seqStorePtr->ofCode;
-	const BYTE* const llCodeTable = seqStorePtr->llCode;
-	const BYTE* const mlCodeTable = seqStorePtr->mlCode;
-	ZSTD_symbolEncodingTypeStats_t stats;
-
-	stats.lastCountSize = 0;
-	/* convert length/distances into codes */
-	stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
-	assert(op <= oend);
-	assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
-	/* build CTable for Literal Lengths */
-	{
-		unsigned max = MaxLL;
-		size_t const mostFrequent = HIST_countFast_wksp(countWorkspace, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */
-		DEBUGLOG(5, "Building LL table");
-		nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
-		stats.LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode,
-		                                       countWorkspace, max, mostFrequent, nbSeq,
-		                                       LLFSELog, prevEntropy->litlengthCTable,
-		                                       LL_defaultNorm, LL_defaultNormLog,
-		                                       ZSTD_defaultAllowed, strategy);
-		assert(set_basic < set_compressed && set_rle < set_compressed);
-		assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
-		{
-			size_t const countSize = ZSTD_buildCTable(
-			    op, (size_t)(oend - op),
-			    CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype,
-			    countWorkspace, max, llCodeTable, nbSeq,
-			    LL_defaultNorm, LL_defaultNormLog, MaxLL,
-			    prevEntropy->litlengthCTable,
-			    sizeof(prevEntropy->litlengthCTable),
-			    entropyWorkspace, entropyWkspSize);
-			if(ZSTD_isError(countSize)) {
-				DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed");
-				stats.size = countSize;
-				return stats;
-			}
-			if(stats.LLtype == set_compressed)
-				stats.lastCountSize = countSize;
-			op += countSize;
-			assert(op <= oend);
-		}
-	}
-	/* build CTable for Offsets */
-	{
-		unsigned max = MaxOff;
-		size_t const mostFrequent = HIST_countFast_wksp(
-		    countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */
-		/* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
-		ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
-		DEBUGLOG(5, "Building OF table");
-		nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
-		stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode,
-		                                        countWorkspace, max, mostFrequent, nbSeq,
-		                                        OffFSELog, prevEntropy->offcodeCTable,
-		                                        OF_defaultNorm, OF_defaultNormLog,
-		                                        defaultPolicy, strategy);
-		assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
-		{
-			size_t const countSize = ZSTD_buildCTable(
-			    op, (size_t)(oend - op),
-			    CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype,
-			    countWorkspace, max, ofCodeTable, nbSeq,
-			    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
-			    prevEntropy->offcodeCTable,
-			    sizeof(prevEntropy->offcodeCTable),
-			    entropyWorkspace, entropyWkspSize);
-			if(ZSTD_isError(countSize)) {
-				DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed");
-				stats.size = countSize;
-				return stats;
-			}
-			if(stats.Offtype == set_compressed)
-				stats.lastCountSize = countSize;
-			op += countSize;
-			assert(op <= oend);
-		}
-	}
-	/* build CTable for MatchLengths */
-	{
-		unsigned max = MaxML;
-		size_t const mostFrequent = HIST_countFast_wksp(
-		    countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */
-		DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend - op));
-		nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
-		stats.MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode,
-		                                       countWorkspace, max, mostFrequent, nbSeq,
-		                                       MLFSELog, prevEntropy->matchlengthCTable,
-		                                       ML_defaultNorm, ML_defaultNormLog,
-		                                       ZSTD_defaultAllowed, strategy);
-		assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
-		{
-			size_t const countSize = ZSTD_buildCTable(
-			    op, (size_t)(oend - op),
-			    CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype,
-			    countWorkspace, max, mlCodeTable, nbSeq,
-			    ML_defaultNorm, ML_defaultNormLog, MaxML,
-			    prevEntropy->matchlengthCTable,
-			    sizeof(prevEntropy->matchlengthCTable),
-			    entropyWorkspace, entropyWkspSize);
-			if(ZSTD_isError(countSize)) {
-				DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed");
-				stats.size = countSize;
-				return stats;
-			}
-			if(stats.MLtype == set_compressed)
-				stats.lastCountSize = countSize;
-			op += countSize;
-			assert(op <= oend);
-		}
-	}
-	stats.size = (size_t)(op - ostart);
-	return stats;
-}
-
-/* ZSTD_entropyCompressSeqStore_internal():
- * compresses both literals and sequences
- * Returns compressed size of block, or a zstd error.
- */
-#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
-MEM_STATIC size_t
-ZSTD_entropyCompressSeqStore_internal(
-    const seqStore_t* seqStorePtr,
-    const ZSTD_entropyCTables_t* prevEntropy,
-    ZSTD_entropyCTables_t* nextEntropy,
-    const ZSTD_CCtx_params* cctxParams,
-    void* dst, size_t dstCapacity,
-    void* entropyWorkspace, size_t entropyWkspSize,
-    const int bmi2) {
-	ZSTD_strategy const strategy = cctxParams->cParams.strategy;
-	unsigned* count = (unsigned*)entropyWorkspace;
-	FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
-	FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
-	FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
-	const seqDef* const sequences = seqStorePtr->sequencesStart;
-	const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
-	const BYTE* const ofCodeTable = seqStorePtr->ofCode;
-	const BYTE* const llCodeTable = seqStorePtr->llCode;
-	const BYTE* const mlCodeTable = seqStorePtr->mlCode;
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = ostart + dstCapacity;
-	BYTE* op = ostart;
-	size_t lastCountSize;
-	int longOffsets = 0;
-
-	entropyWorkspace = count + (MaxSeq + 1);
-	entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
-
-	DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
-	ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1 << MAX(MLFSELog, LLFSELog)));
-	assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
-
-	/* Compress literals */
-	{
-		const BYTE* const literals = seqStorePtr->litStart;
-		size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
-		size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
-		/* Base suspicion of uncompressibility on ratio of literals to sequences */
-		unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
-		size_t const litSize = (size_t)(seqStorePtr->lit - literals);
-
-		size_t const cSize = ZSTD_compressLiterals(
-		    op, dstCapacity,
-		    literals, litSize,
-		    entropyWorkspace, entropyWkspSize,
-		    &prevEntropy->huf, &nextEntropy->huf,
-		    cctxParams->cParams.strategy,
-		    ZSTD_literalsCompressionIsDisabled(cctxParams),
-		    suspectUncompressible, bmi2);
-		FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
-		assert(cSize <= dstCapacity);
-		op += cSize;
-	}
-
-	/* Sequences Header */
-	RETURN_ERROR_IF((oend - op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
-	                dstSize_tooSmall, "Can't fit seq hdr in output buf!");
-	if(nbSeq < 128) {
-		*op++ = (BYTE)nbSeq;
-	} else if(nbSeq < LONGNBSEQ) {
-		op[0] = (BYTE)((nbSeq >> 8) + 0x80);
-		op[1] = (BYTE)nbSeq;
-		op += 2;
-	} else {
-		op[0] = 0xFF;
-		MEM_writeLE16(op + 1, (U16)(nbSeq - LONGNBSEQ));
-		op += 3;
-	}
-	assert(op <= oend);
-	if(nbSeq == 0) {
-		/* Copy the old tables over as if we repeated them */
-		ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
-		return (size_t)(op - ostart);
-	}
-	{
-		BYTE* const seqHead = op++;
-		/* build stats for sequences */
-		const ZSTD_symbolEncodingTypeStats_t stats =
-		    ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
-		                                  &prevEntropy->fse, &nextEntropy->fse,
-		                                  op, oend,
-		                                  strategy, count,
-		                                  entropyWorkspace, entropyWkspSize);
-		FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!");
-		*seqHead = (BYTE)((stats.LLtype << 6) + (stats.Offtype << 4) + (stats.MLtype << 2));
-		lastCountSize = stats.lastCountSize;
-		op += stats.size;
-		longOffsets = stats.longOffsets;
-	}
-
-	{
-		size_t const bitstreamSize = ZSTD_encodeSequences(
-		    op, (size_t)(oend - op),
-		    CTable_MatchLength, mlCodeTable,
-		    CTable_OffsetBits, ofCodeTable,
-		    CTable_LitLength, llCodeTable,
-		    sequences, nbSeq,
-		    longOffsets, bmi2);
-		FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed");
-		op += bitstreamSize;
-		assert(op <= oend);
-		/* zstd versions <= 1.3.4 mistakenly report corruption when
-		 * FSE_readNCount() receives a buffer < 4 bytes.
-		 * Fixed by https://github.com/facebook/zstd/pull/1146.
-		 * This can happen when the last set_compressed table present is 2
-		 * bytes and the bitstream is only one byte.
-		 * In this exceedingly rare case, we will simply emit an uncompressed
-		 * block, since it isn't worth optimizing.
-		 */
-		if(lastCountSize && (lastCountSize + bitstreamSize) < 4) {
-			/* lastCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
-			assert(lastCountSize + bitstreamSize == 3);
-			DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
-			            "emitting an uncompressed block.");
-			return 0;
-		}
-	}
-
-	DEBUGLOG(5, "compressed block size : %u", (unsigned)(op - ostart));
-	return (size_t)(op - ostart);
-}
-
-MEM_STATIC size_t
-ZSTD_entropyCompressSeqStore(
-    const seqStore_t* seqStorePtr,
-    const ZSTD_entropyCTables_t* prevEntropy,
-    ZSTD_entropyCTables_t* nextEntropy,
-    const ZSTD_CCtx_params* cctxParams,
-    void* dst, size_t dstCapacity,
-    size_t srcSize,
-    void* entropyWorkspace, size_t entropyWkspSize,
-    int bmi2) {
-	size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
-	    seqStorePtr, prevEntropy, nextEntropy, cctxParams,
-	    dst, dstCapacity,
-	    entropyWorkspace, entropyWkspSize, bmi2);
-	if(cSize == 0)
-		return 0;
-	/* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
-	 * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
-	 */
-	if((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
-		DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
-		return 0; /* block not compressed */
-	}
-	FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
-
-	/* Check compressibility */
-	{
-		size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
-		if(cSize >= maxCSize)
-			return 0; /* block not compressed */
-	}
-	DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
-	/* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
-	 * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
-	 */
-	assert(cSize < ZSTD_BLOCKSIZE_MAX);
-	return cSize;
-}
-
-/* ZSTD_selectBlockCompressor() :
- * Not static, but internal use only (used by long distance matcher)
- * assumption : strat is a valid strategy */
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) {
-	static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX + 1] = {
-	    {ZSTD_compressBlock_fast /* default for 0 */,
-	     ZSTD_compressBlock_fast,
-	     ZSTD_compressBlock_doubleFast,
-	     ZSTD_compressBlock_greedy,
-	     ZSTD_compressBlock_lazy,
-	     ZSTD_compressBlock_lazy2,
-	     ZSTD_compressBlock_btlazy2,
-	     ZSTD_compressBlock_btopt,
-	     ZSTD_compressBlock_btultra,
-	     ZSTD_compressBlock_btultra2},
-	    {ZSTD_compressBlock_fast_extDict /* default for 0 */,
-	     ZSTD_compressBlock_fast_extDict,
-	     ZSTD_compressBlock_doubleFast_extDict,
-	     ZSTD_compressBlock_greedy_extDict,
-	     ZSTD_compressBlock_lazy_extDict,
-	     ZSTD_compressBlock_lazy2_extDict,
-	     ZSTD_compressBlock_btlazy2_extDict,
-	     ZSTD_compressBlock_btopt_extDict,
-	     ZSTD_compressBlock_btultra_extDict,
-	     ZSTD_compressBlock_btultra_extDict},
-	    {ZSTD_compressBlock_fast_dictMatchState /* default for 0 */,
-	     ZSTD_compressBlock_fast_dictMatchState,
-	     ZSTD_compressBlock_doubleFast_dictMatchState,
-	     ZSTD_compressBlock_greedy_dictMatchState,
-	     ZSTD_compressBlock_lazy_dictMatchState,
-	     ZSTD_compressBlock_lazy2_dictMatchState,
-	     ZSTD_compressBlock_btlazy2_dictMatchState,
-	     ZSTD_compressBlock_btopt_dictMatchState,
-	     ZSTD_compressBlock_btultra_dictMatchState,
-	     ZSTD_compressBlock_btultra_dictMatchState},
-	    {NULL /* default for 0 */,
-	     NULL,
-	     NULL,
-	     ZSTD_compressBlock_greedy_dedicatedDictSearch,
-	     ZSTD_compressBlock_lazy_dedicatedDictSearch,
-	     ZSTD_compressBlock_lazy2_dedicatedDictSearch,
-	     NULL,
-	     NULL,
-	     NULL,
-	     NULL}};
-	ZSTD_blockCompressor selectedCompressor;
-	ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
-
-	assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
-	DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
-	if(ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
-		static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
-		    {ZSTD_compressBlock_greedy_row,
-		     ZSTD_compressBlock_lazy_row,
-		     ZSTD_compressBlock_lazy2_row},
-		    {ZSTD_compressBlock_greedy_extDict_row,
-		     ZSTD_compressBlock_lazy_extDict_row,
-		     ZSTD_compressBlock_lazy2_extDict_row},
-		    {ZSTD_compressBlock_greedy_dictMatchState_row,
-		     ZSTD_compressBlock_lazy_dictMatchState_row,
-		     ZSTD_compressBlock_lazy2_dictMatchState_row},
-		    {ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
-		     ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
-		     ZSTD_compressBlock_lazy2_dedicatedDictSearch_row}};
-		DEBUGLOG(4, "Selecting a row-based matchfinder");
-		assert(useRowMatchFinder != ZSTD_ps_auto);
-		selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy];
-	} else {
-		selectedCompressor = blockCompressor[(int)dictMode][(int)strat];
-	}
-	assert(selectedCompressor != NULL);
-	return selectedCompressor;
-}
-
-static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
-                                   const BYTE* anchor, size_t lastLLSize) {
-	ZSTD_memcpy(seqStorePtr->lit, anchor, lastLLSize);
-	seqStorePtr->lit += lastLLSize;
-}
-
-void ZSTD_resetSeqStore(seqStore_t* ssPtr) {
-	ssPtr->lit = ssPtr->litStart;
-	ssPtr->sequences = ssPtr->sequencesStart;
-	ssPtr->longLengthType = ZSTD_llt_none;
-}
-
-/* ZSTD_postProcessSequenceProducerResult() :
- * Validates and post-processes sequences obtained through the external matchfinder API:
- *   - Checks whether nbExternalSeqs represents an error condition.
- *   - Appends a block delimiter to outSeqs if one is not already present.
- *     See zstd.h for context regarding block delimiters.
- * Returns the number of sequences after post-processing, or an error code. */
-static size_t ZSTD_postProcessSequenceProducerResult(
-    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize) {
-	RETURN_ERROR_IF(
-	    nbExternalSeqs > outSeqsCapacity,
-	    sequenceProducer_failed,
-	    "External sequence producer returned error code %lu",
-	    (unsigned long)nbExternalSeqs);
-
-	RETURN_ERROR_IF(
-	    nbExternalSeqs == 0 && srcSize > 0,
-	    sequenceProducer_failed,
-	    "Got zero sequences from external sequence producer for a non-empty src buffer!");
-
-	if(srcSize == 0) {
-		ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
-		return 1;
-	}
-
-	{
-		ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
-
-		/* We can return early if lastSeq is already a block delimiter. */
-		if(lastSeq.offset == 0 && lastSeq.matchLength == 0) {
-			return nbExternalSeqs;
-		}
-
-		/* This error condition is only possible if the external matchfinder
-		 * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
-		RETURN_ERROR_IF(
-		    nbExternalSeqs == outSeqsCapacity,
-		    sequenceProducer_failed,
-		    "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!");
-
-		/* lastSeq is not a block delimiter, so we need to append one. */
-		ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
-		return nbExternalSeqs + 1;
-	}
-}
-
-/* ZSTD_fastSequenceLengthSum() :
- * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
- * Similar to another function in zstd_compress.c (determine_blockSize),
- * except it doesn't check for a block delimiter to end summation.
- * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
- * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
-static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const * seqBuf, size_t seqBufSize) {
-	size_t matchLenSum, litLenSum, i;
-	matchLenSum = 0;
-	litLenSum = 0;
-	for(i = 0; i < seqBufSize; i++) {
-		litLenSum += seqBuf[i].litLength;
-		matchLenSum += seqBuf[i].matchLength;
-	}
-	return litLenSum + matchLenSum;
-}
-
-typedef enum { ZSTDbss_compress,
-	           ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
-
-static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) {
-	ZSTD_matchState_t* const ms = &zc->blockState.matchState;
-	DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize);
-	assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
-	/* Assert that we have correctly flushed the ctx params into the ms's copy */
-	ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
-	/* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
-	 * additional 1. We need to revisit and change this logic to be more consistent */
-	if(srcSize < MIN_CBLOCK_SIZE + ZSTD_blockHeaderSize + 1 + 1) {
-		if(zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
-			ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
-		} else {
-			ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.minMatch);
-		}
-		return ZSTDbss_noCompress; /* don't even attempt compression below a certain srcSize */
-	}
-	ZSTD_resetSeqStore(&(zc->seqStore));
-	/* required for optimal parser to read stats from dictionary */
-	ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy;
-	/* tell the optimal parser how we expect to compress literals */
-	ms->opt.literalCompressionMode = zc->appliedParams.literalCompressionMode;
-	/* a gap between an attached dict and the current window is not safe,
-	 * they must remain adjacent,
-	 * and when that stops being the case, the dict must be unset */
-	assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit);
-
-	/* limited update after a very long match */
-	{
-		const BYTE* const base = ms->window.base;
-		const BYTE* const istart = (const BYTE*)src;
-		const U32 curr = (U32)(istart - base);
-		if(sizeof(ptrdiff_t) == 8)
-			assert(istart - base < (ptrdiff_t)(U32)(-1)); /* ensure no overflow */
-		if(curr > ms->nextToUpdate + 384)
-			ms->nextToUpdate = curr - MIN(192, (U32)(curr - ms->nextToUpdate - 384));
-	}
-
-	/* select and store sequences */
-	{
-		ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms);
-		size_t lastLLSize;
-		{
-			int i;
-			for(i = 0; i < ZSTD_REP_NUM; ++i)
-				zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i];
-		}
-		if(zc->externSeqStore.pos < zc->externSeqStore.size) {
-			assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
-
-			/* External matchfinder + LDM is technically possible, just not implemented yet.
-			 * We need to revisit soon and implement it. */
-			RETURN_ERROR_IF(
-			    zc->appliedParams.useSequenceProducer,
-			    parameter_combination_unsupported,
-			    "Long-distance matching with external sequence producer enabled is not currently supported.");
-
-			/* Updates ldmSeqStore.pos */
-			lastLLSize =
-			    ZSTD_ldm_blockCompress(&zc->externSeqStore,
-			                           ms, &zc->seqStore,
-			                           zc->blockState.nextCBlock->rep,
-			                           zc->appliedParams.useRowMatchFinder,
-			                           src, srcSize);
-			assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
-		} else if(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
-			rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
-
-			/* External matchfinder + LDM is technically possible, just not implemented yet.
-			 * We need to revisit soon and implement it. */
-			RETURN_ERROR_IF(
-			    zc->appliedParams.useSequenceProducer,
-			    parameter_combination_unsupported,
-			    "Long-distance matching with external sequence producer enabled is not currently supported.");
-
-			ldmSeqStore.seq = zc->ldmSequences;
-			ldmSeqStore.capacity = zc->maxNbLdmSequences;
-			/* Updates ldmSeqStore.size */
-			FORWARD_IF_ERROR(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore,
-			                                            &zc->appliedParams.ldmParams,
-			                                            src, srcSize),
-			                 "");
-			/* Updates ldmSeqStore.pos */
-			lastLLSize =
-			    ZSTD_ldm_blockCompress(&ldmSeqStore,
-			                           ms, &zc->seqStore,
-			                           zc->blockState.nextCBlock->rep,
-			                           zc->appliedParams.useRowMatchFinder,
-			                           src, srcSize);
-			assert(ldmSeqStore.pos == ldmSeqStore.size);
-		} else if(zc->appliedParams.useSequenceProducer) {
-			assert(
-			    zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize));
-			assert(zc->externalMatchCtx.mFinder != NULL);
-
-			{
-				U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
-
-				size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)(
-				    zc->externalMatchCtx.mState,
-				    zc->externalMatchCtx.seqBuffer,
-				    zc->externalMatchCtx.seqBufferCapacity,
-				    src, srcSize,
-				    NULL, 0, /* dict and dictSize, currently not supported */
-				    zc->appliedParams.compressionLevel,
-				    windowSize);
-
-				size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
-				    zc->externalMatchCtx.seqBuffer,
-				    nbExternalSeqs,
-				    zc->externalMatchCtx.seqBufferCapacity,
-				    srcSize);
-
-				/* Return early if there is no error, since we don't need to worry about last literals */
-				if(!ZSTD_isError(nbPostProcessedSeqs)) {
-					ZSTD_sequencePosition seqPos = {0, 0, 0};
-					size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs);
-					RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
-					FORWARD_IF_ERROR(
-					    ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
-					        zc, &seqPos,
-					        zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs,
-					        src, srcSize,
-					        zc->appliedParams.searchForExternalRepcodes),
-					    "Failed to copy external sequences to seqStore!");
-					ms->ldmSeqStore = NULL;
-					DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
-					return ZSTDbss_compress;
-				}
-
-				/* Propagate the error if fallback is disabled */
-				if(!zc->appliedParams.enableMatchFinderFallback) {
-					return nbPostProcessedSeqs;
-				}
-
-				/* Fallback to software matchfinder */
-				{
-					ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
-					                                                                        zc->appliedParams.useRowMatchFinder,
-					                                                                        dictMode);
-					ms->ldmSeqStore = NULL;
-					DEBUGLOG(
-					    5,
-					    "External sequence producer returned error code %lu. Falling back to internal parser.",
-					    (unsigned long)nbExternalSeqs);
-					lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
-				}
-			}
-		} else { /* not long range mode and no external matchfinder */
-			ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
-			                                                                        zc->appliedParams.useRowMatchFinder,
-			                                                                        dictMode);
-			ms->ldmSeqStore = NULL;
-			lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
-		}
-		{
-			const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
-			ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
-		}
-	}
-	return ZSTDbss_compress;
-}
-
-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) {
-	const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
-	const seqDef* seqStoreSeqs = seqStore->sequencesStart;
-	size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
-	size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
-	size_t literalsRead = 0;
-	size_t lastLLSize;
-
-	ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
-	size_t i;
-	repcodes_t updatedRepcodes;
-
-	assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
-	/* Ensure we have enough space for last literals "sequence" */
-	assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
-	ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
-	for(i = 0; i < seqStoreSeqSize; ++i) {
-		U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
-		outSeqs[i].litLength = seqStoreSeqs[i].litLength;
-		outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
-		outSeqs[i].rep = 0;
-
-		if(i == seqStore->longLengthPos) {
-			if(seqStore->longLengthType == ZSTD_llt_literalLength) {
-				outSeqs[i].litLength += 0x10000;
-			} else if(seqStore->longLengthType == ZSTD_llt_matchLength) {
-				outSeqs[i].matchLength += 0x10000;
-			}
-		}
-
-		if(seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
-			/* Derive the correct offset corresponding to a repcode */
-			outSeqs[i].rep = seqStoreSeqs[i].offBase;
-			if(outSeqs[i].litLength != 0) {
-				rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
-			} else {
-				if(outSeqs[i].rep == 3) {
-					rawOffset = updatedRepcodes.rep[0] - 1;
-				} else {
-					rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
-				}
-			}
-		}
-		outSeqs[i].offset = rawOffset;
-		/* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
-		   so we provide seqStoreSeqs[i].offset - 1 */
-		ZSTD_updateRep(updatedRepcodes.rep,
-		               seqStoreSeqs[i].offBase,
-		               seqStoreSeqs[i].litLength == 0);
-		literalsRead += outSeqs[i].litLength;
-	}
-	/* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
-	 * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
-	 * for the block boundary, according to the API.
-	 */
-	assert(seqStoreLiteralsSize >= literalsRead);
-	lastLLSize = seqStoreLiteralsSize - literalsRead;
-	outSeqs[i].litLength = (U32)lastLLSize;
-	outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
-	seqStoreSeqSize++;
-	zc->seqCollector.seqIndex += seqStoreSeqSize;
-}
-
-size_t ZSTD_sequenceBound(size_t srcSize) {
-	return (srcSize / ZSTD_MINMATCH_MIN) + 1;
-}
-
-size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
-                              size_t outSeqsSize, const void* src, size_t srcSize) {
-	const size_t dstCapacity = ZSTD_compressBound(srcSize);
-	void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
-	SeqCollector seqCollector;
-
-	RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
-
-	seqCollector.collectSequences = 1;
-	seqCollector.seqStart = outSeqs;
-	seqCollector.seqIndex = 0;
-	seqCollector.maxSequences = outSeqsSize;
-	zc->seqCollector = seqCollector;
-
-	ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
-	ZSTD_customFree(dst, ZSTD_defaultCMem);
-	return zc->seqCollector.seqIndex;
-}
-
-size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize) {
-	size_t in = 0;
-	size_t out = 0;
-	for(; in < seqsSize; ++in) {
-		if(sequences[in].offset == 0 && sequences[in].matchLength == 0) {
-			if(in != seqsSize - 1) {
-				sequences[in + 1].litLength += sequences[in].litLength;
-			}
-		} else {
-			sequences[out] = sequences[in];
-			++out;
-		}
-	}
-	return out;
-}
-
-/* Unrolled loop to read four size_ts of input at a time. Returns 1 if is RLE, 0 if not. */
-static int ZSTD_isRLE(const BYTE* src, size_t length) {
-	const BYTE* ip = src;
-	const BYTE value = ip[0];
-	const size_t valueST = (size_t)((U64)value * 0x0101010101010101ULL);
-	const size_t unrollSize = sizeof(size_t) * 4;
-	const size_t unrollMask = unrollSize - 1;
-	const size_t prefixLength = length & unrollMask;
-	size_t i;
-	if(length == 1)
-		return 1;
-	/* Check if prefix is RLE first before using unrolled loop */
-	if(prefixLength && ZSTD_count(ip + 1, ip, ip + prefixLength) != prefixLength - 1) {
-		return 0;
-	}
-	for(i = prefixLength; i != length; i += unrollSize) {
-		size_t u;
-		for(u = 0; u < unrollSize; u += sizeof(size_t)) {
-			if(MEM_readST(ip + i + u) != valueST) {
-				return 0;
-			}
-		}
-	}
-	return 1;
-}
-
-/* Returns true if the given block may be RLE.
- * This is just a heuristic based on the compressibility.
- * It may return both false positives and false negatives.
- */
-static int ZSTD_maybeRLE(seqStore_t const * seqStore) {
-	size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
-	size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart);
-
-	return nbSeqs < 4 && nbLits < 10;
-}
-
-static void
-ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) {
-	ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
-	bs->prevCBlock = bs->nextCBlock;
-	bs->nextCBlock = tmp;
-}
-
-/* Writes the block header */
-static void
-writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
-	U32 const cBlockHeader = cSize == 1 ? lastBlock + (((U32)bt_rle) << 1) + (U32)(blockSize << 3) : lastBlock + (((U32)bt_compressed) << 1) + (U32)(cSize << 3);
-	MEM_writeLE24(op, cBlockHeader);
-	DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock);
-}
-
-/** ZSTD_buildBlockEntropyStats_literals() :
- *  Builds entropy for the literals.
- *  Stores literals block type (raw, rle, compressed, repeat) and
- *  huffman description table to hufMetadata.
- *  Requires ENTROPY_WORKSPACE_SIZE workspace
- * @return : size of huffman description table, or an error code
- */
-static size_t
-ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
-                                     const ZSTD_hufCTables_t* prevHuf,
-                                     ZSTD_hufCTables_t* nextHuf,
-                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                     const int literalsCompressionIsDisabled,
-                                     void* workspace, size_t wkspSize,
-                                     int hufFlags) {
-	BYTE* const wkspStart = (BYTE*)workspace;
-	BYTE* const wkspEnd = wkspStart + wkspSize;
-	BYTE* const countWkspStart = wkspStart;
-	unsigned* const countWksp = (unsigned*)workspace;
-	const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
-	BYTE* const nodeWksp = countWkspStart + countWkspSize;
-	const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
-	unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
-	unsigned huffLog = LitHufLog;
-	HUF_repeat repeat = prevHuf->repeatMode;
-	DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
-
-	/* Prepare nextEntropy assuming reusing the existing table */
-	ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-
-	if(literalsCompressionIsDisabled) {
-		DEBUGLOG(5, "set_basic - disabled");
-		hufMetadata->hType = set_basic;
-		return 0;
-	}
-
-	/* small ? don't even attempt compression (speed opt) */
-#ifndef COMPRESS_LITERALS_SIZE_MIN
-#define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */
-#endif
-	{
-		size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
-		if(srcSize <= minLitSize) {
-			DEBUGLOG(5, "set_basic - too small");
-			hufMetadata->hType = set_basic;
-			return 0;
-		}
-	}
-
-	/* Scan input and build symbol stats */
-	{
-		size_t const largest =
-		    HIST_count_wksp(countWksp, &maxSymbolValue,
-		                    (const BYTE*)src, srcSize,
-		                    workspace, wkspSize);
-		FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
-		if(largest == srcSize) {
-			/* only one literal symbol */
-			DEBUGLOG(5, "set_rle");
-			hufMetadata->hType = set_rle;
-			return 0;
-		}
-		if(largest <= (srcSize >> 7) + 4) {
-			/* heuristic: likely not compressible */
-			DEBUGLOG(5, "set_basic - no gain");
-			hufMetadata->hType = set_basic;
-			return 0;
-		}
-	}
-
-	/* Validate the previous Huffman table */
-	if(repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const *)prevHuf->CTable, countWksp, maxSymbolValue)) {
-		repeat = HUF_repeat_none;
-	}
-
-	/* Build Huffman Tree */
-	ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
-	huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
-	assert(huffLog <= LitHufLog);
-	{
-		size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
-		                                            maxSymbolValue, huffLog,
-		                                            nodeWksp, nodeWkspSize);
-		FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
-		huffLog = (U32)maxBits;
-	}
-	{ /* Build and write the CTable */
-		size_t const newCSize = HUF_estimateCompressedSize(
-		    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
-		size_t const hSize = HUF_writeCTable_wksp(
-		    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
-		    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
-		    nodeWksp, nodeWkspSize);
-		/* Check against repeating the previous CTable */
-		if(repeat != HUF_repeat_none) {
-			size_t const oldCSize = HUF_estimateCompressedSize(
-			    (HUF_CElt const *)prevHuf->CTable, countWksp, maxSymbolValue);
-			if(oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
-				DEBUGLOG(5, "set_repeat - smaller");
-				ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-				hufMetadata->hType = set_repeat;
-				return 0;
-			}
-		}
-		if(newCSize + hSize >= srcSize) {
-			DEBUGLOG(5, "set_basic - no gains");
-			ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-			hufMetadata->hType = set_basic;
-			return 0;
-		}
-		DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
-		hufMetadata->hType = set_compressed;
-		nextHuf->repeatMode = HUF_repeat_check;
-		return hSize;
-	}
-}
-
-/* ZSTD_buildDummySequencesStatistics():
- * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set_basic,
- * and updates nextEntropy to the appropriate repeatMode.
- */
-static ZSTD_symbolEncodingTypeStats_t
-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
-	ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
-	nextEntropy->litlength_repeatMode = FSE_repeat_none;
-	nextEntropy->offcode_repeatMode = FSE_repeat_none;
-	nextEntropy->matchlength_repeatMode = FSE_repeat_none;
-	return stats;
-}
-
-/** ZSTD_buildBlockEntropyStats_sequences() :
- *  Builds entropy for the sequences.
- *  Stores symbol compression modes and fse table to fseMetadata.
- *  Requires ENTROPY_WORKSPACE_SIZE wksp.
- * @return : size of fse tables or error code */
-static size_t
-ZSTD_buildBlockEntropyStats_sequences(
-    const seqStore_t* seqStorePtr,
-    const ZSTD_fseCTables_t* prevEntropy,
-    ZSTD_fseCTables_t* nextEntropy,
-    const ZSTD_CCtx_params* cctxParams,
-    ZSTD_fseCTablesMetadata_t* fseMetadata,
-    void* workspace, size_t wkspSize) {
-	ZSTD_strategy const strategy = cctxParams->cParams.strategy;
-	size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
-	BYTE* const ostart = fseMetadata->fseTablesBuffer;
-	BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
-	BYTE* op = ostart;
-	unsigned* countWorkspace = (unsigned*)workspace;
-	unsigned* entropyWorkspace = countWorkspace + (MaxSeq + 1);
-	size_t entropyWorkspaceSize = wkspSize - (MaxSeq + 1) * sizeof(*countWorkspace);
-	ZSTD_symbolEncodingTypeStats_t stats;
-
-	DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=%zu)", nbSeq);
-	stats = nbSeq != 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
-	                                                   prevEntropy, nextEntropy, op, oend,
-	                                                   strategy, countWorkspace,
-	                                                   entropyWorkspace, entropyWorkspaceSize)
-	                   : ZSTD_buildDummySequencesStatistics(nextEntropy);
-	FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!");
-	fseMetadata->llType = (symbolEncodingType_e)stats.LLtype;
-	fseMetadata->ofType = (symbolEncodingType_e)stats.Offtype;
-	fseMetadata->mlType = (symbolEncodingType_e)stats.MLtype;
-	fseMetadata->lastCountSize = stats.lastCountSize;
-	return stats.size;
-}
-
-/** ZSTD_buildBlockEntropyStats() :
- *  Builds entropy for the block.
- *  Requires workspace size ENTROPY_WORKSPACE_SIZE
- * @return : 0 on success, or an error code
- *  Note : also employed in superblock
- */
-size_t ZSTD_buildBlockEntropyStats(
-    const seqStore_t* seqStorePtr,
-    const ZSTD_entropyCTables_t* prevEntropy,
-    ZSTD_entropyCTables_t* nextEntropy,
-    const ZSTD_CCtx_params* cctxParams,
-    ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-    void* workspace, size_t wkspSize) {
-	size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
-	int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
-	int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
-
-	entropyMetadata->hufMetadata.hufDesSize =
-	    ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
-	                                         &prevEntropy->huf, &nextEntropy->huf,
-	                                         &entropyMetadata->hufMetadata,
-	                                         ZSTD_literalsCompressionIsDisabled(cctxParams),
-	                                         workspace, wkspSize, hufFlags);
-
-	FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
-	entropyMetadata->fseMetadata.fseTablesSize =
-	    ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
-	                                          &prevEntropy->fse, &nextEntropy->fse,
-	                                          cctxParams,
-	                                          &entropyMetadata->fseMetadata,
-	                                          workspace, wkspSize);
-	FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildBlockEntropyStats_sequences failed");
-	return 0;
-}
-
-/* Returns the size estimate for the literals section (header + content) of a block */
-static size_t
-ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
-                               const ZSTD_hufCTables_t* huf,
-                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
-                               void* workspace, size_t wkspSize,
-                               int writeEntropy) {
-	unsigned* const countWksp = (unsigned*)workspace;
-	unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
-	size_t literalSectionHeaderSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB);
-	U32 singleStream = litSize < 256;
-
-	if(hufMetadata->hType == set_basic)
-		return litSize;
-	else if(hufMetadata->hType == set_rle)
-		return 1;
-	else if(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) {
-		size_t const largest = HIST_count_wksp(countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize);
-		if(ZSTD_isError(largest))
-			return litSize;
-		{
-			size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
-			if(writeEntropy)
-				cLitSizeEstimate += hufMetadata->hufDesSize;
-			if(!singleStream)
-				cLitSizeEstimate += 6; /* multi-stream huffman uses 6-byte jump table */
-			return cLitSizeEstimate + literalSectionHeaderSize;
-		}
-	}
-	assert(0); /* impossible */
-	return 0;
-}
-
-/* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
-static size_t
-ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
-                                  const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
-                                  const FSE_CTable* fseCTable,
-                                  const U8* additionalBits,
-                                  short const * defaultNorm, U32 defaultNormLog, U32 defaultMax,
-                                  void* workspace, size_t wkspSize) {
-	unsigned* const countWksp = (unsigned*)workspace;
-	const BYTE* ctp = codeTable;
-	const BYTE* const ctStart = ctp;
-	const BYTE* const ctEnd = ctStart + nbSeq;
-	size_t cSymbolTypeSizeEstimateInBits = 0;
-	unsigned max = maxCode;
-
-	HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */
-	if(type == set_basic) {
-		/* We selected this encoding type, so it must be valid. */
-		assert(max <= defaultMax);
-		(void)defaultMax;
-		cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max);
-	} else if(type == set_rle) {
-		cSymbolTypeSizeEstimateInBits = 0;
-	} else if(type == set_compressed || type == set_repeat) {
-		cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max);
-	}
-	if(ZSTD_isError(cSymbolTypeSizeEstimateInBits)) {
-		return nbSeq * 10;
-	}
-	while(ctp < ctEnd) {
-		if(additionalBits)
-			cSymbolTypeSizeEstimateInBits += additionalBits[*ctp];
-		else
-			cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */
-		ctp++;
-	}
-	return cSymbolTypeSizeEstimateInBits >> 3;
-}
-
-/* Returns the size estimate for the sequences section (header + content) of a block */
-static size_t
-ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
-                                 const BYTE* llCodeTable,
-                                 const BYTE* mlCodeTable,
-                                 size_t nbSeq,
-                                 const ZSTD_fseCTables_t* fseTables,
-                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                 void* workspace, size_t wkspSize,
-                                 int writeEntropy) {
-	size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
-	size_t cSeqSizeEstimate = 0;
-	cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
-	                                                      fseTables->offcodeCTable, NULL,
-	                                                      OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
-	                                                      workspace, wkspSize);
-	cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
-	                                                      fseTables->litlengthCTable, LL_bits,
-	                                                      LL_defaultNorm, LL_defaultNormLog, MaxLL,
-	                                                      workspace, wkspSize);
-	cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
-	                                                      fseTables->matchlengthCTable, ML_bits,
-	                                                      ML_defaultNorm, ML_defaultNormLog, MaxML,
-	                                                      workspace, wkspSize);
-	if(writeEntropy)
-		cSeqSizeEstimate += fseMetadata->fseTablesSize;
-	return cSeqSizeEstimate + sequencesSectionHeaderSize;
-}
-
-/* Returns the size estimate for a given stream of literals, of, ll, ml */
-static size_t
-ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
-                       const BYTE* ofCodeTable,
-                       const BYTE* llCodeTable,
-                       const BYTE* mlCodeTable,
-                       size_t nbSeq,
-                       const ZSTD_entropyCTables_t* entropy,
-                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                       void* workspace, size_t wkspSize,
-                       int writeLitEntropy, int writeSeqEntropy) {
-	size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
-	                                                           &entropy->huf, &entropyMetadata->hufMetadata,
-	                                                           workspace, wkspSize, writeLitEntropy);
-	size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
-	                                                        nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
-	                                                        workspace, wkspSize, writeSeqEntropy);
-	return seqSize + literalsSize + ZSTD_blockHeaderSize;
-}
-
-/* Builds entropy statistics and uses them for blocksize estimation.
- *
- * @return: estimated compressed size of the seqStore, or a zstd error.
- */
-static size_t
-ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
-	ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
-	DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
-	FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
-	                                             &zc->blockState.prevCBlock->entropy,
-	                                             &zc->blockState.nextCBlock->entropy,
-	                                             &zc->appliedParams,
-	                                             entropyMetadata,
-	                                             zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE),
-	                 "");
-	return ZSTD_estimateBlockSize(
-	    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
-	    seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
-	    (size_t)(seqStore->sequences - seqStore->sequencesStart),
-	    &zc->blockState.nextCBlock->entropy,
-	    entropyMetadata,
-	    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
-	    (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
-}
-
-/* Returns literals bytes represented in a seqStore */
-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
-	size_t literalsBytes = 0;
-	size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
-	size_t i;
-	for(i = 0; i < nbSeqs; ++i) {
-		seqDef const seq = seqStore->sequencesStart[i];
-		literalsBytes += seq.litLength;
-		if(i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
-			literalsBytes += 0x10000;
-		}
-	}
-	return literalsBytes;
-}
-
-/* Returns match bytes represented in a seqStore */
-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
-	size_t matchBytes = 0;
-	size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
-	size_t i;
-	for(i = 0; i < nbSeqs; ++i) {
-		seqDef seq = seqStore->sequencesStart[i];
-		matchBytes += seq.mlBase + MINMATCH;
-		if(i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
-			matchBytes += 0x10000;
-		}
-	}
-	return matchBytes;
-}
-
-/* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx).
- * Stores the result in resultSeqStore.
- */
-static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
-                                     const seqStore_t* originalSeqStore,
-                                     size_t startIdx, size_t endIdx) {
-	*resultSeqStore = *originalSeqStore;
-	if(startIdx > 0) {
-		resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
-		resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
-	}
-
-	/* Move longLengthPos into the correct position if necessary */
-	if(originalSeqStore->longLengthType != ZSTD_llt_none) {
-		if(originalSeqStore->longLengthPos < startIdx || originalSeqStore->longLengthPos > endIdx) {
-			resultSeqStore->longLengthType = ZSTD_llt_none;
-		} else {
-			resultSeqStore->longLengthPos -= (U32)startIdx;
-		}
-	}
-	resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
-	resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
-	if(endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
-		/* This accounts for possible last literals if the derived chunk reaches the end of the block */
-		assert(resultSeqStore->lit == originalSeqStore->lit);
-	} else {
-		size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
-		resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
-	}
-	resultSeqStore->llCode += startIdx;
-	resultSeqStore->mlCode += startIdx;
-	resultSeqStore->ofCode += startIdx;
-}
-
-/**
- * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
- * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
- */
-static U32
-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) {
-	U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */
-	assert(OFFBASE_IS_REPCODE(offBase));
-	if(adjustedRepCode == ZSTD_REP_NUM) {
-		assert(ll0);
-		/* litlength == 0 and offCode == 2 implies selection of first repcode - 1
-		 * This is only valid if it results in a valid offset value, aka > 0.
-		 * Note : it may happen that `rep[0]==1` in exceptional circumstances.
-		 * In which case this function will return 0, which is an invalid offset.
-		 * It's not an issue though, since this value will be
-		 * compared and discarded within ZSTD_seqStore_resolveOffCodes().
-		 */
-		return rep[0] - 1;
-	}
-	return rep[adjustedRepCode];
-}
-
-/**
- * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise
- * due to emission of RLE/raw blocks that disturb the offset history,
- * and replaces any repcodes within the seqStore that may be invalid.
- *
- * dRepcodes are updated as would be on the decompression side.
- * cRepcodes are updated exactly in accordance with the seqStore.
- *
- * Note : this function assumes seq->offBase respects the following numbering scheme :
- *        0 : invalid
- *        1-3 : repcode 1-3
- *        4+ : real_offset+3
- */
-static void
-ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
-                              const seqStore_t* const seqStore, U32 const nbSeq) {
-	U32 idx = 0;
-	for(; idx < nbSeq; ++idx) {
-		seqDef* const seq = seqStore->sequencesStart + idx;
-		U32 const ll0 = (seq->litLength == 0);
-		U32 const offBase = seq->offBase;
-		assert(offBase > 0);
-		if(OFFBASE_IS_REPCODE(offBase)) {
-			U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
-			U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
-			/* Adjust simulated decompression repcode history if we come across a mismatch. Replace
-			 * the repcode with the offset it actually references, determined by the compression
-			 * repcode history.
-			 */
-			if(dRawOffset != cRawOffset) {
-				seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
-			}
-		}
-		/* Compression repcode history is always updated with values directly from the unmodified seqStore.
-		 * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
-		 */
-		ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
-		ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
-	}
-}
-
-/* ZSTD_compressSeqStore_singleBlock():
- * Compresses a seqStore into a block with a block header, into the buffer dst.
- *
- * Returns the total size of that block (including header) or a ZSTD error code.
- */
-static size_t
-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
-                                  const seqStore_t* const seqStore,
-                                  repcodes_t* const dRep, repcodes_t* const cRep,
-                                  void* dst, size_t dstCapacity,
-                                  const void* src, size_t srcSize,
-                                  U32 lastBlock, U32 isPartition) {
-	const U32 rleMaxLength = 25;
-	BYTE* op = (BYTE*)dst;
-	const BYTE* ip = (const BYTE*)src;
-	size_t cSize;
-	size_t cSeqsSize;
-
-	/* In case of an RLE or raw block, the simulated decompression repcode history must be reset */
-	repcodes_t const dRepOriginal = *dRep;
-	DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock");
-	if(isPartition)
-		ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart));
-
-	RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "Block header doesn't fit");
-	cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore,
-	                                         &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
-	                                         &zc->appliedParams,
-	                                         op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize,
-	                                         srcSize,
-	                                         zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
-	                                         zc->bmi2);
-	FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!");
-
-	if(!zc->isFirstBlock &&
-	   cSeqsSize < rleMaxLength &&
-	   ZSTD_isRLE((BYTE const *)src, srcSize)) {
-		/* We don't want to emit our first block as a RLE even if it qualifies because
-		 * doing so will cause the decoder (cli only) to throw a "should consume all input error."
-		 * This is only an issue for zstd <= v1.4.3
-		 */
-		cSeqsSize = 1;
-	}
-
-	if(zc->seqCollector.collectSequences) {
-		ZSTD_copyBlockSequences(zc);
-		ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
-		return 0;
-	}
-
-	if(cSeqsSize == 0) {
-		cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
-		FORWARD_IF_ERROR(cSize, "Nocompress block failed");
-		DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize);
-		*dRep = dRepOriginal; /* reset simulated decompression repcode history */
-	} else if(cSeqsSize == 1) {
-		cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock);
-		FORWARD_IF_ERROR(cSize, "RLE compress block failed");
-		DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize);
-		*dRep = dRepOriginal; /* reset simulated decompression repcode history */
-	} else {
-		ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
-		writeBlockHeader(op, cSeqsSize, srcSize, lastBlock);
-		cSize = ZSTD_blockHeaderSize + cSeqsSize;
-		DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize);
-	}
-
-	if(zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
-		zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
-
-	return cSize;
-}
-
-/* Struct to keep track of where we are in our recursive calls. */
-typedef struct {
-	U32* splitLocations; /* Array of split indices */
-	size_t idx;          /* The current index within splitLocations being worked on */
-} seqStoreSplits;
-
-#define MIN_SEQUENCES_BLOCK_SPLITTING 300
-
-/* Helper function to perform the recursive search for block splits.
- * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
- * If advantageous to split, then we recurse down the two sub-blocks.
- * If not, or if an error occurred in estimation, then we do not recurse.
- *
- * Note: The recursion depth is capped by a heuristic minimum number of sequences,
- * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
- * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
- * In practice, recursion depth usually doesn't go beyond 4.
- *
- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
- * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
- * maximum of 128 KB, this value is actually impossible to reach.
- */
-static void
-ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
-                             ZSTD_CCtx* zc, const seqStore_t* origSeqStore) {
-	seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
-	seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
-	seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
-	size_t estimatedOriginalSize;
-	size_t estimatedFirstHalfSize;
-	size_t estimatedSecondHalfSize;
-	size_t midIdx = (startIdx + endIdx) / 2;
-
-	DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
-	assert(endIdx >= startIdx);
-	if(endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
-		DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
-		return;
-	}
-	ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
-	ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
-	ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
-	estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
-	estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
-	estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
-	DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
-	         estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
-	if(ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
-		return;
-	}
-	if(estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
-		DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
-		ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
-		splits->splitLocations[splits->idx] = (U32)midIdx;
-		splits->idx++;
-		ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqStore);
-	}
-}
-
-/* Base recursive function.
- * Populates a table with intra-block partition indices that can improve compression ratio.
- *
- * @return: number of splits made (which equals the size of the partition table - 1).
- */
-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
-	seqStoreSplits splits;
-	splits.splitLocations = partitions;
-	splits.idx = 0;
-	if(nbSeq <= 4) {
-		DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
-		/* Refuse to try and split anything with less than 4 sequences */
-		return 0;
-	}
-	ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore);
-	splits.splitLocations[splits.idx] = nbSeq;
-	DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits.idx + 1);
-	return splits.idx;
-}
-
-/* ZSTD_compressBlock_splitBlock():
- * Attempts to split a given block into multiple blocks to improve compression ratio.
- *
- * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
- */
-static size_t
-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
-                                       void* dst, size_t dstCapacity,
-                                       const void* src, size_t blockSize,
-                                       U32 lastBlock, U32 nbSeq) {
-	size_t cSize = 0;
-	const BYTE* ip = (const BYTE*)src;
-	BYTE* op = (BYTE*)dst;
-	size_t i = 0;
-	size_t srcBytesTotal = 0;
-	U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
-	seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
-	seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
-	size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
-
-	/* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
-	 * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
-	 * separate repcode histories that simulate repcode history on compression and decompression side,
-	 * and use the histories to determine whether we must replace a particular repcode with its raw offset.
-	 *
-	 * 1) cRep gets updated for each partition, regardless of whether the block was emitted as uncompressed
-	 *    or RLE. This allows us to retrieve the offset value that an invalid repcode references within
-	 *    a nocompress/RLE block.
-	 * 2) dRep gets updated only for compressed partitions, and when a repcode gets replaced, will use
-	 *    the replacement offset value rather than the original repcode to update the repcode history.
-	 *    dRep also will be the final repcode history sent to the next block.
-	 *
-	 * See ZSTD_seqStore_resolveOffCodes() for more details.
-	 */
-	repcodes_t dRep;
-	repcodes_t cRep;
-	ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
-	ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
-	ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
-
-	DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
-	         (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
-	         (unsigned)zc->blockState.matchState.nextToUpdate);
-
-	if(numSplits == 0) {
-		size_t cSizeSingleBlock =
-		    ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
-		                                      &dRep, &cRep,
-		                                      op, dstCapacity,
-		                                      ip, blockSize,
-		                                      lastBlock, 0 /* isPartition */);
-		FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
-		DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
-		assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
-		assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
-		return cSizeSingleBlock;
-	}
-
-	ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
-	for(i = 0; i <= numSplits; ++i) {
-		size_t cSizeChunk;
-		U32 const lastPartition = (i == numSplits);
-		U32 lastBlockEntireSrc = 0;
-
-		size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
-		srcBytesTotal += srcBytes;
-		if(lastPartition) {
-			/* This is the final partition, need to account for possible last literals */
-			srcBytes += blockSize - srcBytesTotal;
-			lastBlockEntireSrc = lastBlock;
-		} else {
-			ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitions[i], partitions[i + 1]);
-		}
-
-		cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, currSeqStore,
-		                                               &dRep, &cRep,
-		                                               op, dstCapacity,
-		                                               ip, srcBytes,
-		                                               lastBlockEntireSrc, 1 /* isPartition */);
-		DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
-		         ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
-		FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
-
-		ip += srcBytes;
-		op += cSizeChunk;
-		dstCapacity -= cSizeChunk;
-		cSize += cSizeChunk;
-		*currSeqStore = *nextSeqStore;
-		assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
-	}
-	/* cRep and dRep may have diverged during the compression.
-	 * If so, we use the dRep repcodes for the next block.
-	 */
-	ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
-	return cSize;
-}
-
-static size_t
-ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
-                              void* dst, size_t dstCapacity,
-                              const void* src, size_t srcSize, U32 lastBlock) {
-	U32 nbSeq;
-	size_t cSize;
-	DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
-	assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable);
-
-	{
-		const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
-		FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
-		if(bss == ZSTDbss_noCompress) {
-			if(zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
-				zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
-			cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
-			FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
-			DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
-			return cSize;
-		}
-		nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart);
-	}
-
-	cSize = ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity, src, srcSize, lastBlock, nbSeq);
-	FORWARD_IF_ERROR(cSize, "Splitting blocks failed!");
-	return cSize;
-}
-
-static size_t
-ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
-                            void* dst, size_t dstCapacity,
-                            const void* src, size_t srcSize, U32 frame) {
-	/* This is an estimated upper bound for the length of an rle block.
-	 * This isn't the actual upper bound.
-	 * Finding the real threshold needs further investigation.
-	 */
-	const U32 rleMaxLength = 25;
-	size_t cSize;
-	const BYTE* ip = (const BYTE*)src;
-	BYTE* op = (BYTE*)dst;
-	DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
-	         (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
-	         (unsigned)zc->blockState.matchState.nextToUpdate);
-
-	{
-		const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
-		FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
-		if(bss == ZSTDbss_noCompress) {
-			cSize = 0;
-			goto out;
-		}
-	}
-
-	if(zc->seqCollector.collectSequences) {
-		ZSTD_copyBlockSequences(zc);
-		ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
-		return 0;
-	}
-
-	/* encode sequences and literals */
-	cSize = ZSTD_entropyCompressSeqStore(&zc->seqStore,
-	                                     &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
-	                                     &zc->appliedParams,
-	                                     dst, dstCapacity,
-	                                     srcSize,
-	                                     zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
-	                                     zc->bmi2);
-
-	if(frame &&
-	   /* We don't want to emit our first block as a RLE even if it qualifies because
-	    * doing so will cause the decoder (cli only) to throw a "should consume all input error."
-	    * This is only an issue for zstd <= v1.4.3
-	    */
-	   !zc->isFirstBlock &&
-	   cSize < rleMaxLength &&
-	   ZSTD_isRLE(ip, srcSize)) {
-		cSize = 1;
-		op[0] = ip[0];
-	}
-
-out:
-	if(!ZSTD_isError(cSize) && cSize > 1) {
-		ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
-	}
-	/* We check that dictionaries have offset codes available for the first
-	 * block. After the first block, the offcode table might not have large
-	 * enough codes to represent the offsets in the data.
-	 */
-	if(zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
-		zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
-
-	return cSize;
-}
-
-static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
-                                                       void* dst, size_t dstCapacity,
-                                                       const void* src, size_t srcSize,
-                                                       const size_t bss, U32 lastBlock) {
-	DEBUGLOG(6, "Attempting ZSTD_compressSuperBlock()");
-	if(bss == ZSTDbss_compress) {
-		if(/* We don't want to emit our first block as a RLE even if it qualifies because
-		    * doing so will cause the decoder (cli only) to throw a "should consume all input error."
-		    * This is only an issue for zstd <= v1.4.3
-		    */
-		   !zc->isFirstBlock &&
-		   ZSTD_maybeRLE(&zc->seqStore) &&
-		   ZSTD_isRLE((BYTE const *)src, srcSize)) {
-			return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const *)src, srcSize, lastBlock);
-		}
-		/* Attempt superblock compression.
-		 *
-		 * Note that compressed size of ZSTD_compressSuperBlock() is not bound by the
-		 * standard ZSTD_compressBound(). This is a problem, because even if we have
-		 * space now, taking an extra byte now could cause us to run out of space later
-		 * and violate ZSTD_compressBound().
-		 *
-		 * Define blockBound(blockSize) = blockSize + ZSTD_blockHeaderSize.
-		 *
-		 * In order to respect ZSTD_compressBound() we must attempt to emit a raw
-		 * uncompressed block in these cases:
-		 *   * cSize == 0: Return code for an uncompressed block.
-		 *   * cSize == dstSize_tooSmall: We may have expanded beyond blockBound(srcSize).
-		 *     ZSTD_noCompressBlock() will return dstSize_tooSmall if we are really out of
-		 *     output space.
-		 *   * cSize >= blockBound(srcSize): We have expanded the block too much so
-		 *     emit an uncompressed block.
-		 */
-		{
-			size_t const cSize =
-			    ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
-			if(cSize != ERROR(dstSize_tooSmall)) {
-				size_t const maxCSize =
-				    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
-				FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
-				if(cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
-					ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
-					return cSize;
-				}
-			}
-		}
-	} /* if (bss == ZSTDbss_compress)*/
-
-	DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
-	/* Superblock compression failed, attempt to emit a single no compress block.
-	 * The decoder will be able to stream this block since it is uncompressed.
-	 */
-	return ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
-}
-
-static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc,
-                                                  void* dst, size_t dstCapacity,
-                                                  const void* src, size_t srcSize,
-                                                  U32 lastBlock) {
-	size_t cSize = 0;
-	const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
-	DEBUGLOG(5, "ZSTD_compressBlock_targetCBlockSize (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u, srcSize=%zu)",
-	         (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate, srcSize);
-	FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
-
-	cSize = ZSTD_compressBlock_targetCBlockSize_body(zc, dst, dstCapacity, src, srcSize, bss, lastBlock);
-	FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize_body failed");
-
-	if(zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
-		zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
-
-	return cSize;
-}
-
-static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
-                                         ZSTD_cwksp* ws,
-                                         ZSTD_CCtx_params const * params,
-                                         void const * ip,
-                                         void const * iend) {
-	U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy);
-	U32 const maxDist = (U32)1 << params->cParams.windowLog;
-	if(ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, ms->loadedDictEnd, ip, iend)) {
-		U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip);
-		ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
-		ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
-		ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
-		ZSTD_cwksp_mark_tables_dirty(ws);
-		ZSTD_reduceIndex(ms, params, correction);
-		ZSTD_cwksp_mark_tables_clean(ws);
-		if(ms->nextToUpdate < correction)
-			ms->nextToUpdate = 0;
-		else
-			ms->nextToUpdate -= correction;
-		/* invalidate dictionaries on overflow correction */
-		ms->loadedDictEnd = 0;
-		ms->dictMatchState = NULL;
-	}
-}
-
-/*! ZSTD_compress_frameChunk() :
- *   Compress a chunk of data into one or multiple blocks.
- *   All blocks will be terminated, all input will be consumed.
- *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
- *   Frame is supposed already started (header already produced)
- *  @return : compressed size, or an error code
- */
-static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
-                                       void* dst, size_t dstCapacity,
-                                       const void* src, size_t srcSize,
-                                       U32 lastFrameChunk) {
-	size_t blockSize = cctx->blockSize;
-	size_t remaining = srcSize;
-	const BYTE* ip = (const BYTE*)src;
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* op = ostart;
-	U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog;
-
-	assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX);
-
-	DEBUGLOG(4, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize);
-	if(cctx->appliedParams.fParams.checksumFlag && srcSize)
-		XXH64_update(&cctx->xxhState, src, srcSize);
-
-	while(remaining) {
-		ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
-		U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
-
-		/* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
-		 * additional 1. We need to revisit and change this logic to be more consistent */
-		RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
-		                dstSize_tooSmall,
-		                "not enough space to store compressed block");
-		if(remaining < blockSize)
-			blockSize = remaining;
-
-		ZSTD_overflowCorrectIfNeeded(
-		    ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize);
-		ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
-		ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
-
-		/* Ensure hash/chain table insertion resumes no sooner than lowlimit */
-		if(ms->nextToUpdate < ms->window.lowLimit)
-			ms->nextToUpdate = ms->window.lowLimit;
-
-		{
-			size_t cSize;
-			if(ZSTD_useTargetCBlockSize(&cctx->appliedParams)) {
-				cSize = ZSTD_compressBlock_targetCBlockSize(cctx, op, dstCapacity, ip, blockSize, lastBlock);
-				FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed");
-				assert(cSize > 0);
-				assert(cSize <= blockSize + ZSTD_blockHeaderSize);
-			} else if(ZSTD_blockSplitterEnabled(&cctx->appliedParams)) {
-				cSize = ZSTD_compressBlock_splitBlock(cctx, op, dstCapacity, ip, blockSize, lastBlock);
-				FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock failed");
-				assert(cSize > 0 || cctx->seqCollector.collectSequences == 1);
-			} else {
-				cSize = ZSTD_compressBlock_internal(cctx,
-				                                    op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize,
-				                                    ip, blockSize, 1 /* frame */);
-				FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_internal failed");
-
-				if(cSize == 0) { /* block is not compressible */
-					cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
-					FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
-				} else {
-					U32 const cBlockHeader = cSize == 1 ? lastBlock + (((U32)bt_rle) << 1) + (U32)(blockSize << 3) : lastBlock + (((U32)bt_compressed) << 1) + (U32)(cSize << 3);
-					MEM_writeLE24(op, cBlockHeader);
-					cSize += ZSTD_blockHeaderSize;
-				}
-			} /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
-
-			ip += blockSize;
-			assert(remaining >= blockSize);
-			remaining -= blockSize;
-			op += cSize;
-			assert(dstCapacity >= cSize);
-			dstCapacity -= cSize;
-			cctx->isFirstBlock = 0;
-			DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u",
-			         (unsigned)cSize);
-		}
-	}
-
-	if(lastFrameChunk && (op > ostart))
-		cctx->stage = ZSTDcs_ending;
-	return (size_t)(op - ostart);
-}
-
-static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
-                                    const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID) {
-	BYTE* const op = (BYTE*)dst;
-	U32 const dictIDSizeCodeLength = (dictID > 0) + (dictID >= 256) + (dictID >= 65536); /* 0-3 */
-	U32 const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength;  /* 0-3 */
-	U32 const checksumFlag = params->fParams.checksumFlag > 0;
-	U32 const windowSize = (U32)1 << params->cParams.windowLog;
-	U32 const singleSegment = params->fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
-	BYTE const windowLogByte = (BYTE)((params->cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3);
-	U32 const fcsCode = params->fParams.contentSizeFlag ? (pledgedSrcSize >= 256) + (pledgedSrcSize >= 65536 + 256) + (pledgedSrcSize >= 0xFFFFFFFFU) : 0; /* 0-3 */
-	BYTE const frameHeaderDescriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag << 2) + (singleSegment << 5) + (fcsCode << 6));
-	size_t pos = 0;
-
-	assert(!(params->fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN));
-	RETURN_ERROR_IF(dstCapacity < ZSTD_FRAMEHEADERSIZE_MAX, dstSize_tooSmall,
-	                "dst buf is too small to fit worst-case frame header size.");
-	DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u",
-	         !params->fParams.noDictIDFlag, (unsigned)dictID, (unsigned)dictIDSizeCode);
-	if(params->format == ZSTD_f_zstd1) {
-		MEM_writeLE32(dst, ZSTD_MAGICNUMBER);
-		pos = 4;
-	}
-	op[pos++] = frameHeaderDescriptionByte;
-	if(!singleSegment)
-		op[pos++] = windowLogByte;
-	switch(dictIDSizeCode) {
-	default:
-		assert(0); /* impossible */
-		ZSTD_FALLTHROUGH;
-	case 0:
-		break;
-	case 1:
-		op[pos] = (BYTE)(dictID);
-		pos++;
-		break;
-	case 2:
-		MEM_writeLE16(op + pos, (U16)dictID);
-		pos += 2;
-		break;
-	case 3:
-		MEM_writeLE32(op + pos, dictID);
-		pos += 4;
-		break;
-	}
-	switch(fcsCode) {
-	default:
-		assert(0); /* impossible */
-		ZSTD_FALLTHROUGH;
-	case 0:
-		if(singleSegment)
-			op[pos++] = (BYTE)(pledgedSrcSize);
-		break;
-	case 1:
-		MEM_writeLE16(op + pos, (U16)(pledgedSrcSize - 256));
-		pos += 2;
-		break;
-	case 2:
-		MEM_writeLE32(op + pos, (U32)(pledgedSrcSize));
-		pos += 4;
-		break;
-	case 3:
-		MEM_writeLE64(op + pos, (U64)(pledgedSrcSize));
-		pos += 8;
-		break;
-	}
-	return pos;
-}
-
-/* ZSTD_writeSkippableFrame_advanced() :
- * Writes out a skippable frame with the specified magic number variant (16 are supported),
- * from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15, and the desired source data.
- *
- * Returns the total number of bytes written, or a ZSTD error code.
- */
-size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity,
-                                const void* src, size_t srcSize, unsigned magicVariant) {
-	BYTE* op = (BYTE*)dst;
-	RETURN_ERROR_IF(dstCapacity < srcSize + ZSTD_SKIPPABLEHEADERSIZE /* Skippable frame overhead */,
-	                dstSize_tooSmall, "Not enough room for skippable frame");
-	RETURN_ERROR_IF(srcSize > (unsigned)0xFFFFFFFF, srcSize_wrong, "Src size too large for skippable frame");
-	RETURN_ERROR_IF(magicVariant > 15, parameter_outOfBound, "Skippable frame magic number variant not supported");
-
-	MEM_writeLE32(op, (U32)(ZSTD_MAGIC_SKIPPABLE_START + magicVariant));
-	MEM_writeLE32(op + 4, (U32)srcSize);
-	ZSTD_memcpy(op + 8, src, srcSize);
-	return srcSize + ZSTD_SKIPPABLEHEADERSIZE;
-}
-
-/* ZSTD_writeLastEmptyBlock() :
- * output an empty Block with end-of-frame mark to complete a frame
- * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
- *           or an error code if `dstCapacity` is too small (<ZSTD_blockHeaderSize)
- */
-size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity) {
-	RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall,
-	                "dst buf is too small to write frame trailer empty block.");
-	{
-		U32 const cBlockHeader24 = 1 /*lastBlock*/ + (((U32)bt_raw) << 1); /* 0 size */
-		MEM_writeLE24(dst, cBlockHeader24);
-		return ZSTD_blockHeaderSize;
-	}
-}
-
-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) {
-	RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
-	                "wrong cctx stage");
-	RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,
-	                parameter_unsupported,
-	                "incompatible with ldm");
-	cctx->externSeqStore.seq = seq;
-	cctx->externSeqStore.size = nbSeq;
-	cctx->externSeqStore.capacity = nbSeq;
-	cctx->externSeqStore.pos = 0;
-	cctx->externSeqStore.posInSequence = 0;
-	return 0;
-}
-
-static size_t ZSTD_compressContinue_internal(ZSTD_CCtx* cctx,
-                                             void* dst, size_t dstCapacity,
-                                             const void* src, size_t srcSize,
-                                             U32 frame, U32 lastFrameChunk) {
-	ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
-	size_t fhSize = 0;
-
-	DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
-	         cctx->stage, (unsigned)srcSize);
-	RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong,
-	                "missing init (ZSTD_compressBegin)");
-
-	if(frame && (cctx->stage == ZSTDcs_init)) {
-		fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams,
-		                               cctx->pledgedSrcSizePlusOne - 1, cctx->dictID);
-		FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
-		assert(fhSize <= dstCapacity);
-		dstCapacity -= fhSize;
-		dst = (char*)dst + fhSize;
-		cctx->stage = ZSTDcs_ongoing;
-	}
-
-	if(!srcSize)
-		return fhSize; /* do not generate an empty block if no input */
-
-	if(!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) {
-		ms->forceNonContiguous = 0;
-		ms->nextToUpdate = ms->window.dictLimit;
-	}
-	if(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
-		ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0);
-	}
-
-	if(!frame) {
-		/* overflow check and correction for block mode */
-		ZSTD_overflowCorrectIfNeeded(
-		    ms, &cctx->workspace, &cctx->appliedParams,
-		    src, (BYTE const *)src + srcSize);
-	}
-
-	DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize);
-	{
-		size_t const cSize = frame ? ZSTD_compress_frameChunk(cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : ZSTD_compressBlock_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame */);
-		FORWARD_IF_ERROR(cSize, "%s", frame ? "ZSTD_compress_frameChunk failed" : "ZSTD_compressBlock_internal failed");
-		cctx->consumedSrcSize += srcSize;
-		cctx->producedCSize += (cSize + fhSize);
-		assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
-		if(cctx->pledgedSrcSizePlusOne != 0) { /* control src size */
-			ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
-			RETURN_ERROR_IF(
-			    cctx->consumedSrcSize + 1 > cctx->pledgedSrcSizePlusOne,
-			    srcSize_wrong,
-			    "error : pledgedSrcSize = %u, while realSrcSize >= %u",
-			    (unsigned)cctx->pledgedSrcSizePlusOne - 1,
-			    (unsigned)cctx->consumedSrcSize);
-		}
-		return cSize + fhSize;
-	}
-}
-
-size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
-                             void* dst, size_t dstCapacity,
-                             const void* src, size_t srcSize) {
-	DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
-	return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
-}
-
-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) {
-	ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
-	assert(!ZSTD_checkCParams(cParams));
-	return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
-}
-
-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) {
-	DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
-	{
-		size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
-		RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block");
-	}
-
-	return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
-}
-
-/*! ZSTD_loadDictionaryContent() :
- *  @return : 0, or an error code
- */
-static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
-                                         ldmState_t* ls,
-                                         ZSTD_cwksp* ws,
-                                         ZSTD_CCtx_params const * params,
-                                         const void* src, size_t srcSize,
-                                         ZSTD_dictTableLoadMethod_e dtlm,
-                                         ZSTD_tableFillPurpose_e tfp) {
-	const BYTE* ip = (const BYTE*)src;
-	const BYTE* const iend = ip + srcSize;
-	int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
-
-	/* Assert that the ms params match the params we're being given */
-	ZSTD_assertEqualCParams(params->cParams, ms->cParams);
-
-	{ /* Ensure large dictionaries can't cause index overflow */
-
-		/* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
-		 * Dictionaries right at the edge will immediately trigger overflow
-		 * correction, but I don't want to insert extra constraints here.
-		 */
-		U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
-
-		int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
-		if(CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
-			/* Some dictionary matchfinders in zstd use "short cache",
-			 * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
-			 * CDict hashtable entry as a tag rather than as part of an index.
-			 * When short cache is used, we need to truncate the dictionary
-			 * so that its indices don't overlap with the tag. */
-			U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
-			maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
-			assert(!loadLdmDict);
-		}
-
-		/* If the dictionary is too large, only load the suffix of the dictionary. */
-		if(srcSize > maxDictSize) {
-			ip = iend - maxDictSize;
-			src = ip;
-			srcSize = maxDictSize;
-		}
-	}
-
-	if(srcSize > ZSTD_CHUNKSIZE_MAX) {
-		/* We must have cleared our windows when our source is this large. */
-		assert(ZSTD_window_isEmpty(ms->window));
-		if(loadLdmDict)
-			assert(ZSTD_window_isEmpty(ls->window));
-	}
-
-	DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
-	ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
-	ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
-	ms->forceNonContiguous = params->deterministicRefPrefix;
-
-	if(loadLdmDict) {
-		ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
-		ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
-	}
-
-	if(srcSize <= HASH_READ_SIZE)
-		return 0;
-
-	ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
-
-	if(loadLdmDict)
-		ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
-
-	switch(params->cParams.strategy) {
-	case ZSTD_fast:
-		ZSTD_fillHashTable(ms, iend, dtlm, tfp);
-		break;
-	case ZSTD_dfast:
-		ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
-		break;
-
-	case ZSTD_greedy:
-	case ZSTD_lazy:
-	case ZSTD_lazy2:
-		assert(srcSize >= HASH_READ_SIZE);
-		if(ms->dedicatedDictSearch) {
-			assert(ms->chainTable != NULL);
-			ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend - HASH_READ_SIZE);
-		} else {
-			assert(params->useRowMatchFinder != ZSTD_ps_auto);
-			if(params->useRowMatchFinder == ZSTD_ps_enable) {
-				size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
-				ZSTD_memset(ms->tagTable, 0, tagTableSize);
-				ZSTD_row_update(ms, iend - HASH_READ_SIZE);
-				DEBUGLOG(4, "Using row-based hash table for lazy dict");
-			} else {
-				ZSTD_insertAndFindFirstIndex(ms, iend - HASH_READ_SIZE);
-				DEBUGLOG(4, "Using chain-based hash table for lazy dict");
-			}
-		}
-		break;
-
-	case ZSTD_btlazy2: /* we want the dictionary table fully sorted */
-	case ZSTD_btopt:
-	case ZSTD_btultra:
-	case ZSTD_btultra2:
-		assert(srcSize >= HASH_READ_SIZE);
-		ZSTD_updateTree(ms, iend - HASH_READ_SIZE, iend);
-		break;
-
-	default:
-		assert(0); /* not possible : not a valid strategy id */
-	}
-
-	ms->nextToUpdate = (U32)(iend - ms->window.base);
-	return 0;
-}
-
-/* Dictionaries that assign zero probability to symbols that show up causes problems
- * when FSE encoding. Mark dictionaries with zero probability symbols as FSE_repeat_check
- * and only dictionaries with 100% valid symbols can be assumed valid.
- */
-static FSE_repeat ZSTD_dictNCountRepeat(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) {
-	U32 s;
-	if(dictMaxSymbolValue < maxSymbolValue) {
-		return FSE_repeat_check;
-	}
-	for(s = 0; s <= maxSymbolValue; ++s) {
-		if(normalizedCounter[s] == 0) {
-			return FSE_repeat_check;
-		}
-	}
-	return FSE_repeat_valid;
-}
-
-size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
-                         const void* const dict, size_t dictSize) {
-	short offcodeNCount[MaxOff + 1];
-	unsigned offcodeMaxValue = MaxOff;
-	const BYTE* dictPtr = (const BYTE*)dict; /* skip magic num and dict ID */
-	const BYTE* const dictEnd = dictPtr + dictSize;
-	dictPtr += 8;
-	bs->entropy.huf.repeatMode = HUF_repeat_check;
-
-	{
-		unsigned maxSymbolValue = 255;
-		unsigned hasZeroWeights = 1;
-		size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr,
-		                                            dictEnd - dictPtr, &hasZeroWeights);
-
-		/* We only set the loaded table as valid if it contains all non-zero
-		 * weights. Otherwise, we set it to check */
-		if(!hasZeroWeights)
-			bs->entropy.huf.repeatMode = HUF_repeat_valid;
-
-		RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
-		RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
-		dictPtr += hufHeaderSize;
-	}
-
-	{
-		unsigned offcodeLog;
-		size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd - dictPtr);
-		RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
-		RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
-		/* fill all offset symbols to avoid garbage at end of table */
-		RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
-		                    bs->entropy.fse.offcodeCTable,
-		                    offcodeNCount, MaxOff, offcodeLog,
-		                    workspace, HUF_WORKSPACE_SIZE)),
-		                dictionary_corrupted, "");
-		/* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
-		dictPtr += offcodeHeaderSize;
-	}
-
-	{
-		short matchlengthNCount[MaxML + 1];
-		unsigned matchlengthMaxValue = MaxML, matchlengthLog;
-		size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd - dictPtr);
-		RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
-		RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
-		RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
-		                    bs->entropy.fse.matchlengthCTable,
-		                    matchlengthNCount, matchlengthMaxValue, matchlengthLog,
-		                    workspace, HUF_WORKSPACE_SIZE)),
-		                dictionary_corrupted, "");
-		bs->entropy.fse.matchlength_repeatMode = ZSTD_dictNCountRepeat(matchlengthNCount, matchlengthMaxValue, MaxML);
-		dictPtr += matchlengthHeaderSize;
-	}
-
-	{
-		short litlengthNCount[MaxLL + 1];
-		unsigned litlengthMaxValue = MaxLL, litlengthLog;
-		size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd - dictPtr);
-		RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
-		RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
-		RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
-		                    bs->entropy.fse.litlengthCTable,
-		                    litlengthNCount, litlengthMaxValue, litlengthLog,
-		                    workspace, HUF_WORKSPACE_SIZE)),
-		                dictionary_corrupted, "");
-		bs->entropy.fse.litlength_repeatMode = ZSTD_dictNCountRepeat(litlengthNCount, litlengthMaxValue, MaxLL);
-		dictPtr += litlengthHeaderSize;
-	}
-
-	RETURN_ERROR_IF(dictPtr + 12 > dictEnd, dictionary_corrupted, "");
-	bs->rep[0] = MEM_readLE32(dictPtr + 0);
-	bs->rep[1] = MEM_readLE32(dictPtr + 4);
-	bs->rep[2] = MEM_readLE32(dictPtr + 8);
-	dictPtr += 12;
-
-	{
-		size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
-		U32 offcodeMax = MaxOff;
-		if(dictContentSize <= ((U32)-1) - 128 KB) {
-			U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
-			offcodeMax = ZSTD_highbit32(maxOffset);              /* Calculate minimum offset code required to represent maxOffset */
-		}
-		/* All offset values <= dictContentSize + 128 KB must be representable for a valid table */
-		bs->entropy.fse.offcode_repeatMode = ZSTD_dictNCountRepeat(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff));
-
-		/* All repCodes must be <= dictContentSize and != 0 */
-		{
-			U32 u;
-			for(u = 0; u < 3; u++) {
-				RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
-				RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
-			}
-		}
-	}
-
-	return dictPtr - (const BYTE*)dict;
-}
-
-/* Dictionary format :
- * See :
- * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#dictionary-format
- */
-/*! ZSTD_loadZstdDictionary() :
- * @return : dictID, or an error code
- *  assumptions : magic number supposed already checked
- *                dictSize supposed >= 8
- */
-static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
-                                      ZSTD_matchState_t* ms,
-                                      ZSTD_cwksp* ws,
-                                      ZSTD_CCtx_params const * params,
-                                      const void* dict, size_t dictSize,
-                                      ZSTD_dictTableLoadMethod_e dtlm,
-                                      ZSTD_tableFillPurpose_e tfp,
-                                      void* workspace) {
-	const BYTE* dictPtr = (const BYTE*)dict;
-	const BYTE* const dictEnd = dictPtr + dictSize;
-	size_t dictID;
-	size_t eSize;
-	ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1 << MAX(MLFSELog, LLFSELog)));
-	assert(dictSize >= 8);
-	assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
-
-	dictID = params->fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr + 4 /* skip magic number */);
-	eSize = ZSTD_loadCEntropy(bs, workspace, dict, dictSize);
-	FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed");
-	dictPtr += eSize;
-
-	{
-		size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
-		FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
-		                     ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp),
-		                 "");
-	}
-	return dictID;
-}
-
-/** ZSTD_compress_insertDictionary() :
- *   @return : dictID, or an error code */
-static size_t
-ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
-                               ZSTD_matchState_t* ms,
-                               ldmState_t* ls,
-                               ZSTD_cwksp* ws,
-                               const ZSTD_CCtx_params* params,
-                               const void* dict, size_t dictSize,
-                               ZSTD_dictContentType_e dictContentType,
-                               ZSTD_dictTableLoadMethod_e dtlm,
-                               ZSTD_tableFillPurpose_e tfp,
-                               void* workspace) {
-	DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
-	if((dict == NULL) || (dictSize < 8)) {
-		RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
-		return 0;
-	}
-
-	ZSTD_reset_compressedBlockState(bs);
-
-	/* dict restricted modes */
-	if(dictContentType == ZSTD_dct_rawContent)
-		return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
-
-	if(MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
-		if(dictContentType == ZSTD_dct_auto) {
-			DEBUGLOG(4, "raw content dictionary detected");
-			return ZSTD_loadDictionaryContent(
-			    ms, ls, ws, params, dict, dictSize, dtlm, tfp);
-		}
-		RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
-		assert(0); /* impossible */
-	}
-
-	/* dict as full zstd dictionary */
-	return ZSTD_loadZstdDictionary(
-	    bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
-}
-
-#define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
-#define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
-
-/*! ZSTD_compressBegin_internal() :
- * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
- * @return : 0, or an error code */
-static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
-                                          const void* dict, size_t dictSize,
-                                          ZSTD_dictContentType_e dictContentType,
-                                          ZSTD_dictTableLoadMethod_e dtlm,
-                                          const ZSTD_CDict* cdict,
-                                          const ZSTD_CCtx_params* params, U64 pledgedSrcSize,
-                                          ZSTD_buffered_policy_e zbuff) {
-	size_t const dictContentSize = cdict ? cdict->dictContentSize : dictSize;
-#if ZSTD_TRACE
-	cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0;
-#endif
-	DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog);
-	/* params are supposed to be fully validated at this point */
-	assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
-	assert(!((dict) && (cdict))); /* either dict or cdict, not both */
-	if((cdict) && (cdict->dictContentSize > 0) && (pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN || cdict->compressionLevel == 0) && (params->attachDictPref != ZSTD_dictForceLoad)) {
-		return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff);
-	}
-
-	FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
-	                                         dictContentSize,
-	                                         ZSTDcrp_makeClean, zbuff),
-	                 "");
-	{
-		size_t const dictID = cdict ? ZSTD_compress_insertDictionary(
-		                                  cctx->blockState.prevCBlock, &cctx->blockState.matchState,
-		                                  &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
-		                                  cdict->dictContentSize, cdict->dictContentType, dtlm,
-		                                  ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
-		                            : ZSTD_compress_insertDictionary(
-		                                  cctx->blockState.prevCBlock, &cctx->blockState.matchState,
-		                                  &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
-		                                  dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
-		FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
-		assert(dictID <= UINT_MAX);
-		cctx->dictID = (U32)dictID;
-		cctx->dictContentSize = dictContentSize;
-	}
-	return 0;
-}
-
-size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
-                                            const void* dict, size_t dictSize,
-                                            ZSTD_dictContentType_e dictContentType,
-                                            ZSTD_dictTableLoadMethod_e dtlm,
-                                            const ZSTD_CDict* cdict,
-                                            const ZSTD_CCtx_params* params,
-                                            unsigned long long pledgedSrcSize) {
-	DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params->cParams.windowLog);
-	/* compression parameters verification and optimization */
-	FORWARD_IF_ERROR(ZSTD_checkCParams(params->cParams), "");
-	return ZSTD_compressBegin_internal(cctx,
-	                                   dict, dictSize, dictContentType, dtlm,
-	                                   cdict,
-	                                   params, pledgedSrcSize,
-	                                   ZSTDb_not_buffered);
-}
-
-/*! ZSTD_compressBegin_advanced() :
- *   @return : 0, or an error code */
-size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
-                                   const void* dict, size_t dictSize,
-                                   ZSTD_parameters params, unsigned long long pledgedSrcSize) {
-	ZSTD_CCtx_params cctxParams;
-	ZSTD_CCtxParams_init_internal(&cctxParams, &params, ZSTD_NO_CLEVEL);
-	return ZSTD_compressBegin_advanced_internal(cctx,
-	                                            dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast,
-	                                            NULL /*cdict*/,
-	                                            &cctxParams, pledgedSrcSize);
-}
-
-size_t
-ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) {
-	ZSTD_CCtx_params cctxParams;
-	{
-		ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
-		ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
-	}
-	DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
-	return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
-	                                   &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
-}
-
-size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) {
-	return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
-}
-
-/*! ZSTD_writeEpilogue() :
- *   Ends a frame.
- *   @return : nb of bytes written into dst (or an error code) */
-static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) {
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* op = ostart;
-	size_t fhSize = 0;
-
-	DEBUGLOG(4, "ZSTD_writeEpilogue");
-	RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
-
-	/* special case : empty frame */
-	if(cctx->stage == ZSTDcs_init) {
-		fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
-		FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
-		dstCapacity -= fhSize;
-		op += fhSize;
-		cctx->stage = ZSTDcs_ongoing;
-	}
-
-	if(cctx->stage != ZSTDcs_ending) {
-		/* write one last empty block, make it the "last" block */
-		U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw) << 1) + 0;
-		RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, "no room for epilogue");
-		MEM_writeLE32(op, cBlockHeader24);
-		op += ZSTD_blockHeaderSize;
-		dstCapacity -= ZSTD_blockHeaderSize;
-	}
-
-	if(cctx->appliedParams.fParams.checksumFlag) {
-		U32 const checksum = (U32)XXH64_digest(&cctx->xxhState);
-		RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, "no room for checksum");
-		DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", (unsigned)checksum);
-		MEM_writeLE32(op, checksum);
-		op += 4;
-	}
-
-	cctx->stage = ZSTDcs_created; /* return to "created but no init" status */
-	return op - ostart;
-}
-
-void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) {
-#if ZSTD_TRACE
-	if(cctx->traceCtx && ZSTD_trace_compress_end != NULL) {
-		int const streaming = cctx->inBuffSize > 0 || cctx->outBuffSize > 0 || cctx->appliedParams.nbWorkers > 0;
-		ZSTD_Trace trace;
-		ZSTD_memset(&trace, 0, sizeof(trace));
-		trace.version = ZSTD_VERSION_NUMBER;
-		trace.streaming = streaming;
-		trace.dictionaryID = cctx->dictID;
-		trace.dictionarySize = cctx->dictContentSize;
-		trace.uncompressedSize = cctx->consumedSrcSize;
-		trace.compressedSize = cctx->producedCSize + extraCSize;
-		trace.params = &cctx->appliedParams;
-		trace.cctx = cctx;
-		ZSTD_trace_compress_end(cctx->traceCtx, &trace);
-	}
-	cctx->traceCtx = 0;
-#else
-	(void)cctx;
-	(void)extraCSize;
-#endif
-}
-
-size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
-                        void* dst, size_t dstCapacity,
-                        const void* src, size_t srcSize) {
-	size_t endResult;
-	size_t const cSize = ZSTD_compressContinue_internal(cctx,
-	                                                    dst, dstCapacity, src, srcSize,
-	                                                    1 /* frame mode */, 1 /* last chunk */);
-	FORWARD_IF_ERROR(cSize, "ZSTD_compressContinue_internal failed");
-	endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity - cSize);
-	FORWARD_IF_ERROR(endResult, "ZSTD_writeEpilogue failed");
-	assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
-	if(cctx->pledgedSrcSizePlusOne != 0) { /* control src size */
-		ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
-		DEBUGLOG(4, "end of frame : controlling src size");
-		RETURN_ERROR_IF(
-		    cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize + 1,
-		    srcSize_wrong,
-		    "error : pledgedSrcSize = %u, while realSrcSize = %u",
-		    (unsigned)cctx->pledgedSrcSizePlusOne - 1,
-		    (unsigned)cctx->consumedSrcSize);
-	}
-	ZSTD_CCtx_trace(cctx, endResult);
-	return cSize + endResult;
-}
-
-size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
-                              void* dst, size_t dstCapacity,
-                              const void* src, size_t srcSize,
-                              const void* dict, size_t dictSize,
-                              ZSTD_parameters params) {
-	DEBUGLOG(4, "ZSTD_compress_advanced");
-	FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
-	ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, &params, ZSTD_NO_CLEVEL);
-	return ZSTD_compress_advanced_internal(cctx,
-	                                       dst, dstCapacity,
-	                                       src, srcSize,
-	                                       dict, dictSize,
-	                                       &cctx->simpleApiParams);
-}
-
-/* Internal */
-size_t ZSTD_compress_advanced_internal(
-    ZSTD_CCtx* cctx,
-    void* dst, size_t dstCapacity,
-    const void* src, size_t srcSize,
-    const void* dict, size_t dictSize,
-    const ZSTD_CCtx_params* params) {
-	DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (unsigned)srcSize);
-	FORWARD_IF_ERROR(ZSTD_compressBegin_internal(cctx,
-	                                             dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
-	                                             params, srcSize, ZSTDb_not_buffered),
-	                 "");
-	return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
-}
-
-size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
-                               void* dst, size_t dstCapacity,
-                               const void* src, size_t srcSize,
-                               const void* dict, size_t dictSize,
-                               int compressionLevel) {
-	{
-		ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict);
-		assert(params.fParams.contentSizeFlag == 1);
-		ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
-	}
-	DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize);
-	return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctx->simpleApiParams);
-}
-
-size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
-                         void* dst, size_t dstCapacity,
-                         const void* src, size_t srcSize,
-                         int compressionLevel) {
-	DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (unsigned)srcSize);
-	assert(cctx != NULL);
-	return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel);
-}
-
-size_t ZSTD_compress(void* dst, size_t dstCapacity,
-                     const void* src, size_t srcSize,
-                     int compressionLevel) {
-	size_t result;
-#if ZSTD_COMPRESS_HEAPMODE
-	ZSTD_CCtx* cctx = ZSTD_createCCtx();
-	RETURN_ERROR_IF(!cctx, memory_allocation, "ZSTD_createCCtx failed");
-	result = ZSTD_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel);
-	ZSTD_freeCCtx(cctx);
-#else
-	ZSTD_CCtx ctxBody;
-	ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem);
-	result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel);
-	ZSTD_freeCCtxContent(&ctxBody); /* can't free ctxBody itself, as it's on stack; free only heap content */
-#endif
-	return result;
-}
-
-/* =====  Dictionary API  ===== */
-
-/*! ZSTD_estimateCDictSize_advanced() :
- *  Estimate amount of memory that will be needed to create a dictionary with following arguments */
-size_t ZSTD_estimateCDictSize_advanced(
-    size_t dictSize, ZSTD_compressionParameters cParams,
-    ZSTD_dictLoadMethod_e dictLoadMethod) {
-	DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict));
-	return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
-	       /* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small
-	        * in case we are using DDS with row-hash. */
-	       + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams),
-	                                /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0) +
-	       (dictLoadMethod == ZSTD_dlm_byRef ? 0
-	                                         : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))));
-}
-
-size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) {
-	ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
-	return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy);
-}
-
-size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict) {
-	if(cdict == NULL)
-		return 0; /* support sizeof on NULL */
-	DEBUGLOG(5, "sizeof(*cdict) : %u", (unsigned)sizeof(*cdict));
-	/* cdict may be in the workspace */
-	return (cdict->workspace.workspace == cdict ? 0 : sizeof(*cdict)) + ZSTD_cwksp_sizeof(&cdict->workspace);
-}
-
-static size_t ZSTD_initCDict_internal(
-    ZSTD_CDict* cdict,
-    const void* dictBuffer, size_t dictSize,
-    ZSTD_dictLoadMethod_e dictLoadMethod,
-    ZSTD_dictContentType_e dictContentType,
-    ZSTD_CCtx_params params) {
-	DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (unsigned)dictContentType);
-	assert(!ZSTD_checkCParams(params.cParams));
-	cdict->matchState.cParams = params.cParams;
-	cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch;
-	if((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
-		cdict->dictContent = dictBuffer;
-	} else {
-		void* internalBuffer = ZSTD_cwksp_reserve_object(&cdict->workspace, ZSTD_cwksp_align(dictSize, sizeof(void*)));
-		RETURN_ERROR_IF(!internalBuffer, memory_allocation, "NULL pointer!");
-		cdict->dictContent = internalBuffer;
-		ZSTD_memcpy(internalBuffer, dictBuffer, dictSize);
-	}
-	cdict->dictContentSize = dictSize;
-	cdict->dictContentType = dictContentType;
-
-	cdict->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cdict->workspace, HUF_WORKSPACE_SIZE);
-
-	/* Reset the state to no dictionary */
-	ZSTD_reset_compressedBlockState(&cdict->cBlockState);
-	FORWARD_IF_ERROR(ZSTD_reset_matchState(
-	                     &cdict->matchState,
-	                     &cdict->workspace,
-	                     &params.cParams,
-	                     params.useRowMatchFinder,
-	                     ZSTDcrp_makeClean,
-	                     ZSTDirp_reset,
-	                     ZSTD_resetTarget_CDict),
-	                 "");
-	/* (Maybe) load the dictionary
-	 * Skips loading the dictionary if it is < 8 bytes.
-	 */
-	{
-		params.compressionLevel = ZSTD_CLEVEL_DEFAULT;
-		params.fParams.contentSizeFlag = 1;
-		{
-			size_t const dictID = ZSTD_compress_insertDictionary(
-			    &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
-			    &params, cdict->dictContent, cdict->dictContentSize,
-			    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
-			FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
-			assert(dictID <= (size_t)(U32)-1);
-			cdict->dictID = (U32)dictID;
-		}
-	}
-
-	return 0;
-}
-
-static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize,
-                                                      ZSTD_dictLoadMethod_e dictLoadMethod,
-                                                      ZSTD_compressionParameters cParams,
-                                                      ZSTD_paramSwitch_e useRowMatchFinder,
-                                                      U32 enableDedicatedDictSearch,
-                                                      ZSTD_customMem customMem) {
-	if((!customMem.customAlloc) ^ (!customMem.customFree))
-		return NULL;
-
-	{
-		size_t const workspaceSize =
-		    ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +
-		    ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) +
-		    ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) +
-		    (dictLoadMethod == ZSTD_dlm_byRef ? 0
-		                                      : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))));
-		void* const workspace = ZSTD_customMalloc(workspaceSize, customMem);
-		ZSTD_cwksp ws;
-		ZSTD_CDict* cdict;
-
-		if(!workspace) {
-			ZSTD_customFree(workspace, customMem);
-			return NULL;
-		}
-
-		ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_dynamic_alloc);
-
-		cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict));
-		assert(cdict != NULL);
-		ZSTD_cwksp_move(&cdict->workspace, &ws);
-		cdict->customMem = customMem;
-		cdict->compressionLevel = ZSTD_NO_CLEVEL; /* signals advanced API usage */
-		cdict->useRowMatchFinder = useRowMatchFinder;
-		return cdict;
-	}
-}
-
-ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,
-                                      ZSTD_dictLoadMethod_e dictLoadMethod,
-                                      ZSTD_dictContentType_e dictContentType,
-                                      ZSTD_compressionParameters cParams,
-                                      ZSTD_customMem customMem) {
-	ZSTD_CCtx_params cctxParams;
-	ZSTD_memset(&cctxParams, 0, sizeof(cctxParams));
-	ZSTD_CCtxParams_init(&cctxParams, 0);
-	cctxParams.cParams = cParams;
-	cctxParams.customMem = customMem;
-	return ZSTD_createCDict_advanced2(
-	    dictBuffer, dictSize,
-	    dictLoadMethod, dictContentType,
-	    &cctxParams, customMem);
-}
-
-ZSTD_CDict* ZSTD_createCDict_advanced2(
-    const void* dict, size_t dictSize,
-    ZSTD_dictLoadMethod_e dictLoadMethod,
-    ZSTD_dictContentType_e dictContentType,
-    const ZSTD_CCtx_params* originalCctxParams,
-    ZSTD_customMem customMem) {
-	ZSTD_CCtx_params cctxParams = *originalCctxParams;
-	ZSTD_compressionParameters cParams;
-	ZSTD_CDict* cdict;
-
-	DEBUGLOG(3, "ZSTD_createCDict_advanced2, mode %u", (unsigned)dictContentType);
-	if(!customMem.customAlloc ^ !customMem.customFree)
-		return NULL;
-
-	if(cctxParams.enableDedicatedDictSearch) {
-		cParams = ZSTD_dedicatedDictSearch_getCParams(
-		    cctxParams.compressionLevel, dictSize);
-		ZSTD_overrideCParams(&cParams, &cctxParams.cParams);
-	} else {
-		cParams = ZSTD_getCParamsFromCCtxParams(
-		    &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
-	}
-
-	if(!ZSTD_dedicatedDictSearch_isSupported(&cParams)) {
-		/* Fall back to non-DDSS params */
-		cctxParams.enableDedicatedDictSearch = 0;
-		cParams = ZSTD_getCParamsFromCCtxParams(
-		    &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
-	}
-
-	DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch);
-	cctxParams.cParams = cParams;
-	cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
-
-	cdict = ZSTD_createCDict_advanced_internal(dictSize,
-	                                           dictLoadMethod, cctxParams.cParams,
-	                                           cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch,
-	                                           customMem);
-
-	if(ZSTD_isError(ZSTD_initCDict_internal(cdict,
-	                                        dict, dictSize,
-	                                        dictLoadMethod, dictContentType,
-	                                        cctxParams))) {
-		ZSTD_freeCDict(cdict);
-		return NULL;
-	}
-
-	return cdict;
-}
-
-ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel) {
-	ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
-	ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize,
-	                                                    ZSTD_dlm_byCopy, ZSTD_dct_auto,
-	                                                    cParams, ZSTD_defaultCMem);
-	if(cdict)
-		cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel;
-	return cdict;
-}
-
-ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel) {
-	ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
-	ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize,
-	                                                    ZSTD_dlm_byRef, ZSTD_dct_auto,
-	                                                    cParams, ZSTD_defaultCMem);
-	if(cdict)
-		cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel;
-	return cdict;
-}
-
-size_t ZSTD_freeCDict(ZSTD_CDict* cdict) {
-	if(cdict == NULL)
-		return 0; /* support free on NULL */
-	{
-		ZSTD_customMem const cMem = cdict->customMem;
-		int cdictInWorkspace = ZSTD_cwksp_owns_buffer(&cdict->workspace, cdict);
-		ZSTD_cwksp_free(&cdict->workspace, cMem);
-		if(!cdictInWorkspace) {
-			ZSTD_customFree(cdict, cMem);
-		}
-		return 0;
-	}
-}
-
-/*! ZSTD_initStaticCDict_advanced() :
- *  Generate a digested dictionary in provided memory area.
- *  workspace: The memory area to emplace the dictionary into.
- *             Provided pointer must 8-bytes aligned.
- *             It must outlive dictionary usage.
- *  workspaceSize: Use ZSTD_estimateCDictSize()
- *                 to determine how large workspace must be.
- *  cParams : use ZSTD_getCParams() to transform a compression level
- *            into its relevants cParams.
- * @return : pointer to ZSTD_CDict*, or NULL if error (size too small)
- *  Note : there is no corresponding "free" function.
- *         Since workspace was allocated externally, it must be freed externally.
- */
-const ZSTD_CDict* ZSTD_initStaticCDict(
-    void* workspace, size_t workspaceSize,
-    const void* dict, size_t dictSize,
-    ZSTD_dictLoadMethod_e dictLoadMethod,
-    ZSTD_dictContentType_e dictContentType,
-    ZSTD_compressionParameters cParams) {
-	ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams);
-	/* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */
-	size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0);
-	size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + matchStateSize;
-	ZSTD_CDict* cdict;
-	ZSTD_CCtx_params params;
-
-	if((size_t)workspace & 7)
-		return NULL; /* 8-aligned */
-
-	{
-		ZSTD_cwksp ws;
-		ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc);
-		cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict));
-		if(cdict == NULL)
-			return NULL;
-		ZSTD_cwksp_move(&cdict->workspace, &ws);
-	}
-
-	DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u",
-	         (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize));
-	if(workspaceSize < neededSize)
-		return NULL;
-
-	ZSTD_CCtxParams_init(&params, 0);
-	params.cParams = cParams;
-	params.useRowMatchFinder = useRowMatchFinder;
-	cdict->useRowMatchFinder = useRowMatchFinder;
-
-	if(ZSTD_isError(ZSTD_initCDict_internal(cdict,
-	                                        dict, dictSize,
-	                                        dictLoadMethod, dictContentType,
-	                                        params)))
-		return NULL;
-
-	return cdict;
-}
-
-ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict) {
-	assert(cdict != NULL);
-	return cdict->matchState.cParams;
-}
-
-/*! ZSTD_getDictID_fromCDict() :
- *  Provides the dictID of the dictionary loaded into `cdict`.
- *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
- *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
-unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict) {
-	if(cdict == NULL)
-		return 0;
-	return cdict->dictID;
-}
-
-/* ZSTD_compressBegin_usingCDict_internal() :
- * Implementation of various ZSTD_compressBegin_usingCDict* functions.
- */
-static size_t ZSTD_compressBegin_usingCDict_internal(
-    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
-    ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) {
-	ZSTD_CCtx_params cctxParams;
-	DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal");
-	RETURN_ERROR_IF(cdict == NULL, dictionary_wrong, "NULL pointer!");
-	/* Initialize the cctxParams from the cdict */
-	{
-		ZSTD_parameters params;
-		params.fParams = fParams;
-		params.cParams = (pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN || cdict->compressionLevel == 0) ? ZSTD_getCParamsFromCDict(cdict)
-		                                                                                                                                                                                                                                              : ZSTD_getCParams(cdict->compressionLevel,
-		                                                                                                                                                                                                                                                                pledgedSrcSize,
-		                                                                                                                                                                                                                                                                cdict->dictContentSize);
-		ZSTD_CCtxParams_init_internal(&cctxParams, &params, cdict->compressionLevel);
-	}
-	/* Increase window log to fit the entire dictionary and source if the
-	 * source size is known. Limit the increase to 19, which is the
-	 * window log for compression level 1 with the largest source size.
-	 */
-	if(pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
-		U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19);
-		U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1;
-		cctxParams.cParams.windowLog = MAX(cctxParams.cParams.windowLog, limitedSrcLog);
-	}
-	return ZSTD_compressBegin_internal(cctx,
-	                                   NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast,
-	                                   cdict,
-	                                   &cctxParams, pledgedSrcSize,
-	                                   ZSTDb_not_buffered);
-}
-
-/* ZSTD_compressBegin_usingCDict_advanced() :
- * This function is DEPRECATED.
- * cdict must be != NULL */
-size_t ZSTD_compressBegin_usingCDict_advanced(
-    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
-    ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) {
-	return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pledgedSrcSize);
-}
-
-/* ZSTD_compressBegin_usingCDict() :
- * cdict must be != NULL */
-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) {
-	ZSTD_frameParameters const fParams = {0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/};
-	return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
-}
-
-/*! ZSTD_compress_usingCDict_internal():
- * Implementation of various ZSTD_compress_usingCDict* functions.
- */
-static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
-                                                void* dst, size_t dstCapacity,
-                                                const void* src, size_t srcSize,
-                                                const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) {
-	FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
-	return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
-}
-
-/*! ZSTD_compress_usingCDict_advanced():
- * This function is DEPRECATED.
- */
-size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
-                                         void* dst, size_t dstCapacity,
-                                         const void* src, size_t srcSize,
-                                         const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) {
-	return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);
-}
-
-/*! ZSTD_compress_usingCDict() :
- *  Compression using a digested Dictionary.
- *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
- *  Note that compression parameters are decided at CDict creation time
- *  while frame parameters are hardcoded */
-size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
-                                void* dst, size_t dstCapacity,
-                                const void* src, size_t srcSize,
-                                const ZSTD_CDict* cdict) {
-	ZSTD_frameParameters const fParams = {1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/};
-	return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);
-}
-
-/* ******************************************************************
- *  Streaming
- ********************************************************************/
-
-ZSTD_CStream* ZSTD_createCStream(void) {
-	DEBUGLOG(3, "ZSTD_createCStream");
-	return ZSTD_createCStream_advanced(ZSTD_defaultCMem);
-}
-
-ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize) {
-	return ZSTD_initStaticCCtx(workspace, workspaceSize);
-}
-
-ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem) { /* CStream and CCtx are now same object */
-	return ZSTD_createCCtx_advanced(customMem);
-}
-
-size_t ZSTD_freeCStream(ZSTD_CStream* zcs) {
-	return ZSTD_freeCCtx(zcs); /* same object */
-}
-
-/*======   Initialization   ======*/
-
-size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX; }
-
-size_t ZSTD_CStreamOutSize(void) {
-	return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */;
-}
-
-static ZSTD_cParamMode_e ZSTD_getCParamMode(ZSTD_CDict const * cdict, ZSTD_CCtx_params const * params, U64 pledgedSrcSize) {
-	if(cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize))
-		return ZSTD_cpm_attachDict;
-	else
-		return ZSTD_cpm_noAttachDict;
-}
-
-/* ZSTD_resetCStream():
- * pledgedSrcSize == 0 means "unknown" */
-size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss) {
-	/* temporary : 0 interpreted as "unknown" during transition period.
-	 * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN.
-	 * 0 will be interpreted as "empty" in the future.
-	 */
-	U64 const pledgedSrcSize = (pss == 0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
-	DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (unsigned)pledgedSrcSize);
-	FORWARD_IF_ERROR(ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize), "");
-	return 0;
-}
-
-/*! ZSTD_initCStream_internal() :
- *  Note : for lib/compress only. Used by zstdmt_compress.c.
- *  Assumption 1 : params are valid
- *  Assumption 2 : either dict, or cdict, is defined, not both */
-size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
-                                 const void* dict, size_t dictSize, const ZSTD_CDict* cdict,
-                                 const ZSTD_CCtx_params* params,
-                                 unsigned long long pledgedSrcSize) {
-	DEBUGLOG(4, "ZSTD_initCStream_internal");
-	FORWARD_IF_ERROR(ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize), "");
-	assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
-	zcs->requestedParams = *params;
-	assert(!((dict) && (cdict))); /* either dict or cdict, not both */
-	if(dict) {
-		FORWARD_IF_ERROR(ZSTD_CCtx_loadDictionary(zcs, dict, dictSize), "");
-	} else {
-		/* Dictionary is cleared if !cdict */
-		FORWARD_IF_ERROR(ZSTD_CCtx_refCDict(zcs, cdict), "");
-	}
-	return 0;
-}
-
-/* ZSTD_initCStream_usingCDict_advanced() :
- * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */
-size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
-                                            const ZSTD_CDict* cdict,
-                                            ZSTD_frameParameters fParams,
-                                            unsigned long long pledgedSrcSize) {
-	DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced");
-	FORWARD_IF_ERROR(ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize), "");
-	zcs->requestedParams.fParams = fParams;
-	FORWARD_IF_ERROR(ZSTD_CCtx_refCDict(zcs, cdict), "");
-	return 0;
-}
-
-/* note : cdict must outlive compression session */
-size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict) {
-	DEBUGLOG(4, "ZSTD_initCStream_usingCDict");
-	FORWARD_IF_ERROR(ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_refCDict(zcs, cdict), "");
-	return 0;
-}
-
-/* ZSTD_initCStream_advanced() :
- * pledgedSrcSize must be exact.
- * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
- * dict is loaded with default parameters ZSTD_dct_auto and ZSTD_dlm_byCopy. */
-size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
-                                 const void* dict, size_t dictSize,
-                                 ZSTD_parameters params, unsigned long long pss) {
-	/* for compatibility with older programs relying on this behavior.
-	 * Users should now specify ZSTD_CONTENTSIZE_UNKNOWN.
-	 * This line will be removed in the future.
-	 */
-	U64 const pledgedSrcSize = (pss == 0 && params.fParams.contentSizeFlag == 0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
-	DEBUGLOG(4, "ZSTD_initCStream_advanced");
-	FORWARD_IF_ERROR(ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize), "");
-	FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
-	ZSTD_CCtxParams_setZstdParams(&zcs->requestedParams, &params);
-	FORWARD_IF_ERROR(ZSTD_CCtx_loadDictionary(zcs, dict, dictSize), "");
-	return 0;
-}
-
-size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel) {
-	DEBUGLOG(4, "ZSTD_initCStream_usingDict");
-	FORWARD_IF_ERROR(ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_loadDictionary(zcs, dict, dictSize), "");
-	return 0;
-}
-
-size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss) {
-	/* temporary : 0 interpreted as "unknown" during transition period.
-	 * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN.
-	 * 0 will be interpreted as "empty" in the future.
-	 */
-	U64 const pledgedSrcSize = (pss == 0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
-	DEBUGLOG(4, "ZSTD_initCStream_srcSize");
-	FORWARD_IF_ERROR(ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_refCDict(zcs, NULL), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize), "");
-	return 0;
-}
-
-size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) {
-	DEBUGLOG(4, "ZSTD_initCStream");
-	FORWARD_IF_ERROR(ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_refCDict(zcs, NULL), "");
-	FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel), "");
-	return 0;
-}
-
-/*======   Compression   ======*/
-
-static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) {
-	if(cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
-		return cctx->blockSize - cctx->stableIn_notConsumed;
-	}
-	assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
-	{
-		size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
-		if(hintInSize == 0)
-			hintInSize = cctx->blockSize;
-		return hintInSize;
-	}
-}
-
-/** ZSTD_compressStream_generic():
- *  internal function for all *compressStream*() variants
- * @return : hint size for next input to complete ongoing block */
-static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
-                                          ZSTD_outBuffer* output,
-                                          ZSTD_inBuffer* input,
-                                          ZSTD_EndDirective const flushMode) {
-	const char* const istart = (assert(input != NULL), (const char*)input->src);
-	const char* const iend = (istart != NULL) ? istart + input->size : istart;
-	const char* ip = (istart != NULL) ? istart + input->pos : istart;
-	char* const ostart = (assert(output != NULL), (char*)output->dst);
-	char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
-	char* op = (ostart != NULL) ? ostart + output->pos : ostart;
-	U32 someMoreWork = 1;
-
-	/* check expectations */
-	DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
-	assert(zcs != NULL);
-	if(zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
-		assert(input->pos >= zcs->stableIn_notConsumed);
-		input->pos -= zcs->stableIn_notConsumed;
-		ip -= zcs->stableIn_notConsumed;
-		zcs->stableIn_notConsumed = 0;
-	}
-	if(zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
-		assert(zcs->inBuff != NULL);
-		assert(zcs->inBuffSize > 0);
-	}
-	if(zcs->appliedParams.outBufferMode == ZSTD_bm_buffered) {
-		assert(zcs->outBuff != NULL);
-		assert(zcs->outBuffSize > 0);
-	}
-	if(input->src == NULL)
-		assert(input->size == 0);
-	assert(input->pos <= input->size);
-	if(output->dst == NULL)
-		assert(output->size == 0);
-	assert(output->pos <= output->size);
-	assert((U32)flushMode <= (U32)ZSTD_e_end);
-
-	while(someMoreWork) {
-		switch(zcs->streamStage) {
-		case zcss_init:
-			RETURN_ERROR(init_missing, "call ZSTD_initCStream() first!");
-
-		case zcss_load:
-			if((flushMode == ZSTD_e_end) && ((size_t)(oend - op) >= ZSTD_compressBound(iend - ip)   /* Enough output space */
-			                                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */
-			   && (zcs->inBuffPos == 0)) {
-				/* shortcut to compression pass directly into output buffer */
-				size_t const cSize = ZSTD_compressEnd(zcs,
-				                                      op, oend - op, ip, iend - ip);
-				DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
-				FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
-				ip = iend;
-				op += cSize;
-				zcs->frameEnded = 1;
-				ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
-				someMoreWork = 0;
-				break;
-			}
-			/* complete loading into inBuffer in buffered mode */
-			if(zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
-				size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
-				size_t const loaded = ZSTD_limitCopy(
-				    zcs->inBuff + zcs->inBuffPos, toLoad,
-				    ip, iend - ip);
-				zcs->inBuffPos += loaded;
-				if(ip)
-					ip += loaded;
-				if((flushMode == ZSTD_e_continue) && (zcs->inBuffPos < zcs->inBuffTarget)) {
-					/* not enough input to fill full block : stop here */
-					someMoreWork = 0;
-					break;
-				}
-				if((flushMode == ZSTD_e_flush) && (zcs->inBuffPos == zcs->inToCompress)) {
-					/* empty */
-					someMoreWork = 0;
-					break;
-				}
-			} else {
-				assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
-				if((flushMode == ZSTD_e_continue) && ((size_t)(iend - ip) < zcs->blockSize)) {
-					/* can't compress a full block : stop here */
-					zcs->stableIn_notConsumed = (size_t)(iend - ip);
-					ip = iend; /* pretend to have consumed input */
-					someMoreWork = 0;
-					break;
-				}
-				if((flushMode == ZSTD_e_flush) && (ip == iend)) {
-					/* empty */
-					someMoreWork = 0;
-					break;
-				}
-			}
-			/* compress current block (note : this stage cannot be stopped in the middle) */
-			DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
-			{
-				int const inputBuffered = (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered);
-				void* cDst;
-				size_t cSize;
-				size_t oSize = oend - op;
-				size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
-				                                   : MIN((size_t)(iend - ip), zcs->blockSize);
-				if(oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
-					cDst = op; /* compress into output buffer, to skip flush stage */
-				else
-					cDst = zcs->outBuff, oSize = zcs->outBuffSize;
-				if(inputBuffered) {
-					unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip == iend);
-					cSize = lastBlock ? ZSTD_compressEnd(zcs, cDst, oSize,
-					                                     zcs->inBuff + zcs->inToCompress, iSize)
-					                  : ZSTD_compressContinue(zcs, cDst, oSize,
-					                                          zcs->inBuff + zcs->inToCompress, iSize);
-					FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
-					zcs->frameEnded = lastBlock;
-					/* prepare next block */
-					zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
-					if(zcs->inBuffTarget > zcs->inBuffSize)
-						zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize;
-					DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u",
-					         (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize);
-					if(!lastBlock)
-						assert(zcs->inBuffTarget <= zcs->inBuffSize);
-					zcs->inToCompress = zcs->inBuffPos;
-				} else { /* !inputBuffered, hence ZSTD_bm_stable */
-					unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
-					cSize = lastBlock ? ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) : ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
-					/* Consume the input prior to error checking to mirror buffered mode. */
-					if(ip)
-						ip += iSize;
-					FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
-					zcs->frameEnded = lastBlock;
-					if(lastBlock)
-						assert(ip == iend);
-				}
-				if(cDst == op) { /* no need to flush */
-					op += cSize;
-					if(zcs->frameEnded) {
-						DEBUGLOG(5, "Frame completed directly in outBuffer");
-						someMoreWork = 0;
-						ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
-					}
-					break;
-				}
-				zcs->outBuffContentSize = cSize;
-				zcs->outBuffFlushedSize = 0;
-				zcs->streamStage = zcss_flush; /* pass-through to flush stage */
-			}
-			ZSTD_FALLTHROUGH;
-		case zcss_flush:
-			DEBUGLOG(5, "flush stage");
-			assert(zcs->appliedParams.outBufferMode == ZSTD_bm_buffered);
-			{
-				size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
-				size_t const flushed = ZSTD_limitCopy(op, (size_t)(oend - op),
-				                                      zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
-				DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u",
-				         (unsigned)toFlush, (unsigned)(oend - op), (unsigned)flushed);
-				if(flushed)
-					op += flushed;
-				zcs->outBuffFlushedSize += flushed;
-				if(toFlush != flushed) {
-					/* flush not fully completed, presumably because dst is too small */
-					assert(op == oend);
-					someMoreWork = 0;
-					break;
-				}
-				zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
-				if(zcs->frameEnded) {
-					DEBUGLOG(5, "Frame completed on flush");
-					someMoreWork = 0;
-					ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
-					break;
-				}
-				zcs->streamStage = zcss_load;
-				break;
-			}
-
-		default: /* impossible */
-			assert(0);
-		}
-	}
-
-	input->pos = ip - istart;
-	output->pos = op - ostart;
-	if(zcs->frameEnded)
-		return 0;
-	return ZSTD_nextInputSizeHint(zcs);
-}
-
-static size_t ZSTD_nextInputSizeHint_MTorST(const ZSTD_CCtx* cctx) {
-#ifdef ZSTD_MULTITHREAD
-	if(cctx->appliedParams.nbWorkers >= 1) {
-		assert(cctx->mtctx != NULL);
-		return ZSTDMT_nextInputSizeHint(cctx->mtctx);
-	}
-#endif
-	return ZSTD_nextInputSizeHint(cctx);
-}
-
-size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input) {
-	FORWARD_IF_ERROR(ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue), "");
-	return ZSTD_nextInputSizeHint_MTorST(zcs);
-}
-
-/* After a compression call set the expected input/output buffer.
- * This is validated at the start of the next compression call.
- */
-static void
-ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) {
-	DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
-	if(cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
-		cctx->expectedInBuffer = *input;
-	}
-	if(cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
-		cctx->expectedOutBufferSize = output->size - output->pos;
-	}
-}
-
-/* Validate that the input/output buffers match the expectations set by
- * ZSTD_setBufferExpectations.
- */
-static size_t ZSTD_checkBufferStability(ZSTD_CCtx const * cctx,
-                                        ZSTD_outBuffer const * output,
-                                        ZSTD_inBuffer const * input,
-                                        ZSTD_EndDirective endOp) {
-	if(cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
-		ZSTD_inBuffer const expect = cctx->expectedInBuffer;
-		if(expect.src != input->src || expect.pos != input->pos)
-			RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
-	}
-	(void)endOp;
-	if(cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
-		size_t const outBufferSize = output->size - output->pos;
-		if(cctx->expectedOutBufferSize != outBufferSize)
-			RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
-	}
-	return 0;
-}
-
-static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
-                                             ZSTD_EndDirective endOp,
-                                             size_t inSize) {
-	ZSTD_CCtx_params params = cctx->requestedParams;
-	ZSTD_prefixDict const prefixDict = cctx->prefixDict;
-	FORWARD_IF_ERROR(ZSTD_initLocalDict(cctx), "");              /* Init the local dict if present. */
-	ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */
-	assert(prefixDict.dict == NULL || cctx->cdict == NULL);      /* only one can be set */
-	if(cctx->cdict && !cctx->localDict.cdict) {
-		/* Let the cdict's compression level take priority over the requested params.
-		 * But do not take the cdict's compression level if the "cdict" is actually a localDict
-		 * generated from ZSTD_initLocalDict().
-		 */
-		params.compressionLevel = cctx->cdict->compressionLevel;
-	}
-	DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
-	if(endOp == ZSTD_e_end)
-		cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */
-
-	{
-		size_t const dictSize = prefixDict.dict
-		                            ? prefixDict.dictSize
-		                            : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
-		ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
-		params.cParams = ZSTD_getCParamsFromCCtxParams(
-		    &params, cctx->pledgedSrcSizePlusOne - 1,
-		    dictSize, mode);
-	}
-
-	params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
-	params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
-	params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
-	params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
-	params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
-	params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
-
-#ifdef ZSTD_MULTITHREAD
-	/* If external matchfinder is enabled, make sure to fail before checking job size (for consistency) */
-	RETURN_ERROR_IF(
-	    params.useSequenceProducer == 1 && params.nbWorkers >= 1,
-	    parameter_combination_unsupported,
-	    "External sequence producer isn't supported with nbWorkers >= 1");
-
-	if((cctx->pledgedSrcSizePlusOne - 1) <= ZSTDMT_JOBSIZE_MIN) {
-		params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */
-	}
-	if(params.nbWorkers > 0) {
-#if ZSTD_TRACE
-		cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0;
-#endif
-		/* mt context creation */
-		if(cctx->mtctx == NULL) {
-			DEBUGLOG(4, "ZSTD_compressStream2: creating new mtctx for nbWorkers=%u",
-			         params.nbWorkers);
-			cctx->mtctx = ZSTDMT_createCCtx_advanced((U32)params.nbWorkers, cctx->customMem, cctx->pool);
-			RETURN_ERROR_IF(cctx->mtctx == NULL, memory_allocation, "NULL pointer!");
-		}
-		/* mt compression */
-		DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers);
-		FORWARD_IF_ERROR(ZSTDMT_initCStream_internal(
-		                     cctx->mtctx,
-		                     prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType,
-		                     cctx->cdict, params, cctx->pledgedSrcSizePlusOne - 1),
-		                 "");
-		cctx->dictID = cctx->cdict ? cctx->cdict->dictID : 0;
-		cctx->dictContentSize = cctx->cdict ? cctx->cdict->dictContentSize : prefixDict.dictSize;
-		cctx->consumedSrcSize = 0;
-		cctx->producedCSize = 0;
-		cctx->streamStage = zcss_load;
-		cctx->appliedParams = params;
-	} else
-#endif /* ZSTD_MULTITHREAD */
-	{
-		U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
-		assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
-		FORWARD_IF_ERROR(ZSTD_compressBegin_internal(cctx,
-		                                             prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, ZSTD_dtlm_fast,
-		                                             cctx->cdict,
-		                                             &params, pledgedSrcSize,
-		                                             ZSTDb_buffered),
-		                 "");
-		assert(cctx->appliedParams.nbWorkers == 0);
-		cctx->inToCompress = 0;
-		cctx->inBuffPos = 0;
-		if(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered) {
-			/* for small input: avoid automatic flush on reaching end of block, since
-			 * it would require to add a 3-bytes null block to end frame
-			 */
-			cctx->inBuffTarget = cctx->blockSize + (cctx->blockSize == pledgedSrcSize);
-		} else {
-			cctx->inBuffTarget = 0;
-		}
-		cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0;
-		cctx->streamStage = zcss_load;
-		cctx->frameEnded = 0;
-	}
-	return 0;
-}
-
-/* @return provides a minimum amount of data remaining to be flushed from internal buffers
- */
-size_t ZSTD_compressStream2(ZSTD_CCtx* cctx,
-                            ZSTD_outBuffer* output,
-                            ZSTD_inBuffer* input,
-                            ZSTD_EndDirective endOp) {
-	DEBUGLOG(5, "ZSTD_compressStream2, endOp=%u ", (unsigned)endOp);
-	/* check conditions */
-	RETURN_ERROR_IF(output->pos > output->size, dstSize_tooSmall, "invalid output buffer");
-	RETURN_ERROR_IF(input->pos > input->size, srcSize_wrong, "invalid input buffer");
-	RETURN_ERROR_IF((U32)endOp > (U32)ZSTD_e_end, parameter_outOfBound, "invalid endDirective");
-	assert(cctx != NULL);
-
-	/* transparent initialization stage */
-	if(cctx->streamStage == zcss_init) {
-		size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */
-		size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
-		if((cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
-		   && (endOp == ZSTD_e_continue)                          /* no flush requested, more input to come */
-		   && (totalInputSize < ZSTD_BLOCKSIZE_MAX)) {            /* not even reached one block yet */
-			if(cctx->stableIn_notConsumed) {                      /* not the first time */
-				/* check stable source guarantees */
-				RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
-				RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
-			}
-			/* pretend input was consumed, to give a sense forward progress */
-			input->pos = input->size;
-			/* save stable inBuffer, for later control, and flush/end */
-			cctx->expectedInBuffer = *input;
-			/* but actually input wasn't consumed, so keep track of position from where compression shall resume */
-			cctx->stableIn_notConsumed += inputSize;
-			/* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
-			return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */
-		}
-		FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
-		ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */
-	}
-	/* end of transparent initialization stage */
-
-	FORWARD_IF_ERROR(ZSTD_checkBufferStability(cctx, output, input, endOp), "invalid buffers");
-	/* compression stage */
-#ifdef ZSTD_MULTITHREAD
-	if(cctx->appliedParams.nbWorkers > 0) {
-		size_t flushMin;
-		if(cctx->cParamsChanged) {
-			ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams);
-			cctx->cParamsChanged = 0;
-		}
-		if(cctx->stableIn_notConsumed) {
-			assert(cctx->appliedParams.inBufferMode == ZSTD_bm_stable);
-			/* some early data was skipped - make it available for consumption */
-			assert(input->pos >= cctx->stableIn_notConsumed);
-			input->pos -= cctx->stableIn_notConsumed;
-			cctx->stableIn_notConsumed = 0;
-		}
-		for(;;) {
-			size_t const ipos = input->pos;
-			size_t const opos = output->pos;
-			flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp);
-			cctx->consumedSrcSize += (U64)(input->pos - ipos);
-			cctx->producedCSize += (U64)(output->pos - opos);
-			if(ZSTD_isError(flushMin) || (endOp == ZSTD_e_end && flushMin == 0)) { /* compression completed */
-				if(flushMin == 0)
-					ZSTD_CCtx_trace(cctx, 0);
-				ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only);
-			}
-			FORWARD_IF_ERROR(flushMin, "ZSTDMT_compressStream_generic failed");
-
-			if(endOp == ZSTD_e_continue) {
-				/* We only require some progress with ZSTD_e_continue, not maximal progress.
-				 * We're done if we've consumed or produced any bytes, or either buffer is
-				 * full.
-				 */
-				if(input->pos != ipos || output->pos != opos || input->pos == input->size || output->pos == output->size)
-					break;
-			} else {
-				assert(endOp == ZSTD_e_flush || endOp == ZSTD_e_end);
-				/* We require maximal progress. We're done when the flush is complete or the
-				 * output buffer is full.
-				 */
-				if(flushMin == 0 || output->pos == output->size)
-					break;
-			}
-		}
-		DEBUGLOG(5, "completed ZSTD_compressStream2 delegating to ZSTDMT_compressStream_generic");
-		/* Either we don't require maximum forward progress, we've finished the
-		 * flush, or we are out of output space.
-		 */
-		assert(endOp == ZSTD_e_continue || flushMin == 0 || output->pos == output->size);
-		ZSTD_setBufferExpectations(cctx, output, input);
-		return flushMin;
-	}
-#endif /* ZSTD_MULTITHREAD */
-	FORWARD_IF_ERROR(ZSTD_compressStream_generic(cctx, output, input, endOp), "");
-	DEBUGLOG(5, "completed ZSTD_compressStream2");
-	ZSTD_setBufferExpectations(cctx, output, input);
-	return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */
-}
-
-size_t ZSTD_compressStream2_simpleArgs(
-    ZSTD_CCtx* cctx,
-    void* dst, size_t dstCapacity, size_t* dstPos,
-    const void* src, size_t srcSize, size_t* srcPos,
-    ZSTD_EndDirective endOp) {
-	ZSTD_outBuffer output;
-	ZSTD_inBuffer input;
-	output.dst = dst;
-	output.size = dstCapacity;
-	output.pos = *dstPos;
-	input.src = src;
-	input.size = srcSize;
-	input.pos = *srcPos;
-	/* ZSTD_compressStream2() will check validity of dstPos and srcPos */
-	{
-		size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
-		*dstPos = output.pos;
-		*srcPos = input.pos;
-		return cErr;
-	}
-}
-
-size_t ZSTD_compress2(ZSTD_CCtx* cctx,
-                      void* dst, size_t dstCapacity,
-                      const void* src, size_t srcSize) {
-	ZSTD_bufferMode_e const originalInBufferMode = cctx->requestedParams.inBufferMode;
-	ZSTD_bufferMode_e const originalOutBufferMode = cctx->requestedParams.outBufferMode;
-	DEBUGLOG(4, "ZSTD_compress2 (srcSize=%u)", (unsigned)srcSize);
-	ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only);
-	/* Enable stable input/output buffers. */
-	cctx->requestedParams.inBufferMode = ZSTD_bm_stable;
-	cctx->requestedParams.outBufferMode = ZSTD_bm_stable;
-	{
-		size_t oPos = 0;
-		size_t iPos = 0;
-		size_t const result = ZSTD_compressStream2_simpleArgs(cctx,
-		                                                      dst, dstCapacity, &oPos,
-		                                                      src, srcSize, &iPos,
-		                                                      ZSTD_e_end);
-		/* Reset to the original values. */
-		cctx->requestedParams.inBufferMode = originalInBufferMode;
-		cctx->requestedParams.outBufferMode = originalOutBufferMode;
-
-		FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
-		if(result != 0) { /* compression not completed, due to lack of output space */
-			assert(oPos == dstCapacity);
-			RETURN_ERROR(dstSize_tooSmall, "");
-		}
-		assert(iPos == srcSize); /* all input is expected consumed */
-		return oPos;
-	}
-}
-
-/* ZSTD_validateSequence() :
- * @offCode : is presumed to follow format required by ZSTD_storeSeq()
- * @returns a ZSTD error code if sequence is not valid
- */
-static size_t
-ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
-                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) {
-	U32 const windowSize = 1u << windowLog;
-	/* posInSrc represents the amount of data the decoder would decode up to this point.
-	 * As long as the amount of data decoded is less than or equal to window size, offsets may be
-	 * larger than the total length of output decoded in order to reference the dict, even larger than
-	 * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
-	 */
-	size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
-	size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
-	RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
-	/* Validate maxNbSeq is large enough for the given matchLength and minMatch */
-	RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
-	return 0;
-}
-
-/* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
-static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) {
-	U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
-
-	if(!ll0 && rawOffset == rep[0]) {
-		offBase = REPCODE1_TO_OFFBASE;
-	} else if(rawOffset == rep[1]) {
-		offBase = REPCODE_TO_OFFBASE(2 - ll0);
-	} else if(rawOffset == rep[2]) {
-		offBase = REPCODE_TO_OFFBASE(3 - ll0);
-	} else if(ll0 && rawOffset == rep[0] - 1) {
-		offBase = REPCODE3_TO_OFFBASE;
-	}
-	return offBase;
-}
-
-size_t
-ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
-                                               ZSTD_sequencePosition* seqPos,
-                                               const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                               const void* src, size_t blockSize,
-                                               ZSTD_paramSwitch_e externalRepSearch) {
-	U32 idx = seqPos->idx;
-	U32 const startIdx = idx;
-	BYTE const * ip = (BYTE const *)(src);
-	const BYTE* const iend = ip + blockSize;
-	repcodes_t updatedRepcodes;
-	U32 dictSize;
-
-	DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
-
-	if(cctx->cdict) {
-		dictSize = (U32)cctx->cdict->dictContentSize;
-	} else if(cctx->prefixDict.dict) {
-		dictSize = (U32)cctx->prefixDict.dictSize;
-	} else {
-		dictSize = 0;
-	}
-	ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
-	for(; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
-		U32 const litLength = inSeqs[idx].litLength;
-		U32 const matchLength = inSeqs[idx].matchLength;
-		U32 offBase;
-
-		if(externalRepSearch == ZSTD_ps_disable) {
-			offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
-		} else {
-			U32 const ll0 = (litLength == 0);
-			offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
-			ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
-		}
-
-		DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
-		if(cctx->appliedParams.validateSequences) {
-			seqPos->posInSrc += litLength + matchLength;
-			FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
-			                                       cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
-			                 "Sequence validation failed");
-		}
-		RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
-		                "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
-		ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
-		ip += matchLength + litLength;
-	}
-
-	/* If we skipped repcode search while parsing, we need to update repcodes now */
-	assert(externalRepSearch != ZSTD_ps_auto);
-	assert(idx >= startIdx);
-	if(externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
-		U32* const rep = updatedRepcodes.rep;
-		U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
-
-		if(lastSeqIdx >= startIdx + 2) {
-			rep[2] = inSeqs[lastSeqIdx - 2].offset;
-			rep[1] = inSeqs[lastSeqIdx - 1].offset;
-			rep[0] = inSeqs[lastSeqIdx].offset;
-		} else if(lastSeqIdx == startIdx + 1) {
-			rep[2] = rep[0];
-			rep[1] = inSeqs[lastSeqIdx - 1].offset;
-			rep[0] = inSeqs[lastSeqIdx].offset;
-		} else {
-			assert(lastSeqIdx == startIdx);
-			rep[2] = rep[1];
-			rep[1] = rep[0];
-			rep[0] = inSeqs[lastSeqIdx].offset;
-		}
-	}
-
-	ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
-
-	if(inSeqs[idx].litLength) {
-		DEBUGLOG(6, "Storing last literals of size: %u", inSeqs[idx].litLength);
-		ZSTD_storeLastLiterals(&cctx->seqStore, ip, inSeqs[idx].litLength);
-		ip += inSeqs[idx].litLength;
-		seqPos->posInSrc += inSeqs[idx].litLength;
-	}
-	RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
-	seqPos->idx = idx + 1;
-	return 0;
-}
-
-size_t
-ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
-                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                         const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch) {
-	U32 idx = seqPos->idx;
-	U32 startPosInSequence = seqPos->posInSequence;
-	U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize;
-	size_t dictSize;
-	BYTE const * ip = (BYTE const *)(src);
-	BYTE const * iend = ip + blockSize; /* May be adjusted if we decide to process fewer than blockSize bytes */
-	repcodes_t updatedRepcodes;
-	U32 bytesAdjustment = 0;
-	U32 finalMatchSplit = 0;
-
-	/* TODO(embg) support fast parsing mode in noBlockDelim mode */
-	(void)externalRepSearch;
-
-	if(cctx->cdict) {
-		dictSize = cctx->cdict->dictContentSize;
-	} else if(cctx->prefixDict.dict) {
-		dictSize = cctx->prefixDict.dictSize;
-	} else {
-		dictSize = 0;
-	}
-	DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
-	DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
-	ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
-	while(endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
-		const ZSTD_Sequence currSeq = inSeqs[idx];
-		U32 litLength = currSeq.litLength;
-		U32 matchLength = currSeq.matchLength;
-		U32 const rawOffset = currSeq.offset;
-		U32 offBase;
-
-		/* Modify the sequence depending on where endPosInSequence lies */
-		if(endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
-			if(startPosInSequence >= litLength) {
-				startPosInSequence -= litLength;
-				litLength = 0;
-				matchLength -= startPosInSequence;
-			} else {
-				litLength -= startPosInSequence;
-			}
-			/* Move to the next sequence */
-			endPosInSequence -= currSeq.litLength + currSeq.matchLength;
-			startPosInSequence = 0;
-		} else {
-			/* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
-			   does not reach the end of the match. So, we have to split the sequence */
-			DEBUGLOG(6, "Require a split: diff: %u, idx: %u PIS: %u",
-			         currSeq.litLength + currSeq.matchLength - endPosInSequence, idx, endPosInSequence);
-			if(endPosInSequence > litLength) {
-				U32 firstHalfMatchLength;
-				litLength = startPosInSequence >= litLength ? 0 : litLength - startPosInSequence;
-				firstHalfMatchLength = endPosInSequence - startPosInSequence - litLength;
-				if(matchLength > blockSize && firstHalfMatchLength >= cctx->appliedParams.cParams.minMatch) {
-					/* Only ever split the match if it is larger than the block size */
-					U32 secondHalfMatchLength = currSeq.matchLength + currSeq.litLength - endPosInSequence;
-					if(secondHalfMatchLength < cctx->appliedParams.cParams.minMatch) {
-						/* Move the endPosInSequence backward so that it creates match of minMatch length */
-						endPosInSequence -= cctx->appliedParams.cParams.minMatch - secondHalfMatchLength;
-						bytesAdjustment = cctx->appliedParams.cParams.minMatch - secondHalfMatchLength;
-						firstHalfMatchLength -= bytesAdjustment;
-					}
-					matchLength = firstHalfMatchLength;
-					/* Flag that we split the last match - after storing the sequence, exit the loop,
-					   but keep the value of endPosInSequence */
-					finalMatchSplit = 1;
-				} else {
-					/* Move the position in sequence backwards so that we don't split match, and break to store
-					 * the last literals. We use the original currSeq.litLength as a marker for where endPosInSequence
-					 * should go. We prefer to do this whenever it is not necessary to split the match, or if doing so
-					 * would cause the first half of the match to be too small
-					 */
-					bytesAdjustment = endPosInSequence - currSeq.litLength;
-					endPosInSequence = currSeq.litLength;
-					break;
-				}
-			} else {
-				/* This sequence ends inside the literals, break to store the last literals */
-				break;
-			}
-		}
-		/* Check if this offset can be represented with a repcode */
-		{
-			U32 const ll0 = (litLength == 0);
-			offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
-			ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
-		}
-
-		if(cctx->appliedParams.validateSequences) {
-			seqPos->posInSrc += litLength + matchLength;
-			FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
-			                                       cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
-			                 "Sequence validation failed");
-		}
-		DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
-		RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
-		                "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
-		ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
-		ip += matchLength + litLength;
-		if(!finalMatchSplit)
-			idx++; /* Next Sequence */
-	}
-	DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
-	assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
-	seqPos->idx = idx;
-	seqPos->posInSequence = endPosInSequence;
-	ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
-
-	iend -= bytesAdjustment;
-	if(ip != iend) {
-		/* Store any last literals */
-		U32 lastLLSize = (U32)(iend - ip);
-		assert(ip <= iend);
-		DEBUGLOG(6, "Storing last literals of size: %u", lastLLSize);
-		ZSTD_storeLastLiterals(&cctx->seqStore, ip, lastLLSize);
-		seqPos->posInSrc += lastLLSize;
-	}
-
-	return bytesAdjustment;
-}
-
-typedef size_t (*ZSTD_sequenceCopier)(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
-                                      const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                      const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
-static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) {
-	ZSTD_sequenceCopier sequenceCopier = NULL;
-	assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode));
-	if(mode == ZSTD_sf_explicitBlockDelimiters) {
-		return ZSTD_copySequencesToSeqStoreExplicitBlockDelim;
-	} else if(mode == ZSTD_sf_noBlockDelimiters) {
-		return ZSTD_copySequencesToSeqStoreNoBlockDelim;
-	}
-	assert(sequenceCopier != NULL);
-	return sequenceCopier;
-}
-
-/* Discover the size of next block by searching for the delimiter.
- * Note that a block delimiter **must** exist in this mode,
- * otherwise it's an input error.
- * The block size retrieved will be later compared to ensure it remains within bounds */
-static size_t
-blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) {
-	int end = 0;
-	size_t blockSize = 0;
-	size_t spos = seqPos.idx;
-	DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
-	assert(spos <= inSeqsSize);
-	while(spos < inSeqsSize) {
-		end = (inSeqs[spos].offset == 0);
-		blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
-		if(end) {
-			if(inSeqs[spos].matchLength != 0)
-				RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
-			break;
-		}
-		spos++;
-	}
-	if(!end)
-		RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
-	return blockSize;
-}
-
-/* More a "target" block size */
-static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining) {
-	int const lastBlock = (remaining <= blockSize);
-	return lastBlock ? remaining : blockSize;
-}
-
-static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
-                                  size_t blockSize, size_t remaining,
-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) {
-	DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
-	if(mode == ZSTD_sf_noBlockDelimiters)
-		return blockSize_noDelimiter(blockSize, remaining);
-	{
-		size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
-		FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
-		if(explicitBlockSize > blockSize)
-			RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
-		if(explicitBlockSize > remaining)
-			RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
-		return explicitBlockSize;
-	}
-}
-
-/* Compress, block-by-block, all of the sequences given.
- *
- * Returns the cumulative size of all compressed blocks (including their headers),
- * otherwise a ZSTD error.
- */
-static size_t
-ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
-                                void* dst, size_t dstCapacity,
-                                const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
-                                const void* src, size_t srcSize) {
-	size_t cSize = 0;
-	size_t remaining = srcSize;
-	ZSTD_sequencePosition seqPos = {0, 0, 0};
-
-	BYTE const * ip = (BYTE const *)src;
-	BYTE* op = (BYTE*)dst;
-	ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters);
-
-	DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize);
-	/* Special case: empty frame */
-	if(remaining == 0) {
-		U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw) << 1);
-		RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, "No room for empty frame block header");
-		MEM_writeLE32(op, cBlockHeader24);
-		op += ZSTD_blockHeaderSize;
-		dstCapacity -= ZSTD_blockHeaderSize;
-		cSize += ZSTD_blockHeaderSize;
-	}
-
-	while(remaining) {
-		size_t compressedSeqsSize;
-		size_t cBlockSize;
-		size_t additionalByteAdjustment;
-		size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
-		                                       cctx->blockSize, remaining,
-		                                       inSeqs, inSeqsSize, seqPos);
-		U32 const lastBlock = (blockSize == remaining);
-		FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
-		assert(blockSize <= remaining);
-		ZSTD_resetSeqStore(&cctx->seqStore);
-		DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
-
-		additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
-		FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
-		blockSize -= additionalByteAdjustment;
-
-		/* If blocks are too small, emit as a nocompress block */
-		/* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
-		 * additional 1. We need to revisit and change this logic to be more consistent */
-		if(blockSize < MIN_CBLOCK_SIZE + ZSTD_blockHeaderSize + 1 + 1) {
-			cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
-			FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
-			DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
-			cSize += cBlockSize;
-			ip += blockSize;
-			op += cBlockSize;
-			remaining -= blockSize;
-			dstCapacity -= cBlockSize;
-			continue;
-		}
-
-		RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
-		compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
-		                                                  &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
-		                                                  &cctx->appliedParams,
-		                                                  op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize,
-		                                                  blockSize,
-		                                                  cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
-		                                                  cctx->bmi2);
-		FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
-		DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
-
-		if(!cctx->isFirstBlock &&
-		   ZSTD_maybeRLE(&cctx->seqStore) &&
-		   ZSTD_isRLE(ip, blockSize)) {
-			/* We don't want to emit our first block as a RLE even if it qualifies because
-			 * doing so will cause the decoder (cli only) to throw a "should consume all input error."
-			 * This is only an issue for zstd <= v1.4.3
-			 */
-			compressedSeqsSize = 1;
-		}
-
-		if(compressedSeqsSize == 0) {
-			/* ZSTD_noCompressBlock writes the block header as well */
-			cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
-			FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
-			DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
-		} else if(compressedSeqsSize == 1) {
-			cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
-			FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
-			DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
-		} else {
-			U32 cBlockHeader;
-			/* Error checking and repcodes update */
-			ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState);
-			if(cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
-				cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
-
-			/* Write block header into beginning of block*/
-			cBlockHeader = lastBlock + (((U32)bt_compressed) << 1) + (U32)(compressedSeqsSize << 3);
-			MEM_writeLE24(op, cBlockHeader);
-			cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
-			DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
-		}
-
-		cSize += cBlockSize;
-
-		if(lastBlock) {
-			break;
-		} else {
-			ip += blockSize;
-			op += cBlockSize;
-			remaining -= blockSize;
-			dstCapacity -= cBlockSize;
-			cctx->isFirstBlock = 0;
-		}
-		DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
-	}
-
-	DEBUGLOG(4, "cSize final total: %zu", cSize);
-	return cSize;
-}
-
-size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
-                              void* dst, size_t dstCapacity,
-                              const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
-                              const void* src, size_t srcSize) {
-	BYTE* op = (BYTE*)dst;
-	size_t cSize = 0;
-	size_t compressedBlocksSize = 0;
-	size_t frameHeaderSize = 0;
-
-	/* Transparent initialization stage, same as compressStream2() */
-	DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
-	assert(cctx != NULL);
-	FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
-	/* Begin writing output, starting with frame header */
-	frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, &cctx->appliedParams, srcSize, cctx->dictID);
-	op += frameHeaderSize;
-	dstCapacity -= frameHeaderSize;
-	cSize += frameHeaderSize;
-	if(cctx->appliedParams.fParams.checksumFlag && srcSize) {
-		XXH64_update(&cctx->xxhState, src, srcSize);
-	}
-	/* cSize includes block header size and compressed sequences size */
-	compressedBlocksSize = ZSTD_compressSequences_internal(cctx,
-	                                                       op, dstCapacity,
-	                                                       inSeqs, inSeqsSize,
-	                                                       src, srcSize);
-	FORWARD_IF_ERROR(compressedBlocksSize, "Compressing blocks failed!");
-	cSize += compressedBlocksSize;
-	dstCapacity -= compressedBlocksSize;
-
-	if(cctx->appliedParams.fParams.checksumFlag) {
-		U32 const checksum = (U32)XXH64_digest(&cctx->xxhState);
-		RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, "no room for checksum");
-		DEBUGLOG(4, "Write checksum : %08X", (unsigned)checksum);
-		MEM_writeLE32((char*)dst + cSize, checksum);
-		cSize += 4;
-	}
-
-	DEBUGLOG(4, "Final compressed size: %zu", cSize);
-	return cSize;
-}
-
-/*======   Finalize   ======*/
-
-static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) {
-	const ZSTD_inBuffer nullInput = {NULL, 0, 0};
-	const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
-	return stableInput ? zcs->expectedInBuffer : nullInput;
-}
-
-/*! ZSTD_flushStream() :
- * @return : amount of data remaining to flush */
-size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) {
-	ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
-	input.size = input.pos; /* do not ingest more input during flush */
-	return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
-}
-
-size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) {
-	ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
-	size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
-	FORWARD_IF_ERROR(remainingToFlush, "ZSTD_compressStream2(,,ZSTD_e_end) failed");
-	if(zcs->appliedParams.nbWorkers > 0)
-		return remainingToFlush; /* minimal estimation */
-	/* single thread mode : attempt to calculate remaining to flush more precisely */
-	{
-		size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
-		size_t const checksumSize = (size_t)(zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4);
-		size_t const toFlush = remainingToFlush + lastBlockSize + checksumSize;
-		DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (unsigned)toFlush);
-		return toFlush;
-	}
-}
-
-/*-=====  Pre-defined compression levels  =====-*/
-#include "clevels.h"
-
-int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
-int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; }
-int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; }
-
-static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize) {
-	ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, 0, dictSize, ZSTD_cpm_createCDict);
-	switch(cParams.strategy) {
-	case ZSTD_fast:
-	case ZSTD_dfast:
-		break;
-	case ZSTD_greedy:
-	case ZSTD_lazy:
-	case ZSTD_lazy2:
-		cParams.hashLog += ZSTD_LAZY_DDSS_BUCKET_LOG;
-		break;
-	case ZSTD_btlazy2:
-	case ZSTD_btopt:
-	case ZSTD_btultra:
-	case ZSTD_btultra2:
-		break;
-	}
-	return cParams;
-}
-
-static int ZSTD_dedicatedDictSearch_isSupported(
-    ZSTD_compressionParameters const * cParams) {
-	return (cParams->strategy >= ZSTD_greedy) && (cParams->strategy <= ZSTD_lazy2) && (cParams->hashLog > cParams->chainLog) && (cParams->chainLog <= 24);
-}
-
-/**
- * Reverses the adjustment applied to cparams when enabling dedicated dict
- * search. This is used to recover the params set to be used in the working
- * context. (Otherwise, those tables would also grow.)
- */
-static void ZSTD_dedicatedDictSearch_revertCParams(
-    ZSTD_compressionParameters* cParams) {
-	switch(cParams->strategy) {
-	case ZSTD_fast:
-	case ZSTD_dfast:
-		break;
-	case ZSTD_greedy:
-	case ZSTD_lazy:
-	case ZSTD_lazy2:
-		cParams->hashLog -= ZSTD_LAZY_DDSS_BUCKET_LOG;
-		if(cParams->hashLog < ZSTD_HASHLOG_MIN) {
-			cParams->hashLog = ZSTD_HASHLOG_MIN;
-		}
-		break;
-	case ZSTD_btlazy2:
-	case ZSTD_btopt:
-	case ZSTD_btultra:
-	case ZSTD_btultra2:
-		break;
-	}
-}
-
-static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) {
-	switch(mode) {
-	case ZSTD_cpm_unknown:
-	case ZSTD_cpm_noAttachDict:
-	case ZSTD_cpm_createCDict:
-		break;
-	case ZSTD_cpm_attachDict:
-		dictSize = 0;
-		break;
-	default:
-		assert(0);
-		break;
-	}
-	{
-		int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN;
-		size_t const addedSize = unknown && dictSize > 0 ? 500 : 0;
-		return unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint + dictSize + addedSize;
-	}
-}
-
-/*! ZSTD_getCParams_internal() :
- * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
- *  Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown.
- *        Use dictSize == 0 for unknown or unused.
- *  Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_cParamMode_e`. */
-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) {
-	U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode);
-	U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);
-	int row;
-	DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel);
-
-	/* row */
-	if(compressionLevel == 0)
-		row = ZSTD_CLEVEL_DEFAULT; /* 0 == default */
-	else if(compressionLevel < 0)
-		row = 0; /* entry 0 is baseline for fast mode */
-	else if(compressionLevel > ZSTD_MAX_CLEVEL)
-		row = ZSTD_MAX_CLEVEL;
-	else
-		row = compressionLevel;
-
-	{
-		ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row];
-		DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u strat: %u", tableID, row, (U32)cp.strategy);
-		/* acceleration factor */
-		if(compressionLevel < 0) {
-			int const clampedCompressionLevel = MAX(ZSTD_minCLevel(), compressionLevel);
-			cp.targetLength = (unsigned)(-clampedCompressionLevel);
-		}
-		/* refine parameters based on srcSize & dictSize */
-		return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
-	}
-}
-
-/*! ZSTD_getCParams() :
- * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
- *  Size values are optional, provide 0 if not known or unused */
-ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
-	if(srcSizeHint == 0)
-		srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
-	return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
-}
-
-/*! ZSTD_getParams() :
- *  same idea as ZSTD_getCParams()
- * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
- *  Fields of `ZSTD_frameParameters` are set to default values */
-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) {
-	ZSTD_parameters params;
-	ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode);
-	DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel);
-	ZSTD_memset(&params, 0, sizeof(params));
-	params.cParams = cParams;
-	params.fParams.contentSizeFlag = 1;
-	return params;
-}
-
-/*! ZSTD_getParams() :
- *  same idea as ZSTD_getCParams()
- * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
- *  Fields of `ZSTD_frameParameters` are set to default values */
-ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
-	if(srcSizeHint == 0)
-		srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
-	return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
-}
-
-void ZSTD_registerSequenceProducer(
-    ZSTD_CCtx* zc, void* mState,
-    ZSTD_sequenceProducer_F* mFinder) {
-	if(mFinder != NULL) {
-		ZSTD_externalMatchCtx emctx;
-		emctx.mState = mState;
-		emctx.mFinder = mFinder;
-		emctx.seqBuffer = NULL;
-		emctx.seqBufferCapacity = 0;
-		zc->externalMatchCtx = emctx;
-		zc->requestedParams.useSequenceProducer = 1;
-	} else {
-		ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx));
-		zc->requestedParams.useSequenceProducer = 0;
-	}
-}
diff --git a/src/zstd/zstd_compress_internal.h b/src/zstd/zstd_compress_internal.h
deleted file mode 100644
index ee9d8f72d..000000000
--- a/src/zstd/zstd_compress_internal.h
+++ /dev/null
@@ -1,1491 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/* This header contains definitions
- * that shall **only** be used by modules within lib/compress.
- */
-
-#ifndef ZSTD_COMPRESS_H
-#define ZSTD_COMPRESS_H
-
-/*-*************************************
- *  Dependencies
- ***************************************/
-#include "zstd_internal.h"
-#include "zstd_cwksp.h"
-#ifdef ZSTD_MULTITHREAD
-#include "zstdmt_compress.h"
-#endif
-#include "bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/*-*************************************
- *  Constants
- ***************************************/
-#define kSearchStrength 8
-#define HASH_READ_SIZE 8
-#define ZSTD_DUBT_UNSORTED_MARK 1 /* For btlazy2 strategy, index ZSTD_DUBT_UNSORTED_MARK==1 means "unsorted".                                       \
-	                                 It could be confused for a real successor at index "1", if sorted as larger than its predecessor.              \
-	                                 It's not a big deal though : candidate will just be sorted again.                                              \
-	                                 Additionally, candidate position 1 will be lost.                                                               \
-	                                 But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.                                \
-	                                 The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy. \
-	                                 This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
-
-/*-*************************************
- *  Context memory management
- ***************************************/
-typedef enum { ZSTDcs_created = 0,
-	           ZSTDcs_init,
-	           ZSTDcs_ongoing,
-	           ZSTDcs_ending } ZSTD_compressionStage_e;
-typedef enum { zcss_init = 0,
-	           zcss_load,
-	           zcss_flush } ZSTD_cStreamStage;
-
-typedef struct ZSTD_prefixDict_s {
-	const void* dict;
-	size_t dictSize;
-	ZSTD_dictContentType_e dictContentType;
-} ZSTD_prefixDict;
-
-typedef struct {
-	void* dictBuffer;
-	void const * dict;
-	size_t dictSize;
-	ZSTD_dictContentType_e dictContentType;
-	ZSTD_CDict* cdict;
-} ZSTD_localDict;
-
-typedef struct {
-	HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)];
-	HUF_repeat repeatMode;
-} ZSTD_hufCTables_t;
-
-typedef struct {
-	FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
-	FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
-	FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
-	FSE_repeat offcode_repeatMode;
-	FSE_repeat matchlength_repeatMode;
-	FSE_repeat litlength_repeatMode;
-} ZSTD_fseCTables_t;
-
-typedef struct {
-	ZSTD_hufCTables_t huf;
-	ZSTD_fseCTables_t fse;
-} ZSTD_entropyCTables_t;
-
-/***********************************************
- *  Entropy buffer statistics structs and funcs *
- ***********************************************/
-/** ZSTD_hufCTablesMetadata_t :
- *  Stores Literals Block Type for a super-block in hType, and
- *  huffman tree description in hufDesBuffer.
- *  hufDesSize refers to the size of huffman tree description in bytes.
- *  This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */
-typedef struct {
-	symbolEncodingType_e hType;
-	BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE];
-	size_t hufDesSize;
-} ZSTD_hufCTablesMetadata_t;
-
-/** ZSTD_fseCTablesMetadata_t :
- *  Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and
- *  fse tables in fseTablesBuffer.
- *  fseTablesSize refers to the size of fse tables in bytes.
- *  This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */
-typedef struct {
-	symbolEncodingType_e llType;
-	symbolEncodingType_e ofType;
-	symbolEncodingType_e mlType;
-	BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE];
-	size_t fseTablesSize;
-	size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
-} ZSTD_fseCTablesMetadata_t;
-
-typedef struct {
-	ZSTD_hufCTablesMetadata_t hufMetadata;
-	ZSTD_fseCTablesMetadata_t fseMetadata;
-} ZSTD_entropyCTablesMetadata_t;
-
-/** ZSTD_buildBlockEntropyStats() :
- *  Builds entropy for the block.
- *  @return : 0 on success or error code */
-size_t ZSTD_buildBlockEntropyStats(
-    const seqStore_t* seqStorePtr,
-    const ZSTD_entropyCTables_t* prevEntropy,
-    ZSTD_entropyCTables_t* nextEntropy,
-    const ZSTD_CCtx_params* cctxParams,
-    ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-    void* workspace, size_t wkspSize);
-
-/*********************************
- *  Compression internals structs *
- *********************************/
-
-typedef struct {
-	U32 off; /* Offset sumtype code for the match, using ZSTD_storeSeq() format */
-	U32 len; /* Raw length of match */
-} ZSTD_match_t;
-
-typedef struct {
-	U32 offset;      /* Offset of sequence */
-	U32 litLength;   /* Length of literals prior to match */
-	U32 matchLength; /* Raw length of match */
-} rawSeq;
-
-typedef struct {
-	rawSeq* seq;          /* The start of the sequences */
-	size_t pos;           /* The index in seq where reading stopped. pos <= size. */
-	size_t posInSequence; /* The position within the sequence at seq[pos] where reading
-	                         stopped. posInSequence <= seq[pos].litLength + seq[pos].matchLength */
-	size_t size;          /* The number of sequences. <= capacity. */
-	size_t capacity;      /* The capacity starting from `seq` pointer */
-} rawSeqStore_t;
-
-typedef struct {
-	U32 idx;           /* Index in array of ZSTD_Sequence */
-	U32 posInSequence; /* Position within sequence at idx */
-	size_t posInSrc;   /* Number of bytes given by sequences provided so far */
-} ZSTD_sequencePosition;
-
-UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
-
-typedef struct {
-	int price;
-	U32 off;
-	U32 mlen;
-	U32 litlen;
-	U32 rep[ZSTD_REP_NUM];
-} ZSTD_optimal_t;
-
-typedef enum { zop_dynamic = 0,
-	           zop_predef } ZSTD_OptPrice_e;
-
-typedef struct {
-	/* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
-	unsigned* litFreq;          /* table of literals statistics, of size 256 */
-	unsigned* litLengthFreq;    /* table of litLength statistics, of size (MaxLL+1) */
-	unsigned* matchLengthFreq;  /* table of matchLength statistics, of size (MaxML+1) */
-	unsigned* offCodeFreq;      /* table of offCode statistics, of size (MaxOff+1) */
-	ZSTD_match_t* matchTable;   /* list of found matches, of size ZSTD_OPT_NUM+1 */
-	ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
-
-	U32 litSum;                               /* nb of literals */
-	U32 litLengthSum;                         /* nb of litLength codes */
-	U32 matchLengthSum;                       /* nb of matchLength codes */
-	U32 offCodeSum;                           /* nb of offset codes */
-	U32 litSumBasePrice;                      /* to compare to log2(litfreq) */
-	U32 litLengthSumBasePrice;                /* to compare to log2(llfreq)  */
-	U32 matchLengthSumBasePrice;              /* to compare to log2(mlfreq)  */
-	U32 offCodeSumBasePrice;                  /* to compare to log2(offreq)  */
-	ZSTD_OptPrice_e priceType;                /* prices can be determined dynamically, or follow a pre-defined cost structure */
-	const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionary statistics */
-	ZSTD_paramSwitch_e literalCompressionMode;
-} optState_t;
-
-typedef struct {
-	ZSTD_entropyCTables_t entropy;
-	U32 rep[ZSTD_REP_NUM];
-} ZSTD_compressedBlockState_t;
-
-typedef struct {
-	BYTE const * nextSrc;      /* next block here to continue on current prefix */
-	BYTE const * base;         /* All regular indexes relative to this position */
-	BYTE const * dictBase;     /* extDict indexes relative to this position */
-	U32 dictLimit;             /* below that point, need extDict */
-	U32 lowLimit;              /* below that point, no more valid data */
-	U32 nbOverflowCorrections; /* Number of times overflow correction has run since
-	                            * ZSTD_window_init(). Useful for debugging coredumps
-	                            * and for ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY.
-	                            */
-} ZSTD_window_t;
-
-#define ZSTD_WINDOW_START_INDEX 2
-
-typedef struct ZSTD_matchState_t ZSTD_matchState_t;
-
-#define ZSTD_ROW_HASH_CACHE_SIZE 8 /* Size of prefetching hash cache for row-based matchfinder */
-
-struct ZSTD_matchState_t {
-	ZSTD_window_t window; /* State for window round buffer management */
-	U32 loadedDictEnd;    /* index of end of dictionary, within context's referential.
-	                       * When loadedDictEnd != 0, a dictionary is in use, and still valid.
-	                       * This relies on a mechanism to set loadedDictEnd=0 when dictionary is no longer within distance.
-	                       * Such mechanism is provided within ZSTD_window_enforceMaxDist() and ZSTD_checkDictValidity().
-	                       * When dict referential is copied into active context (i.e. not attached),
-	                       * loadedDictEnd == dictSize, since referential starts from zero.
-	                       */
-	U32 nextToUpdate;     /* index from which to continue table update */
-	U32 hashLog3;         /* dispatch table for matches of len==3 : larger == faster, more memory */
-
-	U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
-	U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
-	U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
-
-	U32* hashTable;
-	U32* hashTable3;
-	U32* chainTable;
-
-	U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */
-
-	int dedicatedDictSearch; /* Indicates whether this matchState is using the
-	                          * dedicated dictionary search structure.
-	                          */
-	optState_t opt;          /* optimal parser state */
-	const ZSTD_matchState_t* dictMatchState;
-	ZSTD_compressionParameters cParams;
-	const rawSeqStore_t* ldmSeqStore;
-
-	/* Controls prefetching in some dictMatchState matchfinders.
-	 * This behavior is controlled from the cctx ms.
-	 * This parameter has no effect in the cdict ms. */
-	int prefetchCDictTables;
-};
-
-typedef struct {
-	ZSTD_compressedBlockState_t* prevCBlock;
-	ZSTD_compressedBlockState_t* nextCBlock;
-	ZSTD_matchState_t matchState;
-} ZSTD_blockState_t;
-
-typedef struct {
-	U32 offset;
-	U32 checksum;
-} ldmEntry_t;
-
-typedef struct {
-	BYTE const * split;
-	U32 hash;
-	U32 checksum;
-	ldmEntry_t* bucket;
-} ldmMatchCandidate_t;
-
-#define LDM_BATCH_SIZE 64
-
-typedef struct {
-	ZSTD_window_t window; /* State for the window round buffer management */
-	ldmEntry_t* hashTable;
-	U32 loadedDictEnd;
-	BYTE* bucketOffsets; /* Next position in bucket to insert entry */
-	size_t splitIndices[LDM_BATCH_SIZE];
-	ldmMatchCandidate_t matchCandidates[LDM_BATCH_SIZE];
-} ldmState_t;
-
-typedef struct {
-	ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */
-	U32 hashLog;                  /* Log size of hashTable */
-	U32 bucketSizeLog;            /* Log bucket size for collision resolution, at most 8 */
-	U32 minMatchLength;           /* Minimum match length */
-	U32 hashRateLog;              /* Log number of entries to skip */
-	U32 windowLog;                /* Window log for the LDM */
-} ldmParams_t;
-
-typedef struct {
-	int collectSequences;
-	ZSTD_Sequence* seqStart;
-	size_t seqIndex;
-	size_t maxSequences;
-} SeqCollector;
-
-struct ZSTD_CCtx_params_s {
-	ZSTD_format_e format;
-	ZSTD_compressionParameters cParams;
-	ZSTD_frameParameters fParams;
-
-	int compressionLevel;
-	int forceWindow;         /* force back-references to respect limit of
-	                          * 1<<wLog, even for dictionary */
-	size_t targetCBlockSize; /* Tries to fit compressed block size to be around targetCBlockSize.
-	                          * No target when targetCBlockSize == 0.
-	                          * There is no guarantee on compressed block size */
-	int srcSizeHint;         /* User's best guess of source size.
-	                          * Hint is not valid when srcSizeHint == 0.
-	                          * There is no guarantee that hint is close to actual source size */
-
-	ZSTD_dictAttachPref_e attachDictPref;
-	ZSTD_paramSwitch_e literalCompressionMode;
-
-	/* Multithreading: used to pass parameters to mtctx */
-	int nbWorkers;
-	size_t jobSize;
-	int overlapLog;
-	int rsyncable;
-
-	/* Long distance matching parameters */
-	ldmParams_t ldmParams;
-
-	/* Dedicated dict search algorithm trigger */
-	int enableDedicatedDictSearch;
-
-	/* Input/output buffer modes */
-	ZSTD_bufferMode_e inBufferMode;
-	ZSTD_bufferMode_e outBufferMode;
-
-	/* Sequence compression API */
-	ZSTD_sequenceFormat_e blockDelimiters;
-	int validateSequences;
-
-	/* Block splitting */
-	ZSTD_paramSwitch_e useBlockSplitter;
-
-	/* Param for deciding whether to use row-based matchfinder */
-	ZSTD_paramSwitch_e useRowMatchFinder;
-
-	/* Always load a dictionary in ext-dict mode (not prefix mode)? */
-	int deterministicRefPrefix;
-
-	/* Internal use, for createCCtxParams() and freeCCtxParams() only */
-	ZSTD_customMem customMem;
-
-	/* Controls prefetching in some dictMatchState matchfinders */
-	ZSTD_paramSwitch_e prefetchCDictTables;
-
-	/* Controls whether zstd will fall back to an internal matchfinder
-	 * if the external matchfinder returns an error code. */
-	int enableMatchFinderFallback;
-
-	/* Indicates whether an external matchfinder has been referenced.
-	 * Users can't set this externally.
-	 * It is set internally in ZSTD_registerSequenceProducer(). */
-	int useSequenceProducer;
-
-	/* Adjust the max block size*/
-	size_t maxBlockSize;
-
-	/* Controls repcode search in external sequence parsing */
-	ZSTD_paramSwitch_e searchForExternalRepcodes;
-}; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
-
-#define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
-#define ENTROPY_WORKSPACE_SIZE (HUF_WORKSPACE_SIZE + COMPRESS_SEQUENCES_WORKSPACE_SIZE)
-
-/**
- * Indicates whether this compression proceeds directly from user-provided
- * source buffer to user-provided destination buffer (ZSTDb_not_buffered), or
- * whether the context needs to buffer the input/output (ZSTDb_buffered).
- */
-typedef enum {
-	ZSTDb_not_buffered,
-	ZSTDb_buffered
-} ZSTD_buffered_policy_e;
-
-/**
- * Struct that contains all elements of block splitter that should be allocated
- * in a wksp.
- */
-#define ZSTD_MAX_NB_BLOCK_SPLITS 196
-typedef struct {
-	seqStore_t fullSeqStoreChunk;
-	seqStore_t firstHalfSeqStore;
-	seqStore_t secondHalfSeqStore;
-	seqStore_t currSeqStore;
-	seqStore_t nextSeqStore;
-
-	U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS];
-	ZSTD_entropyCTablesMetadata_t entropyMetadata;
-} ZSTD_blockSplitCtx;
-
-/* Context for block-level external matchfinder API */
-typedef struct {
-	void* mState;
-	ZSTD_sequenceProducer_F* mFinder;
-	ZSTD_Sequence* seqBuffer;
-	size_t seqBufferCapacity;
-} ZSTD_externalMatchCtx;
-
-struct ZSTD_CCtx_s {
-	ZSTD_compressionStage_e stage;
-	int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
-	int bmi2;           /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
-	ZSTD_CCtx_params requestedParams;
-	ZSTD_CCtx_params appliedParams;
-	ZSTD_CCtx_params simpleApiParams; /* Param storage used by the simple API - not sticky. Must only be used in top-level simple API functions for storage. */
-	U32 dictID;
-	size_t dictContentSize;
-
-	ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */
-	size_t blockSize;
-	unsigned long long pledgedSrcSizePlusOne; /* this way, 0 (default) == unknown */
-	unsigned long long consumedSrcSize;
-	unsigned long long producedCSize;
-	XXH64_state_t xxhState;
-	ZSTD_customMem customMem;
-	ZSTD_threadPool* pool;
-	size_t staticSize;
-	SeqCollector seqCollector;
-	int isFirstBlock;
-	int initialized;
-
-	seqStore_t seqStore;  /* sequences storage ptrs */
-	ldmState_t ldmState;  /* long distance matching state */
-	rawSeq* ldmSequences; /* Storage for the ldm output sequences */
-	size_t maxNbLdmSequences;
-	rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
-	ZSTD_blockState_t blockState;
-	U32* entropyWorkspace; /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */
-
-	/* Whether we are streaming or not */
-	ZSTD_buffered_policy_e bufferedPolicy;
-
-	/* streaming */
-	char* inBuff;
-	size_t inBuffSize;
-	size_t inToCompress;
-	size_t inBuffPos;
-	size_t inBuffTarget;
-	char* outBuff;
-	size_t outBuffSize;
-	size_t outBuffContentSize;
-	size_t outBuffFlushedSize;
-	ZSTD_cStreamStage streamStage;
-	U32 frameEnded;
-
-	/* Stable in/out buffer verification */
-	ZSTD_inBuffer expectedInBuffer;
-	size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
-	size_t expectedOutBufferSize;
-
-	/* Dictionary */
-	ZSTD_localDict localDict;
-	const ZSTD_CDict* cdict;
-	ZSTD_prefixDict prefixDict; /* single-usage dictionary */
-
-	/* Multi-threading */
-#ifdef ZSTD_MULTITHREAD
-	ZSTDMT_CCtx* mtctx;
-#endif
-
-	/* Tracing */
-#if ZSTD_TRACE
-	ZSTD_TraceCtx traceCtx;
-#endif
-
-	/* Workspace for block splitter */
-	ZSTD_blockSplitCtx blockSplitCtx;
-
-	/* Workspace for external matchfinder */
-	ZSTD_externalMatchCtx externalMatchCtx;
-};
-
-typedef enum { ZSTD_dtlm_fast,
-	           ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
-typedef enum { ZSTD_tfp_forCCtx,
-	           ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
-
-typedef enum {
-	ZSTD_noDict = 0,
-	ZSTD_extDict = 1,
-	ZSTD_dictMatchState = 2,
-	ZSTD_dedicatedDictSearch = 3
-} ZSTD_dictMode_e;
-
-typedef enum {
-	ZSTD_cpm_noAttachDict = 0, /* Compression with ZSTD_noDict or ZSTD_extDict.
-	                            * In this mode we use both the srcSize and the dictSize
-	                            * when selecting and adjusting parameters.
-	                            */
-	ZSTD_cpm_attachDict = 1,   /* Compression with ZSTD_dictMatchState or ZSTD_dedicatedDictSearch.
-	                            * In this mode we only take the srcSize into account when selecting
-	                            * and adjusting parameters.
-	                            */
-	ZSTD_cpm_createCDict = 2,  /* Creating a CDict.
-	                            * In this mode we take both the source size and the dictionary size
-	                            * into account when selecting and adjusting the parameters.
-	                            */
-	ZSTD_cpm_unknown = 3       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
-	                            * We don't know what these parameters are for. We default to the legacy
-	                            * behavior of taking both the source size and the dict size into account
-	                            * when selecting and adjusting parameters.
-	                            */
-} ZSTD_cParamMode_e;
-
-typedef size_t (*ZSTD_blockCompressor)(
-    ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);
-
-MEM_STATIC U32 ZSTD_LLcode(U32 litLength) {
-	static const BYTE LL_Code[64] = {0, 1, 2, 3, 4, 5, 6, 7,
-	                                 8, 9, 10, 11, 12, 13, 14, 15,
-	                                 16, 16, 17, 17, 18, 18, 19, 19,
-	                                 20, 20, 20, 20, 21, 21, 21, 21,
-	                                 22, 22, 22, 22, 22, 22, 22, 22,
-	                                 23, 23, 23, 23, 23, 23, 23, 23,
-	                                 24, 24, 24, 24, 24, 24, 24, 24,
-	                                 24, 24, 24, 24, 24, 24, 24, 24};
-	static const U32 LL_deltaCode = 19;
-	return (litLength > 63) ? ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
-}
-
-/* ZSTD_MLcode() :
- * note : mlBase = matchLength - MINMATCH;
- *        because it's the format it's stored in seqStore->sequences */
-MEM_STATIC U32 ZSTD_MLcode(U32 mlBase) {
-	static const BYTE ML_Code[128] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-	                                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-	                                  32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37,
-	                                  38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39,
-	                                  40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
-	                                  41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
-	                                  42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
-	                                  42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42};
-	static const U32 ML_deltaCode = 36;
-	return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase];
-}
-
-/* ZSTD_cParam_withinBounds:
- * @return 1 if value is within cParam bounds,
- * 0 otherwise */
-MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) {
-	ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);
-	if(ZSTD_isError(bounds.error))
-		return 0;
-	if(value < bounds.lowerBound)
-		return 0;
-	if(value > bounds.upperBound)
-		return 0;
-	return 1;
-}
-
-/* ZSTD_noCompressBlock() :
- * Writes uncompressed block to dst buffer from given src.
- * Returns the size of the block */
-MEM_STATIC size_t
-ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) {
-	U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw) << 1) + (U32)(srcSize << 3);
-	DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
-	RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
-	                dstSize_tooSmall, "dst buf too small for uncompressed block");
-	MEM_writeLE24(dst, cBlockHeader24);
-	ZSTD_memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
-	return ZSTD_blockHeaderSize + srcSize;
-}
-
-MEM_STATIC size_t
-ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) {
-	BYTE* const op = (BYTE*)dst;
-	U32 const cBlockHeader = lastBlock + (((U32)bt_rle) << 1) + (U32)(srcSize << 3);
-	RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, "");
-	MEM_writeLE24(op, cBlockHeader);
-	op[3] = src;
-	return 4;
-}
-
-/* ZSTD_minGain() :
- * minimum compression required
- * to generate a compress block or a compressed literals section.
- * note : use same formula for both situations */
-MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) {
-	U32 const minlog = (strat >= ZSTD_btultra) ? (U32)(strat)-1 : 6;
-	ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
-	assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
-	return (srcSize >> minlog) + 2;
-}
-
-MEM_STATIC int ZSTD_literalsCompressionIsDisabled(const ZSTD_CCtx_params* cctxParams) {
-	switch(cctxParams->literalCompressionMode) {
-	case ZSTD_ps_enable:
-		return 0;
-	case ZSTD_ps_disable:
-		return 1;
-	default:
-		assert(0 /* impossible: pre-validated */);
-		ZSTD_FALLTHROUGH;
-	case ZSTD_ps_auto:
-		return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
-	}
-}
-
-/*! ZSTD_safecopyLiterals() :
- *  memcpy() function that won't read beyond more than WILDCOPY_OVERLENGTH bytes past ilimit_w.
- *  Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single
- *  large copies.
- */
-static void
-ZSTD_safecopyLiterals(BYTE* op, BYTE const * ip, BYTE const * const iend, BYTE const * ilimit_w) {
-	assert(iend > ilimit_w);
-	if(ip <= ilimit_w) {
-		ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap);
-		op += ilimit_w - ip;
-		ip = ilimit_w;
-	}
-	while(ip < iend)
-		*op++ = *ip++;
-}
-
-#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
-#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
-#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
-#define REPCODE_TO_OFFBASE(r) (assert((r) >= 1), assert((r) <= ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
-#define OFFSET_TO_OFFBASE(o) (assert((o) > 0), o + ZSTD_REP_NUM)
-#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM)
-#define OFFBASE_IS_REPCODE(o) (1 <= (o) && (o) <= ZSTD_REP_NUM)
-#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o)-ZSTD_REP_NUM)
-#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */
-
-/*! ZSTD_storeSeq() :
- *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
- *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
- *  @matchLength : must be >= MINMATCH
- *  Allowed to over-read literals up to litLimit.
- */
-HINT_INLINE UNUSED_ATTR void
-ZSTD_storeSeq(seqStore_t* seqStorePtr,
-              size_t litLength, const BYTE* literals, const BYTE* litLimit,
-              U32 offBase,
-              size_t matchLength) {
-	BYTE const * const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
-	BYTE const * const litEnd = literals + litLength;
-#if defined(DEBUGLEVEL) && (DEBUGLEVEL >= 6)
-	static const BYTE* g_start = NULL;
-	if(g_start == NULL)
-		g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */
-	{
-		U32 const pos = (U32)((const BYTE*)literals - g_start);
-		DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
-		         pos, (U32)litLength, (U32)matchLength, (U32)offBase);
-	}
-#endif
-	assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
-	/* copy Literals */
-	assert(seqStorePtr->maxNbLit <= 128 KB);
-	assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
-	assert(literals + litLength <= litLimit);
-	if(litEnd <= litLimit_w) {
-		/* Common case we can use wildcopy.
-		 * First copy 16 bytes, because literals are likely short.
-		 */
-		ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
-		ZSTD_copy16(seqStorePtr->lit, literals);
-		if(litLength > 16) {
-			ZSTD_wildcopy(seqStorePtr->lit + 16, literals + 16, (ptrdiff_t)litLength - 16, ZSTD_no_overlap);
-		}
-	} else {
-		ZSTD_safecopyLiterals(seqStorePtr->lit, literals, litEnd, litLimit_w);
-	}
-	seqStorePtr->lit += litLength;
-
-	/* literal Length */
-	if(litLength > 0xFFFF) {
-		assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
-		seqStorePtr->longLengthType = ZSTD_llt_literalLength;
-		seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
-	}
-	seqStorePtr->sequences[0].litLength = (U16)litLength;
-
-	/* match offset */
-	seqStorePtr->sequences[0].offBase = offBase;
-
-	/* match Length */
-	assert(matchLength >= MINMATCH);
-	{
-		size_t const mlBase = matchLength - MINMATCH;
-		if(mlBase > 0xFFFF) {
-			assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
-			seqStorePtr->longLengthType = ZSTD_llt_matchLength;
-			seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
-		}
-		seqStorePtr->sequences[0].mlBase = (U16)mlBase;
-	}
-
-	seqStorePtr->sequences++;
-}
-
-/* ZSTD_updateRep() :
- * updates in-place @rep (array of repeat offsets)
- * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
- */
-MEM_STATIC void
-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) {
-	if(OFFBASE_IS_OFFSET(offBase)) { /* full offset */
-		rep[2] = rep[1];
-		rep[1] = rep[0];
-		rep[0] = OFFBASE_TO_OFFSET(offBase);
-	} else { /* repcode */
-		U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
-		if(repCode > 0) { /* note : if repCode==0, no change */
-			U32 const currentOffset = (repCode == ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
-			rep[2] = (repCode >= 2) ? rep[1] : rep[2];
-			rep[1] = rep[0];
-			rep[0] = currentOffset;
-		} else { /* repCode == 0 */
-			     /* nothing to do */
-		}
-	}
-}
-
-typedef struct repcodes_s {
-	U32 rep[3];
-} repcodes_t;
-
-MEM_STATIC repcodes_t
-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) {
-	repcodes_t newReps;
-	ZSTD_memcpy(&newReps, rep, sizeof(newReps));
-	ZSTD_updateRep(newReps.rep, offBase, ll0);
-	return newReps;
-}
-
-/*-*************************************
- *  Match length counter
- ***************************************/
-MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) {
-	const BYTE* const pStart = pIn;
-	const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t) - 1);
-
-	if(pIn < pInLoopLimit) {
-		{
-			size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
-			if(diff)
-				return ZSTD_NbCommonBytes(diff);
-		}
-		pIn += sizeof(size_t);
-		pMatch += sizeof(size_t);
-		while(pIn < pInLoopLimit) {
-			size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
-			if(!diff) {
-				pIn += sizeof(size_t);
-				pMatch += sizeof(size_t);
-				continue;
-			}
-			pIn += ZSTD_NbCommonBytes(diff);
-			return (size_t)(pIn - pStart);
-		}
-	}
-	if(MEM_64bits() && (pIn < (pInLimit - 3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) {
-		pIn += 4;
-		pMatch += 4;
-	}
-	if((pIn < (pInLimit - 1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) {
-		pIn += 2;
-		pMatch += 2;
-	}
-	if((pIn < pInLimit) && (*pMatch == *pIn))
-		pIn++;
-	return (size_t)(pIn - pStart);
-}
-
-/** ZSTD_count_2segments() :
- *  can count match length with `ip` & `match` in 2 different segments.
- *  convention : on reaching mEnd, match count continue starting from iStart
- */
-MEM_STATIC size_t
-ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
-                     const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart) {
-	const BYTE* const vEnd = MIN(ip + (mEnd - match), iEnd);
-	size_t const matchLength = ZSTD_count(ip, match, vEnd);
-	if(match + matchLength != mEnd)
-		return matchLength;
-	DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength);
-	DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match);
-	DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip);
-	DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart);
-	DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip + matchLength, iStart, iEnd));
-	return matchLength + ZSTD_count(ip + matchLength, iStart, iEnd);
-}
-
-/*-*************************************
- *  Hashes
- ***************************************/
-static const U32 prime3bytes = 506832829U;
-static U32 ZSTD_hash3(U32 u, U32 h) {
-	assert(h <= 32);
-	return ((u << (32 - 24)) * prime3bytes) >> (32 - h);
-}
-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
-
-static const U32 prime4bytes = 2654435761U;
-static U32 ZSTD_hash4(U32 u, U32 h) {
-	assert(h <= 32);
-	return (u * prime4bytes) >> (32 - h);
-}
-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h); }
-
-static const U64 prime5bytes = 889523592379ULL;
-static size_t ZSTD_hash5(U64 u, U32 h) {
-	assert(h <= 64);
-	return (size_t)(((u << (64 - 40)) * prime5bytes) >> (64 - h));
-}
-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
-
-static const U64 prime6bytes = 227718039650203ULL;
-static size_t ZSTD_hash6(U64 u, U32 h) {
-	assert(h <= 64);
-	return (size_t)(((u << (64 - 48)) * prime6bytes) >> (64 - h));
-}
-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
-
-static const U64 prime7bytes = 58295818150454627ULL;
-static size_t ZSTD_hash7(U64 u, U32 h) {
-	assert(h <= 64);
-	return (size_t)(((u << (64 - 56)) * prime7bytes) >> (64 - h));
-}
-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
-
-static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
-static size_t ZSTD_hash8(U64 u, U32 h) {
-	assert(h <= 64);
-	return (size_t)(((u)*prime8bytes) >> (64 - h));
-}
-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
-
-MEM_STATIC FORCE_INLINE_ATTR
-    size_t
-    ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) {
-	/* Although some of these hashes do support hBits up to 64, some do not.
-	 * To be on the safe side, always avoid hBits > 32. */
-	assert(hBits <= 32);
-
-	switch(mls) {
-	default:
-	case 4:
-		return ZSTD_hash4Ptr(p, hBits);
-	case 5:
-		return ZSTD_hash5Ptr(p, hBits);
-	case 6:
-		return ZSTD_hash6Ptr(p, hBits);
-	case 7:
-		return ZSTD_hash7Ptr(p, hBits);
-	case 8:
-		return ZSTD_hash8Ptr(p, hBits);
-	}
-}
-
-/** ZSTD_ipow() :
- * Return base^exponent.
- */
-static U64 ZSTD_ipow(U64 base, U64 exponent) {
-	U64 power = 1;
-	while(exponent) {
-		if(exponent & 1)
-			power *= base;
-		exponent >>= 1;
-		base *= base;
-	}
-	return power;
-}
-
-#define ZSTD_ROLL_HASH_CHAR_OFFSET 10
-
-/** ZSTD_rollingHash_append() :
- * Add the buffer to the hash value.
- */
-static U64 ZSTD_rollingHash_append(U64 hash, void const * buf, size_t size) {
-	BYTE const * istart = (BYTE const *)buf;
-	size_t pos;
-	for(pos = 0; pos < size; ++pos) {
-		hash *= prime8bytes;
-		hash += istart[pos] + ZSTD_ROLL_HASH_CHAR_OFFSET;
-	}
-	return hash;
-}
-
-/** ZSTD_rollingHash_compute() :
- * Compute the rolling hash value of the buffer.
- */
-MEM_STATIC U64 ZSTD_rollingHash_compute(void const * buf, size_t size) {
-	return ZSTD_rollingHash_append(0, buf, size);
-}
-
-/** ZSTD_rollingHash_primePower() :
- * Compute the primePower to be passed to ZSTD_rollingHash_rotate() for a hash
- * over a window of length bytes.
- */
-MEM_STATIC U64 ZSTD_rollingHash_primePower(U32 length) {
-	return ZSTD_ipow(prime8bytes, length - 1);
-}
-
-/** ZSTD_rollingHash_rotate() :
- * Rotate the rolling hash by one byte.
- */
-MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 primePower) {
-	hash -= (toRemove + ZSTD_ROLL_HASH_CHAR_OFFSET) * primePower;
-	hash *= prime8bytes;
-	hash += toAdd + ZSTD_ROLL_HASH_CHAR_OFFSET;
-	return hash;
-}
-
-/*-*************************************
- *  Round buffer management
- ***************************************/
-#if(ZSTD_WINDOWLOG_MAX_64 > 31)
-#error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX"
-#endif
-/* Max current allowed */
-#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
-/* Maximum chunk size before overflow correction needs to be called again */
-#define ZSTD_CHUNKSIZE_MAX                                  \
-	(((U32)-1)           /* Maximum ending current index */ \
-	 - ZSTD_CURRENT_MAX) /* Maximum beginning lowLimit */
-
-/**
- * ZSTD_window_clear():
- * Clears the window containing the history by simply setting it to empty.
- */
-MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window) {
-	size_t const endT = (size_t)(window->nextSrc - window->base);
-	U32 const end = (U32)endT;
-
-	window->lowLimit = end;
-	window->dictLimit = end;
-}
-
-MEM_STATIC U32 ZSTD_window_isEmpty(ZSTD_window_t const window) {
-	return window.dictLimit == ZSTD_WINDOW_START_INDEX &&
-	       window.lowLimit == ZSTD_WINDOW_START_INDEX &&
-	       (window.nextSrc - window.base) == ZSTD_WINDOW_START_INDEX;
-}
-
-/**
- * ZSTD_window_hasExtDict():
- * Returns non-zero if the window has a non-empty extDict.
- */
-MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window) {
-	return window.lowLimit < window.dictLimit;
-}
-
-/**
- * ZSTD_matchState_dictMode():
- * Inspects the provided matchState and figures out what dictMode should be
- * passed to the compressor.
- */
-MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t* ms) {
-	return ZSTD_window_hasExtDict(ms->window) ? ZSTD_extDict : ms->dictMatchState != NULL ? (ms->dictMatchState->dedicatedDictSearch ? ZSTD_dedicatedDictSearch : ZSTD_dictMatchState)
-	                                                                                      : ZSTD_noDict;
-}
-
-/* Defining this macro to non-zero tells zstd to run the overflow correction
- * code much more frequently. This is very inefficient, and should only be
- * used for tests and fuzzers.
- */
-#ifndef ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-#define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 1
-#else
-#define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 0
-#endif
-#endif
-
-/**
- * ZSTD_window_canOverflowCorrect():
- * Returns non-zero if the indices are large enough for overflow correction
- * to work correctly without impacting compression ratio.
- */
-MEM_STATIC U32 ZSTD_window_canOverflowCorrect(ZSTD_window_t const window,
-                                              U32 cycleLog,
-                                              U32 maxDist,
-                                              U32 loadedDictEnd,
-                                              void const * src) {
-	U32 const cycleSize = 1u << cycleLog;
-	U32 const curr = (U32)((BYTE const *)src - window.base);
-	U32 const minIndexToOverflowCorrect = cycleSize + MAX(maxDist, cycleSize) + ZSTD_WINDOW_START_INDEX;
-
-	/* Adjust the min index to backoff the overflow correction frequency,
-	 * so we don't waste too much CPU in overflow correction. If this
-	 * computation overflows we don't really care, we just need to make
-	 * sure it is at least minIndexToOverflowCorrect.
-	 */
-	U32 const adjustment = window.nbOverflowCorrections + 1;
-	U32 const adjustedIndex = MAX(minIndexToOverflowCorrect * adjustment,
-	                              minIndexToOverflowCorrect);
-	U32 const indexLargeEnough = curr > adjustedIndex;
-
-	/* Only overflow correct early if the dictionary is invalidated already,
-	 * so we don't hurt compression ratio.
-	 */
-	U32 const dictionaryInvalidated = curr > maxDist + loadedDictEnd;
-
-	return indexLargeEnough && dictionaryInvalidated;
-}
-
-/**
- * ZSTD_window_needOverflowCorrection():
- * Returns non-zero if the indices are getting too large and need overflow
- * protection.
- */
-MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
-                                                  U32 cycleLog,
-                                                  U32 maxDist,
-                                                  U32 loadedDictEnd,
-                                                  void const * src,
-                                                  void const * srcEnd) {
-	U32 const curr = (U32)((BYTE const *)srcEnd - window.base);
-	if(ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) {
-		if(ZSTD_window_canOverflowCorrect(window, cycleLog, maxDist, loadedDictEnd, src)) {
-			return 1;
-		}
-	}
-	return curr > ZSTD_CURRENT_MAX;
-}
-
-/**
- * ZSTD_window_correctOverflow():
- * Reduces the indices to protect from index overflow.
- * Returns the correction made to the indices, which must be applied to every
- * stored index.
- *
- * The least significant cycleLog bits of the indices must remain the same,
- * which may be 0. Every index up to maxDist in the past must be valid.
- */
-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
-                                           U32 maxDist, void const * src) {
-	/* preemptive overflow correction:
-	 * 1. correction is large enough:
-	 *    lowLimit > (3<<29) ==> current > 3<<29 + 1<<windowLog
-	 *    1<<windowLog <= newCurrent < 1<<chainLog + 1<<windowLog
-	 *
-	 *    current - newCurrent
-	 *    > (3<<29 + 1<<windowLog) - (1<<windowLog + 1<<chainLog)
-	 *    > (3<<29) - (1<<chainLog)
-	 *    > (3<<29) - (1<<30)             (NOTE: chainLog <= 30)
-	 *    > 1<<29
-	 *
-	 * 2. (ip+ZSTD_CHUNKSIZE_MAX - cctx->base) doesn't overflow:
-	 *    After correction, current is less than (1<<chainLog + 1<<windowLog).
-	 *    In 64-bit mode we are safe, because we have 64-bit ptrdiff_t.
-	 *    In 32-bit mode we are safe, because (chainLog <= 29), so
-	 *    ip+ZSTD_CHUNKSIZE_MAX - cctx->base < 1<<32.
-	 * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32:
-	 *    windowLog <= 31 ==> 3<<29 + 1<<windowLog < 7<<29 < 1<<32.
-	 */
-	U32 const cycleSize = 1u << cycleLog;
-	U32 const cycleMask = cycleSize - 1;
-	U32 const curr = (U32)((BYTE const *)src - window->base);
-	U32 const currentCycle = curr & cycleMask;
-	/* Ensure newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX. */
-	U32 const currentCycleCorrection = currentCycle < ZSTD_WINDOW_START_INDEX
-	                                       ? MAX(cycleSize, ZSTD_WINDOW_START_INDEX)
-	                                       : 0;
-	U32 const newCurrent = currentCycle + currentCycleCorrection + MAX(maxDist, cycleSize);
-	U32 const correction = curr - newCurrent;
-	/* maxDist must be a power of two so that:
-	 *   (newCurrent & cycleMask) == (curr & cycleMask)
-	 * This is required to not corrupt the chains / binary tree.
-	 */
-	assert((maxDist & (maxDist - 1)) == 0);
-	assert((curr & cycleMask) == (newCurrent & cycleMask));
-	assert(curr > newCurrent);
-	if(!ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) {
-		/* Loose bound, should be around 1<<29 (see above) */
-		assert(correction > 1 << 28);
-	}
-
-	window->base += correction;
-	window->dictBase += correction;
-	if(window->lowLimit < correction + ZSTD_WINDOW_START_INDEX) {
-		window->lowLimit = ZSTD_WINDOW_START_INDEX;
-	} else {
-		window->lowLimit -= correction;
-	}
-	if(window->dictLimit < correction + ZSTD_WINDOW_START_INDEX) {
-		window->dictLimit = ZSTD_WINDOW_START_INDEX;
-	} else {
-		window->dictLimit -= correction;
-	}
-
-	/* Ensure we can still reference the full window. */
-	assert(newCurrent >= maxDist);
-	assert(newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX);
-	/* Ensure that lowLimit and dictLimit didn't underflow. */
-	assert(window->lowLimit <= newCurrent);
-	assert(window->dictLimit <= newCurrent);
-
-	++window->nbOverflowCorrections;
-
-	DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction,
-	         window->lowLimit);
-	return correction;
-}
-
-/**
- * ZSTD_window_enforceMaxDist():
- * Updates lowLimit so that:
- *    (srcEnd - base) - lowLimit == maxDist + loadedDictEnd
- *
- * It ensures index is valid as long as index >= lowLimit.
- * This must be called before a block compression call.
- *
- * loadedDictEnd is only defined if a dictionary is in use for current compression.
- * As the name implies, loadedDictEnd represents the index at end of dictionary.
- * The value lies within context's referential, it can be directly compared to blockEndIdx.
- *
- * If loadedDictEndPtr is NULL, no dictionary is in use, and we use loadedDictEnd == 0.
- * If loadedDictEndPtr is not NULL, we set it to zero after updating lowLimit.
- * This is because dictionaries are allowed to be referenced fully
- * as long as the last byte of the dictionary is in the window.
- * Once input has progressed beyond window size, dictionary cannot be referenced anymore.
- *
- * In normal dict mode, the dictionary lies between lowLimit and dictLimit.
- * In dictMatchState mode, lowLimit and dictLimit are the same,
- * and the dictionary is below them.
- * forceWindow and dictMatchState are therefore incompatible.
- */
-MEM_STATIC void
-ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
-                           const void* blockEnd,
-                           U32 maxDist,
-                           U32* loadedDictEndPtr,
-                           const ZSTD_matchState_t** dictMatchStatePtr) {
-	U32 const blockEndIdx = (U32)((BYTE const *)blockEnd - window->base);
-	U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0;
-	DEBUGLOG(5, "ZSTD_window_enforceMaxDist: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u",
-	         (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
-
-	/* - When there is no dictionary : loadedDictEnd == 0.
-	     In which case, the test (blockEndIdx > maxDist) is merely to avoid
-	     overflowing next operation `newLowLimit = blockEndIdx - maxDist`.
-	   - When there is a standard dictionary :
-	     Index referential is copied from the dictionary,
-	     which means it starts from 0.
-	     In which case, loadedDictEnd == dictSize,
-	     and it makes sense to compare `blockEndIdx > maxDist + dictSize`
-	     since `blockEndIdx` also starts from zero.
-	   - When there is an attached dictionary :
-	     loadedDictEnd is expressed within the referential of the context,
-	     so it can be directly compared against blockEndIdx.
-	*/
-	if(blockEndIdx > maxDist + loadedDictEnd) {
-		U32 const newLowLimit = blockEndIdx - maxDist;
-		if(window->lowLimit < newLowLimit)
-			window->lowLimit = newLowLimit;
-		if(window->dictLimit < window->lowLimit) {
-			DEBUGLOG(5, "Update dictLimit to match lowLimit, from %u to %u",
-			         (unsigned)window->dictLimit, (unsigned)window->lowLimit);
-			window->dictLimit = window->lowLimit;
-		}
-		/* On reaching window size, dictionaries are invalidated */
-		if(loadedDictEndPtr)
-			*loadedDictEndPtr = 0;
-		if(dictMatchStatePtr)
-			*dictMatchStatePtr = NULL;
-	}
-}
-
-/* Similar to ZSTD_window_enforceMaxDist(),
- * but only invalidates dictionary
- * when input progresses beyond window size.
- * assumption : loadedDictEndPtr and dictMatchStatePtr are valid (non NULL)
- *              loadedDictEnd uses same referential as window->base
- *              maxDist is the window size */
-MEM_STATIC void
-ZSTD_checkDictValidity(const ZSTD_window_t* window,
-                       const void* blockEnd,
-                       U32 maxDist,
-                       U32* loadedDictEndPtr,
-                       const ZSTD_matchState_t** dictMatchStatePtr) {
-	assert(loadedDictEndPtr != NULL);
-	assert(dictMatchStatePtr != NULL);
-	{
-		U32 const blockEndIdx = (U32)((BYTE const *)blockEnd - window->base);
-		U32 const loadedDictEnd = *loadedDictEndPtr;
-		DEBUGLOG(5, "ZSTD_checkDictValidity: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u",
-		         (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
-		assert(blockEndIdx >= loadedDictEnd);
-
-		if(blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
-			/* On reaching window size, dictionaries are invalidated.
-			 * For simplification, if window size is reached anywhere within next block,
-			 * the dictionary is invalidated for the full block.
-			 *
-			 * We also have to invalidate the dictionary if ZSTD_window_update() has detected
-			 * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
-			 * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
-			 * dictMatchState, so setting it to NULL is not a problem.
-			 */
-			DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
-			*loadedDictEndPtr = 0;
-			*dictMatchStatePtr = NULL;
-		} else {
-			if(*loadedDictEndPtr != 0) {
-				DEBUGLOG(6, "dictionary considered valid for current block");
-			}
-		}
-	}
-}
-
-MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
-	ZSTD_memset(window, 0, sizeof(*window));
-	window->base = (BYTE const *)" ";
-	window->dictBase = (BYTE const *)" ";
-	ZSTD_STATIC_ASSERT(ZSTD_DUBT_UNSORTED_MARK < ZSTD_WINDOW_START_INDEX); /* Start above ZSTD_DUBT_UNSORTED_MARK */
-	window->dictLimit = ZSTD_WINDOW_START_INDEX;                           /* start from >0, so that 1st position is valid */
-	window->lowLimit = ZSTD_WINDOW_START_INDEX;                            /* it ensures first and later CCtx usages compress the same */
-	window->nextSrc = window->base + ZSTD_WINDOW_START_INDEX;              /* see issue #1241 */
-	window->nbOverflowCorrections = 0;
-}
-
-/**
- * ZSTD_window_update():
- * Updates the window by appending [src, src + srcSize) to the window.
- * If it is not contiguous, the current prefix becomes the extDict, and we
- * forget about the extDict. Handles overlap of the prefix and extDict.
- * Returns non-zero if the segment is contiguous.
- */
-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
-                                  void const * src, size_t srcSize,
-                                  int forceNonContiguous) {
-	BYTE const * const ip = (BYTE const *)src;
-	U32 contiguous = 1;
-	DEBUGLOG(5, "ZSTD_window_update");
-	if(srcSize == 0)
-		return contiguous;
-	assert(window->base != NULL);
-	assert(window->dictBase != NULL);
-	/* Check if blocks follow each other */
-	if(src != window->nextSrc || forceNonContiguous) {
-		/* not contiguous */
-		size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
-		DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit);
-		window->lowLimit = window->dictLimit;
-		assert(distanceFromBase == (size_t)(U32)distanceFromBase); /* should never overflow */
-		window->dictLimit = (U32)distanceFromBase;
-		window->dictBase = window->base;
-		window->base = ip - distanceFromBase;
-		/* ms->nextToUpdate = window->dictLimit; */
-		if(window->dictLimit - window->lowLimit < HASH_READ_SIZE)
-			window->lowLimit = window->dictLimit; /* too small extDict */
-		contiguous = 0;
-	}
-	window->nextSrc = ip + srcSize;
-	/* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
-	if((ip + srcSize > window->dictBase + window->lowLimit) & (ip < window->dictBase + window->dictLimit)) {
-		ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
-		U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
-		window->lowLimit = lowLimitMax;
-		DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit);
-	}
-	return contiguous;
-}
-
-/**
- * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix.
- */
-MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog) {
-	U32 const maxDistance = 1U << windowLog;
-	U32 const lowestValid = ms->window.lowLimit;
-	U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
-	U32 const isDictionary = (ms->loadedDictEnd != 0);
-	/* When using a dictionary the entire dictionary is valid if a single byte of the dictionary
-	 * is within the window. We invalidate the dictionary (and set loadedDictEnd to 0) when it isn't
-	 * valid for the entire block. So this check is sufficient to find the lowest valid match index.
-	 */
-	U32 const matchLowest = isDictionary ? lowestValid : withinWindow;
-	return matchLowest;
-}
-
-/**
- * Returns the lowest allowed match index in the prefix.
- */
-MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog) {
-	U32 const maxDistance = 1U << windowLog;
-	U32 const lowestValid = ms->window.dictLimit;
-	U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
-	U32 const isDictionary = (ms->loadedDictEnd != 0);
-	/* When computing the lowest prefix index we need to take the dictionary into account to handle
-	 * the edge case where the dictionary and the source are contiguous in memory.
-	 */
-	U32 const matchLowest = isDictionary ? lowestValid : withinWindow;
-	return matchLowest;
-}
-
-/* debug functions */
-#if(DEBUGLEVEL >= 2)
-
-MEM_STATIC double ZSTD_fWeight(U32 rawStat) {
-	U32 const fp_accuracy = 8;
-	U32 const fp_multiplier = (1 << fp_accuracy);
-	U32 const newStat = rawStat + 1;
-	U32 const hb = ZSTD_highbit32(newStat);
-	U32 const BWeight = hb * fp_multiplier;
-	U32 const FWeight = (newStat << fp_accuracy) >> hb;
-	U32 const weight = BWeight + FWeight;
-	assert(hb + fp_accuracy < 31);
-	return (double)weight / fp_multiplier;
-}
-
-/* display a table content,
- * listing each element, its frequency, and its predicted bit cost */
-MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) {
-	unsigned u, sum;
-	for(u = 0, sum = 0; u <= max; u++)
-		sum += table[u];
-	DEBUGLOG(2, "total nb elts: %u", sum);
-	for(u = 0; u <= max; u++) {
-		DEBUGLOG(2, "%2u: %5u  (%.2f)",
-		         u, table[u], ZSTD_fWeight(sum) - ZSTD_fWeight(table[u]));
-	}
-}
-
-#endif
-
-/* Short Cache */
-
-/* Normally, zstd matchfinders follow this flow:
- *     1. Compute hash at ip
- *     2. Load index from hashTable[hash]
- *     3. Check if *ip == *(base + index)
- * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
- *
- * Short cache is an optimization which allows us to avoid step 3 most of the time
- * when the data doesn't actually match. With short cache, the flow becomes:
- *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
- *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
- *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
- *
- * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
- * dictMatchState matchfinders.
- */
-#define ZSTD_SHORT_CACHE_TAG_BITS 8
-#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
-
-/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
- * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
-MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
-	size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
-	U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
-	assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
-	hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
-}
-
-/* Helper function for short cache matchfinders.
- * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
-MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
-	U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
-	U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
-	return tag1 == tag2;
-}
-
-#if defined(__cplusplus)
-}
-#endif
-
-/* ===============================================================
- * Shared internal declarations
- * These prototypes may be called from sources not in lib/compress
- * =============================================================== */
-
-/* ZSTD_loadCEntropy() :
- * dict : must point at beginning of a valid zstd dictionary.
- * return : size of dictionary header (size of magic number + dict ID + entropy tables)
- * assumptions : magic number supposed already checked
- *               and dictSize >= 8 */
-size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
-                         const void* const dict, size_t dictSize);
-
-void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
-
-/* ==============================================================
- * Private declarations
- * These prototypes shall only be called from within lib/compress
- * ============================================================== */
-
-/* ZSTD_getCParamsFromCCtxParams() :
- * cParams are built depending on compressionLevel, src size hints,
- * LDM and manually set compression parameters.
- * Note: srcSizeHint == 0 means 0!
- */
-ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
-    const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
-
-/*! ZSTD_initCStream_internal() :
- *  Private use only. Init streaming operation.
- *  expects params to be valid.
- *  must receive dict, or cdict, or none, but not both.
- *  @return : 0, or an error code */
-size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
-                                 const void* dict, size_t dictSize,
-                                 const ZSTD_CDict* cdict,
-                                 const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize);
-
-void ZSTD_resetSeqStore(seqStore_t* ssPtr);
-
-/*! ZSTD_getCParamsFromCDict() :
- *  as the name implies */
-ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict);
-
-/* ZSTD_compressBegin_advanced_internal() :
- * Private use only. To be called from zstdmt_compress.c. */
-size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
-                                            const void* dict, size_t dictSize,
-                                            ZSTD_dictContentType_e dictContentType,
-                                            ZSTD_dictTableLoadMethod_e dtlm,
-                                            const ZSTD_CDict* cdict,
-                                            const ZSTD_CCtx_params* params,
-                                            unsigned long long pledgedSrcSize);
-
-/* ZSTD_compress_advanced_internal() :
- * Private use only. To be called from zstdmt_compress.c. */
-size_t ZSTD_compress_advanced_internal(ZSTD_CCtx* cctx,
-                                       void* dst, size_t dstCapacity,
-                                       const void* src, size_t srcSize,
-                                       const void* dict, size_t dictSize,
-                                       const ZSTD_CCtx_params* params);
-
-/* ZSTD_writeLastEmptyBlock() :
- * output an empty Block with end-of-frame mark to complete a frame
- * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
- *           or an error code if `dstCapacity` is too small (<ZSTD_blockHeaderSize)
- */
-size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
-
-/* ZSTD_referenceExternalSequences() :
- * Must be called before starting a compression operation.
- * seqs must parse a prefix of the source.
- * This cannot be used when long range matching is enabled.
- * Zstd will use these sequences, and pass the literals to a secondary block
- * compressor.
- * @return : An error code on failure.
- * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
- * access and data corruption.
- */
-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
-
-/** ZSTD_cycleLog() :
- *  condition for correct operation : hashLog > 1 */
-U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
-
-/** ZSTD_CCtx_trace() :
- *  Trace the end of a compression call.
- */
-void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
-
-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
- * Note that the block delimiter must include the last literals of the block.
- */
-size_t
-ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
-                                               ZSTD_sequencePosition* seqPos,
-                                               const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                               const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
-
-/* Returns the number of bytes to move the current read position back by.
- * Only non-zero if we ended up splitting a sequence.
- * Otherwise, it may return a ZSTD error if something went wrong.
- *
- * This function will attempt to scan through blockSize bytes
- * represented by the sequences in @inSeqs,
- * storing any (partial) sequences.
- *
- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
- */
-size_t
-ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
-                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                         const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
-
-#endif /* ZSTD_COMPRESS_H */
diff --git a/src/zstd/zstd_compress_literals.c b/src/zstd/zstd_compress_literals.c
deleted file mode 100644
index adfcc3b8b..000000000
--- a/src/zstd/zstd_compress_literals.c
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/*-*************************************
- *  Dependencies
- ***************************************/
-#include "zstd_compress_literals.h"
-
-/* **************************************************************
- *  Debug Traces
- ****************************************************************/
-#if DEBUGLEVEL >= 2
-
-static size_t showHexa(const void* src, size_t srcSize) {
-	const BYTE* const ip = (const BYTE*)src;
-	size_t u;
-	for(u = 0; u < srcSize; u++) {
-		RAWLOG(5, " %02X", ip[u]);
-		(void)ip;
-	}
-	RAWLOG(5, " \n");
-	return srcSize;
-}
-
-#endif
-
-/* **************************************************************
- *  Literals compression - special cases
- ****************************************************************/
-size_t ZSTD_noCompressLiterals(void* dst, size_t dstCapacity, const void* src, size_t srcSize) {
-	BYTE* const ostart = (BYTE*)dst;
-	U32 const flSize = 1 + (srcSize > 31) + (srcSize > 4095);
-
-	DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
-
-	RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
-
-	switch(flSize) {
-	case 1: /* 2 - 1 - 5 */
-		ostart[0] = (BYTE)((U32)set_basic + (srcSize << 3));
-		break;
-	case 2: /* 2 - 2 - 12 */
-		MEM_writeLE16(ostart, (U16)((U32)set_basic + (1 << 2) + (srcSize << 4)));
-		break;
-	case 3: /* 2 - 2 - 20 */
-		MEM_writeLE32(ostart, (U32)((U32)set_basic + (3 << 2) + (srcSize << 4)));
-		break;
-	default: /* not necessary : flSize is {1,2,3} */
-		assert(0);
-	}
-
-	ZSTD_memcpy(ostart + flSize, src, srcSize);
-	DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
-	return srcSize + flSize;
-}
-
-static int allBytesIdentical(const void* src, size_t srcSize) {
-	assert(srcSize >= 1);
-	assert(src != NULL);
-	{
-		const BYTE b = ((const BYTE*)src)[0];
-		size_t p;
-		for(p = 1; p < srcSize; p++) {
-			if(((const BYTE*)src)[p] != b)
-				return 0;
-		}
-		return 1;
-	}
-}
-
-size_t ZSTD_compressRleLiteralsBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize) {
-	BYTE* const ostart = (BYTE*)dst;
-	U32 const flSize = 1 + (srcSize > 31) + (srcSize > 4095);
-
-	assert(dstCapacity >= 4);
-	(void)dstCapacity;
-	assert(allBytesIdentical(src, srcSize));
-
-	switch(flSize) {
-	case 1: /* 2 - 1 - 5 */
-		ostart[0] = (BYTE)((U32)set_rle + (srcSize << 3));
-		break;
-	case 2: /* 2 - 2 - 12 */
-		MEM_writeLE16(ostart, (U16)((U32)set_rle + (1 << 2) + (srcSize << 4)));
-		break;
-	case 3: /* 2 - 2 - 20 */
-		MEM_writeLE32(ostart, (U32)((U32)set_rle + (3 << 2) + (srcSize << 4)));
-		break;
-	default: /* not necessary : flSize is {1,2,3} */
-		assert(0);
-	}
-
-	ostart[flSize] = *(const BYTE*)src;
-	DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
-	return flSize + 1;
-}
-
-/* ZSTD_minLiteralsToCompress() :
- * returns minimal amount of literals
- * for literal compression to even be attempted.
- * Minimum is made tighter as compression strategy increases.
- */
-static size_t
-ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat) {
-	assert((int)strategy >= 0);
-	assert((int)strategy <= 9);
-	/* btultra2 : min 8 bytes;
-	 * then 2x larger for each successive compression strategy
-	 * max threshold 64 bytes */
-	{
-		int const shift = MIN(9 - (int)strategy, 3);
-		size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
-		DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
-		return mintc;
-	}
-}
-
-size_t ZSTD_compressLiterals(
-    void* dst, size_t dstCapacity,
-    const void* src, size_t srcSize,
-    void* entropyWorkspace, size_t entropyWorkspaceSize,
-    const ZSTD_hufCTables_t* prevHuf,
-    ZSTD_hufCTables_t* nextHuf,
-    ZSTD_strategy strategy,
-    int disableLiteralCompression,
-    int suspectUncompressible,
-    int bmi2) {
-	size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
-	BYTE* const ostart = (BYTE*)dst;
-	U32 singleStream = srcSize < 256;
-	symbolEncodingType_e hType = set_compressed;
-	size_t cLitSize;
-
-	DEBUGLOG(5, "ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
-	         disableLiteralCompression, (U32)srcSize, dstCapacity);
-
-	DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
-
-	/* Prepare nextEntropy assuming reusing the existing table */
-	ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-
-	if(disableLiteralCompression)
-		return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
-
-	/* if too small, don't even attempt compression (speed opt) */
-	if(srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
-		return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
-
-	RETURN_ERROR_IF(dstCapacity < lhSize + 1, dstSize_tooSmall, "not enough space for compression");
-	{
-		HUF_repeat repeat = prevHuf->repeatMode;
-		int const flags = 0 | (bmi2 ? HUF_flags_bmi2 : 0) | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0) | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0) | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
-
-		typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
-		huf_compress_f huf_compress;
-		if(repeat == HUF_repeat_valid && lhSize == 3)
-			singleStream = 1;
-		huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
-		cLitSize = huf_compress(ostart + lhSize, dstCapacity - lhSize,
-		                        src, srcSize,
-		                        HUF_SYMBOLVALUE_MAX, LitHufLog,
-		                        entropyWorkspace, entropyWorkspaceSize,
-		                        (HUF_CElt*)nextHuf->CTable,
-		                        &repeat, flags);
-		DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
-		if(repeat != HUF_repeat_none) {
-			/* reused the existing table */
-			DEBUGLOG(5, "reusing statistics from previous huffman block");
-			hType = set_repeat;
-		}
-	}
-
-	{
-		size_t const minGain = ZSTD_minGain(srcSize, strategy);
-		if((cLitSize == 0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
-			ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-			return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
-		}
-	}
-	if(cLitSize == 1) {
-		/* A return value of 1 signals that the alphabet consists of a single symbol.
-		 * However, in some rare circumstances, it could be the compressed size (a single byte).
-		 * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
-		 * (it's also necessary to not generate statistics).
-		 * Therefore, in such a case, actively check that all bytes are identical. */
-		if((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
-			ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-			return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
-		}
-	}
-
-	if(hType == set_compressed) {
-		/* using a newly constructed table */
-		nextHuf->repeatMode = HUF_repeat_check;
-	}
-
-	/* Build header */
-	switch(lhSize) {
-	case 3: /* 2 - 2 - 10 - 10 */
-		if(!singleStream)
-			assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
-		{
-			U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 14);
-			MEM_writeLE24(ostart, lhc);
-			break;
-		}
-	case 4: /* 2 - 2 - 14 - 14 */
-		assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
-		{
-			U32 const lhc = hType + (2 << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 18);
-			MEM_writeLE32(ostart, lhc);
-			break;
-		}
-	case 5: /* 2 - 2 - 18 - 18 */
-		assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
-		{
-			U32 const lhc = hType + (3 << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 22);
-			MEM_writeLE32(ostart, lhc);
-			ostart[4] = (BYTE)(cLitSize >> 10);
-			break;
-		}
-	default: /* not possible : lhSize is {3,4,5} */
-		assert(0);
-	}
-	DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)srcSize, (U32)(lhSize + cLitSize));
-	return lhSize + cLitSize;
-}
diff --git a/src/zstd/zstd_compress_sequences.c b/src/zstd/zstd_compress_sequences.c
deleted file mode 100644
index bd5268c76..000000000
--- a/src/zstd/zstd_compress_sequences.c
+++ /dev/null
@@ -1,673 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/*-*************************************
- *  Dependencies
- ***************************************/
-#include "zstd_compress_sequences.h"
-
-/**
- * -log2(x / 256) lookup table for x in [0, 256).
- * If x == 0: Return 0
- * Else: Return floor(-log2(x / 256) * 256)
- */
-static unsigned const kInverseProbabilityLog256[256] = {
-    0,
-    2048,
-    1792,
-    1642,
-    1536,
-    1453,
-    1386,
-    1329,
-    1280,
-    1236,
-    1197,
-    1162,
-    1130,
-    1100,
-    1073,
-    1047,
-    1024,
-    1001,
-    980,
-    960,
-    941,
-    923,
-    906,
-    889,
-    874,
-    859,
-    844,
-    830,
-    817,
-    804,
-    791,
-    779,
-    768,
-    756,
-    745,
-    734,
-    724,
-    714,
-    704,
-    694,
-    685,
-    676,
-    667,
-    658,
-    650,
-    642,
-    633,
-    626,
-    618,
-    610,
-    603,
-    595,
-    588,
-    581,
-    574,
-    567,
-    561,
-    554,
-    548,
-    542,
-    535,
-    529,
-    523,
-    517,
-    512,
-    506,
-    500,
-    495,
-    489,
-    484,
-    478,
-    473,
-    468,
-    463,
-    458,
-    453,
-    448,
-    443,
-    438,
-    434,
-    429,
-    424,
-    420,
-    415,
-    411,
-    407,
-    402,
-    398,
-    394,
-    390,
-    386,
-    382,
-    377,
-    373,
-    370,
-    366,
-    362,
-    358,
-    354,
-    350,
-    347,
-    343,
-    339,
-    336,
-    332,
-    329,
-    325,
-    322,
-    318,
-    315,
-    311,
-    308,
-    305,
-    302,
-    298,
-    295,
-    292,
-    289,
-    286,
-    282,
-    279,
-    276,
-    273,
-    270,
-    267,
-    264,
-    261,
-    258,
-    256,
-    253,
-    250,
-    247,
-    244,
-    241,
-    239,
-    236,
-    233,
-    230,
-    228,
-    225,
-    222,
-    220,
-    217,
-    215,
-    212,
-    209,
-    207,
-    204,
-    202,
-    199,
-    197,
-    194,
-    192,
-    190,
-    187,
-    185,
-    182,
-    180,
-    178,
-    175,
-    173,
-    171,
-    168,
-    166,
-    164,
-    162,
-    159,
-    157,
-    155,
-    153,
-    151,
-    149,
-    146,
-    144,
-    142,
-    140,
-    138,
-    136,
-    134,
-    132,
-    130,
-    128,
-    126,
-    123,
-    121,
-    119,
-    117,
-    115,
-    114,
-    112,
-    110,
-    108,
-    106,
-    104,
-    102,
-    100,
-    98,
-    96,
-    94,
-    93,
-    91,
-    89,
-    87,
-    85,
-    83,
-    82,
-    80,
-    78,
-    76,
-    74,
-    73,
-    71,
-    69,
-    67,
-    66,
-    64,
-    62,
-    61,
-    59,
-    57,
-    55,
-    54,
-    52,
-    50,
-    49,
-    47,
-    46,
-    44,
-    42,
-    41,
-    39,
-    37,
-    36,
-    34,
-    33,
-    31,
-    30,
-    28,
-    26,
-    25,
-    23,
-    22,
-    20,
-    19,
-    17,
-    16,
-    14,
-    13,
-    11,
-    10,
-    8,
-    7,
-    5,
-    4,
-    2,
-    1,
-};
-
-static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const * ctable) {
-	void const * ptr = ctable;
-	U16 const * u16ptr = (U16 const *)ptr;
-	U32 const maxSymbolValue = MEM_read16(u16ptr + 1);
-	return maxSymbolValue;
-}
-
-/**
- * Returns true if we should use ncount=-1 else we should
- * use ncount=1 for low probability symbols instead.
- */
-static unsigned ZSTD_useLowProbCount(size_t const nbSeq) {
-	/* Heuristic: This should cover most blocks <= 16K and
-	 * start to fade out after 16K to about 32K depending on
-	 * compressibility.
-	 */
-	return nbSeq >= 2048;
-}
-
-/**
- * Returns the cost in bytes of encoding the normalized count header.
- * Returns an error if any of the helper functions return an error.
- */
-static size_t ZSTD_NCountCost(unsigned const * count, unsigned const max,
-                              size_t const nbSeq, unsigned const FSELog) {
-	BYTE wksp[FSE_NCOUNTBOUND];
-	S16 norm[MaxSeq + 1];
-	const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
-	FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max, ZSTD_useLowProbCount(nbSeq)), "");
-	return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog);
-}
-
-/**
- * Returns the cost in bits of encoding the distribution described by count
- * using the entropy bound.
- */
-static size_t ZSTD_entropyCost(unsigned const * count, unsigned const max, size_t const total) {
-	unsigned cost = 0;
-	unsigned s;
-
-	assert(total > 0);
-	for(s = 0; s <= max; ++s) {
-		unsigned norm = (unsigned)((256 * count[s]) / total);
-		if(count[s] != 0 && norm == 0)
-			norm = 1;
-		assert(count[s] < total);
-		cost += count[s] * kInverseProbabilityLog256[norm];
-	}
-	return cost >> 8;
-}
-
-/**
- * Returns the cost in bits of encoding the distribution in count using ctable.
- * Returns an error if ctable cannot represent all the symbols in count.
- */
-size_t ZSTD_fseBitCost(
-    FSE_CTable const * ctable,
-    unsigned const * count,
-    unsigned const max) {
-	unsigned const kAccuracyLog = 8;
-	size_t cost = 0;
-	unsigned s;
-	FSE_CState_t cstate;
-	FSE_initCState(&cstate, ctable);
-	if(ZSTD_getFSEMaxSymbolValue(ctable) < max) {
-		DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u",
-		         ZSTD_getFSEMaxSymbolValue(ctable), max);
-		return ERROR(GENERIC);
-	}
-	for(s = 0; s <= max; ++s) {
-		unsigned const tableLog = cstate.stateLog;
-		unsigned const badCost = (tableLog + 1) << kAccuracyLog;
-		unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog);
-		if(count[s] == 0)
-			continue;
-		if(bitCost >= badCost) {
-			DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s);
-			return ERROR(GENERIC);
-		}
-		cost += (size_t)count[s] * bitCost;
-	}
-	return cost >> kAccuracyLog;
-}
-
-/**
- * Returns the cost in bits of encoding the distribution in count using the
- * table described by norm. The max symbol support by norm is assumed >= max.
- * norm must be valid for every symbol with non-zero probability in count.
- */
-size_t ZSTD_crossEntropyCost(short const * norm, unsigned accuracyLog,
-                             unsigned const * count, unsigned const max) {
-	unsigned const shift = 8 - accuracyLog;
-	size_t cost = 0;
-	unsigned s;
-	assert(accuracyLog <= 8);
-	for(s = 0; s <= max; ++s) {
-		unsigned const normAcc = (norm[s] != -1) ? (unsigned)norm[s] : 1;
-		unsigned const norm256 = normAcc << shift;
-		assert(norm256 > 0);
-		assert(norm256 < 256);
-		cost += count[s] * kInverseProbabilityLog256[norm256];
-	}
-	return cost >> 8;
-}
-
-symbolEncodingType_e
-ZSTD_selectEncodingType(
-    FSE_repeat* repeatMode, unsigned const * count, unsigned const max,
-    size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
-    FSE_CTable const * prevCTable,
-    short const * defaultNorm, U32 defaultNormLog,
-    ZSTD_defaultPolicy_e const isDefaultAllowed,
-    ZSTD_strategy const strategy) {
-	ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
-	if(mostFrequent == nbSeq) {
-		*repeatMode = FSE_repeat_none;
-		if(isDefaultAllowed && nbSeq <= 2) {
-			/* Prefer set_basic over set_rle when there are 2 or fewer symbols,
-			 * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
-			 * If basic encoding isn't possible, always choose RLE.
-			 */
-			DEBUGLOG(5, "Selected set_basic");
-			return set_basic;
-		}
-		DEBUGLOG(5, "Selected set_rle");
-		return set_rle;
-	}
-	if(strategy < ZSTD_lazy) {
-		if(isDefaultAllowed) {
-			size_t const staticFse_nbSeq_max = 1000;
-			size_t const mult = 10 - strategy;
-			size_t const baseLog = 3;
-			size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog; /* 28-36 for offset, 56-72 for lengths */
-			assert(defaultNormLog >= 5 && defaultNormLog <= 6);                                    /* xx_DEFAULTNORMLOG */
-			assert(mult <= 9 && mult >= 7);
-			if((*repeatMode == FSE_repeat_valid) && (nbSeq < staticFse_nbSeq_max)) {
-				DEBUGLOG(5, "Selected set_repeat");
-				return set_repeat;
-			}
-			if((nbSeq < dynamicFse_nbSeq_min) || (mostFrequent < (nbSeq >> (defaultNormLog - 1)))) {
-				DEBUGLOG(5, "Selected set_basic");
-				/* The format allows default tables to be repeated, but it isn't useful.
-				 * When using simple heuristics to select encoding type, we don't want
-				 * to confuse these tables with dictionaries. When running more careful
-				 * analysis, we don't need to waste time checking both repeating tables
-				 * and default tables.
-				 */
-				*repeatMode = FSE_repeat_none;
-				return set_basic;
-			}
-		}
-	} else {
-		size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC);
-		size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC);
-		size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog);
-		size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq);
-
-		if(isDefaultAllowed) {
-			assert(!ZSTD_isError(basicCost));
-			assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost)));
-		}
-		assert(!ZSTD_isError(NCountCost));
-		assert(compressedCost < ERROR(maxCode));
-		DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u",
-		         (unsigned)basicCost, (unsigned)repeatCost, (unsigned)compressedCost);
-		if(basicCost <= repeatCost && basicCost <= compressedCost) {
-			DEBUGLOG(5, "Selected set_basic");
-			assert(isDefaultAllowed);
-			*repeatMode = FSE_repeat_none;
-			return set_basic;
-		}
-		if(repeatCost <= compressedCost) {
-			DEBUGLOG(5, "Selected set_repeat");
-			assert(!ZSTD_isError(repeatCost));
-			return set_repeat;
-		}
-		assert(compressedCost < basicCost && compressedCost < repeatCost);
-	}
-	DEBUGLOG(5, "Selected set_compressed");
-	*repeatMode = FSE_repeat_check;
-	return set_compressed;
-}
-
-typedef struct {
-	S16 norm[MaxSeq + 1];
-	U32 wksp[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(MaxSeq, MaxFSELog)];
-} ZSTD_BuildCTableWksp;
-
-size_t
-ZSTD_buildCTable(void* dst, size_t dstCapacity,
-                 FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
-                 unsigned* count, U32 max,
-                 const BYTE* codeTable, size_t nbSeq,
-                 const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
-                 const FSE_CTable* prevCTable, size_t prevCTableSize,
-                 void* entropyWorkspace, size_t entropyWorkspaceSize) {
-	BYTE* op = (BYTE*)dst;
-	const BYTE* const oend = op + dstCapacity;
-	DEBUGLOG(6, "ZSTD_buildCTable (dstCapacity=%u)", (unsigned)dstCapacity);
-
-	switch(type) {
-	case set_rle:
-		FORWARD_IF_ERROR(FSE_buildCTable_rle(nextCTable, (BYTE)max), "");
-		RETURN_ERROR_IF(dstCapacity == 0, dstSize_tooSmall, "not enough space");
-		*op = codeTable[0];
-		return 1;
-	case set_repeat:
-		ZSTD_memcpy(nextCTable, prevCTable, prevCTableSize);
-		return 0;
-	case set_basic:
-		FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize), ""); /* note : could be pre-calculated */
-		return 0;
-	case set_compressed: {
-		ZSTD_BuildCTableWksp* wksp = (ZSTD_BuildCTableWksp*)entropyWorkspace;
-		size_t nbSeq_1 = nbSeq;
-		const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
-		if(count[codeTable[nbSeq - 1]] > 1) {
-			count[codeTable[nbSeq - 1]]--;
-			nbSeq_1--;
-		}
-		assert(nbSeq_1 > 1);
-		assert(entropyWorkspaceSize >= sizeof(ZSTD_BuildCTableWksp));
-		(void)entropyWorkspaceSize;
-		FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed");
-		assert(oend >= op);
-		{
-			size_t const NCountSize = FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, max, tableLog); /* overflow protected */
-			FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed");
-			FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp failed");
-			return NCountSize;
-		}
-	}
-	default:
-		assert(0);
-		RETURN_ERROR(GENERIC, "impossible to reach");
-	}
-}
-
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_encodeSequences_body(
-    void* dst, size_t dstCapacity,
-    FSE_CTable const * CTable_MatchLength, BYTE const * mlCodeTable,
-    FSE_CTable const * CTable_OffsetBits, BYTE const * ofCodeTable,
-    FSE_CTable const * CTable_LitLength, BYTE const * llCodeTable,
-    seqDef const * sequences, size_t nbSeq, int longOffsets) {
-	BIT_CStream_t blockStream;
-	FSE_CState_t stateMatchLength;
-	FSE_CState_t stateOffsetBits;
-	FSE_CState_t stateLitLength;
-
-	RETURN_ERROR_IF(
-	    ERR_isError(BIT_initCStream(&blockStream, dst, dstCapacity)),
-	    dstSize_tooSmall, "not enough space remaining");
-	DEBUGLOG(6, "available space for bitstream : %i  (dstCapacity=%u)",
-	         (int)(blockStream.endPtr - blockStream.startPtr),
-	         (unsigned)dstCapacity);
-
-	/* first symbols */
-	FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq - 1]);
-	FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq - 1]);
-	FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq - 1]);
-	BIT_addBits(&blockStream, sequences[nbSeq - 1].litLength, LL_bits[llCodeTable[nbSeq - 1]]);
-	if(MEM_32bits())
-		BIT_flushBits(&blockStream);
-	BIT_addBits(&blockStream, sequences[nbSeq - 1].mlBase, ML_bits[mlCodeTable[nbSeq - 1]]);
-	if(MEM_32bits())
-		BIT_flushBits(&blockStream);
-	if(longOffsets) {
-		U32 const ofBits = ofCodeTable[nbSeq - 1];
-		unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN - 1);
-		if(extraBits) {
-			BIT_addBits(&blockStream, sequences[nbSeq - 1].offBase, extraBits);
-			BIT_flushBits(&blockStream);
-		}
-		BIT_addBits(&blockStream, sequences[nbSeq - 1].offBase >> extraBits,
-		            ofBits - extraBits);
-	} else {
-		BIT_addBits(&blockStream, sequences[nbSeq - 1].offBase, ofCodeTable[nbSeq - 1]);
-	}
-	BIT_flushBits(&blockStream);
-
-	{
-		size_t n;
-		for(n = nbSeq - 2; n < nbSeq; n--) { /* intentional underflow */
-			BYTE const llCode = llCodeTable[n];
-			BYTE const ofCode = ofCodeTable[n];
-			BYTE const mlCode = mlCodeTable[n];
-			U32 const llBits = LL_bits[llCode];
-			U32 const ofBits = ofCode;
-			U32 const mlBits = ML_bits[mlCode];
-			DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u",
-			         (unsigned)sequences[n].litLength,
-			         (unsigned)sequences[n].mlBase + MINMATCH,
-			         (unsigned)sequences[n].offBase);
-			/* 32b*/                                                            /* 64b*/
-			/* (7)*/                                                            /* (7)*/
-			FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */  /* 15 */
-			FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode); /* 24 */ /* 24 */
-			if(MEM_32bits())
-				BIT_flushBits(&blockStream);                                  /* (7)*/
-			FSE_encodeSymbol(&blockStream, &stateLitLength, llCode); /* 16 */ /* 33 */
-			if(MEM_32bits() || (ofBits + mlBits + llBits >= 64 - 7 - (LLFSELog + MLFSELog + OffFSELog)))
-				BIT_flushBits(&blockStream); /* (7)*/
-			BIT_addBits(&blockStream, sequences[n].litLength, llBits);
-			if(MEM_32bits() && ((llBits + mlBits) > 24))
-				BIT_flushBits(&blockStream);
-			BIT_addBits(&blockStream, sequences[n].mlBase, mlBits);
-			if(MEM_32bits() || (ofBits + mlBits + llBits > 56))
-				BIT_flushBits(&blockStream);
-			if(longOffsets) {
-				unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN - 1);
-				if(extraBits) {
-					BIT_addBits(&blockStream, sequences[n].offBase, extraBits);
-					BIT_flushBits(&blockStream); /* (7)*/
-				}
-				BIT_addBits(&blockStream, sequences[n].offBase >> extraBits,
-				            ofBits - extraBits); /* 31 */
-			} else {
-				BIT_addBits(&blockStream, sequences[n].offBase, ofBits); /* 31 */
-			}
-			BIT_flushBits(&blockStream); /* (7)*/
-			DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr));
-		}
-	}
-
-	DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog);
-	FSE_flushCState(&blockStream, &stateMatchLength);
-	DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog);
-	FSE_flushCState(&blockStream, &stateOffsetBits);
-	DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog);
-	FSE_flushCState(&blockStream, &stateLitLength);
-
-	{
-		size_t const streamSize = BIT_closeCStream(&blockStream);
-		RETURN_ERROR_IF(streamSize == 0, dstSize_tooSmall, "not enough space");
-		return streamSize;
-	}
-}
-
-static size_t
-ZSTD_encodeSequences_default(
-    void* dst, size_t dstCapacity,
-    FSE_CTable const * CTable_MatchLength, BYTE const * mlCodeTable,
-    FSE_CTable const * CTable_OffsetBits, BYTE const * ofCodeTable,
-    FSE_CTable const * CTable_LitLength, BYTE const * llCodeTable,
-    seqDef const * sequences, size_t nbSeq, int longOffsets) {
-	return ZSTD_encodeSequences_body(dst, dstCapacity,
-	                                 CTable_MatchLength, mlCodeTable,
-	                                 CTable_OffsetBits, ofCodeTable,
-	                                 CTable_LitLength, llCodeTable,
-	                                 sequences, nbSeq, longOffsets);
-}
-
-#if DYNAMIC_BMI2
-
-static BMI2_TARGET_ATTRIBUTE size_t
-ZSTD_encodeSequences_bmi2(
-    void* dst, size_t dstCapacity,
-    FSE_CTable const * CTable_MatchLength, BYTE const * mlCodeTable,
-    FSE_CTable const * CTable_OffsetBits, BYTE const * ofCodeTable,
-    FSE_CTable const * CTable_LitLength, BYTE const * llCodeTable,
-    seqDef const * sequences, size_t nbSeq, int longOffsets) {
-	return ZSTD_encodeSequences_body(dst, dstCapacity,
-	                                 CTable_MatchLength, mlCodeTable,
-	                                 CTable_OffsetBits, ofCodeTable,
-	                                 CTable_LitLength, llCodeTable,
-	                                 sequences, nbSeq, longOffsets);
-}
-
-#endif
-
-size_t ZSTD_encodeSequences(
-    void* dst, size_t dstCapacity,
-    FSE_CTable const * CTable_MatchLength, BYTE const * mlCodeTable,
-    FSE_CTable const * CTable_OffsetBits, BYTE const * ofCodeTable,
-    FSE_CTable const * CTable_LitLength, BYTE const * llCodeTable,
-    seqDef const * sequences, size_t nbSeq, int longOffsets, int bmi2) {
-	DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity);
-#if DYNAMIC_BMI2
-	if(bmi2) {
-		return ZSTD_encodeSequences_bmi2(dst, dstCapacity,
-		                                 CTable_MatchLength, mlCodeTable,
-		                                 CTable_OffsetBits, ofCodeTable,
-		                                 CTable_LitLength, llCodeTable,
-		                                 sequences, nbSeq, longOffsets);
-	}
-#endif
-	(void)bmi2;
-	return ZSTD_encodeSequences_default(dst, dstCapacity,
-	                                    CTable_MatchLength, mlCodeTable,
-	                                    CTable_OffsetBits, ofCodeTable,
-	                                    CTable_LitLength, llCodeTable,
-	                                    sequences, nbSeq, longOffsets);
-}
diff --git a/src/zstd/zstd_compress_sequences.h b/src/zstd/zstd_compress_sequences.h
deleted file mode 100644
index d832be4d0..000000000
--- a/src/zstd/zstd_compress_sequences.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef ZSTD_COMPRESS_SEQUENCES_H
-#define ZSTD_COMPRESS_SEQUENCES_H
-
-#include "fse.h"           /* FSE_repeat, FSE_CTable */
-#include "zstd_internal.h" /* symbolEncodingType_e, ZSTD_strategy */
-
-typedef enum {
-	ZSTD_defaultDisallowed = 0,
-	ZSTD_defaultAllowed = 1
-} ZSTD_defaultPolicy_e;
-
-symbolEncodingType_e
-ZSTD_selectEncodingType(
-    FSE_repeat* repeatMode, unsigned const * count, unsigned const max,
-    size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
-    FSE_CTable const * prevCTable,
-    short const * defaultNorm, U32 defaultNormLog,
-    ZSTD_defaultPolicy_e const isDefaultAllowed,
-    ZSTD_strategy const strategy);
-
-size_t
-ZSTD_buildCTable(void* dst, size_t dstCapacity,
-                 FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
-                 unsigned* count, U32 max,
-                 const BYTE* codeTable, size_t nbSeq,
-                 const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
-                 const FSE_CTable* prevCTable, size_t prevCTableSize,
-                 void* entropyWorkspace, size_t entropyWorkspaceSize);
-
-size_t ZSTD_encodeSequences(
-    void* dst, size_t dstCapacity,
-    FSE_CTable const * CTable_MatchLength, BYTE const * mlCodeTable,
-    FSE_CTable const * CTable_OffsetBits, BYTE const * ofCodeTable,
-    FSE_CTable const * CTable_LitLength, BYTE const * llCodeTable,
-    seqDef const * sequences, size_t nbSeq, int longOffsets, int bmi2);
-
-size_t ZSTD_fseBitCost(
-    FSE_CTable const * ctable,
-    unsigned const * count,
-    unsigned const max);
-
-size_t ZSTD_crossEntropyCost(short const * norm, unsigned accuracyLog,
-                             unsigned const * count, unsigned const max);
-#endif /* ZSTD_COMPRESS_SEQUENCES_H */
diff --git a/src/zstd/zstd_compress_superblock.c b/src/zstd/zstd_compress_superblock.c
deleted file mode 100644
index 3f1dc0cf0..000000000
--- a/src/zstd/zstd_compress_superblock.c
+++ /dev/null
@@ -1,589 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/*-*************************************
- *  Dependencies
- ***************************************/
-#include "zstd_compress_superblock.h"
-
-#include "zstd_internal.h"          /* ZSTD_getSequenceLength */
-#include "hist.h"                   /* HIST_countFast_wksp */
-#include "zstd_compress_internal.h" /* ZSTD_[huf|fse|entropy]CTablesMetadata_t */
-#include "zstd_compress_sequences.h"
-#include "zstd_compress_literals.h"
-
-/** ZSTD_compressSubBlock_literal() :
- *  Compresses literals section for a sub-block.
- *  When we have to write the Huffman table we will sometimes choose a header
- *  size larger than necessary. This is because we have to pick the header size
- *  before we know the table size + compressed size, so we have a bound on the
- *  table size. If we guessed incorrectly, we fall back to uncompressed literals.
- *
- *  We write the header when writeEntropy=1 and set entropyWritten=1 when we succeeded
- *  in writing the header, otherwise it is set to 0.
- *
- *  hufMetadata->hType has literals block type info.
- *      If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block.
- *      If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block.
- *      If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block
- *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
- *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
- *  @return : compressed size of literals section of a sub-block
- *            Or 0 if unable to compress.
- *            Or error code */
-static size_t
-ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
-                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
-                              const BYTE* literals, size_t litSize,
-                              void* dst, size_t dstSize,
-                              const int bmi2, int writeEntropy, int* entropyWritten) {
-	size_t const header = writeEntropy ? 200 : 0;
-	size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = ostart + dstSize;
-	BYTE* op = ostart + lhSize;
-	U32 const singleStream = lhSize == 3;
-	symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
-	size_t cLitSize = 0;
-
-	DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
-
-	*entropyWritten = 0;
-	if(litSize == 0 || hufMetadata->hType == set_basic) {
-		DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal");
-		return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
-	} else if(hufMetadata->hType == set_rle) {
-		DEBUGLOG(5, "ZSTD_compressSubBlock_literal using rle literal");
-		return ZSTD_compressRleLiteralsBlock(dst, dstSize, literals, litSize);
-	}
-
-	assert(litSize > 0);
-	assert(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat);
-
-	if(writeEntropy && hufMetadata->hType == set_compressed) {
-		ZSTD_memcpy(op, hufMetadata->hufDesBuffer, hufMetadata->hufDesSize);
-		op += hufMetadata->hufDesSize;
-		cLitSize += hufMetadata->hufDesSize;
-		DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
-	}
-
-	{
-		int const flags = bmi2 ? HUF_flags_bmi2 : 0;
-		const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend - op, literals, litSize, hufTable, flags)
-		                                  : HUF_compress4X_usingCTable(op, oend - op, literals, litSize, hufTable, flags);
-		op += cSize;
-		cLitSize += cSize;
-		if(cSize == 0 || ERR_isError(cSize)) {
-			DEBUGLOG(5, "Failed to write entropy tables %s", ZSTD_getErrorName(cSize));
-			return 0;
-		}
-		/* If we expand and we aren't writing a header then emit uncompressed */
-		if(!writeEntropy && cLitSize >= litSize) {
-			DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal because uncompressible");
-			return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
-		}
-		/* If we are writing headers then allow expansion that doesn't change our header size. */
-		if(lhSize < (size_t)(3 + (cLitSize >= 1 KB) + (cLitSize >= 16 KB))) {
-			assert(cLitSize > litSize);
-			DEBUGLOG(5, "Literals expanded beyond allowed header size");
-			return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
-		}
-		DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize);
-	}
-
-	/* Build header */
-	switch(lhSize) {
-	case 3: /* 2 - 2 - 10 - 10 */
-	{
-		U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize << 4) + ((U32)cLitSize << 14);
-		MEM_writeLE24(ostart, lhc);
-		break;
-	}
-	case 4: /* 2 - 2 - 14 - 14 */
-	{
-		U32 const lhc = hType + (2 << 2) + ((U32)litSize << 4) + ((U32)cLitSize << 18);
-		MEM_writeLE32(ostart, lhc);
-		break;
-	}
-	case 5: /* 2 - 2 - 18 - 18 */
-	{
-		U32 const lhc = hType + (3 << 2) + ((U32)litSize << 4) + ((U32)cLitSize << 22);
-		MEM_writeLE32(ostart, lhc);
-		ostart[4] = (BYTE)(cLitSize >> 10);
-		break;
-	}
-	default: /* not possible : lhSize is {3,4,5} */
-		assert(0);
-	}
-	*entropyWritten = 1;
-	DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op - ostart));
-	return op - ostart;
-}
-
-static size_t
-ZSTD_seqDecompressedSize(seqStore_t const * seqStore,
-                         const seqDef* sequences, size_t nbSeq,
-                         size_t litSize, int lastSequence) {
-	const seqDef* const sstart = sequences;
-	const seqDef* const send = sequences + nbSeq;
-	const seqDef* sp = sstart;
-	size_t matchLengthSum = 0;
-	size_t litLengthSum = 0;
-	(void)(litLengthSum); /* suppress unused variable warning on some environments */
-	while(send - sp > 0) {
-		ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
-		litLengthSum += seqLen.litLength;
-		matchLengthSum += seqLen.matchLength;
-		sp++;
-	}
-	assert(litLengthSum <= litSize);
-	if(!lastSequence) {
-		assert(litLengthSum == litSize);
-	}
-	return matchLengthSum + litSize;
-}
-
-/** ZSTD_compressSubBlock_sequences() :
- *  Compresses sequences section for a sub-block.
- *  fseMetadata->llType, fseMetadata->ofType, and fseMetadata->mlType have
- *  symbol compression modes for the super-block.
- *  The first successfully compressed block will have these in its header.
- *  We set entropyWritten=1 when we succeed in compressing the sequences.
- *  The following sub-blocks will always have repeat mode.
- *  @return : compressed size of sequences section of a sub-block
- *            Or 0 if it is unable to compress
- *            Or error code. */
-static size_t
-ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
-                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                const seqDef* sequences, size_t nbSeq,
-                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
-                                const ZSTD_CCtx_params* cctxParams,
-                                void* dst, size_t dstCapacity,
-                                const int bmi2, int writeEntropy, int* entropyWritten) {
-	const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = ostart + dstCapacity;
-	BYTE* op = ostart;
-	BYTE* seqHead;
-
-	DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq, writeEntropy, longOffsets);
-
-	*entropyWritten = 0;
-	/* Sequences Header */
-	RETURN_ERROR_IF((oend - op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
-	                dstSize_tooSmall, "");
-	if(nbSeq < 0x7F)
-		*op++ = (BYTE)nbSeq;
-	else if(nbSeq < LONGNBSEQ)
-		op[0] = (BYTE)((nbSeq >> 8) + 0x80), op[1] = (BYTE)nbSeq, op += 2;
-	else
-		op[0] = 0xFF, MEM_writeLE16(op + 1, (U16)(nbSeq - LONGNBSEQ)), op += 3;
-	if(nbSeq == 0) {
-		return op - ostart;
-	}
-
-	/* seqHead : flags for FSE encoding type */
-	seqHead = op++;
-
-	DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (seqHeadSize=%u)", (unsigned)(op - ostart));
-
-	if(writeEntropy) {
-		const U32 LLtype = fseMetadata->llType;
-		const U32 Offtype = fseMetadata->ofType;
-		const U32 MLtype = fseMetadata->mlType;
-		DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (fseTablesSize=%zu)", fseMetadata->fseTablesSize);
-		*seqHead = (BYTE)((LLtype << 6) + (Offtype << 4) + (MLtype << 2));
-		ZSTD_memcpy(op, fseMetadata->fseTablesBuffer, fseMetadata->fseTablesSize);
-		op += fseMetadata->fseTablesSize;
-	} else {
-		const U32 repeat = set_repeat;
-		*seqHead = (BYTE)((repeat << 6) + (repeat << 4) + (repeat << 2));
-	}
-
-	{
-		size_t const bitstreamSize = ZSTD_encodeSequences(
-		    op, oend - op,
-		    fseTables->matchlengthCTable, mlCode,
-		    fseTables->offcodeCTable, ofCode,
-		    fseTables->litlengthCTable, llCode,
-		    sequences, nbSeq,
-		    longOffsets, bmi2);
-		FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed");
-		op += bitstreamSize;
-		/* zstd versions <= 1.3.4 mistakenly report corruption when
-		 * FSE_readNCount() receives a buffer < 4 bytes.
-		 * Fixed by https://github.com/facebook/zstd/pull/1146.
-		 * This can happen when the last set_compressed table present is 2
-		 * bytes and the bitstream is only one byte.
-		 * In this exceedingly rare case, we will simply emit an uncompressed
-		 * block, since it isn't worth optimizing.
-		 */
-#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-		if(writeEntropy && fseMetadata->lastCountSize && fseMetadata->lastCountSize + bitstreamSize < 4) {
-			/* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
-			assert(fseMetadata->lastCountSize + bitstreamSize == 3);
-			DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
-			            "emitting an uncompressed block.");
-			return 0;
-		}
-#endif
-		DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize);
-	}
-
-	/* zstd versions <= 1.4.0 mistakenly report error when
-	 * sequences section body size is less than 3 bytes.
-	 * Fixed by https://github.com/facebook/zstd/pull/1664.
-	 * This can happen when the previous sequences section block is compressed
-	 * with rle mode and the current block's sequences section is compressed
-	 * with repeat mode where sequences section body size can be 1 byte.
-	 */
-#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-	if(op - seqHead < 4) {
-		DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.4.0 by emitting "
-		            "an uncompressed block when sequences are < 4 bytes");
-		return 0;
-	}
-#endif
-
-	*entropyWritten = 1;
-	return op - ostart;
-}
-
-/** ZSTD_compressSubBlock() :
- *  Compresses a single sub-block.
- *  @return : compressed size of the sub-block
- *            Or 0 if it failed to compress. */
-static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
-                                    const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                    const seqDef* sequences, size_t nbSeq,
-                                    const BYTE* literals, size_t litSize,
-                                    const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
-                                    const ZSTD_CCtx_params* cctxParams,
-                                    void* dst, size_t dstCapacity,
-                                    const int bmi2,
-                                    int writeLitEntropy, int writeSeqEntropy,
-                                    int* litEntropyWritten, int* seqEntropyWritten,
-                                    U32 lastBlock) {
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = ostart + dstCapacity;
-	BYTE* op = ostart + ZSTD_blockHeaderSize;
-	DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeLitEntropy=%d, writeSeqEntropy=%d, lastBlock=%d)",
-	         litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
-	{
-		size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
-		                                                &entropyMetadata->hufMetadata, literals, litSize,
-		                                                op, oend - op, bmi2, writeLitEntropy, litEntropyWritten);
-		FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
-		if(cLitSize == 0)
-			return 0;
-		op += cLitSize;
-	}
-	{
-		size_t cSeqSize = ZSTD_compressSubBlock_sequences(&entropy->fse,
-		                                                  &entropyMetadata->fseMetadata,
-		                                                  sequences, nbSeq,
-		                                                  llCode, mlCode, ofCode,
-		                                                  cctxParams,
-		                                                  op, oend - op,
-		                                                  bmi2, writeSeqEntropy, seqEntropyWritten);
-		FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
-		if(cSeqSize == 0)
-			return 0;
-		op += cSeqSize;
-	}
-	/* Write block header */
-	{
-		size_t cSize = (op - ostart) - ZSTD_blockHeaderSize;
-		U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed) << 1) + (U32)(cSize << 3);
-		MEM_writeLE24(ostart, cBlockHeader24);
-	}
-	return op - ostart;
-}
-
-static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
-                                                const ZSTD_hufCTables_t* huf,
-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                                void* workspace, size_t wkspSize,
-                                                int writeEntropy) {
-	unsigned* const countWksp = (unsigned*)workspace;
-	unsigned maxSymbolValue = 255;
-	size_t literalSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
-
-	if(hufMetadata->hType == set_basic)
-		return litSize;
-	else if(hufMetadata->hType == set_rle)
-		return 1;
-	else if(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) {
-		size_t const largest = HIST_count_wksp(countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize);
-		if(ZSTD_isError(largest))
-			return litSize;
-		{
-			size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
-			if(writeEntropy)
-				cLitSizeEstimate += hufMetadata->hufDesSize;
-			return cLitSizeEstimate + literalSectionHeaderSize;
-		}
-	}
-	assert(0); /* impossible */
-	return 0;
-}
-
-static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type,
-                                                   const BYTE* codeTable, unsigned maxCode,
-                                                   size_t nbSeq, const FSE_CTable* fseCTable,
-                                                   const U8* additionalBits,
-                                                   short const * defaultNorm, U32 defaultNormLog, U32 defaultMax,
-                                                   void* workspace, size_t wkspSize) {
-	unsigned* const countWksp = (unsigned*)workspace;
-	const BYTE* ctp = codeTable;
-	const BYTE* const ctStart = ctp;
-	const BYTE* const ctEnd = ctStart + nbSeq;
-	size_t cSymbolTypeSizeEstimateInBits = 0;
-	unsigned max = maxCode;
-
-	HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */
-	if(type == set_basic) {
-		/* We selected this encoding type, so it must be valid. */
-		assert(max <= defaultMax);
-		cSymbolTypeSizeEstimateInBits = max <= defaultMax
-		                                    ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max)
-		                                    : ERROR(GENERIC);
-	} else if(type == set_rle) {
-		cSymbolTypeSizeEstimateInBits = 0;
-	} else if(type == set_compressed || type == set_repeat) {
-		cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max);
-	}
-	if(ZSTD_isError(cSymbolTypeSizeEstimateInBits))
-		return nbSeq * 10;
-	while(ctp < ctEnd) {
-		if(additionalBits)
-			cSymbolTypeSizeEstimateInBits += additionalBits[*ctp];
-		else
-			cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */
-		ctp++;
-	}
-	return cSymbolTypeSizeEstimateInBits / 8;
-}
-
-static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
-                                                  const BYTE* llCodeTable,
-                                                  const BYTE* mlCodeTable,
-                                                  size_t nbSeq,
-                                                  const ZSTD_fseCTables_t* fseTables,
-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                                  void* workspace, size_t wkspSize,
-                                                  int writeEntropy) {
-	size_t const sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
-	size_t cSeqSizeEstimate = 0;
-	if(nbSeq == 0)
-		return sequencesSectionHeaderSize;
-	cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff,
-	                                                         nbSeq, fseTables->offcodeCTable, NULL,
-	                                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
-	                                                         workspace, wkspSize);
-	cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->llType, llCodeTable, MaxLL,
-	                                                         nbSeq, fseTables->litlengthCTable, LL_bits,
-	                                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
-	                                                         workspace, wkspSize);
-	cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, MaxML,
-	                                                         nbSeq, fseTables->matchlengthCTable, ML_bits,
-	                                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
-	                                                         workspace, wkspSize);
-	if(writeEntropy)
-		cSeqSizeEstimate += fseMetadata->fseTablesSize;
-	return cSeqSizeEstimate + sequencesSectionHeaderSize;
-}
-
-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
-                                        const BYTE* ofCodeTable,
-                                        const BYTE* llCodeTable,
-                                        const BYTE* mlCodeTable,
-                                        size_t nbSeq,
-                                        const ZSTD_entropyCTables_t* entropy,
-                                        const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                        void* workspace, size_t wkspSize,
-                                        int writeLitEntropy, int writeSeqEntropy) {
-	size_t cSizeEstimate = 0;
-	cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
-	                                                   &entropy->huf, &entropyMetadata->hufMetadata,
-	                                                   workspace, wkspSize, writeLitEntropy);
-	cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
-	                                                     nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
-	                                                     workspace, wkspSize, writeSeqEntropy);
-	return cSizeEstimate + ZSTD_blockHeaderSize;
-}
-
-static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const * fseMetadata) {
-	if(fseMetadata->llType == set_compressed || fseMetadata->llType == set_rle)
-		return 1;
-	if(fseMetadata->mlType == set_compressed || fseMetadata->mlType == set_rle)
-		return 1;
-	if(fseMetadata->ofType == set_compressed || fseMetadata->ofType == set_rle)
-		return 1;
-	return 0;
-}
-
-/** ZSTD_compressSubBlock_multi() :
- *  Breaks super-block into multiple sub-blocks and compresses them.
- *  Entropy will be written to the first block.
- *  The following blocks will use repeat mode to compress.
- *  All sub-blocks are compressed blocks (no raw or rle blocks).
- *  @return : compressed size of the super block (which is multiple ZSTD blocks)
- *            Or 0 if it failed to compress. */
-static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
-                                          const ZSTD_compressedBlockState_t* prevCBlock,
-                                          ZSTD_compressedBlockState_t* nextCBlock,
-                                          const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                          const ZSTD_CCtx_params* cctxParams,
-                                          void* dst, size_t dstCapacity,
-                                          const void* src, size_t srcSize,
-                                          const int bmi2, U32 lastBlock,
-                                          void* workspace, size_t wkspSize) {
-	const seqDef* const sstart = seqStorePtr->sequencesStart;
-	const seqDef* const send = seqStorePtr->sequences;
-	const seqDef* sp = sstart;
-	const BYTE* const lstart = seqStorePtr->litStart;
-	const BYTE* const lend = seqStorePtr->lit;
-	const BYTE* lp = lstart;
-	BYTE const * ip = (BYTE const *)src;
-	BYTE const * const iend = ip + srcSize;
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = ostart + dstCapacity;
-	BYTE* op = ostart;
-	const BYTE* llCodePtr = seqStorePtr->llCode;
-	const BYTE* mlCodePtr = seqStorePtr->mlCode;
-	const BYTE* ofCodePtr = seqStorePtr->ofCode;
-	size_t targetCBlockSize = cctxParams->targetCBlockSize;
-	size_t litSize, seqCount;
-	int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
-	int writeSeqEntropy = 1;
-	int lastSequence = 0;
-
-	DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
-	         (unsigned)(lend - lp), (unsigned)(send - sstart));
-
-	litSize = 0;
-	seqCount = 0;
-	do {
-		size_t cBlockSizeEstimate = 0;
-		if(sstart == send) {
-			lastSequence = 1;
-		} else {
-			const seqDef* const sequence = sp + seqCount;
-			lastSequence = sequence == send - 1;
-			litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
-			seqCount++;
-		}
-		if(lastSequence) {
-			assert(lp <= lend);
-			assert(litSize <= (size_t)(lend - lp));
-			litSize = (size_t)(lend - lp);
-		}
-		/* I think there is an optimization opportunity here.
-		 * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
-		 * since it recalculates estimate from scratch.
-		 * For example, it would recount literal distribution and symbol codes every time.
-		 */
-		cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
-		                                               &nextCBlock->entropy, entropyMetadata,
-		                                               workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
-		if(cBlockSizeEstimate > targetCBlockSize || lastSequence) {
-			int litEntropyWritten = 0;
-			int seqEntropyWritten = 0;
-			const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
-			const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
-			                                           sp, seqCount,
-			                                           lp, litSize,
-			                                           llCodePtr, mlCodePtr, ofCodePtr,
-			                                           cctxParams,
-			                                           op, oend - op,
-			                                           bmi2, writeLitEntropy, writeSeqEntropy,
-			                                           &litEntropyWritten, &seqEntropyWritten,
-			                                           lastBlock && lastSequence);
-			FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
-			if(cSize > 0 && cSize < decompressedSize) {
-				DEBUGLOG(5, "Committed the sub-block");
-				assert(ip + decompressedSize <= iend);
-				ip += decompressedSize;
-				sp += seqCount;
-				lp += litSize;
-				op += cSize;
-				llCodePtr += seqCount;
-				mlCodePtr += seqCount;
-				ofCodePtr += seqCount;
-				litSize = 0;
-				seqCount = 0;
-				/* Entropy only needs to be written once */
-				if(litEntropyWritten) {
-					writeLitEntropy = 0;
-				}
-				if(seqEntropyWritten) {
-					writeSeqEntropy = 0;
-				}
-			}
-		}
-	} while(!lastSequence);
-	if(writeLitEntropy) {
-		DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
-		ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
-	}
-	if(writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
-		/* If we haven't written our entropy tables, then we've violated our contract and
-		 * must emit an uncompressed block.
-		 */
-		DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
-		return 0;
-	}
-	if(ip < iend) {
-		size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
-		DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
-		FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
-		assert(cSize != 0);
-		op += cSize;
-		/* We have to regenerate the repcodes because we've skipped some sequences */
-		if(sp < send) {
-			seqDef const * seq;
-			repcodes_t rep;
-			ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
-			for(seq = sstart; seq < sp; ++seq) {
-				ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
-			}
-			ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
-		}
-	}
-	DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
-	return op - ostart;
-}
-
-size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
-                               void* dst, size_t dstCapacity,
-                               void const * src, size_t srcSize,
-                               unsigned lastBlock) {
-	ZSTD_entropyCTablesMetadata_t entropyMetadata;
-
-	FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
-	                                             &zc->blockState.prevCBlock->entropy,
-	                                             &zc->blockState.nextCBlock->entropy,
-	                                             &zc->appliedParams,
-	                                             &entropyMetadata,
-	                                             zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */),
-	                 "");
-
-	return ZSTD_compressSubBlock_multi(&zc->seqStore,
-	                                   zc->blockState.prevCBlock,
-	                                   zc->blockState.nextCBlock,
-	                                   &entropyMetadata,
-	                                   &zc->appliedParams,
-	                                   dst, dstCapacity,
-	                                   src, srcSize,
-	                                   zc->bmi2, lastBlock,
-	                                   zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */);
-}
diff --git a/src/zstd/zstd_cwksp.h b/src/zstd/zstd_cwksp.h
deleted file mode 100644
index 9926044a1..000000000
--- a/src/zstd/zstd_cwksp.h
+++ /dev/null
@@ -1,666 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef ZSTD_CWKSP_H
-#define ZSTD_CWKSP_H
-
-/*-*************************************
- *  Dependencies
- ***************************************/
-#include "zstd_internal.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/*-*************************************
- *  Constants
- ***************************************/
-
-/* Since the workspace is effectively its own little malloc implementation /
- * arena, when we run under ASAN, we should similarly insert redzones between
- * each internal element of the workspace, so ASAN will catch overruns that
- * reach outside an object but that stay inside the workspace.
- *
- * This defines the size of that redzone.
- */
-#ifndef ZSTD_CWKSP_ASAN_REDZONE_SIZE
-#define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128
-#endif
-
-/* Set our tables and aligneds to align by 64 bytes */
-#define ZSTD_CWKSP_ALIGNMENT_BYTES 64
-
-/*-*************************************
- *  Structures
- ***************************************/
-typedef enum {
-	ZSTD_cwksp_alloc_objects,
-	ZSTD_cwksp_alloc_buffers,
-	ZSTD_cwksp_alloc_aligned
-} ZSTD_cwksp_alloc_phase_e;
-
-/**
- * Used to describe whether the workspace is statically allocated (and will not
- * necessarily ever be freed), or if it's dynamically allocated and we can
- * expect a well-formed caller to free this.
- */
-typedef enum {
-	ZSTD_cwksp_dynamic_alloc,
-	ZSTD_cwksp_static_alloc
-} ZSTD_cwksp_static_alloc_e;
-
-/**
- * Zstd fits all its internal datastructures into a single continuous buffer,
- * so that it only needs to perform a single OS allocation (or so that a buffer
- * can be provided to it and it can perform no allocations at all). This buffer
- * is called the workspace.
- *
- * Several optimizations complicate that process of allocating memory ranges
- * from this workspace for each internal datastructure:
- *
- * - These different internal datastructures have different setup requirements:
- *
- *   - The static objects need to be cleared once and can then be trivially
- *     reused for each compression.
- *
- *   - Various buffers don't need to be initialized at all--they are always
- *     written into before they're read.
- *
- *   - The matchstate tables have a unique requirement that they don't need
- *     their memory to be totally cleared, but they do need the memory to have
- *     some bound, i.e., a guarantee that all values in the memory they've been
- *     allocated is less than some maximum value (which is the starting value
- *     for the indices that they will then use for compression). When this
- *     guarantee is provided to them, they can use the memory without any setup
- *     work. When it can't, they have to clear the area.
- *
- * - These buffers also have different alignment requirements.
- *
- * - We would like to reuse the objects in the workspace for multiple
- *   compressions without having to perform any expensive reallocation or
- *   reinitialization work.
- *
- * - We would like to be able to efficiently reuse the workspace across
- *   multiple compressions **even when the compression parameters change** and
- *   we need to resize some of the objects (where possible).
- *
- * To attempt to manage this buffer, given these constraints, the ZSTD_cwksp
- * abstraction was created. It works as follows:
- *
- * Workspace Layout:
- *
- * [                        ... workspace ...                         ]
- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
- *
- * The various objects that live in the workspace are divided into the
- * following categories, and are allocated separately:
- *
- * - Static objects: this is optionally the enclosing ZSTD_CCtx or ZSTD_CDict,
- *   so that literally everything fits in a single buffer. Note: if present,
- *   this must be the first object in the workspace, since ZSTD_customFree{CCtx,
- *   CDict}() rely on a pointer comparison to see whether one or two frees are
- *   required.
- *
- * - Fixed size objects: these are fixed-size, fixed-count objects that are
- *   nonetheless "dynamically" allocated in the workspace so that we can
- *   control how they're initialized separately from the broader ZSTD_CCtx.
- *   Examples:
- *   - Entropy Workspace
- *   - 2 x ZSTD_compressedBlockState_t
- *   - CDict dictionary contents
- *
- * - Tables: these are any of several different datastructures (hash tables,
- *   chain tables, binary trees) that all respect a common format: they are
- *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
- *   Their sizes depend on the cparams. These tables are 64-byte aligned.
- *
- * - Aligned: these buffers are used for various purposes that require 4 byte
- *   alignment, but don't require any initialization before they're used. These
- *   buffers are each aligned to 64 bytes.
- *
- * - Buffers: these buffers are used for various purposes that don't require
- *   any alignment or initialization before they're used. This means they can
- *   be moved around at no cost for a new compression.
- *
- * Allocating Memory:
- *
- * The various types of objects must be allocated in order, so they can be
- * correctly packed into the workspace buffer. That order is:
- *
- * 1. Objects
- * 2. Buffers
- * 3. Aligned/Tables
- *
- * Attempts to reserve objects of different types out of order will fail.
- */
-typedef struct {
-	void* workspace;
-	void* workspaceEnd;
-
-	void* objectEnd;
-	void* tableEnd;
-	void* tableValidEnd;
-	void* allocStart;
-
-	BYTE allocFailed;
-	int workspaceOversizedDuration;
-	ZSTD_cwksp_alloc_phase_e phase;
-	ZSTD_cwksp_static_alloc_e isStatic;
-} ZSTD_cwksp;
-
-/*-*************************************
- *  Functions
- ***************************************/
-
-MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
-
-MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
-	(void)ws;
-	assert(ws->workspace <= ws->objectEnd);
-	assert(ws->objectEnd <= ws->tableEnd);
-	assert(ws->objectEnd <= ws->tableValidEnd);
-	assert(ws->tableEnd <= ws->allocStart);
-	assert(ws->tableValidEnd <= ws->allocStart);
-	assert(ws->allocStart <= ws->workspaceEnd);
-}
-
-/**
- * Align must be a power of 2.
- */
-MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
-	size_t const mask = align - 1;
-	assert((align & mask) == 0);
-	return (size + mask) & ~mask;
-}
-
-/**
- * Use this to determine how much space in the workspace we will consume to
- * allocate this object. (Normally it should be exactly the size of the object,
- * but under special conditions, like ASAN, where we pad each object, it might
- * be larger.)
- *
- * Since tables aren't currently redzoned, you don't need to call through this
- * to figure out how much space you need for the matchState tables. Everything
- * else is though.
- *
- * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size().
- */
-MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
-	if(size == 0)
-		return 0;
-#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE)
-	return size + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
-#else
-	return size;
-#endif
-}
-
-/**
- * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes.
- * Used to determine the number of bytes required for a given "aligned".
- */
-MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
-	return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES));
-}
-
-/**
- * Returns the amount of additional space the cwksp must allocate
- * for internal purposes (currently only alignment).
- */
-MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
-	/* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
-	 * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
-	 * to align the beginning of the aligned section.
-	 *
-	 * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
-	 * aligneds being sized in multiples of 64 bytes.
-	 */
-	size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
-	return slackSpace;
-}
-
-/**
- * Return the number of additional bytes required to align a pointer to the given number of bytes.
- * alignBytes must be a power of two.
- */
-MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) {
-	size_t const alignBytesMask = alignBytes - 1;
-	size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
-	assert((alignBytes & alignBytesMask) == 0);
-	assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
-	return bytes;
-}
-
-/**
- * Internal function. Do not use directly.
- * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
- * which counts from the end of the wksp (as opposed to the object/table segment).
- *
- * Returns a pointer to the beginning of that space.
- */
-MEM_STATIC void*
-ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes) {
-	void* const alloc = (BYTE*)ws->allocStart - bytes;
-	void* const bottom = ws->tableEnd;
-	DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
-	         alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
-	ZSTD_cwksp_assert_internal_consistency(ws);
-	assert(alloc >= bottom);
-	if(alloc < bottom) {
-		DEBUGLOG(4, "cwksp: alloc failed!");
-		ws->allocFailed = 1;
-		return NULL;
-	}
-	/* the area is reserved from the end of wksp.
-	 * If it overlaps with tableValidEnd, it voids guarantees on values' range */
-	if(alloc < ws->tableValidEnd) {
-		ws->tableValidEnd = alloc;
-	}
-	ws->allocStart = alloc;
-	return alloc;
-}
-
-/**
- * Moves the cwksp to the next phase, and does any necessary allocations.
- * cwksp initialization must necessarily go through each phase in order.
- * Returns a 0 on success, or zstd error
- */
-MEM_STATIC size_t
-ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) {
-	assert(phase >= ws->phase);
-	if(phase > ws->phase) {
-		/* Going from allocating objects to allocating buffers */
-		if(ws->phase < ZSTD_cwksp_alloc_buffers &&
-		   phase >= ZSTD_cwksp_alloc_buffers) {
-			ws->tableValidEnd = ws->objectEnd;
-		}
-
-		/* Going from allocating buffers to allocating aligneds/tables */
-		if(ws->phase < ZSTD_cwksp_alloc_aligned &&
-		   phase >= ZSTD_cwksp_alloc_aligned) {
-			{ /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
-				size_t const bytesToAlign =
-				    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
-				DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
-				ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
-				RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
-				                memory_allocation, "aligned phase - alignment initial allocation failed!");
-			}
-			{ /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
-				void* const alloc = ws->objectEnd;
-				size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
-				void* const objectEnd = (BYTE*)alloc + bytesToAlign;
-				DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
-				RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
-				                "table phase - alignment initial allocation failed!");
-				ws->objectEnd = objectEnd;
-				ws->tableEnd = objectEnd; /* table area starts being empty */
-				if(ws->tableValidEnd < ws->tableEnd) {
-					ws->tableValidEnd = ws->tableEnd;
-				}
-			}
-		}
-		ws->phase = phase;
-		ZSTD_cwksp_assert_internal_consistency(ws);
-	}
-	return 0;
-}
-
-/**
- * Returns whether this object/buffer/etc was allocated in this workspace.
- */
-MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) {
-	return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
-}
-
-/**
- * Internal function. Do not use directly.
- */
-MEM_STATIC void*
-ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) {
-	void* alloc;
-	if(ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) {
-		return NULL;
-	}
-
-#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE)
-	/* over-reserve space */
-	bytes += 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
-#endif
-
-	alloc = ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes);
-
-#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE)
-	/* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
-	 * either size. */
-	if(alloc) {
-		alloc = (BYTE*)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
-		if(ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
-			/* We need to keep the redzone poisoned while unpoisoning the bytes that
-			 * are actually allocated. */
-			__asan_unpoison_memory_region(alloc, bytes - 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE);
-		}
-	}
-#endif
-
-	return alloc;
-}
-
-/**
- * Reserves and returns unaligned memory.
- */
-MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) {
-	return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
-}
-
-/**
- * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
- */
-MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) {
-	void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES),
-	                                        ZSTD_cwksp_alloc_aligned);
-	assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0);
-	return ptr;
-}
-
-/**
- * Aligned on 64 bytes. These buffers have the special property that
- * their values remain constrained, allowing us to re-use them without
- * memset()-ing them.
- */
-MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) {
-	const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
-	void* alloc;
-	void* end;
-	void* top;
-
-	if(ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
-		return NULL;
-	}
-	alloc = ws->tableEnd;
-	end = (BYTE*)alloc + bytes;
-	top = ws->allocStart;
-
-	DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining",
-	         alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
-	assert((bytes & (sizeof(U32) - 1)) == 0);
-	ZSTD_cwksp_assert_internal_consistency(ws);
-	assert(end <= top);
-	if(end > top) {
-		DEBUGLOG(4, "cwksp: table alloc failed!");
-		ws->allocFailed = 1;
-		return NULL;
-	}
-	ws->tableEnd = end;
-
-#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE)
-	if(ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
-		__asan_unpoison_memory_region(alloc, bytes);
-	}
-#endif
-
-	assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0);
-	assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0);
-	return alloc;
-}
-
-/**
- * Aligned on sizeof(void*).
- * Note : should happen only once, at workspace first initialization
- */
-MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) {
-	size_t const roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*));
-	void* alloc = ws->objectEnd;
-	void* end = (BYTE*)alloc + roundedBytes;
-
-#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE)
-	/* over-reserve space */
-	end = (BYTE*)end + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
-#endif
-
-	DEBUGLOG(4,
-	         "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining",
-	         alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes);
-	assert((size_t)alloc % ZSTD_ALIGNOF(void*) == 0);
-	assert(bytes % ZSTD_ALIGNOF(void*) == 0);
-	ZSTD_cwksp_assert_internal_consistency(ws);
-	/* we must be in the first phase, no advance is possible */
-	if(ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) {
-		DEBUGLOG(3, "cwksp: object alloc failed!");
-		ws->allocFailed = 1;
-		return NULL;
-	}
-	ws->objectEnd = end;
-	ws->tableEnd = end;
-	ws->tableValidEnd = end;
-
-#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE)
-	/* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
-	 * either size. */
-	alloc = (BYTE*)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
-	if(ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
-		__asan_unpoison_memory_region(alloc, bytes);
-	}
-#endif
-
-	return alloc;
-}
-
-MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) {
-	DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty");
-
-#if ZSTD_MEMORY_SANITIZER && !defined(ZSTD_MSAN_DONT_POISON_WORKSPACE)
-	/* To validate that the table re-use logic is sound, and that we don't
-	 * access table space that we haven't cleaned, we re-"poison" the table
-	 * space every time we mark it dirty. */
-	{
-		size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
-		assert(__msan_test_shadow(ws->objectEnd, size) == -1);
-		__msan_poison(ws->objectEnd, size);
-	}
-#endif
-
-	assert(ws->tableValidEnd >= ws->objectEnd);
-	assert(ws->tableValidEnd <= ws->allocStart);
-	ws->tableValidEnd = ws->objectEnd;
-	ZSTD_cwksp_assert_internal_consistency(ws);
-}
-
-MEM_STATIC void ZSTD_cwksp_mark_tables_clean(ZSTD_cwksp* ws) {
-	DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_clean");
-	assert(ws->tableValidEnd >= ws->objectEnd);
-	assert(ws->tableValidEnd <= ws->allocStart);
-	if(ws->tableValidEnd < ws->tableEnd) {
-		ws->tableValidEnd = ws->tableEnd;
-	}
-	ZSTD_cwksp_assert_internal_consistency(ws);
-}
-
-/**
- * Zero the part of the allocated tables not already marked clean.
- */
-MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
-	DEBUGLOG(4, "cwksp: ZSTD_cwksp_clean_tables");
-	assert(ws->tableValidEnd >= ws->objectEnd);
-	assert(ws->tableValidEnd <= ws->allocStart);
-	if(ws->tableValidEnd < ws->tableEnd) {
-		ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
-	}
-	ZSTD_cwksp_mark_tables_clean(ws);
-}
-
-/**
- * Invalidates table allocations.
- * All other allocations remain valid.
- */
-MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) {
-	DEBUGLOG(4, "cwksp: clearing tables!");
-
-#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE)
-	/* We don't do this when the workspace is statically allocated, because
-	 * when that is the case, we have no capability to hook into the end of the
-	 * workspace's lifecycle to unpoison the memory.
-	 */
-	if(ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
-		size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
-		__asan_poison_memory_region(ws->objectEnd, size);
-	}
-#endif
-
-	ws->tableEnd = ws->objectEnd;
-	ZSTD_cwksp_assert_internal_consistency(ws);
-}
-
-/**
- * Invalidates all buffer, aligned, and table allocations.
- * Object allocations remain valid.
- */
-MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
-	DEBUGLOG(4, "cwksp: clearing!");
-
-#if ZSTD_MEMORY_SANITIZER && !defined(ZSTD_MSAN_DONT_POISON_WORKSPACE)
-	/* To validate that the context re-use logic is sound, and that we don't
-	 * access stuff that this compression hasn't initialized, we re-"poison"
-	 * the workspace (or at least the non-static, non-table parts of it)
-	 * every time we start a new compression. */
-	{
-		size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->tableValidEnd;
-		__msan_poison(ws->tableValidEnd, size);
-	}
-#endif
-
-#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE)
-	/* We don't do this when the workspace is statically allocated, because
-	 * when that is the case, we have no capability to hook into the end of the
-	 * workspace's lifecycle to unpoison the memory.
-	 */
-	if(ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
-		size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->objectEnd;
-		__asan_poison_memory_region(ws->objectEnd, size);
-	}
-#endif
-
-	ws->tableEnd = ws->objectEnd;
-	ws->allocStart = ws->workspaceEnd;
-	ws->allocFailed = 0;
-	if(ws->phase > ZSTD_cwksp_alloc_buffers) {
-		ws->phase = ZSTD_cwksp_alloc_buffers;
-	}
-	ZSTD_cwksp_assert_internal_consistency(ws);
-}
-
-/**
- * The provided workspace takes ownership of the buffer [start, start+size).
- * Any existing values in the workspace are ignored (the previously managed
- * buffer, if present, must be separately freed).
- */
-MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_cwksp_static_alloc_e isStatic) {
-	DEBUGLOG(4, "cwksp: init'ing workspace with %zd bytes", size);
-	assert(((size_t)start & (sizeof(void*) - 1)) == 0); /* ensure correct alignment */
-	ws->workspace = start;
-	ws->workspaceEnd = (BYTE*)start + size;
-	ws->objectEnd = ws->workspace;
-	ws->tableValidEnd = ws->objectEnd;
-	ws->phase = ZSTD_cwksp_alloc_objects;
-	ws->isStatic = isStatic;
-	ZSTD_cwksp_clear(ws);
-	ws->workspaceOversizedDuration = 0;
-	ZSTD_cwksp_assert_internal_consistency(ws);
-}
-
-MEM_STATIC size_t ZSTD_cwksp_create(ZSTD_cwksp* ws, size_t size, ZSTD_customMem customMem) {
-	void* workspace = ZSTD_customMalloc(size, customMem);
-	DEBUGLOG(4, "cwksp: creating new workspace with %zd bytes", size);
-	RETURN_ERROR_IF(workspace == NULL, memory_allocation, "NULL pointer!");
-	ZSTD_cwksp_init(ws, workspace, size, ZSTD_cwksp_dynamic_alloc);
-	return 0;
-}
-
-MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) {
-	void* ptr = ws->workspace;
-	DEBUGLOG(4, "cwksp: freeing workspace");
-	ZSTD_memset(ws, 0, sizeof(ZSTD_cwksp));
-	ZSTD_customFree(ptr, customMem);
-}
-
-/**
- * Moves the management of a workspace from one cwksp to another. The src cwksp
- * is left in an invalid state (src must be re-init()'ed before it's used again).
- */
-MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
-	*dst = *src;
-	ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
-}
-
-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
-	return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
-}
-
-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
-	return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
-}
-
-MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
-	return ws->allocFailed;
-}
-
-/*-*************************************
- *  Functions Checking Free Space
- ***************************************/
-
-/* ZSTD_alignmentSpaceWithinBounds() :
- * Returns if the estimated space needed for a wksp is within an acceptable limit of the
- * actual amount of space used.
- */
-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
-                                                        size_t const estimatedSpace, int resizedWorkspace) {
-	if(resizedWorkspace) {
-		/* Resized/newly allocated wksp should have exact bounds */
-		return ZSTD_cwksp_used(ws) == estimatedSpace;
-	} else {
-		/* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
-		 * than estimatedSpace. See the comments in zstd_cwksp.h for details.
-		 */
-		return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
-	}
-}
-
-MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) {
-	return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd);
-}
-
-MEM_STATIC int ZSTD_cwksp_check_available(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
-	return ZSTD_cwksp_available_space(ws) >= additionalNeededSpace;
-}
-
-MEM_STATIC int ZSTD_cwksp_check_too_large(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
-	return ZSTD_cwksp_check_available(
-	    ws, additionalNeededSpace * ZSTD_WORKSPACETOOLARGE_FACTOR);
-}
-
-MEM_STATIC int ZSTD_cwksp_check_wasteful(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
-	return ZSTD_cwksp_check_too_large(ws, additionalNeededSpace) && ws->workspaceOversizedDuration > ZSTD_WORKSPACETOOLARGE_MAXDURATION;
-}
-
-MEM_STATIC void ZSTD_cwksp_bump_oversized_duration(
-    ZSTD_cwksp* ws, size_t additionalNeededSpace) {
-	if(ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)) {
-		ws->workspaceOversizedDuration++;
-	} else {
-		ws->workspaceOversizedDuration = 0;
-	}
-}
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* ZSTD_CWKSP_H */
diff --git a/src/zstd/zstd_ddict.c b/src/zstd/zstd_ddict.c
deleted file mode 100644
index f7d09a25c..000000000
--- a/src/zstd/zstd_ddict.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/* zstd_ddict.c :
- * concentrates all logic that needs to know the internals of ZSTD_DDict object */
-
-/*-*******************************************************
- *  Dependencies
- *********************************************************/
-#include "zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
-#include "cpu.h"       /* bmi2 */
-#include "mem.h"       /* low level memory routines */
-#define FSE_STATIC_LINKING_ONLY
-#include "fse.h"
-#include "huf.h"
-#include "zstd_decompress_internal.h"
-#include "zstd_ddict.h"
-
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-#include "../legacy/zstd_legacy.h"
-#endif
-
-/*-*******************************************************
- *  Types
- *********************************************************/
-struct ZSTD_DDict_s {
-	void* dictBuffer;
-	const void* dictContent;
-	size_t dictSize;
-	ZSTD_entropyDTables_t entropy;
-	U32 dictID;
-	U32 entropyPresent;
-	ZSTD_customMem cMem;
-}; /* typedef'd to ZSTD_DDict within "zstd.h" */
-
-const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict) {
-	assert(ddict != NULL);
-	return ddict->dictContent;
-}
-
-size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict) {
-	assert(ddict != NULL);
-	return ddict->dictSize;
-}
-
-void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) {
-	DEBUGLOG(4, "ZSTD_copyDDictParameters");
-	assert(dctx != NULL);
-	assert(ddict != NULL);
-	dctx->dictID = ddict->dictID;
-	dctx->prefixStart = ddict->dictContent;
-	dctx->virtualStart = ddict->dictContent;
-	dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
-	dctx->previousDstEnd = dctx->dictEnd;
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-	dctx->dictContentBeginForFuzzing = dctx->prefixStart;
-	dctx->dictContentEndForFuzzing = dctx->previousDstEnd;
-#endif
-	if(ddict->entropyPresent) {
-		dctx->litEntropy = 1;
-		dctx->fseEntropy = 1;
-		dctx->LLTptr = ddict->entropy.LLTable;
-		dctx->MLTptr = ddict->entropy.MLTable;
-		dctx->OFTptr = ddict->entropy.OFTable;
-		dctx->HUFptr = ddict->entropy.hufTable;
-		dctx->entropy.rep[0] = ddict->entropy.rep[0];
-		dctx->entropy.rep[1] = ddict->entropy.rep[1];
-		dctx->entropy.rep[2] = ddict->entropy.rep[2];
-	} else {
-		dctx->litEntropy = 0;
-		dctx->fseEntropy = 0;
-	}
-}
-
-static size_t
-ZSTD_loadEntropy_intoDDict(ZSTD_DDict* ddict,
-                           ZSTD_dictContentType_e dictContentType) {
-	ddict->dictID = 0;
-	ddict->entropyPresent = 0;
-	if(dictContentType == ZSTD_dct_rawContent)
-		return 0;
-
-	if(ddict->dictSize < 8) {
-		if(dictContentType == ZSTD_dct_fullDict)
-			return ERROR(dictionary_corrupted); /* only accept specified dictionaries */
-		return 0;                               /* pure content mode */
-	}
-	{
-		U32 const magic = MEM_readLE32(ddict->dictContent);
-		if(magic != ZSTD_MAGIC_DICTIONARY) {
-			if(dictContentType == ZSTD_dct_fullDict)
-				return ERROR(dictionary_corrupted); /* only accept specified dictionaries */
-			return 0;                               /* pure content mode */
-		}
-	}
-	ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE);
-
-	/* load entropy tables */
-	RETURN_ERROR_IF(ZSTD_isError(ZSTD_loadDEntropy(
-	                    &ddict->entropy, ddict->dictContent, ddict->dictSize)),
-	                dictionary_corrupted, "");
-	ddict->entropyPresent = 1;
-	return 0;
-}
-
-static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
-                                      const void* dict, size_t dictSize,
-                                      ZSTD_dictLoadMethod_e dictLoadMethod,
-                                      ZSTD_dictContentType_e dictContentType) {
-	if((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
-		ddict->dictBuffer = NULL;
-		ddict->dictContent = dict;
-		if(!dict)
-			dictSize = 0;
-	} else {
-		void* const internalBuffer = ZSTD_customMalloc(dictSize, ddict->cMem);
-		ddict->dictBuffer = internalBuffer;
-		ddict->dictContent = internalBuffer;
-		if(!internalBuffer)
-			return ERROR(memory_allocation);
-		ZSTD_memcpy(internalBuffer, dict, dictSize);
-	}
-	ddict->dictSize = dictSize;
-	ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */
-
-	/* parse dictionary content */
-	FORWARD_IF_ERROR(ZSTD_loadEntropy_intoDDict(ddict, dictContentType), "");
-
-	return 0;
-}
-
-ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
-                                      ZSTD_dictLoadMethod_e dictLoadMethod,
-                                      ZSTD_dictContentType_e dictContentType,
-                                      ZSTD_customMem customMem) {
-	if((!customMem.customAlloc) ^ (!customMem.customFree))
-		return NULL;
-
-	{
-		ZSTD_DDict* const ddict = (ZSTD_DDict*)ZSTD_customMalloc(sizeof(ZSTD_DDict), customMem);
-		if(ddict == NULL)
-			return NULL;
-		ddict->cMem = customMem;
-		{
-			size_t const initResult = ZSTD_initDDict_internal(ddict,
-			                                                  dict, dictSize,
-			                                                  dictLoadMethod, dictContentType);
-			if(ZSTD_isError(initResult)) {
-				ZSTD_freeDDict(ddict);
-				return NULL;
-			}
-		}
-		return ddict;
-	}
-}
-
-/*! ZSTD_createDDict() :
- *   Create a digested dictionary, to start decompression without startup delay.
- *   `dict` content is copied inside DDict.
- *   Consequently, `dict` can be released after `ZSTD_DDict` creation */
-ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize) {
-	ZSTD_customMem const allocator = {NULL, NULL, NULL};
-	return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator);
-}
-
-/*! ZSTD_createDDict_byReference() :
- *  Create a digested dictionary, to start decompression without startup delay.
- *  Dictionary content is simply referenced, it will be accessed during decompression.
- *  Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */
-ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize) {
-	ZSTD_customMem const allocator = {NULL, NULL, NULL};
-	return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator);
-}
-
-const ZSTD_DDict* ZSTD_initStaticDDict(
-    void* sBuffer, size_t sBufferSize,
-    const void* dict, size_t dictSize,
-    ZSTD_dictLoadMethod_e dictLoadMethod,
-    ZSTD_dictContentType_e dictContentType) {
-	size_t const neededSpace = sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
-	ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer;
-	assert(sBuffer != NULL);
-	assert(dict != NULL);
-	if((size_t)sBuffer & 7)
-		return NULL; /* 8-aligned */
-	if(sBufferSize < neededSpace)
-		return NULL;
-	if(dictLoadMethod == ZSTD_dlm_byCopy) {
-		ZSTD_memcpy(ddict + 1, dict, dictSize); /* local copy */
-		dict = ddict + 1;
-	}
-	if(ZSTD_isError(ZSTD_initDDict_internal(ddict,
-	                                        dict, dictSize,
-	                                        ZSTD_dlm_byRef, dictContentType)))
-		return NULL;
-	return ddict;
-}
-
-size_t ZSTD_freeDDict(ZSTD_DDict* ddict) {
-	if(ddict == NULL)
-		return 0; /* support free on NULL */
-	{
-		ZSTD_customMem const cMem = ddict->cMem;
-		ZSTD_customFree(ddict->dictBuffer, cMem);
-		ZSTD_customFree(ddict, cMem);
-		return 0;
-	}
-}
-
-/*! ZSTD_estimateDDictSize() :
- *  Estimate amount of memory that will be needed to create a dictionary for decompression.
- *  Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */
-size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod) {
-	return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
-}
-
-size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) {
-	if(ddict == NULL)
-		return 0; /* support sizeof on NULL */
-	return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0);
-}
-
-/*! ZSTD_getDictID_fromDDict() :
- *  Provides the dictID of the dictionary loaded into `ddict`.
- *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
- *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
-unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) {
-	if(ddict == NULL)
-		return 0;
-	return ddict->dictID;
-}
diff --git a/src/zstd/zstd_decompress.c b/src/zstd/zstd_decompress.c
deleted file mode 100644
index b59010f86..000000000
--- a/src/zstd/zstd_decompress.c
+++ /dev/null
@@ -1,2350 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/* ***************************************************************
- *  Tuning parameters
- *****************************************************************/
-/*!
- * HEAPMODE :
- * Select how default decompression function ZSTD_decompress() allocates its context,
- * on stack (0), or into heap (1, default; requires malloc()).
- * Note that functions with explicit context such as ZSTD_decompressDCtx() are unaffected.
- */
-#ifndef ZSTD_HEAPMODE
-#define ZSTD_HEAPMODE 1
-#endif
-
-/*!
- *  LEGACY_SUPPORT :
- *  if set to 1+, ZSTD_decompress() can decode older formats (v0.1+)
- */
-#ifndef ZSTD_LEGACY_SUPPORT
-#define ZSTD_LEGACY_SUPPORT 0
-#endif
-
-/*!
- *  MAXWINDOWSIZE_DEFAULT :
- *  maximum window size accepted by DStream __by default__.
- *  Frames requiring more memory will be rejected.
- *  It's possible to set a different limit using ZSTD_DCtx_setMaxWindowSize().
- */
-#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT
-#define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + 1)
-#endif
-
-/*!
- *  NO_FORWARD_PROGRESS_MAX :
- *  maximum allowed nb of calls to ZSTD_decompressStream()
- *  without any forward progress
- *  (defined as: no byte read from input, and no byte flushed to output)
- *  before triggering an error.
- */
-#ifndef ZSTD_NO_FORWARD_PROGRESS_MAX
-#define ZSTD_NO_FORWARD_PROGRESS_MAX 16
-#endif
-
-/*-*******************************************************
- *  Dependencies
- *********************************************************/
-#include "zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
-#include "mem.h"       /* low level memory routines */
-#define FSE_STATIC_LINKING_ONLY
-#include "fse.h"
-#include "huf.h"
-#include "xxhash.h"                   /* XXH64_reset, XXH64_update, XXH64_digest, XXH64 */
-#include "zstd_internal.h"            /* blockProperties_t */
-#include "zstd_decompress_internal.h" /* ZSTD_DCtx */
-#include "zstd_ddict.h"               /* ZSTD_DDictDictContent */
-#include "zstd_decompress_block.h"    /* ZSTD_decompressBlock_internal */
-#include "bits.h"                     /* ZSTD_highbit32 */
-
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-#include "../legacy/zstd_legacy.h"
-#endif
-
-/*************************************
- * Multiple DDicts Hashset internals *
- *************************************/
-
-#define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. \
-	                                               * Currently, that means a 0.75 load factor.                                             \
-	                                               * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded                \
-	                                               * the load factor of the ddict hash set.                                                \
-	                                               */
-
-#define DDICT_HASHSET_TABLE_BASE_SIZE 64
-#define DDICT_HASHSET_RESIZE_FACTOR 2
-
-/* Hash function to determine starting position of dict insertion within the table
- * Returns an index between [0, hashSet->ddictPtrTableSize]
- */
-static size_t ZSTD_DDictHashSet_getIndex(const ZSTD_DDictHashSet* hashSet, U32 dictID) {
-	const U64 hash = XXH64(&dictID, sizeof(U32), 0);
-	/* DDict ptr table size is a multiple of 2, use size - 1 as mask to get index within [0, hashSet->ddictPtrTableSize) */
-	return hash & (hashSet->ddictPtrTableSize - 1);
-}
-
-/* Adds DDict to a hashset without resizing it.
- * If inserting a DDict with a dictID that already exists in the set, replaces the one in the set.
- * Returns 0 if successful, or a zstd error code if something went wrong.
- */
-static size_t ZSTD_DDictHashSet_emplaceDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict) {
-	const U32 dictID = ZSTD_getDictID_fromDDict(ddict);
-	size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID);
-	const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1;
-	RETURN_ERROR_IF(hashSet->ddictPtrCount == hashSet->ddictPtrTableSize, GENERIC, "Hash set is full!");
-	DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx);
-	while(hashSet->ddictPtrTable[idx] != NULL) {
-		/* Replace existing ddict if inserting ddict with same dictID */
-		if(ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]) == dictID) {
-			DEBUGLOG(4, "DictID already exists, replacing rather than adding");
-			hashSet->ddictPtrTable[idx] = ddict;
-			return 0;
-		}
-		idx &= idxRangeMask;
-		idx++;
-	}
-	DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx);
-	hashSet->ddictPtrTable[idx] = ddict;
-	hashSet->ddictPtrCount++;
-	return 0;
-}
-
-/* Expands hash table by factor of DDICT_HASHSET_RESIZE_FACTOR and
- * rehashes all values, allocates new table, frees old table.
- * Returns 0 on success, otherwise a zstd error code.
- */
-static size_t ZSTD_DDictHashSet_expand(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) {
-	size_t newTableSize = hashSet->ddictPtrTableSize * DDICT_HASHSET_RESIZE_FACTOR;
-	const ZSTD_DDict** newTable = (const ZSTD_DDict**)ZSTD_customCalloc(sizeof(ZSTD_DDict*) * newTableSize, customMem);
-	const ZSTD_DDict** oldTable = hashSet->ddictPtrTable;
-	size_t oldTableSize = hashSet->ddictPtrTableSize;
-	size_t i;
-
-	DEBUGLOG(4, "Expanding DDict hash table! Old size: %zu new size: %zu", oldTableSize, newTableSize);
-	RETURN_ERROR_IF(!newTable, memory_allocation, "Expanded hashset allocation failed!");
-	hashSet->ddictPtrTable = newTable;
-	hashSet->ddictPtrTableSize = newTableSize;
-	hashSet->ddictPtrCount = 0;
-	for(i = 0; i < oldTableSize; ++i) {
-		if(oldTable[i] != NULL) {
-			FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, oldTable[i]), "");
-		}
-	}
-	ZSTD_customFree((void*)oldTable, customMem);
-	DEBUGLOG(4, "Finished re-hash");
-	return 0;
-}
-
-/* Fetches a DDict with the given dictID
- * Returns the ZSTD_DDict* with the requested dictID. If it doesn't exist, then returns NULL.
- */
-static const ZSTD_DDict* ZSTD_DDictHashSet_getDDict(ZSTD_DDictHashSet* hashSet, U32 dictID) {
-	size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID);
-	const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1;
-	DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx);
-	for(;;) {
-		size_t currDictID = ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]);
-		if(currDictID == dictID || currDictID == 0) {
-			/* currDictID == 0 implies a NULL ddict entry */
-			break;
-		} else {
-			idx &= idxRangeMask; /* Goes to start of table when we reach the end */
-			idx++;
-		}
-	}
-	DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx);
-	return hashSet->ddictPtrTable[idx];
-}
-
-/* Allocates space for and returns a ddict hash set
- * The hash set's ZSTD_DDict* table has all values automatically set to NULL to begin with.
- * Returns NULL if allocation failed.
- */
-static ZSTD_DDictHashSet* ZSTD_createDDictHashSet(ZSTD_customMem customMem) {
-	ZSTD_DDictHashSet* ret = (ZSTD_DDictHashSet*)ZSTD_customMalloc(sizeof(ZSTD_DDictHashSet), customMem);
-	DEBUGLOG(4, "Allocating new hash set");
-	if(!ret)
-		return NULL;
-	ret->ddictPtrTable = (const ZSTD_DDict**)ZSTD_customCalloc(DDICT_HASHSET_TABLE_BASE_SIZE * sizeof(ZSTD_DDict*), customMem);
-	if(!ret->ddictPtrTable) {
-		ZSTD_customFree(ret, customMem);
-		return NULL;
-	}
-	ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE;
-	ret->ddictPtrCount = 0;
-	return ret;
-}
-
-/* Frees the table of ZSTD_DDict* within a hashset, then frees the hashset itself.
- * Note: The ZSTD_DDict* within the table are NOT freed.
- */
-static void ZSTD_freeDDictHashSet(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) {
-	DEBUGLOG(4, "Freeing ddict hash set");
-	if(hashSet && hashSet->ddictPtrTable) {
-		ZSTD_customFree((void*)hashSet->ddictPtrTable, customMem);
-	}
-	if(hashSet) {
-		ZSTD_customFree(hashSet, customMem);
-	}
-}
-
-/* Public function: Adds a DDict into the ZSTD_DDictHashSet, possibly triggering a resize of the hash set.
- * Returns 0 on success, or a ZSTD error.
- */
-static size_t ZSTD_DDictHashSet_addDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict, ZSTD_customMem customMem) {
-	DEBUGLOG(4, "Adding dict ID: %u to hashset with - Count: %zu Tablesize: %zu", ZSTD_getDictID_fromDDict(ddict), hashSet->ddictPtrCount, hashSet->ddictPtrTableSize);
-	if(hashSet->ddictPtrCount * DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT / hashSet->ddictPtrTableSize * DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT != 0) {
-		FORWARD_IF_ERROR(ZSTD_DDictHashSet_expand(hashSet, customMem), "");
-	}
-	FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, ddict), "");
-	return 0;
-}
-
-/*-*************************************************************
- *   Context management
- ***************************************************************/
-size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx) {
-	if(dctx == NULL)
-		return 0; /* support sizeof NULL */
-	return sizeof(*dctx) + ZSTD_sizeof_DDict(dctx->ddictLocal) + dctx->inBuffSize + dctx->outBuffSize;
-}
-
-size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); }
-
-static size_t ZSTD_startingInputLength(ZSTD_format_e format) {
-	size_t const startingInputLength = ZSTD_FRAMEHEADERSIZE_PREFIX(format);
-	/* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
-	assert((format == ZSTD_f_zstd1) || (format == ZSTD_f_zstd1_magicless));
-	return startingInputLength;
-}
-
-static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) {
-	assert(dctx->streamStage == zdss_init);
-	dctx->format = ZSTD_f_zstd1;
-	dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
-	dctx->outBufferMode = ZSTD_bm_buffered;
-	dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
-	dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
-	dctx->disableHufAsm = 0;
-}
-
-static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) {
-	dctx->staticSize = 0;
-	dctx->ddict = NULL;
-	dctx->ddictLocal = NULL;
-	dctx->dictEnd = NULL;
-	dctx->ddictIsCold = 0;
-	dctx->dictUses = ZSTD_dont_use;
-	dctx->inBuff = NULL;
-	dctx->inBuffSize = 0;
-	dctx->outBuffSize = 0;
-	dctx->streamStage = zdss_init;
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-	dctx->legacyContext = NULL;
-	dctx->previousLegacyVersion = 0;
-#endif
-	dctx->noForwardProgress = 0;
-	dctx->oversizedDuration = 0;
-#if DYNAMIC_BMI2
-	dctx->bmi2 = ZSTD_cpuSupportsBmi2();
-#endif
-	dctx->ddictSet = NULL;
-	ZSTD_DCtx_resetParameters(dctx);
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-	dctx->dictContentEndForFuzzing = NULL;
-#endif
-}
-
-ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize) {
-	ZSTD_DCtx* const dctx = (ZSTD_DCtx*)workspace;
-
-	if((size_t)workspace & 7)
-		return NULL; /* 8-aligned */
-	if(workspaceSize < sizeof(ZSTD_DCtx))
-		return NULL; /* minimum size */
-
-	ZSTD_initDCtx_internal(dctx);
-	dctx->staticSize = workspaceSize;
-	dctx->inBuff = (char*)(dctx + 1);
-	return dctx;
-}
-
-static ZSTD_DCtx* ZSTD_createDCtx_internal(ZSTD_customMem customMem) {
-	if((!customMem.customAlloc) ^ (!customMem.customFree))
-		return NULL;
-
-	{
-		ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_customMalloc(sizeof(*dctx), customMem);
-		if(!dctx)
-			return NULL;
-		dctx->customMem = customMem;
-		ZSTD_initDCtx_internal(dctx);
-		return dctx;
-	}
-}
-
-ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) {
-	return ZSTD_createDCtx_internal(customMem);
-}
-
-ZSTD_DCtx* ZSTD_createDCtx(void) {
-	DEBUGLOG(3, "ZSTD_createDCtx");
-	return ZSTD_createDCtx_internal(ZSTD_defaultCMem);
-}
-
-static void ZSTD_clearDict(ZSTD_DCtx* dctx) {
-	ZSTD_freeDDict(dctx->ddictLocal);
-	dctx->ddictLocal = NULL;
-	dctx->ddict = NULL;
-	dctx->dictUses = ZSTD_dont_use;
-}
-
-size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx) {
-	if(dctx == NULL)
-		return 0; /* support free on NULL */
-	RETURN_ERROR_IF(dctx->staticSize, memory_allocation, "not compatible with static DCtx");
-	{
-		ZSTD_customMem const cMem = dctx->customMem;
-		ZSTD_clearDict(dctx);
-		ZSTD_customFree(dctx->inBuff, cMem);
-		dctx->inBuff = NULL;
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-		if(dctx->legacyContext)
-			ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion);
-#endif
-		if(dctx->ddictSet) {
-			ZSTD_freeDDictHashSet(dctx->ddictSet, cMem);
-			dctx->ddictSet = NULL;
-		}
-		ZSTD_customFree(dctx, cMem);
-		return 0;
-	}
-}
-
-/* no longer useful */
-void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) {
-	size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx);
-	ZSTD_memcpy(dstDCtx, srcDCtx, toCopy); /* no need to copy workspace */
-}
-
-/* Given a dctx with a digested frame params, re-selects the correct ZSTD_DDict based on
- * the requested dict ID from the frame. If there exists a reference to the correct ZSTD_DDict, then
- * accordingly sets the ddict to be used to decompress the frame.
- *
- * If no DDict is found, then no action is taken, and the ZSTD_DCtx::ddict remains as-is.
- *
- * ZSTD_d_refMultipleDDicts must be enabled for this function to be called.
- */
-static void ZSTD_DCtx_selectFrameDDict(ZSTD_DCtx* dctx) {
-	assert(dctx->refMultipleDDicts && dctx->ddictSet);
-	DEBUGLOG(4, "Adjusting DDict based on requested dict ID from frame");
-	if(dctx->ddict) {
-		const ZSTD_DDict* frameDDict = ZSTD_DDictHashSet_getDDict(dctx->ddictSet, dctx->fParams.dictID);
-		if(frameDDict) {
-			DEBUGLOG(4, "DDict found!");
-			ZSTD_clearDict(dctx);
-			dctx->dictID = dctx->fParams.dictID;
-			dctx->ddict = frameDDict;
-			dctx->dictUses = ZSTD_use_indefinitely;
-		}
-	}
-}
-
-/*-*************************************************************
- *   Frame header decoding
- ***************************************************************/
-
-/*! ZSTD_isFrame() :
- *  Tells if the content of `buffer` starts with a valid Frame Identifier.
- *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
- *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
- *  Note 3 : Skippable Frame Identifiers are considered valid. */
-unsigned ZSTD_isFrame(const void* buffer, size_t size) {
-	if(size < ZSTD_FRAMEIDSIZE)
-		return 0;
-	{
-		U32 const magic = MEM_readLE32(buffer);
-		if(magic == ZSTD_MAGICNUMBER)
-			return 1;
-		if((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START)
-			return 1;
-	}
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-	if(ZSTD_isLegacy(buffer, size))
-		return 1;
-#endif
-	return 0;
-}
-
-/*! ZSTD_isSkippableFrame() :
- *  Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame.
- *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
- */
-unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size) {
-	if(size < ZSTD_FRAMEIDSIZE)
-		return 0;
-	{
-		U32 const magic = MEM_readLE32(buffer);
-		if((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START)
-			return 1;
-	}
-	return 0;
-}
-
-/** ZSTD_frameHeaderSize_internal() :
- *  srcSize must be large enough to reach header size fields.
- *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless.
- * @return : size of the Frame Header
- *           or an error code, which can be tested with ZSTD_isError() */
-static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format) {
-	size_t const minInputSize = ZSTD_startingInputLength(format);
-	RETURN_ERROR_IF(srcSize < minInputSize, srcSize_wrong, "");
-
-	{
-		BYTE const fhd = ((const BYTE*)src)[minInputSize - 1];
-		U32 const dictID = fhd & 3;
-		U32 const singleSegment = (fhd >> 5) & 1;
-		U32 const fcsId = fhd >> 6;
-		return minInputSize + !singleSegment + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId] + (singleSegment && !fcsId);
-	}
-}
-
-/** ZSTD_frameHeaderSize() :
- *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
- * @return : size of the Frame Header,
- *           or an error code (if srcSize is too small) */
-size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) {
-	return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1);
-}
-
-/** ZSTD_getFrameHeader_advanced() :
- *  decode Frame Header, or require larger `srcSize`.
- *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
- * @return : 0, `zfhPtr` is correctly filled,
- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
- **           or an error code, which can be tested using ZSTD_isError() */
-size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) {
-	const BYTE* ip = (const BYTE*)src;
-	size_t const minInputSize = ZSTD_startingInputLength(format);
-
-	DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
-
-	if(srcSize > 0) {
-		/* note : technically could be considered an assert(), since it's an invalid entry */
-		RETURN_ERROR_IF(src == NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
-	}
-	if(srcSize < minInputSize) {
-		if(srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
-			/* when receiving less than @minInputSize bytes,
-			 * control these bytes at least correspond to a supported magic number
-			 * in order to error out early if they don't.
-			 **/
-			size_t const toCopy = MIN(4, srcSize);
-			unsigned char hbuf[4];
-			MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
-			assert(src != NULL);
-			ZSTD_memcpy(hbuf, src, toCopy);
-			if(MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER) {
-				/* not a zstd frame : let's check if it's a skippable frame */
-				MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
-				ZSTD_memcpy(hbuf, src, toCopy);
-				if((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
-					RETURN_ERROR(prefix_unknown,
-					             "first bytes don't correspond to any supported magic number");
-				}
-			}
-		}
-		return minInputSize;
-	}
-
-	ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
-	if((format != ZSTD_f_zstd1_magicless) && (MEM_readLE32(src) != ZSTD_MAGICNUMBER)) {
-		if((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
-			/* skippable frame */
-			if(srcSize < ZSTD_SKIPPABLEHEADERSIZE)
-				return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */
-			ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));
-			zfhPtr->frameContentSize = MEM_readLE32((const char*)src + ZSTD_FRAMEIDSIZE);
-			zfhPtr->frameType = ZSTD_skippableFrame;
-			return 0;
-		}
-		RETURN_ERROR(prefix_unknown, "");
-	}
-
-	/* ensure there is enough `srcSize` to fully read/decode frame header */
-	{
-		size_t const fhsize = ZSTD_frameHeaderSize_internal(src, srcSize, format);
-		if(srcSize < fhsize)
-			return fhsize;
-		zfhPtr->headerSize = (U32)fhsize;
-	}
-
-	{
-		BYTE const fhdByte = ip[minInputSize - 1];
-		size_t pos = minInputSize;
-		U32 const dictIDSizeCode = fhdByte & 3;
-		U32 const checksumFlag = (fhdByte >> 2) & 1;
-		U32 const singleSegment = (fhdByte >> 5) & 1;
-		U32 const fcsID = fhdByte >> 6;
-		U64 windowSize = 0;
-		U32 dictID = 0;
-		U64 frameContentSize = ZSTD_CONTENTSIZE_UNKNOWN;
-		RETURN_ERROR_IF((fhdByte & 0x08) != 0, frameParameter_unsupported,
-		                "reserved bits, must be zero");
-
-		if(!singleSegment) {
-			BYTE const wlByte = ip[pos++];
-			U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
-			RETURN_ERROR_IF(windowLog > ZSTD_WINDOWLOG_MAX, frameParameter_windowTooLarge, "");
-			windowSize = (1ULL << windowLog);
-			windowSize += (windowSize >> 3) * (wlByte & 7);
-		}
-		switch(dictIDSizeCode) {
-		default:
-			assert(0); /* impossible */
-			ZSTD_FALLTHROUGH;
-		case 0:
-			break;
-		case 1:
-			dictID = ip[pos];
-			pos++;
-			break;
-		case 2:
-			dictID = MEM_readLE16(ip + pos);
-			pos += 2;
-			break;
-		case 3:
-			dictID = MEM_readLE32(ip + pos);
-			pos += 4;
-			break;
-		}
-		switch(fcsID) {
-		default:
-			assert(0); /* impossible */
-			ZSTD_FALLTHROUGH;
-		case 0:
-			if(singleSegment)
-				frameContentSize = ip[pos];
-			break;
-		case 1:
-			frameContentSize = MEM_readLE16(ip + pos) + 256;
-			break;
-		case 2:
-			frameContentSize = MEM_readLE32(ip + pos);
-			break;
-		case 3:
-			frameContentSize = MEM_readLE64(ip + pos);
-			break;
-		}
-		if(singleSegment)
-			windowSize = frameContentSize;
-
-		zfhPtr->frameType = ZSTD_frame;
-		zfhPtr->frameContentSize = frameContentSize;
-		zfhPtr->windowSize = windowSize;
-		zfhPtr->blockSizeMax = (unsigned)MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
-		zfhPtr->dictID = dictID;
-		zfhPtr->checksumFlag = checksumFlag;
-	}
-	return 0;
-}
-
-/** ZSTD_getFrameHeader() :
- *  decode Frame Header, or require larger `srcSize`.
- *  note : this function does not consume input, it only reads it.
- * @return : 0, `zfhPtr` is correctly filled,
- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
- *           or an error code, which can be tested using ZSTD_isError() */
-size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize) {
-	return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1);
-}
-
-/** ZSTD_getFrameContentSize() :
- *  compatible with legacy mode
- * @return : decompressed size of the single frame pointed to be `src` if known, otherwise
- *         - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
- *         - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
-unsigned long long ZSTD_getFrameContentSize(const void* src, size_t srcSize) {
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-	if(ZSTD_isLegacy(src, srcSize)) {
-		unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize);
-		return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret;
-	}
-#endif
-	{
-		ZSTD_frameHeader zfh;
-		if(ZSTD_getFrameHeader(&zfh, src, srcSize) != 0)
-			return ZSTD_CONTENTSIZE_ERROR;
-		if(zfh.frameType == ZSTD_skippableFrame) {
-			return 0;
-		} else {
-			return zfh.frameContentSize;
-		}
-	}
-}
-
-static size_t readSkippableFrameSize(void const * src, size_t srcSize) {
-	size_t const skippableHeaderSize = ZSTD_SKIPPABLEHEADERSIZE;
-	U32 sizeU32;
-
-	RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
-
-	sizeU32 = MEM_readLE32((BYTE const *)src + ZSTD_FRAMEIDSIZE);
-	RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
-	                frameParameter_unsupported, "");
-	{
-		size_t const skippableSize = skippableHeaderSize + sizeU32;
-		RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
-		return skippableSize;
-	}
-}
-
-/*! ZSTD_readSkippableFrame() :
- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
- *
- * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
- * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
- * in the magicVariant.
- *
- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
- *
- * @return : number of bytes written or a ZSTD error.
- */
-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
-                                           const void* src, size_t srcSize) {
-	U32 const magicNumber = MEM_readLE32(src);
-	size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
-	size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
-
-	/* check input validity */
-	RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
-	RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
-	RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
-
-	/* deliver payload */
-	if(skippableContentSize > 0 && dst != NULL)
-		ZSTD_memcpy(dst, (const BYTE*)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
-	if(magicVariant != NULL)
-		*magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
-	return skippableContentSize;
-}
-
-/** ZSTD_findDecompressedSize() :
- *  compatible with legacy mode
- *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
- *      skippable frames
- *  @return : decompressed size of the frames contained */
-unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) {
-	unsigned long long totalDstSize = 0;
-
-	while(srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
-		U32 const magicNumber = MEM_readLE32(src);
-
-		if((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
-			size_t const skippableSize = readSkippableFrameSize(src, srcSize);
-			if(ZSTD_isError(skippableSize)) {
-				return ZSTD_CONTENTSIZE_ERROR;
-			}
-			assert(skippableSize <= srcSize);
-
-			src = (const BYTE*)src + skippableSize;
-			srcSize -= skippableSize;
-			continue;
-		}
-
-		{
-			unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
-			if(ret >= ZSTD_CONTENTSIZE_ERROR)
-				return ret;
-
-			/* check for overflow */
-			if(totalDstSize + ret < totalDstSize)
-				return ZSTD_CONTENTSIZE_ERROR;
-			totalDstSize += ret;
-		}
-		{
-			size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
-			if(ZSTD_isError(frameSrcSize)) {
-				return ZSTD_CONTENTSIZE_ERROR;
-			}
-
-			src = (const BYTE*)src + frameSrcSize;
-			srcSize -= frameSrcSize;
-		}
-	} /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
-
-	if(srcSize)
-		return ZSTD_CONTENTSIZE_ERROR;
-
-	return totalDstSize;
-}
-
-/** ZSTD_getDecompressedSize() :
- *  compatible with legacy mode
- * @return : decompressed size if known, 0 otherwise
-             note : 0 can mean any of the following :
-                   - frame content is empty
-                   - decompressed size field is not present in frame header
-                   - frame header unknown / not supported
-                   - frame header not complete (`srcSize` too small) */
-unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize) {
-	unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
-	ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_ERROR < ZSTD_CONTENTSIZE_UNKNOWN);
-	return (ret >= ZSTD_CONTENTSIZE_ERROR) ? 0 : ret;
-}
-
-/** ZSTD_decodeFrameHeader() :
- * `headerSize` must be the size provided by ZSTD_frameHeaderSize().
- * If multiple DDict references are enabled, also will choose the correct DDict to use.
- * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
-static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize) {
-	size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format);
-	if(ZSTD_isError(result))
-		return result; /* invalid header */
-	RETURN_ERROR_IF(result > 0, srcSize_wrong, "headerSize too small");
-
-	/* Reference DDict requested by frame if dctx references multiple ddicts */
-	if(dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts && dctx->ddictSet) {
-		ZSTD_DCtx_selectFrameDDict(dctx);
-	}
-
-#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-	/* Skip the dictID check in fuzzing mode, because it makes the search
-	 * harder.
-	 */
-	RETURN_ERROR_IF(dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID),
-	                dictionary_wrong, "");
-#endif
-	dctx->validateChecksum = (dctx->fParams.checksumFlag && !dctx->forceIgnoreChecksum) ? 1 : 0;
-	if(dctx->validateChecksum)
-		XXH64_reset(&dctx->xxhState, 0);
-	dctx->processedCSize += headerSize;
-	return 0;
-}
-
-static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret) {
-	ZSTD_frameSizeInfo frameSizeInfo;
-	frameSizeInfo.compressedSize = ret;
-	frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR;
-	return frameSizeInfo;
-}
-
-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize) {
-	ZSTD_frameSizeInfo frameSizeInfo;
-	ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
-
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-	if(ZSTD_isLegacy(src, srcSize))
-		return ZSTD_findFrameSizeInfoLegacy(src, srcSize);
-#endif
-
-	if((srcSize >= ZSTD_SKIPPABLEHEADERSIZE) && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
-		frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
-		assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
-		       frameSizeInfo.compressedSize <= srcSize);
-		return frameSizeInfo;
-	} else {
-		const BYTE* ip = (const BYTE*)src;
-		const BYTE* const ipstart = ip;
-		size_t remainingSize = srcSize;
-		size_t nbBlocks = 0;
-		ZSTD_frameHeader zfh;
-
-		/* Extract Frame Header */
-		{
-			size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
-			if(ZSTD_isError(ret))
-				return ZSTD_errorFrameSizeInfo(ret);
-			if(ret > 0)
-				return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
-		}
-
-		ip += zfh.headerSize;
-		remainingSize -= zfh.headerSize;
-
-		/* Iterate over each block */
-		while(1) {
-			blockProperties_t blockProperties;
-			size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
-			if(ZSTD_isError(cBlockSize))
-				return ZSTD_errorFrameSizeInfo(cBlockSize);
-
-			if(ZSTD_blockHeaderSize + cBlockSize > remainingSize)
-				return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
-
-			ip += ZSTD_blockHeaderSize + cBlockSize;
-			remainingSize -= ZSTD_blockHeaderSize + cBlockSize;
-			nbBlocks++;
-
-			if(blockProperties.lastBlock)
-				break;
-		}
-
-		/* Final frame content checksum */
-		if(zfh.checksumFlag) {
-			if(remainingSize < 4)
-				return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
-			ip += 4;
-		}
-
-		frameSizeInfo.nbBlocks = nbBlocks;
-		frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
-		frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
-		                                      ? zfh.frameContentSize
-		                                      : (unsigned long long)nbBlocks * zfh.blockSizeMax;
-		return frameSizeInfo;
-	}
-}
-
-/** ZSTD_findFrameCompressedSize() :
- *  compatible with legacy mode
- *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
- *  `srcSize` must be at least as large as the frame contained
- *  @return : the compressed size of the frame starting at `src` */
-size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) {
-	ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
-	return frameSizeInfo.compressedSize;
-}
-
-/** ZSTD_decompressBound() :
- *  compatible with legacy mode
- *  `src` must point to the start of a ZSTD frame or a skippeable frame
- *  `srcSize` must be at least as large as the frame contained
- *  @return : the maximum decompressed size of the compressed source
- */
-unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) {
-	unsigned long long bound = 0;
-	/* Iterate over each frame */
-	while(srcSize > 0) {
-		ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
-		size_t const compressedSize = frameSizeInfo.compressedSize;
-		unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
-		if(ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
-			return ZSTD_CONTENTSIZE_ERROR;
-		assert(srcSize >= compressedSize);
-		src = (const BYTE*)src + compressedSize;
-		srcSize -= compressedSize;
-		bound += decompressedBound;
-	}
-	return bound;
-}
-
-size_t ZSTD_decompressionMargin(void const * src, size_t srcSize) {
-	size_t margin = 0;
-	unsigned maxBlockSize = 0;
-
-	/* Iterate over each frame */
-	while(srcSize > 0) {
-		ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
-		size_t const compressedSize = frameSizeInfo.compressedSize;
-		unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
-		ZSTD_frameHeader zfh;
-
-		FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
-		if(ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
-			return ERROR(corruption_detected);
-
-		if(zfh.frameType == ZSTD_frame) {
-			/* Add the frame header to our margin */
-			margin += zfh.headerSize;
-			/* Add the checksum to our margin */
-			margin += zfh.checksumFlag ? 4 : 0;
-			/* Add 3 bytes per block */
-			margin += 3 * frameSizeInfo.nbBlocks;
-
-			/* Compute the max block size */
-			maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
-		} else {
-			assert(zfh.frameType == ZSTD_skippableFrame);
-			/* Add the entire skippable frame size to our margin. */
-			margin += compressedSize;
-		}
-
-		assert(srcSize >= compressedSize);
-		src = (const BYTE*)src + compressedSize;
-		srcSize -= compressedSize;
-	}
-
-	/* Add the max block size back to the margin. */
-	margin += maxBlockSize;
-
-	return margin;
-}
-
-/*-*************************************************************
- *   Frame decoding
- ***************************************************************/
-
-/** ZSTD_insertBlock() :
- *  insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
-size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize) {
-	DEBUGLOG(5, "ZSTD_insertBlock: %u bytes", (unsigned)blockSize);
-	ZSTD_checkContinuity(dctx, blockStart, blockSize);
-	dctx->previousDstEnd = (const char*)blockStart + blockSize;
-	return blockSize;
-}
-
-static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
-                                const void* src, size_t srcSize) {
-	DEBUGLOG(5, "ZSTD_copyRawBlock");
-	RETURN_ERROR_IF(srcSize > dstCapacity, dstSize_tooSmall, "");
-	if(dst == NULL) {
-		if(srcSize == 0)
-			return 0;
-		RETURN_ERROR(dstBuffer_null, "");
-	}
-	ZSTD_memmove(dst, src, srcSize);
-	return srcSize;
-}
-
-static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity,
-                               BYTE b,
-                               size_t regenSize) {
-	RETURN_ERROR_IF(regenSize > dstCapacity, dstSize_tooSmall, "");
-	if(dst == NULL) {
-		if(regenSize == 0)
-			return 0;
-		RETURN_ERROR(dstBuffer_null, "");
-	}
-	ZSTD_memset(dst, b, regenSize);
-	return regenSize;
-}
-
-static void ZSTD_DCtx_trace_end(ZSTD_DCtx const * dctx, U64 uncompressedSize, U64 compressedSize, unsigned streaming) {
-#if ZSTD_TRACE
-	if(dctx->traceCtx && ZSTD_trace_decompress_end != NULL) {
-		ZSTD_Trace trace;
-		ZSTD_memset(&trace, 0, sizeof(trace));
-		trace.version = ZSTD_VERSION_NUMBER;
-		trace.streaming = streaming;
-		if(dctx->ddict) {
-			trace.dictionaryID = ZSTD_getDictID_fromDDict(dctx->ddict);
-			trace.dictionarySize = ZSTD_DDict_dictSize(dctx->ddict);
-			trace.dictionaryIsCold = dctx->ddictIsCold;
-		}
-		trace.uncompressedSize = (size_t)uncompressedSize;
-		trace.compressedSize = (size_t)compressedSize;
-		trace.dctx = dctx;
-		ZSTD_trace_decompress_end(dctx->traceCtx, &trace);
-	}
-#else
-	(void)dctx;
-	(void)uncompressedSize;
-	(void)compressedSize;
-	(void)streaming;
-#endif
-}
-
-/*! ZSTD_decompressFrame() :
- * @dctx must be properly initialized
- *  will update *srcPtr and *srcSizePtr,
- *  to make *srcPtr progress by one frame. */
-static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
-                                   void* dst, size_t dstCapacity,
-                                   const void** srcPtr, size_t* srcSizePtr) {
-	const BYTE* const istart = (const BYTE*)(*srcPtr);
-	const BYTE* ip = istart;
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart;
-	BYTE* op = ostart;
-	size_t remainingSrcSize = *srcSizePtr;
-
-	DEBUGLOG(4, "ZSTD_decompressFrame (srcSize:%i)", (int)*srcSizePtr);
-
-	/* check */
-	RETURN_ERROR_IF(
-	    remainingSrcSize < ZSTD_FRAMEHEADERSIZE_MIN(dctx->format) + ZSTD_blockHeaderSize,
-	    srcSize_wrong, "");
-
-	/* Frame Header */
-	{
-		size_t const frameHeaderSize = ZSTD_frameHeaderSize_internal(
-		    ip, ZSTD_FRAMEHEADERSIZE_PREFIX(dctx->format), dctx->format);
-		if(ZSTD_isError(frameHeaderSize))
-			return frameHeaderSize;
-		RETURN_ERROR_IF(remainingSrcSize < frameHeaderSize + ZSTD_blockHeaderSize,
-		                srcSize_wrong, "");
-		FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize), "");
-		ip += frameHeaderSize;
-		remainingSrcSize -= frameHeaderSize;
-	}
-
-	/* Loop on each block */
-	while(1) {
-		BYTE* oBlockEnd = oend;
-		size_t decodedSize;
-		blockProperties_t blockProperties;
-		size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties);
-		if(ZSTD_isError(cBlockSize))
-			return cBlockSize;
-
-		ip += ZSTD_blockHeaderSize;
-		remainingSrcSize -= ZSTD_blockHeaderSize;
-		RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, "");
-
-		if(ip >= op && ip < oBlockEnd) {
-			/* We are decompressing in-place. Limit the output pointer so that we
-			 * don't overwrite the block that we are currently reading. This will
-			 * fail decompression if the input & output pointers aren't spaced
-			 * far enough apart.
-			 *
-			 * This is important to set, even when the pointers are far enough
-			 * apart, because ZSTD_decompressBlock_internal() can decide to store
-			 * literals in the output buffer, after the block it is decompressing.
-			 * Since we don't want anything to overwrite our input, we have to tell
-			 * ZSTD_decompressBlock_internal to never write past ip.
-			 *
-			 * See ZSTD_allocateLiteralsBuffer() for reference.
-			 */
-			oBlockEnd = op + (ip - op);
-		}
-
-		switch(blockProperties.blockType) {
-		case bt_compressed:
-			decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd - op), ip, cBlockSize, /* frame */ 1, not_streaming);
-			break;
-		case bt_raw:
-			/* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
-			decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend - op), ip, cBlockSize);
-			break;
-		case bt_rle:
-			decodedSize = ZSTD_setRleBlock(op, (size_t)(oBlockEnd - op), *ip, blockProperties.origSize);
-			break;
-		case bt_reserved:
-		default:
-			RETURN_ERROR(corruption_detected, "invalid block type");
-		}
-
-		if(ZSTD_isError(decodedSize))
-			return decodedSize;
-		if(dctx->validateChecksum)
-			XXH64_update(&dctx->xxhState, op, decodedSize);
-		if(decodedSize != 0)
-			op += decodedSize;
-		assert(ip != NULL);
-		ip += cBlockSize;
-		remainingSrcSize -= cBlockSize;
-		if(blockProperties.lastBlock)
-			break;
-	}
-
-	if(dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) {
-		RETURN_ERROR_IF((U64)(op - ostart) != dctx->fParams.frameContentSize,
-		                corruption_detected, "");
-	}
-	if(dctx->fParams.checksumFlag) { /* Frame content checksum verification */
-		RETURN_ERROR_IF(remainingSrcSize < 4, checksum_wrong, "");
-		if(!dctx->forceIgnoreChecksum) {
-			U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState);
-			U32 checkRead;
-			checkRead = MEM_readLE32(ip);
-			RETURN_ERROR_IF(checkRead != checkCalc, checksum_wrong, "");
-		}
-		ip += 4;
-		remainingSrcSize -= 4;
-	}
-	ZSTD_DCtx_trace_end(dctx, (U64)(op - ostart), (U64)(ip - istart), /* streaming */ 0);
-	/* Allow caller to get size read */
-	DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op - ostart, ip - (const BYTE*)*srcPtr);
-	*srcPtr = ip;
-	*srcSizePtr = remainingSrcSize;
-	return (size_t)(op - ostart);
-}
-
-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
-                                        void* dst, size_t dstCapacity,
-                                        const void* src, size_t srcSize,
-                                        const void* dict, size_t dictSize,
-                                        const ZSTD_DDict* ddict) {
-	void* const dststart = dst;
-	int moreThan1Frame = 0;
-
-	DEBUGLOG(5, "ZSTD_decompressMultiFrame");
-	assert(dict == NULL || ddict == NULL); /* either dict or ddict set, not both */
-
-	if(ddict) {
-		dict = ZSTD_DDict_dictContent(ddict);
-		dictSize = ZSTD_DDict_dictSize(ddict);
-	}
-
-	while(srcSize >= ZSTD_startingInputLength(dctx->format)) {
-
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-		if(ZSTD_isLegacy(src, srcSize)) {
-			size_t decodedSize;
-			size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
-			if(ZSTD_isError(frameSize))
-				return frameSize;
-			RETURN_ERROR_IF(dctx->staticSize, memory_allocation,
-			                "legacy support is not compatible with static dctx");
-
-			decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize);
-			if(ZSTD_isError(decodedSize))
-				return decodedSize;
-
-			assert(decodedSize <= dstCapacity);
-			dst = (BYTE*)dst + decodedSize;
-			dstCapacity -= decodedSize;
-
-			src = (const BYTE*)src + frameSize;
-			srcSize -= frameSize;
-
-			continue;
-		}
-#endif
-
-		{
-			U32 const magicNumber = MEM_readLE32(src);
-			DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
-			         (unsigned)magicNumber, ZSTD_MAGICNUMBER);
-			if((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
-				size_t const skippableSize = readSkippableFrameSize(src, srcSize);
-				FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
-				assert(skippableSize <= srcSize);
-
-				src = (const BYTE*)src + skippableSize;
-				srcSize -= skippableSize;
-				continue;
-			}
-		}
-
-		if(ddict) {
-			/* we were called from ZSTD_decompress_usingDDict */
-			FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(dctx, ddict), "");
-		} else {
-			/* this will initialize correctly with no dict if dict == NULL, so
-			 * use this in all cases but ddict */
-			FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize), "");
-		}
-		ZSTD_checkContinuity(dctx, dst, dstCapacity);
-
-		{
-			const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity,
-			                                        &src, &srcSize);
-			RETURN_ERROR_IF(
-			    (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown) && (moreThan1Frame == 1),
-			    srcSize_wrong,
-			    "At least one frame successfully completed, "
-			    "but following bytes are garbage: "
-			    "it's more likely to be a srcSize error, "
-			    "specifying more input bytes than size of frame(s). "
-			    "Note: one could be unlucky, it might be a corruption error instead, "
-			    "happening right at the place where we expect zstd magic bytes. "
-			    "But this is _much_ less likely than a srcSize field error.");
-			if(ZSTD_isError(res))
-				return res;
-			assert(res <= dstCapacity);
-			if(res != 0)
-				dst = (BYTE*)dst + res;
-			dstCapacity -= res;
-		}
-		moreThan1Frame = 1;
-	} /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
-
-	RETURN_ERROR_IF(srcSize, srcSize_wrong, "input not entirely consumed");
-
-	return (size_t)((BYTE*)dst - (BYTE*)dststart);
-}
-
-size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
-                                 void* dst, size_t dstCapacity,
-                                 const void* src, size_t srcSize,
-                                 const void* dict, size_t dictSize) {
-	return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL);
-}
-
-static ZSTD_DDict const * ZSTD_getDDict(ZSTD_DCtx* dctx) {
-	switch(dctx->dictUses) {
-	default:
-		assert(0 /* Impossible */);
-		ZSTD_FALLTHROUGH;
-	case ZSTD_dont_use:
-		ZSTD_clearDict(dctx);
-		return NULL;
-	case ZSTD_use_indefinitely:
-		return dctx->ddict;
-	case ZSTD_use_once:
-		dctx->dictUses = ZSTD_dont_use;
-		return dctx->ddict;
-	}
-}
-
-size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) {
-	return ZSTD_decompress_usingDDict(dctx, dst, dstCapacity, src, srcSize, ZSTD_getDDict(dctx));
-}
-
-size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize) {
-#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE >= 1)
-	size_t regenSize;
-	ZSTD_DCtx* const dctx = ZSTD_createDCtx_internal(ZSTD_defaultCMem);
-	RETURN_ERROR_IF(dctx == NULL, memory_allocation, "NULL pointer!");
-	regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
-	ZSTD_freeDCtx(dctx);
-	return regenSize;
-#else /* stack mode */
-	ZSTD_DCtx dctx;
-	ZSTD_initDCtx_internal(&dctx);
-	return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
-#endif
-}
-
-/*-**************************************
- *   Advanced Streaming Decompression API
- *   Bufferless and synchronous
- ****************************************/
-size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
-
-/**
- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
- * allow taking a partial block as the input. Currently only raw uncompressed blocks can
- * be streamed.
- *
- * For blocks that can be streamed, this allows us to reduce the latency until we produce
- * output, and avoid copying the input.
- *
- * @param inputSize - The total amount of input that the caller currently has.
- */
-static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t inputSize) {
-	if(!(dctx->stage == ZSTDds_decompressBlock || dctx->stage == ZSTDds_decompressLastBlock))
-		return dctx->expected;
-	if(dctx->bType != bt_raw)
-		return dctx->expected;
-	return BOUNDED(1, inputSize, dctx->expected);
-}
-
-ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {
-	switch(dctx->stage) {
-	default: /* should not happen */
-		assert(0);
-		ZSTD_FALLTHROUGH;
-	case ZSTDds_getFrameHeaderSize:
-		ZSTD_FALLTHROUGH;
-	case ZSTDds_decodeFrameHeader:
-		return ZSTDnit_frameHeader;
-	case ZSTDds_decodeBlockHeader:
-		return ZSTDnit_blockHeader;
-	case ZSTDds_decompressBlock:
-		return ZSTDnit_block;
-	case ZSTDds_decompressLastBlock:
-		return ZSTDnit_lastBlock;
-	case ZSTDds_checkChecksum:
-		return ZSTDnit_checksum;
-	case ZSTDds_decodeSkippableHeader:
-		ZSTD_FALLTHROUGH;
-	case ZSTDds_skipFrame:
-		return ZSTDnit_skippableFrame;
-	}
-}
-
-static int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; }
-
-/** ZSTD_decompressContinue() :
- *  srcSize : must be the exact nb of bytes expected (see ZSTD_nextSrcSizeToDecompress())
- *  @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
- *            or an error code, which can be tested using ZSTD_isError() */
-size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) {
-	DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (unsigned)srcSize);
-	/* Sanity check */
-	RETURN_ERROR_IF(srcSize != ZSTD_nextSrcSizeToDecompressWithInputSize(dctx, srcSize), srcSize_wrong, "not allowed");
-	ZSTD_checkContinuity(dctx, dst, dstCapacity);
-
-	dctx->processedCSize += srcSize;
-
-	switch(dctx->stage) {
-	case ZSTDds_getFrameHeaderSize:
-		assert(src != NULL);
-		if(dctx->format == ZSTD_f_zstd1) {                                                      /* allows header */
-			assert(srcSize >= ZSTD_FRAMEIDSIZE);                                                /* to read skippable magic number */
-			if((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */
-				ZSTD_memcpy(dctx->headerBuffer, src, srcSize);
-				dctx->expected = ZSTD_SKIPPABLEHEADERSIZE - srcSize; /* remaining to load to get full skippable frame header */
-				dctx->stage = ZSTDds_decodeSkippableHeader;
-				return 0;
-			}
-		}
-		dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format);
-		if(ZSTD_isError(dctx->headerSize))
-			return dctx->headerSize;
-		ZSTD_memcpy(dctx->headerBuffer, src, srcSize);
-		dctx->expected = dctx->headerSize - srcSize;
-		dctx->stage = ZSTDds_decodeFrameHeader;
-		return 0;
-
-	case ZSTDds_decodeFrameHeader:
-		assert(src != NULL);
-		ZSTD_memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize);
-		FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize), "");
-		dctx->expected = ZSTD_blockHeaderSize;
-		dctx->stage = ZSTDds_decodeBlockHeader;
-		return 0;
-
-	case ZSTDds_decodeBlockHeader: {
-		blockProperties_t bp;
-		size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
-		if(ZSTD_isError(cBlockSize))
-			return cBlockSize;
-		RETURN_ERROR_IF(cBlockSize > dctx->fParams.blockSizeMax, corruption_detected, "Block Size Exceeds Maximum");
-		dctx->expected = cBlockSize;
-		dctx->bType = bp.blockType;
-		dctx->rleSize = bp.origSize;
-		if(cBlockSize) {
-			dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock;
-			return 0;
-		}
-		/* empty block */
-		if(bp.lastBlock) {
-			if(dctx->fParams.checksumFlag) {
-				dctx->expected = 4;
-				dctx->stage = ZSTDds_checkChecksum;
-			} else {
-				dctx->expected = 0; /* end of frame */
-				dctx->stage = ZSTDds_getFrameHeaderSize;
-			}
-		} else {
-			dctx->expected = ZSTD_blockHeaderSize; /* jump to next header */
-			dctx->stage = ZSTDds_decodeBlockHeader;
-		}
-		return 0;
-	}
-
-	case ZSTDds_decompressLastBlock:
-	case ZSTDds_decompressBlock:
-		DEBUGLOG(5, "ZSTD_decompressContinue: case ZSTDds_decompressBlock");
-		{
-			size_t rSize;
-			switch(dctx->bType) {
-			case bt_compressed:
-				DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
-				rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);
-				dctx->expected = 0; /* Streaming not supported */
-				break;
-			case bt_raw:
-				assert(srcSize <= dctx->expected);
-				rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize);
-				FORWARD_IF_ERROR(rSize, "ZSTD_copyRawBlock failed");
-				assert(rSize == srcSize);
-				dctx->expected -= rSize;
-				break;
-			case bt_rle:
-				rSize = ZSTD_setRleBlock(dst, dstCapacity, *(const BYTE*)src, dctx->rleSize);
-				dctx->expected = 0; /* Streaming not supported */
-				break;
-			case bt_reserved: /* should never happen */
-			default:
-				RETURN_ERROR(corruption_detected, "invalid block type");
-			}
-			FORWARD_IF_ERROR(rSize, "");
-			RETURN_ERROR_IF(rSize > dctx->fParams.blockSizeMax, corruption_detected, "Decompressed Block Size Exceeds Maximum");
-			DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (unsigned)rSize);
-			dctx->decodedSize += rSize;
-			if(dctx->validateChecksum)
-				XXH64_update(&dctx->xxhState, dst, rSize);
-			dctx->previousDstEnd = (char*)dst + rSize;
-
-			/* Stay on the same stage until we are finished streaming the block. */
-			if(dctx->expected > 0) {
-				return rSize;
-			}
-
-			if(dctx->stage == ZSTDds_decompressLastBlock) { /* end of frame */
-				DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (unsigned)dctx->decodedSize);
-				RETURN_ERROR_IF(
-				    dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN && dctx->decodedSize != dctx->fParams.frameContentSize,
-				    corruption_detected, "");
-				if(dctx->fParams.checksumFlag) { /* another round for frame checksum */
-					dctx->expected = 4;
-					dctx->stage = ZSTDds_checkChecksum;
-				} else {
-					ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1);
-					dctx->expected = 0; /* ends here */
-					dctx->stage = ZSTDds_getFrameHeaderSize;
-				}
-			} else {
-				dctx->stage = ZSTDds_decodeBlockHeader;
-				dctx->expected = ZSTD_blockHeaderSize;
-			}
-			return rSize;
-		}
-
-	case ZSTDds_checkChecksum:
-		assert(srcSize == 4); /* guaranteed by dctx->expected */
-		{
-			if(dctx->validateChecksum) {
-				U32 const h32 = (U32)XXH64_digest(&dctx->xxhState);
-				U32 const check32 = MEM_readLE32(src);
-				DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32);
-				RETURN_ERROR_IF(check32 != h32, checksum_wrong, "");
-			}
-			ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1);
-			dctx->expected = 0;
-			dctx->stage = ZSTDds_getFrameHeaderSize;
-			return 0;
-		}
-
-	case ZSTDds_decodeSkippableHeader:
-		assert(src != NULL);
-		assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
-		ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */
-		dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);                 /* note : dctx->expected can grow seriously large, beyond local buffer size */
-		dctx->stage = ZSTDds_skipFrame;
-		return 0;
-
-	case ZSTDds_skipFrame:
-		dctx->expected = 0;
-		dctx->stage = ZSTDds_getFrameHeaderSize;
-		return 0;
-
-	default:
-		assert(0);                                    /* impossible */
-		RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */
-	}
-}
-
-static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) {
-	dctx->dictEnd = dctx->previousDstEnd;
-	dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
-	dctx->prefixStart = dict;
-	dctx->previousDstEnd = (const char*)dict + dictSize;
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-	dctx->dictContentBeginForFuzzing = dctx->prefixStart;
-	dctx->dictContentEndForFuzzing = dctx->previousDstEnd;
-#endif
-	return 0;
-}
-
-/*! ZSTD_loadDEntropy() :
- *  dict : must point at beginning of a valid zstd dictionary.
- * @return : size of entropy tables read */
-size_t
-ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
-                  const void* const dict, size_t const dictSize) {
-	const BYTE* dictPtr = (const BYTE*)dict;
-	const BYTE* const dictEnd = dictPtr + dictSize;
-
-	RETURN_ERROR_IF(dictSize <= 8, dictionary_corrupted, "dict is too small");
-	assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY); /* dict must be valid */
-	dictPtr += 8;                                        /* skip header = magic + dictID */
-
-	ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable));
-	ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable));
-	ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE);
-	{
-		void* const workspace = &entropy->LLTable; /* use fse tables as temporary workspace; implies fse tables are grouped together */
-		size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable);
-#ifdef HUF_FORCE_DECOMPRESS_X1
-		/* in minimal huffman, we always use X1 variants */
-		size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
-		                                           dictPtr, dictEnd - dictPtr,
-		                                           workspace, workspaceSize, /* flags */ 0);
-#else
-		size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
-		                                           dictPtr, (size_t)(dictEnd - dictPtr),
-		                                           workspace, workspaceSize, /* flags */ 0);
-#endif
-		RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
-		dictPtr += hSize;
-	}
-
-	{
-		short offcodeNCount[MaxOff + 1];
-		unsigned offcodeMaxValue = MaxOff, offcodeLog;
-		size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd - dictPtr));
-		RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
-		RETURN_ERROR_IF(offcodeMaxValue > MaxOff, dictionary_corrupted, "");
-		RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
-		ZSTD_buildFSETable(entropy->OFTable,
-		                   offcodeNCount, offcodeMaxValue,
-		                   OF_base, OF_bits,
-		                   offcodeLog,
-		                   entropy->workspace, sizeof(entropy->workspace),
-		                   /* bmi2 */ 0);
-		dictPtr += offcodeHeaderSize;
-	}
-
-	{
-		short matchlengthNCount[MaxML + 1];
-		unsigned matchlengthMaxValue = MaxML, matchlengthLog;
-		size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd - dictPtr));
-		RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
-		RETURN_ERROR_IF(matchlengthMaxValue > MaxML, dictionary_corrupted, "");
-		RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
-		ZSTD_buildFSETable(entropy->MLTable,
-		                   matchlengthNCount, matchlengthMaxValue,
-		                   ML_base, ML_bits,
-		                   matchlengthLog,
-		                   entropy->workspace, sizeof(entropy->workspace),
-		                   /* bmi2 */ 0);
-		dictPtr += matchlengthHeaderSize;
-	}
-
-	{
-		short litlengthNCount[MaxLL + 1];
-		unsigned litlengthMaxValue = MaxLL, litlengthLog;
-		size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd - dictPtr));
-		RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
-		RETURN_ERROR_IF(litlengthMaxValue > MaxLL, dictionary_corrupted, "");
-		RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
-		ZSTD_buildFSETable(entropy->LLTable,
-		                   litlengthNCount, litlengthMaxValue,
-		                   LL_base, LL_bits,
-		                   litlengthLog,
-		                   entropy->workspace, sizeof(entropy->workspace),
-		                   /* bmi2 */ 0);
-		dictPtr += litlengthHeaderSize;
-	}
-
-	RETURN_ERROR_IF(dictPtr + 12 > dictEnd, dictionary_corrupted, "");
-	{
-		int i;
-		size_t const dictContentSize = (size_t)(dictEnd - (dictPtr + 12));
-		for(i = 0; i < 3; i++) {
-			U32 const rep = MEM_readLE32(dictPtr);
-			dictPtr += 4;
-			RETURN_ERROR_IF(rep == 0 || rep > dictContentSize,
-			                dictionary_corrupted, "");
-			entropy->rep[i] = rep;
-		}
-	}
-
-	return (size_t)(dictPtr - (const BYTE*)dict);
-}
-
-static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) {
-	if(dictSize < 8)
-		return ZSTD_refDictContent(dctx, dict, dictSize);
-	{
-		U32 const magic = MEM_readLE32(dict);
-		if(magic != ZSTD_MAGIC_DICTIONARY) {
-			return ZSTD_refDictContent(dctx, dict, dictSize); /* pure content mode */
-		}
-	}
-	dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
-
-	/* load entropy tables */
-	{
-		size_t const eSize = ZSTD_loadDEntropy(&dctx->entropy, dict, dictSize);
-		RETURN_ERROR_IF(ZSTD_isError(eSize), dictionary_corrupted, "");
-		dict = (const char*)dict + eSize;
-		dictSize -= eSize;
-	}
-	dctx->litEntropy = dctx->fseEntropy = 1;
-
-	/* reference dictionary content */
-	return ZSTD_refDictContent(dctx, dict, dictSize);
-}
-
-size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) {
-	assert(dctx != NULL);
-#if ZSTD_TRACE
-	dctx->traceCtx = (ZSTD_trace_decompress_begin != NULL) ? ZSTD_trace_decompress_begin(dctx) : 0;
-#endif
-	dctx->expected = ZSTD_startingInputLength(dctx->format); /* dctx->format must be properly set */
-	dctx->stage = ZSTDds_getFrameHeaderSize;
-	dctx->processedCSize = 0;
-	dctx->decodedSize = 0;
-	dctx->previousDstEnd = NULL;
-	dctx->prefixStart = NULL;
-	dctx->virtualStart = NULL;
-	dctx->dictEnd = NULL;
-	dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */
-	dctx->litEntropy = dctx->fseEntropy = 0;
-	dctx->dictID = 0;
-	dctx->bType = bt_reserved;
-	ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
-	ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */
-	dctx->LLTptr = dctx->entropy.LLTable;
-	dctx->MLTptr = dctx->entropy.MLTable;
-	dctx->OFTptr = dctx->entropy.OFTable;
-	dctx->HUFptr = dctx->entropy.hufTable;
-	return 0;
-}
-
-size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) {
-	FORWARD_IF_ERROR(ZSTD_decompressBegin(dctx), "");
-	if(dict && dictSize)
-		RETURN_ERROR_IF(
-		    ZSTD_isError(ZSTD_decompress_insertDictionary(dctx, dict, dictSize)),
-		    dictionary_corrupted, "");
-	return 0;
-}
-
-/* ======   ZSTD_DDict   ====== */
-
-size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) {
-	DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict");
-	assert(dctx != NULL);
-	if(ddict) {
-		const char* const dictStart = (const char*)ZSTD_DDict_dictContent(ddict);
-		size_t const dictSize = ZSTD_DDict_dictSize(ddict);
-		const void* const dictEnd = dictStart + dictSize;
-		dctx->ddictIsCold = (dctx->dictEnd != dictEnd);
-		DEBUGLOG(4, "DDict is %s",
-		         dctx->ddictIsCold ? "~cold~" : "hot!");
-	}
-	FORWARD_IF_ERROR(ZSTD_decompressBegin(dctx), "");
-	if(ddict) { /* NULL ddict is equivalent to no dictionary */
-		ZSTD_copyDDictParameters(dctx, ddict);
-	}
-	return 0;
-}
-
-/*! ZSTD_getDictID_fromDict() :
- *  Provides the dictID stored within dictionary.
- *  if @return == 0, the dictionary is not conformant with Zstandard specification.
- *  It can still be loaded, but as a content-only dictionary. */
-unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) {
-	if(dictSize < 8)
-		return 0;
-	if(MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY)
-		return 0;
-	return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
-}
-
-/*! ZSTD_getDictID_fromFrame() :
- *  Provides the dictID required to decompress frame stored within `src`.
- *  If @return == 0, the dictID could not be decoded.
- *  This could for one of the following reasons :
- *  - The frame does not require a dictionary (most common case).
- *  - The frame was built with dictID intentionally removed.
- *    Needed dictionary is a hidden piece of information.
- *    Note : this use case also happens when using a non-conformant dictionary.
- *  - `srcSize` is too small, and as a result, frame header could not be decoded.
- *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
- *  - This is not a Zstandard frame.
- *  When identifying the exact failure cause, it's possible to use
- *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
-unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) {
-	ZSTD_frameHeader zfp = {0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0};
-	size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
-	if(ZSTD_isError(hError))
-		return 0;
-	return zfp.dictID;
-}
-
-/*! ZSTD_decompress_usingDDict() :
- *   Decompression using a pre-digested Dictionary
- *   Use dictionary without significant overhead. */
-size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
-                                  void* dst, size_t dstCapacity,
-                                  const void* src, size_t srcSize,
-                                  const ZSTD_DDict* ddict) {
-	/* pass content and size in case legacy frames are encountered */
-	return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize,
-	                                 NULL, 0,
-	                                 ddict);
-}
-
-/*=====================================
- *   Streaming decompression
- *====================================*/
-
-ZSTD_DStream* ZSTD_createDStream(void) {
-	DEBUGLOG(3, "ZSTD_createDStream");
-	return ZSTD_createDCtx_internal(ZSTD_defaultCMem);
-}
-
-ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize) {
-	return ZSTD_initStaticDCtx(workspace, workspaceSize);
-}
-
-ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem) {
-	return ZSTD_createDCtx_internal(customMem);
-}
-
-size_t ZSTD_freeDStream(ZSTD_DStream* zds) {
-	return ZSTD_freeDCtx(zds);
-}
-
-/* ***  Initialization  *** */
-
-size_t ZSTD_DStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
-size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }
-
-size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx,
-                                         const void* dict, size_t dictSize,
-                                         ZSTD_dictLoadMethod_e dictLoadMethod,
-                                         ZSTD_dictContentType_e dictContentType) {
-	RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
-	ZSTD_clearDict(dctx);
-	if(dict && dictSize != 0) {
-		dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem);
-		RETURN_ERROR_IF(dctx->ddictLocal == NULL, memory_allocation, "NULL pointer!");
-		dctx->ddict = dctx->ddictLocal;
-		dctx->dictUses = ZSTD_use_indefinitely;
-	}
-	return 0;
-}
-
-size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) {
-	return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
-}
-
-size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) {
-	return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
-}
-
-size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) {
-	FORWARD_IF_ERROR(ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType), "");
-	dctx->dictUses = ZSTD_use_once;
-	return 0;
-}
-
-size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize) {
-	return ZSTD_DCtx_refPrefix_advanced(dctx, prefix, prefixSize, ZSTD_dct_rawContent);
-}
-
-/* ZSTD_initDStream_usingDict() :
- * return : expected size, aka ZSTD_startingInputLength().
- * this function cannot fail */
-size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize) {
-	DEBUGLOG(4, "ZSTD_initDStream_usingDict");
-	FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
-	FORWARD_IF_ERROR(ZSTD_DCtx_loadDictionary(zds, dict, dictSize), "");
-	return ZSTD_startingInputLength(zds->format);
-}
-
-/* note : this variant can't fail */
-size_t ZSTD_initDStream(ZSTD_DStream* zds) {
-	DEBUGLOG(4, "ZSTD_initDStream");
-	FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
-	FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
-	return ZSTD_startingInputLength(zds->format);
-}
-
-/* ZSTD_initDStream_usingDDict() :
- * ddict will just be referenced, and must outlive decompression session
- * this function cannot fail */
-size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) {
-	DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
-	FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
-	FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(dctx, ddict), "");
-	return ZSTD_startingInputLength(dctx->format);
-}
-
-/* ZSTD_resetDStream() :
- * return : expected size, aka ZSTD_startingInputLength().
- * this function cannot fail */
-size_t ZSTD_resetDStream(ZSTD_DStream* dctx) {
-	DEBUGLOG(4, "ZSTD_resetDStream");
-	FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
-	return ZSTD_startingInputLength(dctx->format);
-}
-
-size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) {
-	RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
-	ZSTD_clearDict(dctx);
-	if(ddict) {
-		dctx->ddict = ddict;
-		dctx->dictUses = ZSTD_use_indefinitely;
-		if(dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts) {
-			if(dctx->ddictSet == NULL) {
-				dctx->ddictSet = ZSTD_createDDictHashSet(dctx->customMem);
-				if(!dctx->ddictSet) {
-					RETURN_ERROR(memory_allocation, "Failed to allocate memory for hash set!");
-				}
-			}
-			assert(!dctx->staticSize); /* Impossible: ddictSet cannot have been allocated if static dctx */
-			FORWARD_IF_ERROR(ZSTD_DDictHashSet_addDDict(dctx->ddictSet, ddict, dctx->customMem), "");
-		}
-	}
-	return 0;
-}
-
-/* ZSTD_DCtx_setMaxWindowSize() :
- * note : no direct equivalence in ZSTD_DCtx_setParameter,
- * since this version sets windowSize, and the other sets windowLog */
-size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize) {
-	ZSTD_bounds const bounds = ZSTD_dParam_getBounds(ZSTD_d_windowLogMax);
-	size_t const min = (size_t)1 << bounds.lowerBound;
-	size_t const max = (size_t)1 << bounds.upperBound;
-	RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
-	RETURN_ERROR_IF(maxWindowSize < min, parameter_outOfBound, "");
-	RETURN_ERROR_IF(maxWindowSize > max, parameter_outOfBound, "");
-	dctx->maxWindowSize = maxWindowSize;
-	return 0;
-}
-
-size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format) {
-	return ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, (int)format);
-}
-
-ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) {
-	ZSTD_bounds bounds = {0, 0, 0};
-	switch(dParam) {
-	case ZSTD_d_windowLogMax:
-		bounds.lowerBound = ZSTD_WINDOWLOG_ABSOLUTEMIN;
-		bounds.upperBound = ZSTD_WINDOWLOG_MAX;
-		return bounds;
-	case ZSTD_d_format:
-		bounds.lowerBound = (int)ZSTD_f_zstd1;
-		bounds.upperBound = (int)ZSTD_f_zstd1_magicless;
-		ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless);
-		return bounds;
-	case ZSTD_d_stableOutBuffer:
-		bounds.lowerBound = (int)ZSTD_bm_buffered;
-		bounds.upperBound = (int)ZSTD_bm_stable;
-		return bounds;
-	case ZSTD_d_forceIgnoreChecksum:
-		bounds.lowerBound = (int)ZSTD_d_validateChecksum;
-		bounds.upperBound = (int)ZSTD_d_ignoreChecksum;
-		return bounds;
-	case ZSTD_d_refMultipleDDicts:
-		bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
-		bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
-		return bounds;
-	case ZSTD_d_disableHuffmanAssembly:
-		bounds.lowerBound = 0;
-		bounds.upperBound = 1;
-		return bounds;
-
-	default:;
-	}
-	bounds.error = ERROR(parameter_unsupported);
-	return bounds;
-}
-
-/* ZSTD_dParam_withinBounds:
- * @return 1 if value is within dParam bounds,
- * 0 otherwise */
-static int ZSTD_dParam_withinBounds(ZSTD_dParameter dParam, int value) {
-	ZSTD_bounds const bounds = ZSTD_dParam_getBounds(dParam);
-	if(ZSTD_isError(bounds.error))
-		return 0;
-	if(value < bounds.lowerBound)
-		return 0;
-	if(value > bounds.upperBound)
-		return 0;
-	return 1;
-}
-
-#define CHECK_DBOUNDS(p, v)                                                         \
-	{                                                                               \
-		RETURN_ERROR_IF(!ZSTD_dParam_withinBounds(p, v), parameter_outOfBound, ""); \
-	}
-
-size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value) {
-	switch(param) {
-	case ZSTD_d_windowLogMax:
-		*value = (int)ZSTD_highbit32((U32)dctx->maxWindowSize);
-		return 0;
-	case ZSTD_d_format:
-		*value = (int)dctx->format;
-		return 0;
-	case ZSTD_d_stableOutBuffer:
-		*value = (int)dctx->outBufferMode;
-		return 0;
-	case ZSTD_d_forceIgnoreChecksum:
-		*value = (int)dctx->forceIgnoreChecksum;
-		return 0;
-	case ZSTD_d_refMultipleDDicts:
-		*value = (int)dctx->refMultipleDDicts;
-		return 0;
-	case ZSTD_d_disableHuffmanAssembly:
-		*value = (int)dctx->disableHufAsm;
-		return 0;
-	default:;
-	}
-	RETURN_ERROR(parameter_unsupported, "");
-}
-
-size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value) {
-	RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
-	switch(dParam) {
-	case ZSTD_d_windowLogMax:
-		if(value == 0)
-			value = ZSTD_WINDOWLOG_LIMIT_DEFAULT;
-		CHECK_DBOUNDS(ZSTD_d_windowLogMax, value);
-		dctx->maxWindowSize = ((size_t)1) << value;
-		return 0;
-	case ZSTD_d_format:
-		CHECK_DBOUNDS(ZSTD_d_format, value);
-		dctx->format = (ZSTD_format_e)value;
-		return 0;
-	case ZSTD_d_stableOutBuffer:
-		CHECK_DBOUNDS(ZSTD_d_stableOutBuffer, value);
-		dctx->outBufferMode = (ZSTD_bufferMode_e)value;
-		return 0;
-	case ZSTD_d_forceIgnoreChecksum:
-		CHECK_DBOUNDS(ZSTD_d_forceIgnoreChecksum, value);
-		dctx->forceIgnoreChecksum = (ZSTD_forceIgnoreChecksum_e)value;
-		return 0;
-	case ZSTD_d_refMultipleDDicts:
-		CHECK_DBOUNDS(ZSTD_d_refMultipleDDicts, value);
-		if(dctx->staticSize != 0) {
-			RETURN_ERROR(parameter_unsupported, "Static dctx does not support multiple DDicts!");
-		}
-		dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
-		return 0;
-	case ZSTD_d_disableHuffmanAssembly:
-		CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
-		dctx->disableHufAsm = value != 0;
-		return 0;
-	default:;
-	}
-	RETURN_ERROR(parameter_unsupported, "");
-}
-
-size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) {
-	if((reset == ZSTD_reset_session_only) || (reset == ZSTD_reset_session_and_parameters)) {
-		dctx->streamStage = zdss_init;
-		dctx->noForwardProgress = 0;
-	}
-	if((reset == ZSTD_reset_parameters) || (reset == ZSTD_reset_session_and_parameters)) {
-		RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
-		ZSTD_clearDict(dctx);
-		ZSTD_DCtx_resetParameters(dctx);
-	}
-	return 0;
-}
-
-size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) {
-	return ZSTD_sizeof_DCtx(dctx);
-}
-
-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) {
-	size_t const blockSize = (size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
-	/* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/
-	unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
-	unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
-	size_t const minRBSize = (size_t)neededSize;
-	RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
-	                frameParameter_windowTooLarge, "");
-	return minRBSize;
-}
-
-size_t ZSTD_estimateDStreamSize(size_t windowSize) {
-	size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
-	size_t const inBuffSize = blockSize; /* no block can be larger */
-	size_t const outBuffSize = ZSTD_decodingBufferSize_min(windowSize, ZSTD_CONTENTSIZE_UNKNOWN);
-	return ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize;
-}
-
-size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize) {
-	U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; /* note : should be user-selectable, but requires an additional parameter (or a dctx) */
-	ZSTD_frameHeader zfh;
-	size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize);
-	if(ZSTD_isError(err))
-		return err;
-	RETURN_ERROR_IF(err > 0, srcSize_wrong, "");
-	RETURN_ERROR_IF(zfh.windowSize > windowSizeMax,
-	                frameParameter_windowTooLarge, "");
-	return ZSTD_estimateDStreamSize((size_t)zfh.windowSize);
-}
-
-/* *****   Decompression   ***** */
-
-static int ZSTD_DCtx_isOverflow(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize) {
-	return (zds->inBuffSize + zds->outBuffSize) >= (neededInBuffSize + neededOutBuffSize) * ZSTD_WORKSPACETOOLARGE_FACTOR;
-}
-
-static void ZSTD_DCtx_updateOversizedDuration(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize) {
-	if(ZSTD_DCtx_isOverflow(zds, neededInBuffSize, neededOutBuffSize))
-		zds->oversizedDuration++;
-	else
-		zds->oversizedDuration = 0;
-}
-
-static int ZSTD_DCtx_isOversizedTooLong(ZSTD_DStream* zds) {
-	return zds->oversizedDuration >= ZSTD_WORKSPACETOOLARGE_MAXDURATION;
-}
-
-/* Checks that the output buffer hasn't changed if ZSTD_obm_stable is used. */
-static size_t ZSTD_checkOutBuffer(ZSTD_DStream const * zds, ZSTD_outBuffer const * output) {
-	ZSTD_outBuffer const expect = zds->expectedOutBuffer;
-	/* No requirement when ZSTD_obm_stable is not enabled. */
-	if(zds->outBufferMode != ZSTD_bm_stable)
-		return 0;
-	/* Any buffer is allowed in zdss_init, this must be the same for every other call until
-	 * the context is reset.
-	 */
-	if(zds->streamStage == zdss_init)
-		return 0;
-	/* The buffer must match our expectation exactly. */
-	if(expect.dst == output->dst && expect.pos == output->pos && expect.size == output->size)
-		return 0;
-	RETURN_ERROR(dstBuffer_wrong, "ZSTD_d_stableOutBuffer enabled but output differs!");
-}
-
-/* Calls ZSTD_decompressContinue() with the right parameters for ZSTD_decompressStream()
- * and updates the stage and the output buffer state. This call is extracted so it can be
- * used both when reading directly from the ZSTD_inBuffer, and in buffered input mode.
- * NOTE: You must break after calling this function since the streamStage is modified.
- */
-static size_t ZSTD_decompressContinueStream(
-    ZSTD_DStream* zds, char** op, char* oend,
-    void const * src, size_t srcSize) {
-	int const isSkipFrame = ZSTD_isSkipFrame(zds);
-	if(zds->outBufferMode == ZSTD_bm_buffered) {
-		size_t const dstSize = isSkipFrame ? 0 : zds->outBuffSize - zds->outStart;
-		size_t const decodedSize = ZSTD_decompressContinue(zds,
-		                                                   zds->outBuff + zds->outStart, dstSize, src, srcSize);
-		FORWARD_IF_ERROR(decodedSize, "");
-		if(!decodedSize && !isSkipFrame) {
-			zds->streamStage = zdss_read;
-		} else {
-			zds->outEnd = zds->outStart + decodedSize;
-			zds->streamStage = zdss_flush;
-		}
-	} else {
-		/* Write directly into the output buffer */
-		size_t const dstSize = isSkipFrame ? 0 : (size_t)(oend - *op);
-		size_t const decodedSize = ZSTD_decompressContinue(zds, *op, dstSize, src, srcSize);
-		FORWARD_IF_ERROR(decodedSize, "");
-		*op += decodedSize;
-		/* Flushing is not needed. */
-		zds->streamStage = zdss_read;
-		assert(*op <= oend);
-		assert(zds->outBufferMode == ZSTD_bm_stable);
-	}
-	return 0;
-}
-
-size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input) {
-	const char* const src = (const char*)input->src;
-	const char* const istart = input->pos != 0 ? src + input->pos : src;
-	const char* const iend = input->size != 0 ? src + input->size : src;
-	const char* ip = istart;
-	char* const dst = (char*)output->dst;
-	char* const ostart = output->pos != 0 ? dst + output->pos : dst;
-	char* const oend = output->size != 0 ? dst + output->size : dst;
-	char* op = ostart;
-	U32 someMoreWork = 1;
-
-	DEBUGLOG(5, "ZSTD_decompressStream");
-	RETURN_ERROR_IF(
-	    input->pos > input->size,
-	    srcSize_wrong,
-	    "forbidden. in: pos: %u   vs size: %u",
-	    (U32)input->pos, (U32)input->size);
-	RETURN_ERROR_IF(
-	    output->pos > output->size,
-	    dstSize_tooSmall,
-	    "forbidden. out: pos: %u   vs size: %u",
-	    (U32)output->pos, (U32)output->size);
-	DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos));
-	FORWARD_IF_ERROR(ZSTD_checkOutBuffer(zds, output), "");
-
-	while(someMoreWork) {
-		switch(zds->streamStage) {
-		case zdss_init:
-			DEBUGLOG(5, "stage zdss_init => transparent reset ");
-			zds->streamStage = zdss_loadHeader;
-			zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-			zds->legacyVersion = 0;
-#endif
-			zds->hostageByte = 0;
-			zds->expectedOutBuffer = *output;
-			ZSTD_FALLTHROUGH;
-
-		case zdss_loadHeader:
-			DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip));
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-			if(zds->legacyVersion) {
-				RETURN_ERROR_IF(zds->staticSize, memory_allocation,
-				                "legacy support is incompatible with static dctx");
-				{
-					size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input);
-					if(hint == 0)
-						zds->streamStage = zdss_init;
-					return hint;
-				}
-			}
-#endif
-			{
-				size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
-				if(zds->refMultipleDDicts && zds->ddictSet) {
-					ZSTD_DCtx_selectFrameDDict(zds);
-				}
-				if(ZSTD_isError(hSize)) {
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-					U32 const legacyVersion = ZSTD_isLegacy(istart, iend - istart);
-					if(legacyVersion) {
-						ZSTD_DDict const * const ddict = ZSTD_getDDict(zds);
-						const void* const dict = ddict ? ZSTD_DDict_dictContent(ddict) : NULL;
-						size_t const dictSize = ddict ? ZSTD_DDict_dictSize(ddict) : 0;
-						DEBUGLOG(5, "ZSTD_decompressStream: detected legacy version v0.%u", legacyVersion);
-						RETURN_ERROR_IF(zds->staticSize, memory_allocation,
-						                "legacy support is incompatible with static dctx");
-						FORWARD_IF_ERROR(ZSTD_initLegacyStream(&zds->legacyContext,
-						                                       zds->previousLegacyVersion, legacyVersion,
-						                                       dict, dictSize),
-						                 "");
-						zds->legacyVersion = zds->previousLegacyVersion = legacyVersion;
-						{
-							size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input);
-							if(hint == 0)
-								zds->streamStage = zdss_init; /* or stay in stage zdss_loadHeader */
-							return hint;
-						}
-					}
-#endif
-					return hSize; /* error */
-				}
-				if(hSize != 0) {                               /* need more input */
-					size_t const toLoad = hSize - zds->lhSize; /* if hSize!=0, hSize > zds->lhSize */
-					size_t const remainingInput = (size_t)(iend - ip);
-					assert(iend >= ip);
-					if(toLoad > remainingInput) { /* not enough input to load full header */
-						if(remainingInput > 0) {
-							ZSTD_memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput);
-							zds->lhSize += remainingInput;
-						}
-						input->pos = input->size;
-						/* check first few bytes */
-						FORWARD_IF_ERROR(
-						    ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
-						    "First few bytes detected incorrect");
-						/* return hint input size */
-						return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */
-					}
-					assert(ip != NULL);
-					ZSTD_memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad);
-					zds->lhSize = hSize;
-					ip += toLoad;
-					break;
-				}
-			}
-
-			/* check for single-pass mode opportunity */
-			if(zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN && zds->fParams.frameType != ZSTD_skippableFrame && (U64)(size_t)(oend - op) >= zds->fParams.frameContentSize) {
-				size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend - istart));
-				if(cSize <= (size_t)(iend - istart)) {
-					/* shortcut : using single-pass mode */
-					size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend - op), istart, cSize, ZSTD_getDDict(zds));
-					if(ZSTD_isError(decompressedSize))
-						return decompressedSize;
-					DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
-					assert(istart != NULL);
-					ip = istart + cSize;
-					op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
-					zds->expected = 0;
-					zds->streamStage = zdss_init;
-					someMoreWork = 0;
-					break;
-				}
-			}
-
-			/* Check output buffer is large enough for ZSTD_odm_stable. */
-			if(zds->outBufferMode == ZSTD_bm_stable && zds->fParams.frameType != ZSTD_skippableFrame && zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN && (U64)(size_t)(oend - op) < zds->fParams.frameContentSize) {
-				RETURN_ERROR(dstSize_tooSmall, "ZSTD_obm_stable passed but ZSTD_outBuffer is too small");
-			}
-
-			/* Consume header (see ZSTDds_decodeFrameHeader) */
-			DEBUGLOG(4, "Consume header");
-			FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
-
-			if((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */
-				zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
-				zds->stage = ZSTDds_skipFrame;
-			} else {
-				FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize), "");
-				zds->expected = ZSTD_blockHeaderSize;
-				zds->stage = ZSTDds_decodeBlockHeader;
-			}
-
-			/* control buffer memory usage */
-			DEBUGLOG(4, "Control max memory usage (%u KB <= max %u KB)",
-			         (U32)(zds->fParams.windowSize >> 10),
-			         (U32)(zds->maxWindowSize >> 10));
-			zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
-			RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
-			                frameParameter_windowTooLarge, "");
-
-			/* Adapt buffer sizes to frame header instructions */
-			{
-				size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
-				size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered
-				                                     ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
-				                                     : 0;
-
-				ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
-
-				{
-					int const tooSmall = (zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize);
-					int const tooLarge = ZSTD_DCtx_isOversizedTooLong(zds);
-
-					if(tooSmall || tooLarge) {
-						size_t const bufferSize = neededInBuffSize + neededOutBuffSize;
-						DEBUGLOG(4, "inBuff  : from %u to %u",
-						         (U32)zds->inBuffSize, (U32)neededInBuffSize);
-						DEBUGLOG(4, "outBuff : from %u to %u",
-						         (U32)zds->outBuffSize, (U32)neededOutBuffSize);
-						if(zds->staticSize) { /* static DCtx */
-							DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize);
-							assert(zds->staticSize >= sizeof(ZSTD_DCtx)); /* controlled at init */
-							RETURN_ERROR_IF(
-							    bufferSize > zds->staticSize - sizeof(ZSTD_DCtx),
-							    memory_allocation, "");
-						} else {
-							ZSTD_customFree(zds->inBuff, zds->customMem);
-							zds->inBuffSize = 0;
-							zds->outBuffSize = 0;
-							zds->inBuff = (char*)ZSTD_customMalloc(bufferSize, zds->customMem);
-							RETURN_ERROR_IF(zds->inBuff == NULL, memory_allocation, "");
-						}
-						zds->inBuffSize = neededInBuffSize;
-						zds->outBuff = zds->inBuff + zds->inBuffSize;
-						zds->outBuffSize = neededOutBuffSize;
-					}
-				}
-			}
-			zds->streamStage = zdss_read;
-			ZSTD_FALLTHROUGH;
-
-		case zdss_read:
-			DEBUGLOG(5, "stage zdss_read");
-			{
-				size_t const neededInSize = ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip));
-				DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize);
-				if(neededInSize == 0) { /* end of frame */
-					zds->streamStage = zdss_init;
-					someMoreWork = 0;
-					break;
-				}
-				if((size_t)(iend - ip) >= neededInSize) { /* decode directly from src */
-					FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
-					assert(ip != NULL);
-					ip += neededInSize;
-					/* Function modifies the stage so we must break */
-					break;
-				}
-			}
-			if(ip == iend) {
-				someMoreWork = 0;
-				break;
-			} /* no more input */
-			zds->streamStage = zdss_load;
-			ZSTD_FALLTHROUGH;
-
-		case zdss_load: {
-			size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds);
-			size_t const toLoad = neededInSize - zds->inPos;
-			int const isSkipFrame = ZSTD_isSkipFrame(zds);
-			size_t loadedSize;
-			/* At this point we shouldn't be decompressing a block that we can stream. */
-			assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
-			if(isSkipFrame) {
-				loadedSize = MIN(toLoad, (size_t)(iend - ip));
-			} else {
-				RETURN_ERROR_IF(toLoad > zds->inBuffSize - zds->inPos,
-				                corruption_detected,
-				                "should never happen");
-				loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend - ip));
-			}
-			if(loadedSize != 0) {
-				/* ip may be NULL */
-				ip += loadedSize;
-				zds->inPos += loadedSize;
-			}
-			if(loadedSize < toLoad) {
-				someMoreWork = 0;
-				break;
-			} /* not enough input, wait for more */
-
-			/* decode loaded input */
-			zds->inPos = 0; /* input is consumed */
-			FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, zds->inBuff, neededInSize), "");
-			/* Function modifies the stage so we must break */
-			break;
-		}
-		case zdss_flush: {
-			size_t const toFlushSize = zds->outEnd - zds->outStart;
-			size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend - op), zds->outBuff + zds->outStart, toFlushSize);
-
-			op = op ? op + flushedSize : op;
-
-			zds->outStart += flushedSize;
-			if(flushedSize == toFlushSize) { /* flush completed */
-				zds->streamStage = zdss_read;
-				if((zds->outBuffSize < zds->fParams.frameContentSize) && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize)) {
-					DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
-					         (int)(zds->outBuffSize - zds->outStart),
-					         (U32)zds->fParams.blockSizeMax);
-					zds->outStart = zds->outEnd = 0;
-				}
-				break;
-			}
-		}
-			/* cannot complete flush */
-			someMoreWork = 0;
-			break;
-
-		default:
-			assert(0);                                    /* impossible */
-			RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */
-		}
-	}
-
-	/* result */
-	input->pos = (size_t)(ip - (const char*)(input->src));
-	output->pos = (size_t)(op - (char*)(output->dst));
-
-	/* Update the expected output buffer for ZSTD_obm_stable. */
-	zds->expectedOutBuffer = *output;
-
-	if((ip == istart) && (op == ostart)) { /* no forward progress */
-		zds->noForwardProgress++;
-		if(zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
-			RETURN_ERROR_IF(op == oend, noForwardProgress_destFull, "");
-			RETURN_ERROR_IF(ip == iend, noForwardProgress_inputEmpty, "");
-			assert(0);
-		}
-	} else {
-		zds->noForwardProgress = 0;
-	}
-	{
-		size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds);
-		if(!nextSrcSizeHint) {                 /* frame fully decoded */
-			if(zds->outEnd == zds->outStart) { /* output fully flushed */
-				if(zds->hostageByte) {
-					if(input->pos >= input->size) {
-						/* can't release hostage (not present) */
-						zds->streamStage = zdss_read;
-						return 1;
-					}
-					input->pos++; /* release hostage */
-				}                 /* zds->hostageByte */
-				return 0;
-			}                       /* zds->outEnd == zds->outStart */
-			if(!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */
-				input->pos--;       /* note : pos > 0, otherwise, impossible to finish reading last block */
-				zds->hostageByte = 1;
-			}
-			return 1;
-		}                                                                                     /* nextSrcSizeHint==0 */
-		nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds) == ZSTDnit_block); /* preload header of next block */
-		assert(zds->inPos <= nextSrcSizeHint);
-		nextSrcSizeHint -= zds->inPos; /* part already loaded*/
-		return nextSrcSizeHint;
-	}
-}
-
-size_t ZSTD_decompressStream_simpleArgs(
-    ZSTD_DCtx* dctx,
-    void* dst, size_t dstCapacity, size_t* dstPos,
-    const void* src, size_t srcSize, size_t* srcPos) {
-	ZSTD_outBuffer output;
-	ZSTD_inBuffer input;
-	output.dst = dst;
-	output.size = dstCapacity;
-	output.pos = *dstPos;
-	input.src = src;
-	input.size = srcSize;
-	input.pos = *srcPos;
-	{
-		size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
-		*dstPos = output.pos;
-		*srcPos = input.pos;
-		return cErr;
-	}
-}
diff --git a/src/zstd/zstd_decompress_block.c b/src/zstd/zstd_decompress_block.c
deleted file mode 100644
index fae6a08d0..000000000
--- a/src/zstd/zstd_decompress_block.c
+++ /dev/null
@@ -1,2275 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/* zstd_decompress_block :
- * this module takes care of decompressing _compressed_ block */
-
-/*-*******************************************************
- *  Dependencies
- *********************************************************/
-#include "zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
-#include "compiler.h"  /* prefetch */
-#include "cpu.h"       /* bmi2 */
-#include "mem.h"       /* low level memory routines */
-#define FSE_STATIC_LINKING_ONLY
-#include "fse.h"
-#include "huf.h"
-#include "zstd_internal.h"
-#include "zstd_decompress_internal.h" /* ZSTD_DCtx */
-#include "zstd_ddict.h"               /* ZSTD_DDictDictContent */
-#include "zstd_decompress_block.h"
-#include "bits.h" /* ZSTD_highbit32 */
-
-/*_*******************************************************
- *  Macros
- **********************************************************/
-
-/* These two optional macros force the use one way or another of the two
- * ZSTD_decompressSequences implementations. You can't force in both directions
- * at the same time.
- */
-#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
-    defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!"
-#endif
-
-/*_*******************************************************
- *  Memory operations
- **********************************************************/
-static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
-
-/*-*************************************************************
- *   Block decoding
- ***************************************************************/
-
-/*! ZSTD_getcBlockSize() :
- *  Provides the size of compressed block from block header `src` */
-size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
-                          blockProperties_t* bpPtr) {
-	RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");
-
-	{
-		U32 const cBlockHeader = MEM_readLE24(src);
-		U32 const cSize = cBlockHeader >> 3;
-		bpPtr->lastBlock = cBlockHeader & 1;
-		bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
-		bpPtr->origSize = cSize; /* only useful for RLE */
-		if(bpPtr->blockType == bt_rle)
-			return 1;
-		RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");
-		return cSize;
-	}
-}
-
-/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
-static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
-                                        const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) {
-	if(streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
-		/* room for litbuffer to fit without read faulting */
-		dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
-		dctx->litBufferEnd = dctx->litBuffer + litSize;
-		dctx->litBufferLocation = ZSTD_in_dst;
-	} else if(litSize > ZSTD_LITBUFFEREXTRASIZE) {
-		/* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
-		if(splitImmediately) {
-			/* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
-			dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
-			dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
-		} else {
-			/* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
-			dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
-			dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
-		}
-		dctx->litBufferLocation = ZSTD_split;
-	} else {
-		/* fits entirely within litExtraBuffer, so no split is necessary */
-		dctx->litBuffer = dctx->litExtraBuffer;
-		dctx->litBufferEnd = dctx->litBuffer + litSize;
-		dctx->litBufferLocation = ZSTD_not_in_dst;
-	}
-}
-
-/* Hidden declaration for fullbench */
-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
-                                const void* src, size_t srcSize,
-                                void* dst, size_t dstCapacity, const streaming_operation streaming);
-/*! ZSTD_decodeLiteralsBlock() :
- * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
- * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
- * block will be output.  Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
- * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
- *
- * @return : nb of bytes read from src (< srcSize )
- *  note : symbol not declared but exposed for fullbench */
-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
-                                const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */
-                                void* dst, size_t dstCapacity, const streaming_operation streaming) {
-	DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
-	RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
-
-	{
-		const BYTE* const istart = (const BYTE*)src;
-		symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
-
-		switch(litEncType) {
-		case set_repeat:
-			DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
-			RETURN_ERROR_IF(dctx->litEntropy == 0, dictionary_corrupted, "");
-			ZSTD_FALLTHROUGH;
-
-		case set_compressed:
-			RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
-			{
-				size_t lhSize, litSize, litCSize;
-				U32 singleStream = 0;
-				U32 const lhlCode = (istart[0] >> 2) & 3;
-				U32 const lhc = MEM_readLE32(istart);
-				size_t hufSuccess;
-				size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
-				int const flags = 0 | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
-				switch(lhlCode) {
-				case 0:
-				case 1:
-				default: /* note : default is impossible, since lhlCode into [0..3] */
-					/* 2 - 2 - 10 - 10 */
-					singleStream = !lhlCode;
-					lhSize = 3;
-					litSize = (lhc >> 4) & 0x3FF;
-					litCSize = (lhc >> 14) & 0x3FF;
-					break;
-				case 2:
-					/* 2 - 2 - 14 - 14 */
-					lhSize = 4;
-					litSize = (lhc >> 4) & 0x3FFF;
-					litCSize = lhc >> 18;
-					break;
-				case 3:
-					/* 2 - 2 - 18 - 18 */
-					lhSize = 5;
-					litSize = (lhc >> 4) & 0x3FFFF;
-					litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
-					break;
-				}
-				RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
-				RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
-				if(!singleStream)
-					RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
-					                "Not enough literals (%zu) for the 4-streams mode (min %u)",
-					                litSize, MIN_LITERALS_FOR_4_STREAMS);
-				RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
-				RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
-				ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
-
-				/* prefetch huffman table if cold */
-				if(dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
-					PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
-				}
-
-				if(litEncType == set_repeat) {
-					if(singleStream) {
-						hufSuccess = HUF_decompress1X_usingDTable(
-						    dctx->litBuffer, litSize, istart + lhSize, litCSize,
-						    dctx->HUFptr, flags);
-					} else {
-						assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
-						hufSuccess = HUF_decompress4X_usingDTable(
-						    dctx->litBuffer, litSize, istart + lhSize, litCSize,
-						    dctx->HUFptr, flags);
-					}
-				} else {
-					if(singleStream) {
-#if defined(HUF_FORCE_DECOMPRESS_X2)
-						hufSuccess = HUF_decompress1X_DCtx_wksp(
-						    dctx->entropy.hufTable, dctx->litBuffer, litSize,
-						    istart + lhSize, litCSize, dctx->workspace,
-						    sizeof(dctx->workspace), flags);
-#else
-						hufSuccess = HUF_decompress1X1_DCtx_wksp(
-						    dctx->entropy.hufTable, dctx->litBuffer, litSize,
-						    istart + lhSize, litCSize, dctx->workspace,
-						    sizeof(dctx->workspace), flags);
-#endif
-					} else {
-						hufSuccess = HUF_decompress4X_hufOnly_wksp(
-						    dctx->entropy.hufTable, dctx->litBuffer, litSize,
-						    istart + lhSize, litCSize, dctx->workspace,
-						    sizeof(dctx->workspace), flags);
-					}
-				}
-				if(dctx->litBufferLocation == ZSTD_split) {
-					ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
-					ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
-					dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
-					dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
-				}
-
-				RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
-
-				dctx->litPtr = dctx->litBuffer;
-				dctx->litSize = litSize;
-				dctx->litEntropy = 1;
-				if(litEncType == set_compressed)
-					dctx->HUFptr = dctx->entropy.hufTable;
-				return litCSize + lhSize;
-			}
-
-		case set_basic: {
-			size_t litSize, lhSize;
-			U32 const lhlCode = ((istart[0]) >> 2) & 3;
-			size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
-			switch(lhlCode) {
-			case 0:
-			case 2:
-			default: /* note : default is impossible, since lhlCode into [0..3] */
-				lhSize = 1;
-				litSize = istart[0] >> 3;
-				break;
-			case 1:
-				lhSize = 2;
-				litSize = MEM_readLE16(istart) >> 4;
-				break;
-			case 3:
-				lhSize = 3;
-				RETURN_ERROR_IF(srcSize < 3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
-				litSize = MEM_readLE24(istart) >> 4;
-				break;
-			}
-
-			RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
-			RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
-			ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
-			if(lhSize + litSize + WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
-				RETURN_ERROR_IF(litSize + lhSize > srcSize, corruption_detected, "");
-				if(dctx->litBufferLocation == ZSTD_split) {
-					ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
-					ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
-				} else {
-					ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
-				}
-				dctx->litPtr = dctx->litBuffer;
-				dctx->litSize = litSize;
-				return lhSize + litSize;
-			}
-			/* direct reference into compressed stream */
-			dctx->litPtr = istart + lhSize;
-			dctx->litSize = litSize;
-			dctx->litBufferEnd = dctx->litPtr + litSize;
-			dctx->litBufferLocation = ZSTD_not_in_dst;
-			return lhSize + litSize;
-		}
-
-		case set_rle: {
-			U32 const lhlCode = ((istart[0]) >> 2) & 3;
-			size_t litSize, lhSize;
-			size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
-			switch(lhlCode) {
-			case 0:
-			case 2:
-			default: /* note : default is impossible, since lhlCode into [0..3] */
-				lhSize = 1;
-				litSize = istart[0] >> 3;
-				break;
-			case 1:
-				lhSize = 2;
-				RETURN_ERROR_IF(srcSize < 3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
-				litSize = MEM_readLE16(istart) >> 4;
-				break;
-			case 3:
-				lhSize = 3;
-				RETURN_ERROR_IF(srcSize < 4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
-				litSize = MEM_readLE24(istart) >> 4;
-				break;
-			}
-			RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
-			RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
-			RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
-			ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
-			if(dctx->litBufferLocation == ZSTD_split) {
-				ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
-				ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
-			} else {
-				ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
-			}
-			dctx->litPtr = dctx->litBuffer;
-			dctx->litSize = litSize;
-			return lhSize + 1;
-		}
-		default:
-			RETURN_ERROR(corruption_detected, "impossible");
-		}
-	}
-}
-
-/* Default FSE distribution tables.
- * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
- * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
- * They were generated programmatically with following method :
- * - start from default distributions, present in /lib/common/zstd_internal.h
- * - generate tables normally, using ZSTD_buildFSETable()
- * - printout the content of tables
- * - pretify output, report below, test with fuzzer to ensure it's correct */
-
-/* Default FSE distribution table for Literal Lengths */
-static const ZSTD_seqSymbol LL_defaultDTable[(1 << LL_DEFAULTNORMLOG) + 1] = {
-    {1, 1, 1, LL_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
-    /* nextState, nbAddBits, nbBits, baseVal */
-    {0, 0, 4, 0},
-    {16, 0, 4, 0},
-    {32, 0, 5, 1},
-    {0, 0, 5, 3},
-    {0, 0, 5, 4},
-    {0, 0, 5, 6},
-    {0, 0, 5, 7},
-    {0, 0, 5, 9},
-    {0, 0, 5, 10},
-    {0, 0, 5, 12},
-    {0, 0, 6, 14},
-    {0, 1, 5, 16},
-    {0, 1, 5, 20},
-    {0, 1, 5, 22},
-    {0, 2, 5, 28},
-    {0, 3, 5, 32},
-    {0, 4, 5, 48},
-    {32, 6, 5, 64},
-    {0, 7, 5, 128},
-    {0, 8, 6, 256},
-    {0, 10, 6, 1024},
-    {0, 12, 6, 4096},
-    {32, 0, 4, 0},
-    {0, 0, 4, 1},
-    {0, 0, 5, 2},
-    {32, 0, 5, 4},
-    {0, 0, 5, 5},
-    {32, 0, 5, 7},
-    {0, 0, 5, 8},
-    {32, 0, 5, 10},
-    {0, 0, 5, 11},
-    {0, 0, 6, 13},
-    {32, 1, 5, 16},
-    {0, 1, 5, 18},
-    {32, 1, 5, 22},
-    {0, 2, 5, 24},
-    {32, 3, 5, 32},
-    {0, 3, 5, 40},
-    {0, 6, 4, 64},
-    {16, 6, 4, 64},
-    {32, 7, 5, 128},
-    {0, 9, 6, 512},
-    {0, 11, 6, 2048},
-    {48, 0, 4, 0},
-    {16, 0, 4, 1},
-    {32, 0, 5, 2},
-    {32, 0, 5, 3},
-    {32, 0, 5, 5},
-    {32, 0, 5, 6},
-    {32, 0, 5, 8},
-    {32, 0, 5, 9},
-    {32, 0, 5, 11},
-    {32, 0, 5, 12},
-    {0, 0, 6, 15},
-    {32, 1, 5, 18},
-    {32, 1, 5, 20},
-    {32, 2, 5, 24},
-    {32, 2, 5, 28},
-    {32, 3, 5, 40},
-    {32, 4, 5, 48},
-    {0, 16, 6, 65536},
-    {0, 15, 6, 32768},
-    {0, 14, 6, 16384},
-    {0, 13, 6, 8192},
-}; /* LL_defaultDTable */
-
-/* Default FSE distribution table for Offset Codes */
-static const ZSTD_seqSymbol OF_defaultDTable[(1 << OF_DEFAULTNORMLOG) + 1] = {
-    {1, 1, 1, OF_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
-    /* nextState, nbAddBits, nbBits, baseVal */
-    {0, 0, 5, 0},
-    {0, 6, 4, 61},
-    {0, 9, 5, 509},
-    {0, 15, 5, 32765},
-    {0, 21, 5, 2097149},
-    {0, 3, 5, 5},
-    {0, 7, 4, 125},
-    {0, 12, 5, 4093},
-    {0, 18, 5, 262141},
-    {0, 23, 5, 8388605},
-    {0, 5, 5, 29},
-    {0, 8, 4, 253},
-    {0, 14, 5, 16381},
-    {0, 20, 5, 1048573},
-    {0, 2, 5, 1},
-    {16, 7, 4, 125},
-    {0, 11, 5, 2045},
-    {0, 17, 5, 131069},
-    {0, 22, 5, 4194301},
-    {0, 4, 5, 13},
-    {16, 8, 4, 253},
-    {0, 13, 5, 8189},
-    {0, 19, 5, 524285},
-    {0, 1, 5, 1},
-    {16, 6, 4, 61},
-    {0, 10, 5, 1021},
-    {0, 16, 5, 65533},
-    {0, 28, 5, 268435453},
-    {0, 27, 5, 134217725},
-    {0, 26, 5, 67108861},
-    {0, 25, 5, 33554429},
-    {0, 24, 5, 16777213},
-}; /* OF_defaultDTable */
-
-/* Default FSE distribution table for Match Lengths */
-static const ZSTD_seqSymbol ML_defaultDTable[(1 << ML_DEFAULTNORMLOG) + 1] = {
-    {1, 1, 1, ML_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
-    /* nextState, nbAddBits, nbBits, baseVal */
-    {0, 0, 6, 3},
-    {0, 0, 4, 4},
-    {32, 0, 5, 5},
-    {0, 0, 5, 6},
-    {0, 0, 5, 8},
-    {0, 0, 5, 9},
-    {0, 0, 5, 11},
-    {0, 0, 6, 13},
-    {0, 0, 6, 16},
-    {0, 0, 6, 19},
-    {0, 0, 6, 22},
-    {0, 0, 6, 25},
-    {0, 0, 6, 28},
-    {0, 0, 6, 31},
-    {0, 0, 6, 34},
-    {0, 1, 6, 37},
-    {0, 1, 6, 41},
-    {0, 2, 6, 47},
-    {0, 3, 6, 59},
-    {0, 4, 6, 83},
-    {0, 7, 6, 131},
-    {0, 9, 6, 515},
-    {16, 0, 4, 4},
-    {0, 0, 4, 5},
-    {32, 0, 5, 6},
-    {0, 0, 5, 7},
-    {32, 0, 5, 9},
-    {0, 0, 5, 10},
-    {0, 0, 6, 12},
-    {0, 0, 6, 15},
-    {0, 0, 6, 18},
-    {0, 0, 6, 21},
-    {0, 0, 6, 24},
-    {0, 0, 6, 27},
-    {0, 0, 6, 30},
-    {0, 0, 6, 33},
-    {0, 1, 6, 35},
-    {0, 1, 6, 39},
-    {0, 2, 6, 43},
-    {0, 3, 6, 51},
-    {0, 4, 6, 67},
-    {0, 5, 6, 99},
-    {0, 8, 6, 259},
-    {32, 0, 4, 4},
-    {48, 0, 4, 4},
-    {16, 0, 4, 5},
-    {32, 0, 5, 7},
-    {32, 0, 5, 8},
-    {32, 0, 5, 10},
-    {32, 0, 5, 11},
-    {0, 0, 6, 14},
-    {0, 0, 6, 17},
-    {0, 0, 6, 20},
-    {0, 0, 6, 23},
-    {0, 0, 6, 26},
-    {0, 0, 6, 29},
-    {0, 0, 6, 32},
-    {0, 16, 6, 65539},
-    {0, 15, 6, 32771},
-    {0, 14, 6, 16387},
-    {0, 13, 6, 8195},
-    {0, 12, 6, 4099},
-    {0, 11, 6, 2051},
-    {0, 10, 6, 1027},
-}; /* ML_defaultDTable */
-
-static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits) {
-	void* ptr = dt;
-	ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
-	ZSTD_seqSymbol* const cell = dt + 1;
-
-	DTableH->tableLog = 0;
-	DTableH->fastMode = 0;
-
-	cell->nbBits = 0;
-	cell->nextState = 0;
-	assert(nbAddBits < 255);
-	cell->nbAdditionalBits = nbAddBits;
-	cell->baseValue = baseValue;
-}
-
-/* ZSTD_buildFSETable() :
- * generate FSE decoding table for one symbol (ll, ml or off)
- * cannot fail if input is valid =>
- * all inputs are presumed validated at this stage */
-FORCE_INLINE_TEMPLATE
-void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
-                             const short* normalizedCounter, unsigned maxSymbolValue,
-                             const U32* baseValue, const U8* nbAdditionalBits,
-                             unsigned tableLog, void* wksp, size_t wkspSize) {
-	ZSTD_seqSymbol* const tableDecode = dt + 1;
-	U32 const maxSV1 = maxSymbolValue + 1;
-	U32 const tableSize = 1 << tableLog;
-
-	U16* symbolNext = (U16*)wksp;
-	BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
-	U32 highThreshold = tableSize - 1;
-
-	/* Sanity Checks */
-	assert(maxSymbolValue <= MaxSeq);
-	assert(tableLog <= MaxFSELog);
-	assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
-	(void)wkspSize;
-	/* Init, lay down lowprob symbols */
-	{
-		ZSTD_seqSymbol_header DTableH;
-		DTableH.tableLog = tableLog;
-		DTableH.fastMode = 1;
-		{
-			S16 const largeLimit = (S16)(1 << (tableLog - 1));
-			U32 s;
-			for(s = 0; s < maxSV1; s++) {
-				if(normalizedCounter[s] == -1) {
-					tableDecode[highThreshold--].baseValue = s;
-					symbolNext[s] = 1;
-				} else {
-					if(normalizedCounter[s] >= largeLimit)
-						DTableH.fastMode = 0;
-					assert(normalizedCounter[s] >= 0);
-					symbolNext[s] = (U16)normalizedCounter[s];
-				}
-			}
-		}
-		ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
-	}
-
-	/* Spread symbols */
-	assert(tableSize <= 512);
-	/* Specialized symbol spreading for the case when there are
-	 * no low probability (-1 count) symbols. When compressing
-	 * small blocks we avoid low probability symbols to hit this
-	 * case, since header decoding speed matters more.
-	 */
-	if(highThreshold == tableSize - 1) {
-		size_t const tableMask = tableSize - 1;
-		size_t const step = FSE_TABLESTEP(tableSize);
-		/* First lay down the symbols in order.
-		 * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
-		 * misses since small blocks generally have small table logs, so nearly
-		 * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
-		 * our buffer to handle the over-write.
-		 */
-		{
-			U64 const add = 0x0101010101010101ull;
-			size_t pos = 0;
-			U64 sv = 0;
-			U32 s;
-			for(s = 0; s < maxSV1; ++s, sv += add) {
-				int i;
-				int const n = normalizedCounter[s];
-				MEM_write64(spread + pos, sv);
-				for(i = 8; i < n; i += 8) {
-					MEM_write64(spread + pos + i, sv);
-				}
-				assert(n >= 0);
-				pos += (size_t)n;
-			}
-		}
-		/* Now we spread those positions across the table.
-		 * The benefit of doing it in two stages is that we avoid the
-		 * variable size inner loop, which caused lots of branch misses.
-		 * Now we can run through all the positions without any branch misses.
-		 * We unroll the loop twice, since that is what empirically worked best.
-		 */
-		{
-			size_t position = 0;
-			size_t s;
-			size_t const unroll = 2;
-			assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
-			for(s = 0; s < (size_t)tableSize; s += unroll) {
-				size_t u;
-				for(u = 0; u < unroll; ++u) {
-					size_t const uPosition = (position + (u * step)) & tableMask;
-					tableDecode[uPosition].baseValue = spread[s + u];
-				}
-				position = (position + (unroll * step)) & tableMask;
-			}
-			assert(position == 0);
-		}
-	} else {
-		U32 const tableMask = tableSize - 1;
-		U32 const step = FSE_TABLESTEP(tableSize);
-		U32 s, position = 0;
-		for(s = 0; s < maxSV1; s++) {
-			int i;
-			int const n = normalizedCounter[s];
-			for(i = 0; i < n; i++) {
-				tableDecode[position].baseValue = s;
-				position = (position + step) & tableMask;
-				while(UNLIKELY(position > highThreshold))
-					position = (position + step) & tableMask; /* lowprob area */
-			}
-		}
-		assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
-	}
-
-	/* Build Decoding table */
-	{
-		U32 u;
-		for(u = 0; u < tableSize; u++) {
-			U32 const symbol = tableDecode[u].baseValue;
-			U32 const nextState = symbolNext[symbol]++;
-			tableDecode[u].nbBits = (BYTE)(tableLog - ZSTD_highbit32(nextState));
-			tableDecode[u].nextState = (U16)((nextState << tableDecode[u].nbBits) - tableSize);
-			assert(nbAdditionalBits[symbol] < 255);
-			tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
-			tableDecode[u].baseValue = baseValue[symbol];
-		}
-	}
-}
-
-/* Avoids the FORCE_INLINE of the _body() function. */
-static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
-                                            const short* normalizedCounter, unsigned maxSymbolValue,
-                                            const U32* baseValue, const U8* nbAdditionalBits,
-                                            unsigned tableLog, void* wksp, size_t wkspSize) {
-	ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
-	                        baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
-}
-
-#if DYNAMIC_BMI2
-BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
-                                                               const short* normalizedCounter, unsigned maxSymbolValue,
-                                                               const U32* baseValue, const U8* nbAdditionalBits,
-                                                               unsigned tableLog, void* wksp, size_t wkspSize) {
-	ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
-	                        baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
-}
-#endif
-
-void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
-                        const short* normalizedCounter, unsigned maxSymbolValue,
-                        const U32* baseValue, const U8* nbAdditionalBits,
-                        unsigned tableLog, void* wksp, size_t wkspSize, int bmi2) {
-#if DYNAMIC_BMI2
-	if(bmi2) {
-		ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
-		                             baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
-		return;
-	}
-#endif
-	(void)bmi2;
-	ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
-	                                baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
-}
-
-/*! ZSTD_buildSeqTable() :
- * @return : nb bytes read from src,
- *           or an error code if it fails */
-static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
-                                 symbolEncodingType_e type, unsigned max, U32 maxLog,
-                                 const void* src, size_t srcSize,
-                                 const U32* baseValue, const U8* nbAdditionalBits,
-                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
-                                 int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
-                                 int bmi2) {
-	switch(type) {
-	case set_rle:
-		RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");
-		RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
-		{
-			U32 const symbol = *(const BYTE*)src;
-			U32 const baseline = baseValue[symbol];
-			U8 const nbBits = nbAdditionalBits[symbol];
-			ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
-		}
-		*DTablePtr = DTableSpace;
-		return 1;
-	case set_basic:
-		*DTablePtr = defaultTable;
-		return 0;
-	case set_repeat:
-		RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");
-		/* prefetch FSE table if used */
-		if(ddictIsCold && (nbSeq > 24 /* heuristic */)) {
-			const void* const pStart = *DTablePtr;
-			size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
-			PREFETCH_AREA(pStart, pSize);
-		}
-		return 0;
-	case set_compressed: {
-		unsigned tableLog;
-		S16 norm[MaxSeq + 1];
-		size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
-		RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
-		RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
-		ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
-		*DTablePtr = DTableSpace;
-		return headerSize;
-	}
-	default:
-		assert(0);
-		RETURN_ERROR(GENERIC, "impossible");
-	}
-}
-
-size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
-                             const void* src, size_t srcSize) {
-	const BYTE* const istart = (const BYTE*)src;
-	const BYTE* const iend = istart + srcSize;
-	const BYTE* ip = istart;
-	int nbSeq;
-	DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
-
-	/* check */
-	RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");
-
-	/* SeqHead */
-	nbSeq = *ip++;
-	if(!nbSeq) {
-		*nbSeqPtr = 0;
-		RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
-		return 1;
-	}
-	if(nbSeq > 0x7F) {
-		if(nbSeq == 0xFF) {
-			RETURN_ERROR_IF(ip + 2 > iend, srcSize_wrong, "");
-			nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
-			ip += 2;
-		} else {
-			RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
-			nbSeq = ((nbSeq - 0x80) << 8) + *ip++;
-		}
-	}
-	*nbSeqPtr = nbSeq;
-
-	/* FSE table descriptors */
-	RETURN_ERROR_IF(ip + 1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
-	{
-		symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
-		symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
-		symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
-		ip++;
-
-		/* Build DTables */
-		{
-			size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr,
-			                                          LLtype, MaxLL, LLFSELog,
-			                                          ip, iend - ip,
-			                                          LL_base, LL_bits,
-			                                          LL_defaultDTable, dctx->fseEntropy,
-			                                          dctx->ddictIsCold, nbSeq,
-			                                          dctx->workspace, sizeof(dctx->workspace),
-			                                          ZSTD_DCtx_get_bmi2(dctx));
-			RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
-			ip += llhSize;
-		}
-
-		{
-			size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr,
-			                                          OFtype, MaxOff, OffFSELog,
-			                                          ip, iend - ip,
-			                                          OF_base, OF_bits,
-			                                          OF_defaultDTable, dctx->fseEntropy,
-			                                          dctx->ddictIsCold, nbSeq,
-			                                          dctx->workspace, sizeof(dctx->workspace),
-			                                          ZSTD_DCtx_get_bmi2(dctx));
-			RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
-			ip += ofhSize;
-		}
-
-		{
-			size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr,
-			                                          MLtype, MaxML, MLFSELog,
-			                                          ip, iend - ip,
-			                                          ML_base, ML_bits,
-			                                          ML_defaultDTable, dctx->fseEntropy,
-			                                          dctx->ddictIsCold, nbSeq,
-			                                          dctx->workspace, sizeof(dctx->workspace),
-			                                          ZSTD_DCtx_get_bmi2(dctx));
-			RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
-			ip += mlhSize;
-		}
-	}
-
-	return ip - istart;
-}
-
-typedef struct {
-	size_t litLength;
-	size_t matchLength;
-	size_t offset;
-} seq_t;
-
-typedef struct {
-	size_t state;
-	const ZSTD_seqSymbol* table;
-} ZSTD_fseState;
-
-typedef struct {
-	BIT_DStream_t DStream;
-	ZSTD_fseState stateLL;
-	ZSTD_fseState stateOffb;
-	ZSTD_fseState stateML;
-	size_t prevOffset[ZSTD_REP_NUM];
-} seqState_t;
-
-/*! ZSTD_overlapCopy8() :
- *  Copies 8 bytes from ip to op and updates op and ip where ip <= op.
- *  If the offset is < 8 then the offset is spread to at least 8 bytes.
- *
- *  Precondition: *ip <= *op
- *  Postcondition: *op - *op >= 8
- */
-HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const ** ip, size_t offset) {
-	assert(*ip <= *op);
-	if(offset < 8) {
-		/* close range match, overlap */
-		static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
-		static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */
-		int const sub2 = dec64table[offset];
-		(*op)[0] = (*ip)[0];
-		(*op)[1] = (*ip)[1];
-		(*op)[2] = (*ip)[2];
-		(*op)[3] = (*ip)[3];
-		*ip += dec32table[offset];
-		ZSTD_copy4(*op + 4, *ip);
-		*ip -= sub2;
-	} else {
-		ZSTD_copy8(*op, *ip);
-	}
-	*ip += 8;
-	*op += 8;
-	assert(*op - *ip >= 8);
-}
-
-/*! ZSTD_safecopy() :
- *  Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer
- *  and write up to 16 bytes past oend_w (op >= oend_w is allowed).
- *  This function is only called in the uncommon case where the sequence is near the end of the block. It
- *  should be fast for a single long sequence, but can be slow for several short sequences.
- *
- *  @param ovtype controls the overlap detection
- *         - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
- *         - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
- *           The src buffer must be before the dst buffer.
- */
-static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const * ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
-	ptrdiff_t const diff = op - ip;
-	BYTE* const oend = op + length;
-
-	assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) ||
-	       (ovtype == ZSTD_overlap_src_before_dst && diff >= 0));
-
-	if(length < 8) {
-		/* Handle short lengths. */
-		while(op < oend)
-			*op++ = *ip++;
-		return;
-	}
-	if(ovtype == ZSTD_overlap_src_before_dst) {
-		/* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
-		assert(length >= 8);
-		ZSTD_overlapCopy8(&op, &ip, diff);
-		length -= 8;
-		assert(op - ip >= 8);
-		assert(op <= oend);
-	}
-
-	if(oend <= oend_w) {
-		/* No risk of overwrite. */
-		ZSTD_wildcopy(op, ip, length, ovtype);
-		return;
-	}
-	if(op <= oend_w) {
-		/* Wildcopy until we get close to the end. */
-		assert(oend > oend_w);
-		ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
-		ip += oend_w - op;
-		op += oend_w - op;
-	}
-	/* Handle the leftovers. */
-	while(op < oend)
-		*op++ = *ip++;
-}
-
-/* ZSTD_safecopyDstBeforeSrc():
- * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
- * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const * ip, ptrdiff_t length) {
-	ptrdiff_t const diff = op - ip;
-	BYTE* const oend = op + length;
-
-	if(length < 8 || diff > -8) {
-		/* Handle short lengths, close overlaps, and dst not before src. */
-		while(op < oend)
-			*op++ = *ip++;
-		return;
-	}
-
-	if(op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
-		ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
-		ip += oend - WILDCOPY_OVERLENGTH - op;
-		op += oend - WILDCOPY_OVERLENGTH - op;
-	}
-
-	/* Handle the leftovers. */
-	while(op < oend)
-		*op++ = *ip++;
-}
-
-/* ZSTD_execSequenceEnd():
- * This version handles cases that are near the end of the output buffer. It requires
- * more careful checks to make sure there is no overflow. By separating out these hard
- * and unlikely cases, we can speed up the common cases.
- *
- * NOTE: This function needs to be fast for a single long sequence, but doesn't need
- * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
- */
-FORCE_NOINLINE
-size_t ZSTD_execSequenceEnd(BYTE* op,
-                            BYTE* const oend, seq_t sequence,
-                            const BYTE** litPtr, const BYTE* const litLimit,
-                            const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) {
-	BYTE* const oLitEnd = op + sequence.litLength;
-	size_t const sequenceLength = sequence.litLength + sequence.matchLength;
-	const BYTE* const iLitEnd = *litPtr + sequence.litLength;
-	const BYTE* match = oLitEnd - sequence.offset;
-	BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
-
-	/* bounds checks : careful of address space overflow in 32-bit mode */
-	RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
-	RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
-	assert(op < op + sequenceLength);
-	assert(oLitEnd < op + sequenceLength);
-
-	/* copy literals */
-	ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
-	op = oLitEnd;
-	*litPtr = iLitEnd;
-
-	/* copy Match */
-	if(sequence.offset > (size_t)(oLitEnd - prefixStart)) {
-		/* offset beyond prefix */
-		RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
-		match = dictEnd - (prefixStart - match);
-		if(match + sequence.matchLength <= dictEnd) {
-			ZSTD_memmove(oLitEnd, match, sequence.matchLength);
-			return sequenceLength;
-		}
-		/* span extDict & currentPrefixSegment */
-		{
-			size_t const length1 = dictEnd - match;
-			ZSTD_memmove(oLitEnd, match, length1);
-			op = oLitEnd + length1;
-			sequence.matchLength -= length1;
-			match = prefixStart;
-		}
-	}
-	ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
-	return sequenceLength;
-}
-
-/* ZSTD_execSequenceEndSplitLitBuffer():
- * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
- */
-FORCE_NOINLINE
-size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
-                                          BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
-                                          const BYTE** litPtr, const BYTE* const litLimit,
-                                          const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) {
-	BYTE* const oLitEnd = op + sequence.litLength;
-	size_t const sequenceLength = sequence.litLength + sequence.matchLength;
-	const BYTE* const iLitEnd = *litPtr + sequence.litLength;
-	const BYTE* match = oLitEnd - sequence.offset;
-
-	/* bounds checks : careful of address space overflow in 32-bit mode */
-	RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
-	RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
-	assert(op < op + sequenceLength);
-	assert(oLitEnd < op + sequenceLength);
-
-	/* copy literals */
-	RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
-	ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
-	op = oLitEnd;
-	*litPtr = iLitEnd;
-
-	/* copy Match */
-	if(sequence.offset > (size_t)(oLitEnd - prefixStart)) {
-		/* offset beyond prefix */
-		RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
-		match = dictEnd - (prefixStart - match);
-		if(match + sequence.matchLength <= dictEnd) {
-			ZSTD_memmove(oLitEnd, match, sequence.matchLength);
-			return sequenceLength;
-		}
-		/* span extDict & currentPrefixSegment */
-		{
-			size_t const length1 = dictEnd - match;
-			ZSTD_memmove(oLitEnd, match, length1);
-			op = oLitEnd + length1;
-			sequence.matchLength -= length1;
-			match = prefixStart;
-		}
-	}
-	ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
-	return sequenceLength;
-}
-
-HINT_INLINE
-size_t ZSTD_execSequence(BYTE* op,
-                         BYTE* const oend, seq_t sequence,
-                         const BYTE** litPtr, const BYTE* const litLimit,
-                         const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) {
-	BYTE* const oLitEnd = op + sequence.litLength;
-	size_t const sequenceLength = sequence.litLength + sequence.matchLength;
-	BYTE* const oMatchEnd = op + sequenceLength;     /* risk : address space overflow (32-bits) */
-	BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */
-	const BYTE* const iLitEnd = *litPtr + sequence.litLength;
-	const BYTE* match = oLitEnd - sequence.offset;
-
-	assert(op != NULL /* Precondition */);
-	assert(oend_w < oend /* No underflow */);
-
-#if defined(__aarch64__)
-	/* prefetch sequence starting from match that will be used for copy later */
-	PREFETCH_L1(match);
-#endif
-	/* Handle edge cases in a slow path:
-	 *   - Read beyond end of literals
-	 *   - Match end is within WILDCOPY_OVERLIMIT of oend
-	 *   - 32-bit mode and the match length overflows
-	 */
-	if(UNLIKELY(
-	       iLitEnd > litLimit ||
-	       oMatchEnd > oend_w ||
-	       (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
-		return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
-
-	/* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
-	assert(op <= oLitEnd /* No overflow */);
-	assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
-	assert(oMatchEnd <= oend /* No underflow */);
-	assert(iLitEnd <= litLimit /* Literal length is in bounds */);
-	assert(oLitEnd <= oend_w /* Can wildcopy literals */);
-	assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
-
-	/* Copy Literals:
-	 * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
-	 * We likely don't need the full 32-byte wildcopy.
-	 */
-	assert(WILDCOPY_OVERLENGTH >= 16);
-	ZSTD_copy16(op, (*litPtr));
-	if(UNLIKELY(sequence.litLength > 16)) {
-		ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
-	}
-	op = oLitEnd;
-	*litPtr = iLitEnd; /* update for next sequence */
-
-	/* Copy Match */
-	if(sequence.offset > (size_t)(oLitEnd - prefixStart)) {
-		/* offset beyond prefix -> go into extDict */
-		RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
-		match = dictEnd + (match - prefixStart);
-		if(match + sequence.matchLength <= dictEnd) {
-			ZSTD_memmove(oLitEnd, match, sequence.matchLength);
-			return sequenceLength;
-		}
-		/* span extDict & currentPrefixSegment */
-		{
-			size_t const length1 = dictEnd - match;
-			ZSTD_memmove(oLitEnd, match, length1);
-			op = oLitEnd + length1;
-			sequence.matchLength -= length1;
-			match = prefixStart;
-		}
-	}
-	/* Match within prefix of 1 or more bytes */
-	assert(op <= oMatchEnd);
-	assert(oMatchEnd <= oend_w);
-	assert(match >= prefixStart);
-	assert(sequence.matchLength >= 1);
-
-	/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
-	 * without overlap checking.
-	 */
-	if(LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
-		/* We bet on a full wildcopy for matches, since we expect matches to be
-		 * longer than literals (in general). In silesia, ~10% of matches are longer
-		 * than 16 bytes.
-		 */
-		ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
-		return sequenceLength;
-	}
-	assert(sequence.offset < WILDCOPY_VECLEN);
-
-	/* Copy 8 bytes and spread the offset to be >= 8. */
-	ZSTD_overlapCopy8(&op, &match, sequence.offset);
-
-	/* If the match length is > 8 bytes, then continue with the wildcopy. */
-	if(sequence.matchLength > 8) {
-		assert(op < oMatchEnd);
-		ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
-	}
-	return sequenceLength;
-}
-
-HINT_INLINE
-size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
-                                       BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
-                                       const BYTE** litPtr, const BYTE* const litLimit,
-                                       const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) {
-	BYTE* const oLitEnd = op + sequence.litLength;
-	size_t const sequenceLength = sequence.litLength + sequence.matchLength;
-	BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
-	const BYTE* const iLitEnd = *litPtr + sequence.litLength;
-	const BYTE* match = oLitEnd - sequence.offset;
-
-	assert(op != NULL /* Precondition */);
-	assert(oend_w < oend /* No underflow */);
-	/* Handle edge cases in a slow path:
-	 *   - Read beyond end of literals
-	 *   - Match end is within WILDCOPY_OVERLIMIT of oend
-	 *   - 32-bit mode and the match length overflows
-	 */
-	if(UNLIKELY(
-	       iLitEnd > litLimit ||
-	       oMatchEnd > oend_w ||
-	       (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
-		return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
-
-	/* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
-	assert(op <= oLitEnd /* No overflow */);
-	assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
-	assert(oMatchEnd <= oend /* No underflow */);
-	assert(iLitEnd <= litLimit /* Literal length is in bounds */);
-	assert(oLitEnd <= oend_w /* Can wildcopy literals */);
-	assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
-
-	/* Copy Literals:
-	 * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
-	 * We likely don't need the full 32-byte wildcopy.
-	 */
-	assert(WILDCOPY_OVERLENGTH >= 16);
-	ZSTD_copy16(op, (*litPtr));
-	if(UNLIKELY(sequence.litLength > 16)) {
-		ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
-	}
-	op = oLitEnd;
-	*litPtr = iLitEnd; /* update for next sequence */
-
-	/* Copy Match */
-	if(sequence.offset > (size_t)(oLitEnd - prefixStart)) {
-		/* offset beyond prefix -> go into extDict */
-		RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
-		match = dictEnd + (match - prefixStart);
-		if(match + sequence.matchLength <= dictEnd) {
-			ZSTD_memmove(oLitEnd, match, sequence.matchLength);
-			return sequenceLength;
-		}
-		/* span extDict & currentPrefixSegment */
-		{
-			size_t const length1 = dictEnd - match;
-			ZSTD_memmove(oLitEnd, match, length1);
-			op = oLitEnd + length1;
-			sequence.matchLength -= length1;
-			match = prefixStart;
-		}
-	}
-	/* Match within prefix of 1 or more bytes */
-	assert(op <= oMatchEnd);
-	assert(oMatchEnd <= oend_w);
-	assert(match >= prefixStart);
-	assert(sequence.matchLength >= 1);
-
-	/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
-	 * without overlap checking.
-	 */
-	if(LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
-		/* We bet on a full wildcopy for matches, since we expect matches to be
-		 * longer than literals (in general). In silesia, ~10% of matches are longer
-		 * than 16 bytes.
-		 */
-		ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
-		return sequenceLength;
-	}
-	assert(sequence.offset < WILDCOPY_VECLEN);
-
-	/* Copy 8 bytes and spread the offset to be >= 8. */
-	ZSTD_overlapCopy8(&op, &match, sequence.offset);
-
-	/* If the match length is > 8 bytes, then continue with the wildcopy. */
-	if(sequence.matchLength > 8) {
-		assert(op < oMatchEnd);
-		ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
-	}
-	return sequenceLength;
-}
-
-static void
-ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) {
-	const void* ptr = dt;
-	const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr;
-	DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
-	DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits",
-	         (U32)DStatePtr->state, DTableH->tableLog);
-	BIT_reloadDStream(bitD);
-	DStatePtr->table = dt + 1;
-}
-
-FORCE_INLINE_TEMPLATE void
-ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) {
-	size_t const lowBits = BIT_readBits(bitD, nbBits);
-	DStatePtr->state = nextState + lowBits;
-}
-
-/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
- * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
- * bits before reloading. This value is the maximum number of bytes we read
- * after reloading when we are decoding long offsets.
- */
-#define LONG_OFFSETS_MAX_EXTRA_BITS_32                       \
-	(ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32       \
-	     ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \
-	     : 0)
-
-typedef enum { ZSTD_lo_isRegularOffset,
-	           ZSTD_lo_isLongOffset = 1 } ZSTD_longOffset_e;
-
-FORCE_INLINE_TEMPLATE seq_t
-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) {
-	seq_t seq;
-	/*
-	 * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
-	 * loaded in one operation and extracted its fields by simply shifting or
-	 * bit-extracting on aarch64.
-	 * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
-	 * operations that cause performance drop. This can be avoided by using this
-	 * ZSTD_memcpy hack.
-	 */
-#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
-	ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
-	ZSTD_seqSymbol* const llDInfo = &llDInfoS;
-	ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
-	ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
-	ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
-	ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
-	ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
-#else
-	const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
-	const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
-	const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
-#endif
-	seq.matchLength = mlDInfo->baseValue;
-	seq.litLength = llDInfo->baseValue;
-	{
-		U32 const ofBase = ofDInfo->baseValue;
-		BYTE const llBits = llDInfo->nbAdditionalBits;
-		BYTE const mlBits = mlDInfo->nbAdditionalBits;
-		BYTE const ofBits = ofDInfo->nbAdditionalBits;
-		BYTE const totalBits = llBits + mlBits + ofBits;
-
-		U16 const llNext = llDInfo->nextState;
-		U16 const mlNext = mlDInfo->nextState;
-		U16 const ofNext = ofDInfo->nextState;
-		U32 const llnbBits = llDInfo->nbBits;
-		U32 const mlnbBits = mlDInfo->nbBits;
-		U32 const ofnbBits = ofDInfo->nbBits;
-
-		assert(llBits <= MaxLLBits);
-		assert(mlBits <= MaxMLBits);
-		assert(ofBits <= MaxOff);
-		/*
-		 * As gcc has better branch and block analyzers, sometimes it is only
-		 * valuable to mark likeliness for clang, it gives around 3-4% of
-		 * performance.
-		 */
-
-		/* sequence */
-		{
-			size_t offset;
-#if defined(__clang__)
-			if(LIKELY(ofBits > 1)) {
-#else
-			if(ofBits > 1) {
-#endif
-				ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
-				ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
-				ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
-				ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
-				if(MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
-					/* Always read extra bits, this keeps the logic simple,
-					 * avoids branches, and avoids accidentally reading 0 bits.
-					 */
-					U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
-					offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
-					BIT_reloadDStream(&seqState->DStream);
-					offset += BIT_readBitsFast(&seqState->DStream, extraBits);
-				} else {
-					offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits /*>0*/); /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
-					if(MEM_32bits())
-						BIT_reloadDStream(&seqState->DStream);
-				}
-				seqState->prevOffset[2] = seqState->prevOffset[1];
-				seqState->prevOffset[1] = seqState->prevOffset[0];
-				seqState->prevOffset[0] = offset;
-			} else {
-				U32 const ll0 = (llDInfo->baseValue == 0);
-				if(LIKELY((ofBits == 0))) {
-					offset = seqState->prevOffset[ll0];
-					seqState->prevOffset[1] = seqState->prevOffset[!ll0];
-					seqState->prevOffset[0] = offset;
-				} else {
-					offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
-					{
-						size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
-						temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
-						if(offset != 1)
-							seqState->prevOffset[2] = seqState->prevOffset[1];
-						seqState->prevOffset[1] = seqState->prevOffset[0];
-						seqState->prevOffset[0] = offset = temp;
-					}
-				}
-			}
-			seq.offset = offset;
-		}
-
-#if defined(__clang__)
-		if(UNLIKELY(mlBits > 0))
-#else
-		if(mlBits > 0)
-#endif
-			seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits /*>0*/);
-
-		if(MEM_32bits() && (mlBits + llBits >= STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32))
-			BIT_reloadDStream(&seqState->DStream);
-		if(MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64 - (LLFSELog + MLFSELog + OffFSELog)))
-			BIT_reloadDStream(&seqState->DStream);
-		/* Ensure there are enough bits to read the rest of data in 64-bit mode. */
-		ZSTD_STATIC_ASSERT(16 + LLFSELog + MLFSELog + OffFSELog < STREAM_ACCUMULATOR_MIN_64);
-
-#if defined(__clang__)
-		if(UNLIKELY(llBits > 0))
-#else
-		if(llBits > 0)
-#endif
-			seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits /*>0*/);
-
-		if(MEM_32bits())
-			BIT_reloadDStream(&seqState->DStream);
-
-		DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
-		         (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
-
-		ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <=  9 bits */
-		ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <=  9 bits */
-		if(MEM_32bits())
-			BIT_reloadDStream(&seqState->DStream);                                                /* <= 18 bits */
-		ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <=  8 bits */
-	}
-
-	return seq;
-}
-
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const * dctx, BYTE const * prefixStart, BYTE const * oLitEnd) {
-	size_t const windowSize = dctx->fParams.windowSize;
-	/* No dictionary used. */
-	if(dctx->dictContentEndForFuzzing == NULL)
-		return 0;
-	/* Dictionary is our prefix. */
-	if(prefixStart == dctx->dictContentBeginForFuzzing)
-		return 1;
-	/* Dictionary is not our ext-dict. */
-	if(dctx->dictEnd != dctx->dictContentEndForFuzzing)
-		return 0;
-	/* Dictionary is not within our window size. */
-	if((size_t)(oLitEnd - prefixStart) >= windowSize)
-		return 0;
-	/* Dictionary is active. */
-	return 1;
-}
-
-MEM_STATIC void ZSTD_assertValidSequence(
-    ZSTD_DCtx const * dctx,
-    BYTE const * op, BYTE const * oend,
-    seq_t const seq,
-    BYTE const * prefixStart, BYTE const * virtualStart) {
-#if DEBUGLEVEL >= 1
-	size_t const windowSize = dctx->fParams.windowSize;
-	size_t const sequenceSize = seq.litLength + seq.matchLength;
-	BYTE const * const oLitEnd = op + seq.litLength;
-	DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
-	         (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
-	assert(op <= oend);
-	assert((size_t)(oend - op) >= sequenceSize);
-	assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
-	if(ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
-		size_t const dictSize = (size_t)((char const *)dctx->dictContentEndForFuzzing - (char const *)dctx->dictContentBeginForFuzzing);
-		/* Offset must be within the dictionary. */
-		assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
-		assert(seq.offset <= windowSize + dictSize);
-	} else {
-		/* Offset must be within our window. */
-		assert(seq.offset <= windowSize);
-	}
-#else
-	(void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
-#endif
-}
-#endif
-
-#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
-
-FORCE_INLINE_TEMPLATE size_t
-    DONT_VECTORIZE
-    ZSTD_decompressSequences_bodySplitLitBuffer(ZSTD_DCtx* dctx,
-                                                void* dst, size_t maxDstSize,
-                                                const void* seqStart, size_t seqSize, int nbSeq,
-                                                const ZSTD_longOffset_e isLongOffset,
-                                                const int frame) {
-	const BYTE* ip = (const BYTE*)seqStart;
-	const BYTE* const iend = ip + seqSize;
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = ostart + maxDstSize;
-	BYTE* op = ostart;
-	const BYTE* litPtr = dctx->litPtr;
-	const BYTE* litBufferEnd = dctx->litBufferEnd;
-	const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
-	const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
-	const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
-	DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
-	(void)frame;
-
-	/* Regen sequences */
-	if(nbSeq) {
-		seqState_t seqState;
-		dctx->fseEntropy = 1;
-		{
-			U32 i;
-			for(i = 0; i < ZSTD_REP_NUM; i++)
-				seqState.prevOffset[i] = dctx->entropy.rep[i];
-		}
-		RETURN_ERROR_IF(
-		    ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
-		    corruption_detected, "");
-		ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
-		ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
-		ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
-		assert(dst != NULL);
-
-		ZSTD_STATIC_ASSERT(
-		    BIT_DStream_unfinished < BIT_DStream_completed &&
-		    BIT_DStream_endOfBuffer < BIT_DStream_completed &&
-		    BIT_DStream_completed < BIT_DStream_overflow);
-
-		/* decompress without overrunning litPtr begins */
-		{
-			seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
-			/* Align the decompression loop to 32 + 16 bytes.
-			 *
-			 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
-			 * speed swings based on the alignment of the decompression loop. This
-			 * performance swing is caused by parts of the decompression loop falling
-			 * out of the DSB. The entire decompression loop should fit in the DSB,
-			 * when it can't we get much worse performance. You can measure if you've
-			 * hit the good case or the bad case with this perf command for some
-			 * compressed file test.zst:
-			 *
-			 *   perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
-			 *             -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
-			 *
-			 * If you see most cycles served out of the MITE you've hit the bad case.
-			 * If you see most cycles served out of the DSB you've hit the good case.
-			 * If it is pretty even then you may be in an okay case.
-			 *
-			 * This issue has been reproduced on the following CPUs:
-			 *   - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
-			 *               Use Instruments->Counters to get DSB/MITE cycles.
-			 *               I never got performance swings, but I was able to
-			 *               go from the good case of mostly DSB to half of the
-			 *               cycles served from MITE.
-			 *   - Coffeelake: Intel i9-9900k
-			 *   - Coffeelake: Intel i7-9700k
-			 *
-			 * I haven't been able to reproduce the instability or DSB misses on any
-			 * of the following CPUS:
-			 *   - Haswell
-			 *   - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
-			 *   - Skylake
-			 *
-			 * Alignment is done for each of the three major decompression loops:
-			 *   - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
-			 *   - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
-			 *   - ZSTD_decompressSequences_body
-			 * Alignment choices are made to minimize large swings on bad cases and influence on performance
-			 * from changes external to this code, rather than to overoptimize on the current commit.
-			 *
-			 * If you are seeing performance stability this script can help test.
-			 * It tests on 4 commits in zstd where I saw performance change.
-			 *
-			 *   https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
-			 */
-#if defined(__GNUC__) && defined(__x86_64__)
-			__asm__(".p2align 6");
-#if __GNUC__ >= 7
-			/* good for gcc-7, gcc-9, and gcc-11 */
-			__asm__("nop");
-			__asm__(".p2align 5");
-			__asm__("nop");
-			__asm__(".p2align 4");
-#if __GNUC__ == 8 || __GNUC__ == 10
-			/* good for gcc-8 and gcc-10 */
-			__asm__("nop");
-			__asm__(".p2align 3");
-#endif
-#endif
-#endif
-
-			/* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
-			for(; litPtr + sequence.litLength <= dctx->litBufferEnd;) {
-				size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
-#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
-				assert(!ZSTD_isError(oneSeqSize));
-				if(frame)
-					ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
-#endif
-				if(UNLIKELY(ZSTD_isError(oneSeqSize)))
-					return oneSeqSize;
-				DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
-				op += oneSeqSize;
-				if(UNLIKELY(!--nbSeq))
-					break;
-				BIT_reloadDStream(&(seqState.DStream));
-				sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
-			}
-
-			/* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
-			if(nbSeq > 0) {
-				const size_t leftoverLit = dctx->litBufferEnd - litPtr;
-				if(leftoverLit) {
-					RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
-					ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
-					sequence.litLength -= leftoverLit;
-					op += leftoverLit;
-				}
-				litPtr = dctx->litExtraBuffer;
-				litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
-				dctx->litBufferLocation = ZSTD_not_in_dst;
-				{
-					size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
-#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
-					assert(!ZSTD_isError(oneSeqSize));
-					if(frame)
-						ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
-#endif
-					if(UNLIKELY(ZSTD_isError(oneSeqSize)))
-						return oneSeqSize;
-					DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
-					op += oneSeqSize;
-					if(--nbSeq)
-						BIT_reloadDStream(&(seqState.DStream));
-				}
-			}
-		}
-
-		if(nbSeq > 0) /* there is remaining lit from extra buffer */
-		{
-
-#if defined(__GNUC__) && defined(__x86_64__)
-			__asm__(".p2align 6");
-			__asm__("nop");
-#if __GNUC__ != 7
-			/* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
-			__asm__(".p2align 4");
-			__asm__("nop");
-			__asm__(".p2align 3");
-#elif __GNUC__ >= 11
-			__asm__(".p2align 3");
-#else
-			__asm__(".p2align 5");
-			__asm__("nop");
-			__asm__(".p2align 3");
-#endif
-#endif
-
-			for(;;) {
-				seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
-				size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
-#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
-				assert(!ZSTD_isError(oneSeqSize));
-				if(frame)
-					ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
-#endif
-				if(UNLIKELY(ZSTD_isError(oneSeqSize)))
-					return oneSeqSize;
-				DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
-				op += oneSeqSize;
-				if(UNLIKELY(!--nbSeq))
-					break;
-				BIT_reloadDStream(&(seqState.DStream));
-			}
-		}
-
-		/* check if reached exact end */
-		DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
-		RETURN_ERROR_IF(nbSeq, corruption_detected, "");
-		RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
-		/* save reps for next block */
-		{
-			U32 i;
-			for(i = 0; i < ZSTD_REP_NUM; i++)
-				dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]);
-		}
-	}
-
-	/* last literal segment */
-	if(dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
-	{
-		size_t const lastLLSize = litBufferEnd - litPtr;
-		RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
-		if(op != NULL) {
-			ZSTD_memmove(op, litPtr, lastLLSize);
-			op += lastLLSize;
-		}
-		litPtr = dctx->litExtraBuffer;
-		litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
-		dctx->litBufferLocation = ZSTD_not_in_dst;
-	}
-	{
-		size_t const lastLLSize = litBufferEnd - litPtr;
-		RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
-		if(op != NULL) {
-			ZSTD_memcpy(op, litPtr, lastLLSize);
-			op += lastLLSize;
-		}
-	}
-
-	return op - ostart;
-}
-
-FORCE_INLINE_TEMPLATE size_t
-    DONT_VECTORIZE
-    ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
-                                  void* dst, size_t maxDstSize,
-                                  const void* seqStart, size_t seqSize, int nbSeq,
-                                  const ZSTD_longOffset_e isLongOffset,
-                                  const int frame) {
-	const BYTE* ip = (const BYTE*)seqStart;
-	const BYTE* const iend = ip + seqSize;
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
-	BYTE* op = ostart;
-	const BYTE* litPtr = dctx->litPtr;
-	const BYTE* const litEnd = litPtr + dctx->litSize;
-	const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
-	const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
-	const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
-	DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
-	(void)frame;
-
-	/* Regen sequences */
-	if(nbSeq) {
-		seqState_t seqState;
-		dctx->fseEntropy = 1;
-		{
-			U32 i;
-			for(i = 0; i < ZSTD_REP_NUM; i++)
-				seqState.prevOffset[i] = dctx->entropy.rep[i];
-		}
-		RETURN_ERROR_IF(
-		    ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
-		    corruption_detected, "");
-		ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
-		ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
-		ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
-		assert(dst != NULL);
-
-		ZSTD_STATIC_ASSERT(
-		    BIT_DStream_unfinished < BIT_DStream_completed &&
-		    BIT_DStream_endOfBuffer < BIT_DStream_completed &&
-		    BIT_DStream_completed < BIT_DStream_overflow);
-
-#if defined(__GNUC__) && defined(__x86_64__)
-		__asm__(".p2align 6");
-		__asm__("nop");
-#if __GNUC__ >= 7
-		__asm__(".p2align 5");
-		__asm__("nop");
-		__asm__(".p2align 3");
-#else
-		__asm__(".p2align 4");
-		__asm__("nop");
-		__asm__(".p2align 3");
-#endif
-#endif
-
-		for(;;) {
-			seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
-			size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
-#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
-			assert(!ZSTD_isError(oneSeqSize));
-			if(frame)
-				ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
-#endif
-			if(UNLIKELY(ZSTD_isError(oneSeqSize)))
-				return oneSeqSize;
-			DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
-			op += oneSeqSize;
-			if(UNLIKELY(!--nbSeq))
-				break;
-			BIT_reloadDStream(&(seqState.DStream));
-		}
-
-		/* check if reached exact end */
-		DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
-		RETURN_ERROR_IF(nbSeq, corruption_detected, "");
-		RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
-		/* save reps for next block */
-		{
-			U32 i;
-			for(i = 0; i < ZSTD_REP_NUM; i++)
-				dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]);
-		}
-	}
-
-	/* last literal segment */
-	{
-		size_t const lastLLSize = litEnd - litPtr;
-		RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
-		if(op != NULL) {
-			ZSTD_memcpy(op, litPtr, lastLLSize);
-			op += lastLLSize;
-		}
-	}
-
-	return op - ostart;
-}
-
-static size_t
-ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
-                                 void* dst, size_t maxDstSize,
-                                 const void* seqStart, size_t seqSize, int nbSeq,
-                                 const ZSTD_longOffset_e isLongOffset,
-                                 const int frame) {
-	return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-}
-
-static size_t
-ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
-                                               void* dst, size_t maxDstSize,
-                                               const void* seqStart, size_t seqSize, int nbSeq,
-                                               const ZSTD_longOffset_e isLongOffset,
-                                               const int frame) {
-	return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-}
-#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
-
-#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
-
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
-                   const BYTE* const prefixStart, const BYTE* const dictEnd) {
-	prefetchPos += sequence.litLength;
-	{
-		const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
-		const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
-		                                                                      * No consequence though : memory address is only used for prefetching, not for dereferencing */
-		PREFETCH_L1(match);
-		PREFETCH_L1(match + CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
-	}
-	return prefetchPos + sequence.matchLength;
-}
-
-/* This decoding function employs prefetching
- * to reduce latency impact of cache misses.
- * It's generally employed when block contains a significant portion of long-distance matches
- * or when coupled with a "cold" dictionary */
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_decompressSequencesLong_body(
-    ZSTD_DCtx* dctx,
-    void* dst, size_t maxDstSize,
-    const void* seqStart, size_t seqSize, int nbSeq,
-    const ZSTD_longOffset_e isLongOffset,
-    const int frame) {
-	const BYTE* ip = (const BYTE*)seqStart;
-	const BYTE* const iend = ip + seqSize;
-	BYTE* const ostart = (BYTE*)dst;
-	BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
-	BYTE* op = ostart;
-	const BYTE* litPtr = dctx->litPtr;
-	const BYTE* litBufferEnd = dctx->litBufferEnd;
-	const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
-	const BYTE* const dictStart = (const BYTE*)(dctx->virtualStart);
-	const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
-	(void)frame;
-
-	/* Regen sequences */
-	if(nbSeq) {
-#define STORED_SEQS 8
-#define STORED_SEQS_MASK (STORED_SEQS - 1)
-#define ADVANCED_SEQS STORED_SEQS
-		seq_t sequences[STORED_SEQS];
-		int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
-		seqState_t seqState;
-		int seqNb;
-		size_t prefetchPos = (size_t)(op - prefixStart); /* track position relative to prefixStart */
-
-		dctx->fseEntropy = 1;
-		{
-			int i;
-			for(i = 0; i < ZSTD_REP_NUM; i++)
-				seqState.prevOffset[i] = dctx->entropy.rep[i];
-		}
-		assert(dst != NULL);
-		assert(iend >= ip);
-		RETURN_ERROR_IF(
-		    ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
-		    corruption_detected, "");
-		ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
-		ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
-		ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
-
-		/* prepare in advance */
-		for(seqNb = 0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb < seqAdvance); seqNb++) {
-			seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
-			prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
-			sequences[seqNb] = sequence;
-		}
-		RETURN_ERROR_IF(seqNb < seqAdvance, corruption_detected, "");
-
-		/* decompress without stomping litBuffer */
-		for(; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
-			seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
-			size_t oneSeqSize;
-
-			if(dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
-				/* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
-				const size_t leftoverLit = dctx->litBufferEnd - litPtr;
-				if(leftoverLit) {
-					RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
-					ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
-					sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
-					op += leftoverLit;
-				}
-				litPtr = dctx->litExtraBuffer;
-				litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
-				dctx->litBufferLocation = ZSTD_not_in_dst;
-				oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
-#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
-				assert(!ZSTD_isError(oneSeqSize));
-				if(frame)
-					ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
-#endif
-				if(ZSTD_isError(oneSeqSize))
-					return oneSeqSize;
-
-				prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
-				sequences[seqNb & STORED_SEQS_MASK] = sequence;
-				op += oneSeqSize;
-			} else {
-				/* lit buffer is either wholly contained in first or second split, or not split at all*/
-				oneSeqSize = dctx->litBufferLocation == ZSTD_split ? ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
-#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
-				assert(!ZSTD_isError(oneSeqSize));
-				if(frame)
-					ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
-#endif
-				if(ZSTD_isError(oneSeqSize))
-					return oneSeqSize;
-
-				prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
-				sequences[seqNb & STORED_SEQS_MASK] = sequence;
-				op += oneSeqSize;
-			}
-		}
-		RETURN_ERROR_IF(seqNb < nbSeq, corruption_detected, "");
-
-		/* finish queue */
-		seqNb -= seqAdvance;
-		for(; seqNb < nbSeq; seqNb++) {
-			seq_t* sequence = &(sequences[seqNb & STORED_SEQS_MASK]);
-			if(dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
-				const size_t leftoverLit = dctx->litBufferEnd - litPtr;
-				if(leftoverLit) {
-					RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
-					ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
-					sequence->litLength -= leftoverLit;
-					op += leftoverLit;
-				}
-				litPtr = dctx->litExtraBuffer;
-				litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
-				dctx->litBufferLocation = ZSTD_not_in_dst;
-				{
-					size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
-#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
-					assert(!ZSTD_isError(oneSeqSize));
-					if(frame)
-						ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb & STORED_SEQS_MASK], prefixStart, dictStart);
-#endif
-					if(ZSTD_isError(oneSeqSize))
-						return oneSeqSize;
-					op += oneSeqSize;
-				}
-			} else {
-				size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
-#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
-				assert(!ZSTD_isError(oneSeqSize));
-				if(frame)
-					ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb & STORED_SEQS_MASK], prefixStart, dictStart);
-#endif
-				if(ZSTD_isError(oneSeqSize))
-					return oneSeqSize;
-				op += oneSeqSize;
-			}
-		}
-
-		/* save reps for next block */
-		{
-			U32 i;
-			for(i = 0; i < ZSTD_REP_NUM; i++)
-				dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]);
-		}
-	}
-
-	/* last literal segment */
-	if(dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */
-	{
-		size_t const lastLLSize = litBufferEnd - litPtr;
-		RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
-		if(op != NULL) {
-			ZSTD_memmove(op, litPtr, lastLLSize);
-			op += lastLLSize;
-		}
-		litPtr = dctx->litExtraBuffer;
-		litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
-	}
-	{
-		size_t const lastLLSize = litBufferEnd - litPtr;
-		RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
-		if(op != NULL) {
-			ZSTD_memmove(op, litPtr, lastLLSize);
-			op += lastLLSize;
-		}
-	}
-
-	return op - ostart;
-}
-
-static size_t
-ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
-                                     void* dst, size_t maxDstSize,
-                                     const void* seqStart, size_t seqSize, int nbSeq,
-                                     const ZSTD_longOffset_e isLongOffset,
-                                     const int frame) {
-	return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-}
-#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
-
-#if DYNAMIC_BMI2
-
-#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
-static BMI2_TARGET_ATTRIBUTE size_t
-    DONT_VECTORIZE
-    ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
-                                  void* dst, size_t maxDstSize,
-                                  const void* seqStart, size_t seqSize, int nbSeq,
-                                  const ZSTD_longOffset_e isLongOffset,
-                                  const int frame) {
-	return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-}
-static BMI2_TARGET_ATTRIBUTE size_t
-    DONT_VECTORIZE
-    ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
-                                                void* dst, size_t maxDstSize,
-                                                const void* seqStart, size_t seqSize, int nbSeq,
-                                                const ZSTD_longOffset_e isLongOffset,
-                                                const int frame) {
-	return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-}
-#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
-
-#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
-static BMI2_TARGET_ATTRIBUTE size_t
-ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
-                                  void* dst, size_t maxDstSize,
-                                  const void* seqStart, size_t seqSize, int nbSeq,
-                                  const ZSTD_longOffset_e isLongOffset,
-                                  const int frame) {
-	return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-}
-#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
-
-#endif /* DYNAMIC_BMI2 */
-
-typedef size_t (*ZSTD_decompressSequences_t)(
-    ZSTD_DCtx* dctx,
-    void* dst, size_t maxDstSize,
-    const void* seqStart, size_t seqSize, int nbSeq,
-    const ZSTD_longOffset_e isLongOffset,
-    const int frame);
-
-#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
-static size_t
-ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
-                         const void* seqStart, size_t seqSize, int nbSeq,
-                         const ZSTD_longOffset_e isLongOffset,
-                         const int frame) {
-	DEBUGLOG(5, "ZSTD_decompressSequences");
-#if DYNAMIC_BMI2
-	if(ZSTD_DCtx_get_bmi2(dctx)) {
-		return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-	}
-#endif
-	return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-}
-static size_t
-ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
-                                       const void* seqStart, size_t seqSize, int nbSeq,
-                                       const ZSTD_longOffset_e isLongOffset,
-                                       const int frame) {
-	DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
-#if DYNAMIC_BMI2
-	if(ZSTD_DCtx_get_bmi2(dctx)) {
-		return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-	}
-#endif
-	return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-}
-#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
-
-#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
-/* ZSTD_decompressSequencesLong() :
- * decompression function triggered when a minimum share of offsets is considered "long",
- * aka out of cache.
- * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".
- * This function will try to mitigate main memory latency through the use of prefetching */
-static size_t
-ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
-                             void* dst, size_t maxDstSize,
-                             const void* seqStart, size_t seqSize, int nbSeq,
-                             const ZSTD_longOffset_e isLongOffset,
-                             const int frame) {
-	DEBUGLOG(5, "ZSTD_decompressSequencesLong");
-#if DYNAMIC_BMI2
-	if(ZSTD_DCtx_get_bmi2(dctx)) {
-		return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-	}
-#endif
-	return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
-}
-#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
-
-/**
- * @returns The total size of the history referencable by zstd, including
- * both the prefix and the extDict. At @p op any offset larger than this
- * is invalid.
- */
-static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const * virtualStart) {
-	return (size_t)(op - virtualStart);
-}
-
-typedef struct {
-	unsigned longOffsetShare;
-	unsigned maxNbAdditionalBits;
-} ZSTD_OffsetInfo;
-
-/* ZSTD_getOffsetInfo() :
- * condition : offTable must be valid
- * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
- *           compared to maximum possible of (1<<OffFSELog),
- *           as well as the maximum number additional bits required.
- */
-static ZSTD_OffsetInfo
-ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq) {
-	ZSTD_OffsetInfo info = {0, 0};
-	/* If nbSeq == 0, then the offTable is uninitialized, but we have
-	 * no sequences, so both values should be 0.
-	 */
-	if(nbSeq != 0) {
-		const void* ptr = offTable;
-		U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
-		const ZSTD_seqSymbol* table = offTable + 1;
-		U32 const max = 1 << tableLog;
-		U32 u;
-		DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
-
-		assert(max <= (1 << OffFSELog)); /* max not too large */
-		for(u = 0; u < max; u++) {
-			info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
-			if(table[u].nbAdditionalBits > 22)
-				info.longOffsetShare += 1;
-		}
-
-		assert(tableLog <= OffFSELog);
-		info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */
-	}
-
-	return info;
-}
-
-/**
- * @returns The maximum offset we can decode in one read of our bitstream, without
- * reloading more bits in the middle of the offset bits read. Any offsets larger
- * than this must use the long offset decoder.
- */
-static size_t ZSTD_maxShortOffset(void) {
-	if(MEM_64bits()) {
-		/* We can decode any offset without reloading bits.
-		 * This might change if the max window size grows.
-		 */
-		ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
-		return (size_t)-1;
-	} else {
-		/* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
-		 * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
-		 * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
-		 */
-		size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
-		size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
-		assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
-		return maxOffset;
-	}
-}
-
-size_t
-ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
-                              void* dst, size_t dstCapacity,
-                              const void* src, size_t srcSize, const int frame, const streaming_operation streaming) { /* blockType == blockCompressed */
-	const BYTE* ip = (const BYTE*)src;
-	DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
-
-	/* Note : the wording of the specification
-	 * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
-	 * This generally does not happen, as it makes little sense,
-	 * since an uncompressed block would feature same size and have no decompression cost.
-	 * Also, note that decoder from reference libzstd before < v1.5.4
-	 * would consider this edge case as an error.
-	 * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
-	 * for broader compatibility with the deployed ecosystem of zstd decoders */
-	RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
-
-	/* Decode literals section */
-	{
-		size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
-		DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
-		if(ZSTD_isError(litCSize))
-			return litCSize;
-		ip += litCSize;
-		srcSize -= litCSize;
-	}
-
-	/* Build Decoding Tables */
-	{
-		/* Compute the maximum block size, which must also work when !frame and fParams are unset.
-		 * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
-		 */
-		size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
-		size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const *)dctx->virtualStart);
-		/* isLongOffset must be true if there are long offsets.
-		 * Offsets are long if they are larger than ZSTD_maxShortOffset().
-		 * We don't expect that to be the case in 64-bit mode.
-		 *
-		 * We check here to see if our history is large enough to allow long offsets.
-		 * If it isn't, then we can't possible have (valid) long offsets. If the offset
-		 * is invalid, then it is okay to read it incorrectly.
-		 *
-		 * If isLongOffsets is true, then we will later check our decoding table to see
-		 * if it is even possible to generate long offsets.
-		 */
-		ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
-		/* These macros control at build-time which decompressor implementation
-		 * we use. If neither is defined, we do some inspection and dispatch at
-		 * runtime.
-		 */
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-		int usePrefetchDecoder = dctx->ddictIsCold;
-#else
-		/* Set to 1 to avoid computing offset info if we don't need to.
-		 * Otherwise this value is ignored.
-		 */
-		int usePrefetchDecoder = 1;
-#endif
-		int nbSeq;
-		size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
-		if(ZSTD_isError(seqHSize))
-			return seqHSize;
-		ip += seqHSize;
-		srcSize -= seqHSize;
-
-		RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
-
-		/* If we could potentially have long offsets, or we might want to use the prefetch decoder,
-		 * compute information about the share of long offsets, and the maximum nbAdditionalBits.
-		 * NOTE: could probably use a larger nbSeq limit
-		 */
-		if(isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
-			ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
-			if(isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
-				/* If isLongOffset, but the maximum number of additional bits that we see in our table is small
-				 * enough, then we know it is impossible to have too long an offset in this block, so we can
-				 * use the regular offset decoder.
-				 */
-				isLongOffset = ZSTD_lo_isRegularOffset;
-			}
-			if(!usePrefetchDecoder) {
-				U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
-				usePrefetchDecoder = (info.longOffsetShare >= minShare);
-			}
-		}
-
-		dctx->ddictIsCold = 0;
-
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-		if(usePrefetchDecoder) {
-#else
-		(void)usePrefetchDecoder;
-		{
-#endif
-#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
-			return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
-#endif
-		}
-
-#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
-		/* else */
-		if(dctx->litBufferLocation == ZSTD_split)
-			return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
-		else
-			return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
-#endif
-	}
-}
-
-void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) {
-	if(dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */
-		dctx->dictEnd = dctx->previousDstEnd;
-		dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
-		dctx->prefixStart = dst;
-		dctx->previousDstEnd = dst;
-	}
-}
-
-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
-                            void* dst, size_t dstCapacity,
-                            const void* src, size_t srcSize) {
-	size_t dSize;
-	ZSTD_checkContinuity(dctx, dst, dstCapacity);
-	dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
-	dctx->previousDstEnd = (char*)dst + dSize;
-	return dSize;
-}
diff --git a/src/zstd/zstd_decompress_internal.h b/src/zstd/zstd_decompress_internal.h
deleted file mode 100644
index 404d7fdc9..000000000
--- a/src/zstd/zstd_decompress_internal.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/* zstd_decompress_internal:
- * objects and definitions shared within lib/decompress modules */
-
-#ifndef ZSTD_DECOMPRESS_INTERNAL_H
-#define ZSTD_DECOMPRESS_INTERNAL_H
-
-/*-*******************************************************
- *  Dependencies
- *********************************************************/
-#include "mem.h"           /* BYTE, U16, U32 */
-#include "zstd_internal.h" /* constants : MaxLL, MaxML, MaxOff, LLFSELog, etc. */
-
-/*-*******************************************************
- *  Constants
- *********************************************************/
-static UNUSED_ATTR const U32 LL_base[MaxLL + 1] = {
-    0, 1, 2, 3, 4, 5, 6, 7,
-    8, 9, 10, 11, 12, 13, 14, 15,
-    16, 18, 20, 22, 24, 28, 32, 40,
-    48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
-    0x2000, 0x4000, 0x8000, 0x10000};
-
-static UNUSED_ATTR const U32 OF_base[MaxOff + 1] = {
-    0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D,
-    0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD,
-    0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
-    0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD};
-
-static UNUSED_ATTR const U8 OF_bits[MaxOff + 1] = {
-    0, 1, 2, 3, 4, 5, 6, 7,
-    8, 9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23,
-    24, 25, 26, 27, 28, 29, 30, 31};
-
-static UNUSED_ATTR const U32 ML_base[MaxML + 1] = {
-    3, 4, 5, 6, 7, 8, 9, 10,
-    11, 12, 13, 14, 15, 16, 17, 18,
-    19, 20, 21, 22, 23, 24, 25, 26,
-    27, 28, 29, 30, 31, 32, 33, 34,
-    35, 37, 39, 41, 43, 47, 51, 59,
-    67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
-    0x1003, 0x2003, 0x4003, 0x8003, 0x10003};
-
-/*-*******************************************************
- *  Decompression types
- *********************************************************/
-typedef struct {
-	U32 fastMode;
-	U32 tableLog;
-} ZSTD_seqSymbol_header;
-
-typedef struct {
-	U16 nextState;
-	BYTE nbAdditionalBits;
-	BYTE nbBits;
-	U32 baseValue;
-} ZSTD_seqSymbol;
-
-#define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log)))
-
-#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
-#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
-#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
-
-typedef struct {
-	ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];             /* Note : Space reserved for FSE Tables */
-	ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];            /* is also used as temporary workspace while building hufTable during DDict creation */
-	ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];             /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
-	HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */
-	U32 rep[ZSTD_REP_NUM];
-	U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
-} ZSTD_entropyDTables_t;
-
-typedef enum { ZSTDds_getFrameHeaderSize,
-	           ZSTDds_decodeFrameHeader,
-	           ZSTDds_decodeBlockHeader,
-	           ZSTDds_decompressBlock,
-	           ZSTDds_decompressLastBlock,
-	           ZSTDds_checkChecksum,
-	           ZSTDds_decodeSkippableHeader,
-	           ZSTDds_skipFrame } ZSTD_dStage;
-
-typedef enum { zdss_init = 0,
-	           zdss_loadHeader,
-	           zdss_read,
-	           zdss_load,
-	           zdss_flush } ZSTD_dStreamStage;
-
-typedef enum {
-	ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */
-	ZSTD_dont_use = 0,          /* Do not use the dictionary (if one exists free it) */
-	ZSTD_use_once = 1           /* Use the dictionary once and set to ZSTD_dont_use */
-} ZSTD_dictUses_e;
-
-/* Hashset for storing references to multiple ZSTD_DDict within ZSTD_DCtx */
-typedef struct {
-	const ZSTD_DDict** ddictPtrTable;
-	size_t ddictPtrTableSize;
-	size_t ddictPtrCount;
-} ZSTD_DDictHashSet;
-
-#ifndef ZSTD_DECODER_INTERNAL_BUFFER
-#define ZSTD_DECODER_INTERNAL_BUFFER (1 << 16)
-#endif
-
-#define ZSTD_LBMIN 64
-#define ZSTD_LBMAX (128 << 10)
-
-/* extra buffer, compensates when dst is not large enough to store litBuffer */
-#define ZSTD_LITBUFFEREXTRASIZE BOUNDED(ZSTD_LBMIN, ZSTD_DECODER_INTERNAL_BUFFER, ZSTD_LBMAX)
-
-typedef enum {
-	ZSTD_not_in_dst = 0, /* Stored entirely within litExtraBuffer */
-	ZSTD_in_dst = 1,     /* Stored entirely within dst (in memory after current output write) */
-	ZSTD_split = 2       /* Split between litExtraBuffer and dst */
-} ZSTD_litLocation_e;
-
-struct ZSTD_DCtx_s {
-	const ZSTD_seqSymbol* LLTptr;
-	const ZSTD_seqSymbol* MLTptr;
-	const ZSTD_seqSymbol* OFTptr;
-	const HUF_DTable* HUFptr;
-	ZSTD_entropyDTables_t entropy;
-	U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; /* space needed when building huffman tables */
-	const void* previousDstEnd;                       /* detect continuity */
-	const void* prefixStart;                          /* start of current segment */
-	const void* virtualStart;                         /* virtual start of previous segment if it was just before current one */
-	const void* dictEnd;                              /* end of previous segment */
-	size_t expected;
-	ZSTD_frameHeader fParams;
-	U64 processedCSize;
-	U64 decodedSize;
-	blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
-	ZSTD_dStage stage;
-	U32 litEntropy;
-	U32 fseEntropy;
-	XXH64_state_t xxhState;
-	size_t headerSize;
-	ZSTD_format_e format;
-	ZSTD_forceIgnoreChecksum_e forceIgnoreChecksum; /* User specified: if == 1, will ignore checksums in compressed frame. Default == 0 */
-	U32 validateChecksum;                           /* if == 1, will validate checksum. Is == 1 if (fParams.checksumFlag == 1) and (forceIgnoreChecksum == 0). */
-	const BYTE* litPtr;
-	ZSTD_customMem customMem;
-	size_t litSize;
-	size_t rleSize;
-	size_t staticSize;
-#if DYNAMIC_BMI2 != 0
-	int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
-#endif
-
-	/* dictionary */
-	ZSTD_DDict* ddictLocal;
-	const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
-	U32 dictID;
-	int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
-	ZSTD_dictUses_e dictUses;
-	ZSTD_DDictHashSet* ddictSet;                /* Hash set for multiple ddicts */
-	ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
-	int disableHufAsm;
-
-	/* streaming */
-	ZSTD_dStreamStage streamStage;
-	char* inBuff;
-	size_t inBuffSize;
-	size_t inPos;
-	size_t maxWindowSize;
-	char* outBuff;
-	size_t outBuffSize;
-	size_t outStart;
-	size_t outEnd;
-	size_t lhSize;
-#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
-	void* legacyContext;
-	U32 previousLegacyVersion;
-	U32 legacyVersion;
-#endif
-	U32 hostageByte;
-	int noForwardProgress;
-	ZSTD_bufferMode_e outBufferMode;
-	ZSTD_outBuffer expectedOutBuffer;
-
-	/* workspace */
-	BYTE* litBuffer;
-	const BYTE* litBufferEnd;
-	ZSTD_litLocation_e litBufferLocation;
-	BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /* literal buffer can be split between storage within dst and within this scratch buffer */
-	BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
-
-	size_t oversizedDuration;
-
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-	void const * dictContentBeginForFuzzing;
-	void const * dictContentEndForFuzzing;
-#endif
-
-	/* Tracing */
-#if ZSTD_TRACE
-	ZSTD_TraceCtx traceCtx;
-#endif
-}; /* typedef'd to ZSTD_DCtx within "zstd.h" */
-
-MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s* dctx) {
-#if DYNAMIC_BMI2 != 0
-	return dctx->bmi2;
-#else
-	(void)dctx;
-	return 0;
-#endif
-}
-
-/*-*******************************************************
- *  Shared internal functions
- *********************************************************/
-
-/*! ZSTD_loadDEntropy() :
- *  dict : must point at beginning of a valid zstd dictionary.
- * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */
-size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
-                         const void* const dict, size_t const dictSize);
-
-/*! ZSTD_checkContinuity() :
- *  check if next `dst` follows previous position, where decompression ended.
- *  If yes, do nothing (continue on current segment).
- *  If not, classify previous segment as "external dictionary", and start a new segment.
- *  This function cannot fail. */
-void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize);
-
-#endif /* ZSTD_DECOMPRESS_INTERNAL_H */
diff --git a/src/zstd/zstd_double_fast.c b/src/zstd/zstd_double_fast.c
deleted file mode 100644
index 76629d8e4..000000000
--- a/src/zstd/zstd_double_fast.c
+++ /dev/null
@@ -1,800 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#include "zstd_compress_internal.h"
-#include "zstd_double_fast.h"
-
-static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
-                                             void const * end, ZSTD_dictTableLoadMethod_e dtlm) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const hashLarge = ms->hashTable;
-	U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
-	U32 const mls = cParams->minMatch;
-	U32* const hashSmall = ms->chainTable;
-	U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
-	const BYTE* const base = ms->window.base;
-	const BYTE* ip = base + ms->nextToUpdate;
-	const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
-	const U32 fastHashFillStep = 3;
-
-	/* Always insert every fastHashFillStep position into the hash tables.
-	 * Insert the other positions into the large hash table if their entry
-	 * is empty.
-	 */
-	for(; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
-		U32 const curr = (U32)(ip - base);
-		U32 i;
-		for(i = 0; i < fastHashFillStep; ++i) {
-			size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
-			size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
-			if(i == 0) {
-				ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
-			}
-			if(i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
-				ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
-			}
-			/* Only load extra positions for ZSTD_dtlm_full */
-			if(dtlm == ZSTD_dtlm_fast)
-				break;
-		}
-	}
-}
-
-static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
-                                            void const * end, ZSTD_dictTableLoadMethod_e dtlm) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const hashLarge = ms->hashTable;
-	U32 const hBitsL = cParams->hashLog;
-	U32 const mls = cParams->minMatch;
-	U32* const hashSmall = ms->chainTable;
-	U32 const hBitsS = cParams->chainLog;
-	const BYTE* const base = ms->window.base;
-	const BYTE* ip = base + ms->nextToUpdate;
-	const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
-	const U32 fastHashFillStep = 3;
-
-	/* Always insert every fastHashFillStep position into the hash tables.
-	 * Insert the other positions into the large hash table if their entry
-	 * is empty.
-	 */
-	for(; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
-		U32 const curr = (U32)(ip - base);
-		U32 i;
-		for(i = 0; i < fastHashFillStep; ++i) {
-			size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls);
-			size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8);
-			if(i == 0)
-				hashSmall[smHash] = curr + i;
-			if(i == 0 || hashLarge[lgHash] == 0)
-				hashLarge[lgHash] = curr + i;
-			/* Only load extra positions for ZSTD_dtlm_full */
-			if(dtlm == ZSTD_dtlm_fast)
-				break;
-		}
-	}
-}
-
-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              const void* const end,
-                              ZSTD_dictTableLoadMethod_e dtlm,
-                              ZSTD_tableFillPurpose_e tfp) {
-	if(tfp == ZSTD_tfp_forCDict) {
-		ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
-	} else {
-		ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
-	}
-}
-
-FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_doubleFast_noDict_generic(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize, U32 const mls /* template */) {
-	ZSTD_compressionParameters const * cParams = &ms->cParams;
-	U32* const hashLong = ms->hashTable;
-	const U32 hBitsL = cParams->hashLog;
-	U32* const hashSmall = ms->chainTable;
-	const U32 hBitsS = cParams->chainLog;
-	const BYTE* const base = ms->window.base;
-	const BYTE* const istart = (const BYTE*)src;
-	const BYTE* anchor = istart;
-	const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
-	/* presumes that, if there is a dictionary, it must be using Attach mode */
-	const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
-	const BYTE* const prefixLowest = base + prefixLowestIndex;
-	const BYTE* const iend = istart + srcSize;
-	const BYTE* const ilimit = iend - HASH_READ_SIZE;
-	U32 offset_1 = rep[0], offset_2 = rep[1];
-	U32 offsetSaved1 = 0, offsetSaved2 = 0;
-
-	size_t mLength;
-	U32 offset;
-	U32 curr;
-
-	/* how many positions to search before increasing step size */
-	const size_t kStepIncr = 1 << kSearchStrength;
-	/* the position at which to increment the step size if no match is found */
-	const BYTE* nextStep;
-	size_t step; /* the current step size */
-
-	size_t hl0; /* the long hash at ip */
-	size_t hl1; /* the long hash at ip1 */
-
-	U32 idxl0; /* the long match index for ip */
-	U32 idxl1; /* the long match index for ip1 */
-
-	const BYTE* matchl0; /* the long match for ip */
-	const BYTE* matchs0; /* the short match for ip */
-	const BYTE* matchl1; /* the long match for ip1 */
-
-	const BYTE* ip = istart; /* the current position */
-	const BYTE* ip1;         /* the next position */
-
-	DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic");
-
-	/* init */
-	ip += ((ip - prefixLowest) == 0);
-	{
-		U32 const current = (U32)(ip - base);
-		U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
-		U32 const maxRep = current - windowLow;
-		if(offset_2 > maxRep)
-			offsetSaved2 = offset_2, offset_2 = 0;
-		if(offset_1 > maxRep)
-			offsetSaved1 = offset_1, offset_1 = 0;
-	}
-
-	/* Outer Loop: one iteration per match found and stored */
-	while(1) {
-		step = 1;
-		nextStep = ip + kStepIncr;
-		ip1 = ip + step;
-
-		if(ip1 > ilimit) {
-			goto _cleanup;
-		}
-
-		hl0 = ZSTD_hashPtr(ip, hBitsL, 8);
-		idxl0 = hashLong[hl0];
-		matchl0 = base + idxl0;
-
-		/* Inner Loop: one iteration per search / position */
-		do {
-			const size_t hs0 = ZSTD_hashPtr(ip, hBitsS, mls);
-			const U32 idxs0 = hashSmall[hs0];
-			curr = (U32)(ip - base);
-			matchs0 = base + idxs0;
-
-			hashLong[hl0] = hashSmall[hs0] = curr; /* update hash tables */
-
-			/* check noDict repcode */
-			if((offset_1 > 0) & (MEM_read32(ip + 1 - offset_1) == MEM_read32(ip + 1))) {
-				mLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4;
-				ip++;
-				ZSTD_storeSeq(seqStore, (size_t)(ip - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
-				goto _match_stored;
-			}
-
-			hl1 = ZSTD_hashPtr(ip1, hBitsL, 8);
-
-			if(idxl0 > prefixLowestIndex) {
-				/* check prefix long match */
-				if(MEM_read64(matchl0) == MEM_read64(ip)) {
-					mLength = ZSTD_count(ip + 8, matchl0 + 8, iend) + 8;
-					offset = (U32)(ip - matchl0);
-					while(((ip > anchor) & (matchl0 > prefixLowest)) && (ip[-1] == matchl0[-1])) {
-						ip--;
-						matchl0--;
-						mLength++;
-					} /* catch up */
-					goto _match_found;
-				}
-			}
-
-			idxl1 = hashLong[hl1];
-			matchl1 = base + idxl1;
-
-			if(idxs0 > prefixLowestIndex) {
-				/* check prefix short match */
-				if(MEM_read32(matchs0) == MEM_read32(ip)) {
-					goto _search_next_long;
-				}
-			}
-
-			if(ip1 >= nextStep) {
-				PREFETCH_L1(ip1 + 64);
-				PREFETCH_L1(ip1 + 128);
-				step++;
-				nextStep += kStepIncr;
-			}
-			ip = ip1;
-			ip1 += step;
-
-			hl0 = hl1;
-			idxl0 = idxl1;
-			matchl0 = matchl1;
-#if defined(__aarch64__)
-			PREFETCH_L1(ip + 256);
-#endif
-		} while(ip1 <= ilimit);
-
-	_cleanup:
-		/* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
-		 * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
-		offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
-
-		/* save reps for next block */
-		rep[0] = offset_1 ? offset_1 : offsetSaved1;
-		rep[1] = offset_2 ? offset_2 : offsetSaved2;
-
-		/* Return the last literals size */
-		return (size_t)(iend - anchor);
-
-	_search_next_long:
-
-		/* check prefix long +1 match */
-		if(idxl1 > prefixLowestIndex) {
-			if(MEM_read64(matchl1) == MEM_read64(ip1)) {
-				ip = ip1;
-				mLength = ZSTD_count(ip + 8, matchl1 + 8, iend) + 8;
-				offset = (U32)(ip - matchl1);
-				while(((ip > anchor) & (matchl1 > prefixLowest)) && (ip[-1] == matchl1[-1])) {
-					ip--;
-					matchl1--;
-					mLength++;
-				} /* catch up */
-				goto _match_found;
-			}
-		}
-
-		/* if no long +1 match, explore the short match we found */
-		mLength = ZSTD_count(ip + 4, matchs0 + 4, iend) + 4;
-		offset = (U32)(ip - matchs0);
-		while(((ip > anchor) & (matchs0 > prefixLowest)) && (ip[-1] == matchs0[-1])) {
-			ip--;
-			matchs0--;
-			mLength++;
-		} /* catch up */
-
-		/* fall-through */
-
-	_match_found: /* requires ip, offset, mLength */
-		offset_2 = offset_1;
-		offset_1 = offset;
-
-		if(step < 4) {
-			/* It is unsafe to write this value back to the hashtable when ip1 is
-			 * greater than or equal to the new ip we will have after we're done
-			 * processing this match. Rather than perform that test directly
-			 * (ip1 >= ip + mLength), which costs speed in practice, we do a simpler
-			 * more predictable test. The minmatch even if we take a short match is
-			 * 4 bytes, so as long as step, the distance between ip and ip1
-			 * (initially) is less than 4, we know ip1 < new ip. */
-			hashLong[hl1] = (U32)(ip1 - base);
-		}
-
-		ZSTD_storeSeq(seqStore, (size_t)(ip - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
-
-	_match_stored:
-		/* match found */
-		ip += mLength;
-		anchor = ip;
-
-		if(ip <= ilimit) {
-			/* Complementary insertion */
-			/* done after iLimit test, as candidates could be > iend-8 */
-			{
-				U32 const indexToInsert = curr + 2;
-				hashLong[ZSTD_hashPtr(base + indexToInsert, hBitsL, 8)] = indexToInsert;
-				hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = (U32)(ip - 2 - base);
-				hashSmall[ZSTD_hashPtr(base + indexToInsert, hBitsS, mls)] = indexToInsert;
-				hashSmall[ZSTD_hashPtr(ip - 1, hBitsS, mls)] = (U32)(ip - 1 - base);
-			}
-
-			/* check immediate repcode */
-			while((ip <= ilimit) && ((offset_2 > 0) & (MEM_read32(ip) == MEM_read32(ip - offset_2)))) {
-				/* store sequence */
-				size_t const rLength = ZSTD_count(ip + 4, ip + 4 - offset_2, iend) + 4;
-				U32 const tmpOff = offset_2;
-				offset_2 = offset_1;
-				offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */
-				hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base);
-				hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base);
-				ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
-				ip += rLength;
-				anchor = ip;
-				continue; /* faster when present ... (?) */
-			}
-		}
-	}
-}
-
-FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize,
-    U32 const mls /* template */) {
-	ZSTD_compressionParameters const * cParams = &ms->cParams;
-	U32* const hashLong = ms->hashTable;
-	const U32 hBitsL = cParams->hashLog;
-	U32* const hashSmall = ms->chainTable;
-	const U32 hBitsS = cParams->chainLog;
-	const BYTE* const base = ms->window.base;
-	const BYTE* const istart = (const BYTE*)src;
-	const BYTE* ip = istart;
-	const BYTE* anchor = istart;
-	const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
-	/* presumes that, if there is a dictionary, it must be using Attach mode */
-	const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
-	const BYTE* const prefixLowest = base + prefixLowestIndex;
-	const BYTE* const iend = istart + srcSize;
-	const BYTE* const ilimit = iend - HASH_READ_SIZE;
-	U32 offset_1 = rep[0], offset_2 = rep[1];
-
-	const ZSTD_matchState_t* const dms = ms->dictMatchState;
-	const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
-	const U32* const dictHashLong = dms->hashTable;
-	const U32* const dictHashSmall = dms->chainTable;
-	const U32 dictStartIndex = dms->window.dictLimit;
-	const BYTE* const dictBase = dms->window.base;
-	const BYTE* const dictStart = dictBase + dictStartIndex;
-	const BYTE* const dictEnd = dms->window.nextSrc;
-	const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase);
-	const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
-	const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
-	const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
-
-	DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
-
-	/* if a dictionary is attached, it must be within window range */
-	assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
-
-	if(ms->prefetchCDictTables) {
-		size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
-		size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
-		PREFETCH_AREA(dictHashLong, hashTableBytes)
-		PREFETCH_AREA(dictHashSmall, chainTableBytes)
-	}
-
-	/* init */
-	ip += (dictAndPrefixLength == 0);
-
-	/* dictMatchState repCode checks don't currently handle repCode == 0
-	 * disabling. */
-	assert(offset_1 <= dictAndPrefixLength);
-	assert(offset_2 <= dictAndPrefixLength);
-
-	/* Main Search Loop */
-	while(ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
-		size_t mLength;
-		U32 offset;
-		size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
-		size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
-		size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
-		size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
-		U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
-		U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
-		int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
-		int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
-		U32 const curr = (U32)(ip - base);
-		U32 const matchIndexL = hashLong[h2];
-		U32 matchIndexS = hashSmall[h];
-		const BYTE* matchLong = base + matchIndexL;
-		const BYTE* match = base + matchIndexS;
-		const U32 repIndex = curr + 1 - offset_1;
-		const BYTE* repMatch = (repIndex < prefixLowestIndex) ? dictBase + (repIndex - dictIndexDelta) : base + repIndex;
-		hashLong[h2] = hashSmall[h] = curr; /* update hash tables */
-
-		/* check repcode */
-		if(((U32)((prefixLowestIndex - 1) - repIndex) >= 3 /* intentional underflow */) && (MEM_read32(repMatch) == MEM_read32(ip + 1))) {
-			const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
-			mLength = ZSTD_count_2segments(ip + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixLowest) + 4;
-			ip++;
-			ZSTD_storeSeq(seqStore, (size_t)(ip - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
-			goto _match_stored;
-		}
-
-		if(matchIndexL > prefixLowestIndex) {
-			/* check prefix long match */
-			if(MEM_read64(matchLong) == MEM_read64(ip)) {
-				mLength = ZSTD_count(ip + 8, matchLong + 8, iend) + 8;
-				offset = (U32)(ip - matchLong);
-				while(((ip > anchor) & (matchLong > prefixLowest)) && (ip[-1] == matchLong[-1])) {
-					ip--;
-					matchLong--;
-					mLength++;
-				} /* catch up */
-				goto _match_found;
-			}
-		} else if(dictTagsMatchL) {
-			/* check dictMatchState long match */
-			U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
-			const BYTE* dictMatchL = dictBase + dictMatchIndexL;
-			assert(dictMatchL < dictEnd);
-
-			if(dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) {
-				mLength = ZSTD_count_2segments(ip + 8, dictMatchL + 8, iend, dictEnd, prefixLowest) + 8;
-				offset = (U32)(curr - dictMatchIndexL - dictIndexDelta);
-				while(((ip > anchor) & (dictMatchL > dictStart)) && (ip[-1] == dictMatchL[-1])) {
-					ip--;
-					dictMatchL--;
-					mLength++;
-				} /* catch up */
-				goto _match_found;
-			}
-		}
-
-		if(matchIndexS > prefixLowestIndex) {
-			/* check prefix short match */
-			if(MEM_read32(match) == MEM_read32(ip)) {
-				goto _search_next_long;
-			}
-		} else if(dictTagsMatchS) {
-			/* check dictMatchState short match */
-			U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
-			match = dictBase + dictMatchIndexS;
-			matchIndexS = dictMatchIndexS + dictIndexDelta;
-
-			if(match > dictStart && MEM_read32(match) == MEM_read32(ip)) {
-				goto _search_next_long;
-			}
-		}
-
-		ip += ((ip - anchor) >> kSearchStrength) + 1;
-#if defined(__aarch64__)
-		PREFETCH_L1(ip + 256);
-#endif
-		continue;
-
-	_search_next_long : {
-		size_t const hl3 = ZSTD_hashPtr(ip + 1, hBitsL, 8);
-		size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip + 1, dictHBitsL, 8);
-		U32 const matchIndexL3 = hashLong[hl3];
-		U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
-		int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
-		const BYTE* matchL3 = base + matchIndexL3;
-		hashLong[hl3] = curr + 1;
-
-		/* check prefix long +1 match */
-		if(matchIndexL3 > prefixLowestIndex) {
-			if(MEM_read64(matchL3) == MEM_read64(ip + 1)) {
-				mLength = ZSTD_count(ip + 9, matchL3 + 8, iend) + 8;
-				ip++;
-				offset = (U32)(ip - matchL3);
-				while(((ip > anchor) & (matchL3 > prefixLowest)) && (ip[-1] == matchL3[-1])) {
-					ip--;
-					matchL3--;
-					mLength++;
-				} /* catch up */
-				goto _match_found;
-			}
-		} else if(dictTagsMatchL3) {
-			/* check dict long +1 match */
-			U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
-			const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
-			assert(dictMatchL3 < dictEnd);
-			if(dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip + 1)) {
-				mLength = ZSTD_count_2segments(ip + 1 + 8, dictMatchL3 + 8, iend, dictEnd, prefixLowest) + 8;
-				ip++;
-				offset = (U32)(curr + 1 - dictMatchIndexL3 - dictIndexDelta);
-				while(((ip > anchor) & (dictMatchL3 > dictStart)) && (ip[-1] == dictMatchL3[-1])) {
-					ip--;
-					dictMatchL3--;
-					mLength++;
-				} /* catch up */
-				goto _match_found;
-			}
-		}
-	}
-
-		/* if no long +1 match, explore the short match we found */
-		if(matchIndexS < prefixLowestIndex) {
-			mLength = ZSTD_count_2segments(ip + 4, match + 4, iend, dictEnd, prefixLowest) + 4;
-			offset = (U32)(curr - matchIndexS);
-			while(((ip > anchor) & (match > dictStart)) && (ip[-1] == match[-1])) {
-				ip--;
-				match--;
-				mLength++;
-			} /* catch up */
-		} else {
-			mLength = ZSTD_count(ip + 4, match + 4, iend) + 4;
-			offset = (U32)(ip - match);
-			while(((ip > anchor) & (match > prefixLowest)) && (ip[-1] == match[-1])) {
-				ip--;
-				match--;
-				mLength++;
-			} /* catch up */
-		}
-
-	_match_found:
-		offset_2 = offset_1;
-		offset_1 = offset;
-
-		ZSTD_storeSeq(seqStore, (size_t)(ip - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
-
-	_match_stored:
-		/* match found */
-		ip += mLength;
-		anchor = ip;
-
-		if(ip <= ilimit) {
-			/* Complementary insertion */
-			/* done after iLimit test, as candidates could be > iend-8 */
-			{
-				U32 const indexToInsert = curr + 2;
-				hashLong[ZSTD_hashPtr(base + indexToInsert, hBitsL, 8)] = indexToInsert;
-				hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = (U32)(ip - 2 - base);
-				hashSmall[ZSTD_hashPtr(base + indexToInsert, hBitsS, mls)] = indexToInsert;
-				hashSmall[ZSTD_hashPtr(ip - 1, hBitsS, mls)] = (U32)(ip - 1 - base);
-			}
-
-			/* check immediate repcode */
-			while(ip <= ilimit) {
-				U32 const current2 = (U32)(ip - base);
-				U32 const repIndex2 = current2 - offset_2;
-				const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ? dictBase + repIndex2 - dictIndexDelta : base + repIndex2;
-				if(((U32)((prefixLowestIndex - 1) - (U32)repIndex2) >= 3 /* intentional overflow */) && (MEM_read32(repMatch2) == MEM_read32(ip))) {
-					const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
-					size_t const repLength2 = ZSTD_count_2segments(ip + 4, repMatch2 + 4, iend, repEnd2, prefixLowest) + 4;
-					U32 tmpOffset = offset_2;
-					offset_2 = offset_1;
-					offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
-					ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
-					hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
-					hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
-					ip += repLength2;
-					anchor = ip;
-					continue;
-				}
-				break;
-			}
-		}
-	} /* while (ip < ilimit) */
-
-	/* save reps for next block */
-	rep[0] = offset_1;
-	rep[1] = offset_2;
-
-	/* Return the last literals size */
-	return (size_t)(iend - anchor);
-}
-
-#define ZSTD_GEN_DFAST_FN(dictMode, mls)                                                                 \
-	static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls(                                      \
-	    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                              \
-	    void const * src, size_t srcSize) {                                                              \
-		return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \
-	}
-
-ZSTD_GEN_DFAST_FN(noDict, 4)
-ZSTD_GEN_DFAST_FN(noDict, 5)
-ZSTD_GEN_DFAST_FN(noDict, 6)
-ZSTD_GEN_DFAST_FN(noDict, 7)
-
-ZSTD_GEN_DFAST_FN(dictMatchState, 4)
-ZSTD_GEN_DFAST_FN(dictMatchState, 5)
-ZSTD_GEN_DFAST_FN(dictMatchState, 6)
-ZSTD_GEN_DFAST_FN(dictMatchState, 7)
-
-size_t ZSTD_compressBlock_doubleFast(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	const U32 mls = ms->cParams.minMatch;
-	switch(mls) {
-	default: /* includes case 3 */
-	case 4:
-		return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize);
-	case 5:
-		return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize);
-	case 6:
-		return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize);
-	case 7:
-		return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize);
-	}
-}
-
-size_t ZSTD_compressBlock_doubleFast_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	const U32 mls = ms->cParams.minMatch;
-	switch(mls) {
-	default: /* includes case 3 */
-	case 4:
-		return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize);
-	case 5:
-		return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize);
-	case 6:
-		return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize);
-	case 7:
-		return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize);
-	}
-}
-
-static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize,
-    U32 const mls /* template */) {
-	ZSTD_compressionParameters const * cParams = &ms->cParams;
-	U32* const hashLong = ms->hashTable;
-	U32 const hBitsL = cParams->hashLog;
-	U32* const hashSmall = ms->chainTable;
-	U32 const hBitsS = cParams->chainLog;
-	const BYTE* const istart = (const BYTE*)src;
-	const BYTE* ip = istart;
-	const BYTE* anchor = istart;
-	const BYTE* const iend = istart + srcSize;
-	const BYTE* const ilimit = iend - 8;
-	const BYTE* const base = ms->window.base;
-	const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
-	const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
-	const U32 dictStartIndex = lowLimit;
-	const U32 dictLimit = ms->window.dictLimit;
-	const U32 prefixStartIndex = (dictLimit > lowLimit) ? dictLimit : lowLimit;
-	const BYTE* const prefixStart = base + prefixStartIndex;
-	const BYTE* const dictBase = ms->window.dictBase;
-	const BYTE* const dictStart = dictBase + dictStartIndex;
-	const BYTE* const dictEnd = dictBase + prefixStartIndex;
-	U32 offset_1 = rep[0], offset_2 = rep[1];
-
-	DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize);
-
-	/* if extDict is invalidated due to maxDistance, switch to "regular" variant */
-	if(prefixStartIndex == dictStartIndex)
-		return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize);
-
-	/* Search Loop */
-	while(ip < ilimit) { /* < instead of <=, because (ip+1) */
-		const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
-		const U32 matchIndex = hashSmall[hSmall];
-		const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
-		const BYTE* match = matchBase + matchIndex;
-
-		const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
-		const U32 matchLongIndex = hashLong[hLong];
-		const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base;
-		const BYTE* matchLong = matchLongBase + matchLongIndex;
-
-		const U32 curr = (U32)(ip - base);
-		const U32 repIndex = curr + 1 - offset_1; /* offset_1 expected <= curr +1 */
-		const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
-		const BYTE* const repMatch = repBase + repIndex;
-		size_t mLength;
-		hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */
-
-		if((((U32)((prefixStartIndex - 1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
-		    & (offset_1 <= curr + 1 - dictStartIndex))      /* note: we are searching at curr+1 */
-		   && (MEM_read32(repMatch) == MEM_read32(ip + 1))) {
-			const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-			mLength = ZSTD_count_2segments(ip + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
-			ip++;
-			ZSTD_storeSeq(seqStore, (size_t)(ip - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
-		} else {
-			if((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
-				const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
-				const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart;
-				U32 offset;
-				mLength = ZSTD_count_2segments(ip + 8, matchLong + 8, iend, matchEnd, prefixStart) + 8;
-				offset = curr - matchLongIndex;
-				while(((ip > anchor) & (matchLong > lowMatchPtr)) && (ip[-1] == matchLong[-1])) {
-					ip--;
-					matchLong--;
-					mLength++;
-				} /* catch up */
-				offset_2 = offset_1;
-				offset_1 = offset;
-				ZSTD_storeSeq(seqStore, (size_t)(ip - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
-
-			} else if((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
-				size_t const h3 = ZSTD_hashPtr(ip + 1, hBitsL, 8);
-				U32 const matchIndex3 = hashLong[h3];
-				const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base;
-				const BYTE* match3 = match3Base + matchIndex3;
-				U32 offset;
-				hashLong[h3] = curr + 1;
-				if((matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip + 1))) {
-					const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend;
-					const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart;
-					mLength = ZSTD_count_2segments(ip + 9, match3 + 8, iend, matchEnd, prefixStart) + 8;
-					ip++;
-					offset = curr + 1 - matchIndex3;
-					while(((ip > anchor) & (match3 > lowMatchPtr)) && (ip[-1] == match3[-1])) {
-						ip--;
-						match3--;
-						mLength++;
-					} /* catch up */
-				} else {
-					const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
-					const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
-					mLength = ZSTD_count_2segments(ip + 4, match + 4, iend, matchEnd, prefixStart) + 4;
-					offset = curr - matchIndex;
-					while(((ip > anchor) & (match > lowMatchPtr)) && (ip[-1] == match[-1])) {
-						ip--;
-						match--;
-						mLength++;
-					} /* catch up */
-				}
-				offset_2 = offset_1;
-				offset_1 = offset;
-				ZSTD_storeSeq(seqStore, (size_t)(ip - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
-
-			} else {
-				ip += ((ip - anchor) >> kSearchStrength) + 1;
-				continue;
-			}
-		}
-
-		/* move to next sequence start */
-		ip += mLength;
-		anchor = ip;
-
-		if(ip <= ilimit) {
-			/* Complementary insertion */
-			/* done after iLimit test, as candidates could be > iend-8 */
-			{
-				U32 const indexToInsert = curr + 2;
-				hashLong[ZSTD_hashPtr(base + indexToInsert, hBitsL, 8)] = indexToInsert;
-				hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = (U32)(ip - 2 - base);
-				hashSmall[ZSTD_hashPtr(base + indexToInsert, hBitsS, mls)] = indexToInsert;
-				hashSmall[ZSTD_hashPtr(ip - 1, hBitsS, mls)] = (U32)(ip - 1 - base);
-			}
-
-			/* check immediate repcode */
-			while(ip <= ilimit) {
-				U32 const current2 = (U32)(ip - base);
-				U32 const repIndex2 = current2 - offset_2;
-				const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
-				if((((U32)((prefixStartIndex - 1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
-				    & (offset_2 <= current2 - dictStartIndex)) &&
-				   (MEM_read32(repMatch2) == MEM_read32(ip))) {
-					const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-					size_t const repLength2 = ZSTD_count_2segments(ip + 4, repMatch2 + 4, iend, repEnd2, prefixStart) + 4;
-					U32 const tmpOffset = offset_2;
-					offset_2 = offset_1;
-					offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
-					ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
-					hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
-					hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
-					ip += repLength2;
-					anchor = ip;
-					continue;
-				}
-				break;
-			}
-		}
-	}
-
-	/* save reps for next block */
-	rep[0] = offset_1;
-	rep[1] = offset_2;
-
-	/* Return the last literals size */
-	return (size_t)(iend - anchor);
-}
-
-ZSTD_GEN_DFAST_FN(extDict, 4)
-ZSTD_GEN_DFAST_FN(extDict, 5)
-ZSTD_GEN_DFAST_FN(extDict, 6)
-ZSTD_GEN_DFAST_FN(extDict, 7)
-
-size_t ZSTD_compressBlock_doubleFast_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	U32 const mls = ms->cParams.minMatch;
-	switch(mls) {
-	default: /* includes case 3 */
-	case 4:
-		return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize);
-	case 5:
-		return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize);
-	case 6:
-		return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize);
-	case 7:
-		return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
-	}
-}
diff --git a/src/zstd/zstd_double_fast.h b/src/zstd/zstd_double_fast.h
deleted file mode 100644
index 32c680dbe..000000000
--- a/src/zstd/zstd_double_fast.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef ZSTD_DOUBLE_FAST_H
-#define ZSTD_DOUBLE_FAST_H
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#include "mem.h"                    /* U32 */
-#include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */
-
-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              void const * end, ZSTD_dictTableLoadMethod_e dtlm,
-                              ZSTD_tableFillPurpose_e tfp);
-size_t ZSTD_compressBlock_doubleFast(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
-size_t ZSTD_compressBlock_doubleFast_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
-size_t ZSTD_compressBlock_doubleFast_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* ZSTD_DOUBLE_FAST_H */
diff --git a/src/zstd/zstd_errors.h b/src/zstd/zstd_errors.h
index a691edcd6..dc75eeeba 100644
--- a/src/zstd/zstd_errors.h
+++ b/src/zstd/zstd_errors.h
@@ -11,39 +11,40 @@
 #ifndef ZSTD_ERRORS_H_398273423
 #define ZSTD_ERRORS_H_398273423
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 extern "C" {
 #endif
 
 /*===== dependency =====*/
-#include <stddef.h> /* size_t */
+#include <stddef.h>   /* size_t */
+
 
 /* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
 #ifndef ZSTDERRORLIB_VISIBLE
-/* Backwards compatibility with old macro name */
-#ifdef ZSTDERRORLIB_VISIBILITY
-#define ZSTDERRORLIB_VISIBLE ZSTDERRORLIB_VISIBILITY
-#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
-#define ZSTDERRORLIB_VISIBLE __attribute__((visibility("default")))
-#else
-#define ZSTDERRORLIB_VISIBLE
-#endif
+   /* Backwards compatibility with old macro name */
+#  ifdef ZSTDERRORLIB_VISIBILITY
+#    define ZSTDERRORLIB_VISIBLE ZSTDERRORLIB_VISIBILITY
+#  elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
+#    define ZSTDERRORLIB_VISIBLE __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDERRORLIB_VISIBLE
+#  endif
 #endif
 
 #ifndef ZSTDERRORLIB_HIDDEN
-#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
-#define ZSTDERRORLIB_HIDDEN __attribute__((visibility("hidden")))
-#else
-#define ZSTDERRORLIB_HIDDEN
-#endif
+#  if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
+#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+#  else
+#    define ZSTDERRORLIB_HIDDEN
+#  endif
 #endif
 
-#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT == 1)
-#define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBLE
-#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT == 1)
-#define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBLE
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
 #else
-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
+#  define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
 #endif
 
 /*-*********************************************
@@ -61,51 +62,52 @@ extern "C" {
  *  note 3 : ZSTD_isError() is always correct, whatever the library version.
  **********************************************/
 typedef enum {
-	ZSTD_error_no_error = 0,
-	ZSTD_error_GENERIC = 1,
-	ZSTD_error_prefix_unknown = 10,
-	ZSTD_error_version_unsupported = 12,
-	ZSTD_error_frameParameter_unsupported = 14,
-	ZSTD_error_frameParameter_windowTooLarge = 16,
-	ZSTD_error_corruption_detected = 20,
-	ZSTD_error_checksum_wrong = 22,
-	ZSTD_error_literals_headerWrong = 24,
-	ZSTD_error_dictionary_corrupted = 30,
-	ZSTD_error_dictionary_wrong = 32,
-	ZSTD_error_dictionaryCreation_failed = 34,
-	ZSTD_error_parameter_unsupported = 40,
-	ZSTD_error_parameter_combination_unsupported = 41,
-	ZSTD_error_parameter_outOfBound = 42,
-	ZSTD_error_tableLog_tooLarge = 44,
-	ZSTD_error_maxSymbolValue_tooLarge = 46,
-	ZSTD_error_maxSymbolValue_tooSmall = 48,
-	ZSTD_error_stabilityCondition_notRespected = 50,
-	ZSTD_error_stage_wrong = 60,
-	ZSTD_error_init_missing = 62,
-	ZSTD_error_memory_allocation = 64,
-	ZSTD_error_workSpace_tooSmall = 66,
-	ZSTD_error_dstSize_tooSmall = 70,
-	ZSTD_error_srcSize_wrong = 72,
-	ZSTD_error_dstBuffer_null = 74,
-	ZSTD_error_noForwardProgress_destFull = 80,
-	ZSTD_error_noForwardProgress_inputEmpty = 82,
-	/* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
-	ZSTD_error_frameIndex_tooLarge = 100,
-	ZSTD_error_seekableIO = 102,
-	ZSTD_error_dstBuffer_wrong = 104,
-	ZSTD_error_srcBuffer_wrong = 105,
-	ZSTD_error_sequenceProducer_failed = 106,
-	ZSTD_error_externalSequences_invalid = 107,
-	ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+  ZSTD_error_no_error = 0,
+  ZSTD_error_GENERIC  = 1,
+  ZSTD_error_prefix_unknown                = 10,
+  ZSTD_error_version_unsupported           = 12,
+  ZSTD_error_frameParameter_unsupported    = 14,
+  ZSTD_error_frameParameter_windowTooLarge = 16,
+  ZSTD_error_corruption_detected = 20,
+  ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_literals_headerWrong = 24,
+  ZSTD_error_dictionary_corrupted      = 30,
+  ZSTD_error_dictionary_wrong          = 32,
+  ZSTD_error_dictionaryCreation_failed = 34,
+  ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_combination_unsupported = 41,
+  ZSTD_error_parameter_outOfBound    = 42,
+  ZSTD_error_tableLog_tooLarge       = 44,
+  ZSTD_error_maxSymbolValue_tooLarge = 46,
+  ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stabilityCondition_notRespected = 50,
+  ZSTD_error_stage_wrong       = 60,
+  ZSTD_error_init_missing      = 62,
+  ZSTD_error_memory_allocation = 64,
+  ZSTD_error_workSpace_tooSmall= 66,
+  ZSTD_error_dstSize_tooSmall = 70,
+  ZSTD_error_srcSize_wrong    = 72,
+  ZSTD_error_dstBuffer_null   = 74,
+  ZSTD_error_noForwardProgress_destFull = 80,
+  ZSTD_error_noForwardProgress_inputEmpty = 82,
+  /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+  ZSTD_error_frameIndex_tooLarge = 100,
+  ZSTD_error_seekableIO          = 102,
+  ZSTD_error_dstBuffer_wrong     = 104,
+  ZSTD_error_srcBuffer_wrong     = 105,
+  ZSTD_error_sequenceProducer_failed = 106,
+  ZSTD_error_externalSequences_invalid = 107,
+  ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
 } ZSTD_ErrorCode;
 
 /*! ZSTD_getErrorCode() :
     convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
     which can be used to compare with enum list published above */
 ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
-ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
 
-#if defined(__cplusplus)
+#if defined (__cplusplus)
 }
 #endif
 
diff --git a/src/zstd/zstd_fast.c b/src/zstd/zstd_fast.c
deleted file mode 100644
index da8afa11d..000000000
--- a/src/zstd/zstd_fast.c
+++ /dev/null
@@ -1,968 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
-#include "zstd_fast.h"
-
-static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
-                                       const void* const end,
-                                       ZSTD_dictTableLoadMethod_e dtlm) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const hashTable = ms->hashTable;
-	U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
-	U32 const mls = cParams->minMatch;
-	const BYTE* const base = ms->window.base;
-	const BYTE* ip = base + ms->nextToUpdate;
-	const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
-	const U32 fastHashFillStep = 3;
-
-	/* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
-	 * Feel free to remove this assert if there's a good reason! */
-	assert(dtlm == ZSTD_dtlm_full);
-
-	/* Always insert every fastHashFillStep position into the hash table.
-	 * Insert the other positions if their hash entry is empty.
-	 */
-	for(; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
-		U32 const curr = (U32)(ip - base);
-		{
-			size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
-			ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);
-		}
-
-		if(dtlm == ZSTD_dtlm_fast)
-			continue;
-		/* Only load extra positions for ZSTD_dtlm_full */
-		{
-			U32 p;
-			for(p = 1; p < fastHashFillStep; ++p) {
-				size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
-				if(hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */
-					ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
-				}
-			}
-		}
-	}
-}
-
-static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
-                                      const void* const end,
-                                      ZSTD_dictTableLoadMethod_e dtlm) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const hashTable = ms->hashTable;
-	U32 const hBits = cParams->hashLog;
-	U32 const mls = cParams->minMatch;
-	const BYTE* const base = ms->window.base;
-	const BYTE* ip = base + ms->nextToUpdate;
-	const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
-	const U32 fastHashFillStep = 3;
-
-	/* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
-	 * Feel free to remove this assert if there's a good reason! */
-	assert(dtlm == ZSTD_dtlm_fast);
-
-	/* Always insert every fastHashFillStep position into the hash table.
-	 * Insert the other positions if their hash entry is empty.
-	 */
-	for(; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
-		U32 const curr = (U32)(ip - base);
-		size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
-		hashTable[hash0] = curr;
-		if(dtlm == ZSTD_dtlm_fast)
-			continue;
-		/* Only load extra positions for ZSTD_dtlm_full */
-		{
-			U32 p;
-			for(p = 1; p < fastHashFillStep; ++p) {
-				size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);
-				if(hashTable[hash] == 0) { /* not yet filled */
-					hashTable[hash] = curr + p;
-				}
-			}
-		}
-	}
-}
-
-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        const void* const end,
-                        ZSTD_dictTableLoadMethod_e dtlm,
-                        ZSTD_tableFillPurpose_e tfp) {
-	if(tfp == ZSTD_tfp_forCDict) {
-		ZSTD_fillHashTableForCDict(ms, end, dtlm);
-	} else {
-		ZSTD_fillHashTableForCCtx(ms, end, dtlm);
-	}
-}
-
-/**
- * If you squint hard enough (and ignore repcodes), the search operation at any
- * given position is broken into 4 stages:
- *
- * 1. Hash   (map position to hash value via input read)
- * 2. Lookup (map hash val to index via hashtable read)
- * 3. Load   (map index to value at that position via input read)
- * 4. Compare
- *
- * Each of these steps involves a memory read at an address which is computed
- * from the previous step. This means these steps must be sequenced and their
- * latencies are cumulative.
- *
- * Rather than do 1->2->3->4 sequentially for a single position before moving
- * onto the next, this implementation interleaves these operations across the
- * next few positions:
- *
- * R = Repcode Read & Compare
- * H = Hash
- * T = Table Lookup
- * M = Match Read & Compare
- *
- * Pos | Time -->
- * ----+-------------------
- * N   | ... M
- * N+1 | ...   TM
- * N+2 |    R H   T M
- * N+3 |         H    TM
- * N+4 |           R H   T M
- * N+5 |                H   ...
- * N+6 |                  R ...
- *
- * This is very much analogous to the pipelining of execution in a CPU. And just
- * like a CPU, we have to dump the pipeline when we find a match (i.e., take a
- * branch).
- *
- * When this happens, we throw away our current state, and do the following prep
- * to re-enter the loop:
- *
- * Pos | Time -->
- * ----+-------------------
- * N   | H T
- * N+1 |  H
- *
- * This is also the work we do at the beginning to enter the loop initially.
- */
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_compressBlock_fast_noDict_generic(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize,
-    U32 const mls, U32 const hasStep) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const hashTable = ms->hashTable;
-	U32 const hlog = cParams->hashLog;
-	/* support stepSize of 0 */
-	size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2;
-	const BYTE* const base = ms->window.base;
-	const BYTE* const istart = (const BYTE*)src;
-	const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
-	const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
-	const BYTE* const prefixStart = base + prefixStartIndex;
-	const BYTE* const iend = istart + srcSize;
-	const BYTE* const ilimit = iend - HASH_READ_SIZE;
-
-	const BYTE* anchor = istart;
-	const BYTE* ip0 = istart;
-	const BYTE* ip1;
-	const BYTE* ip2;
-	const BYTE* ip3;
-	U32 current0;
-
-	U32 rep_offset1 = rep[0];
-	U32 rep_offset2 = rep[1];
-	U32 offsetSaved1 = 0, offsetSaved2 = 0;
-
-	size_t hash0; /* hash for ip0 */
-	size_t hash1; /* hash for ip1 */
-	U32 idx;      /* match idx for ip0 */
-	U32 mval;     /* src value at match idx */
-
-	U32 offcode;
-	const BYTE* match0;
-	size_t mLength;
-
-	/* ip0 and ip1 are always adjacent. The targetLength skipping and
-	 * uncompressibility acceleration is applied to every other position,
-	 * matching the behavior of #1562. step therefore represents the gap
-	 * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
-	size_t step;
-	const BYTE* nextStep;
-	const size_t kStepIncr = (1 << (kSearchStrength - 1));
-
-	DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
-	ip0 += (ip0 == prefixStart);
-	{
-		U32 const curr = (U32)(ip0 - base);
-		U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
-		U32 const maxRep = curr - windowLow;
-		if(rep_offset2 > maxRep)
-			offsetSaved2 = rep_offset2, rep_offset2 = 0;
-		if(rep_offset1 > maxRep)
-			offsetSaved1 = rep_offset1, rep_offset1 = 0;
-	}
-
-	/* start each op */
-_start: /* Requires: ip0 */
-
-	step = stepSize;
-	nextStep = ip0 + kStepIncr;
-
-	/* calculate positions, ip0 - anchor == 0, so we skip step calc */
-	ip1 = ip0 + 1;
-	ip2 = ip0 + step;
-	ip3 = ip2 + 1;
-
-	if(ip3 >= ilimit) {
-		goto _cleanup;
-	}
-
-	hash0 = ZSTD_hashPtr(ip0, hlog, mls);
-	hash1 = ZSTD_hashPtr(ip1, hlog, mls);
-
-	idx = hashTable[hash0];
-
-	do {
-		/* load repcode match for ip[2]*/
-		const U32 rval = MEM_read32(ip2 - rep_offset1);
-
-		/* write back hash table entry */
-		current0 = (U32)(ip0 - base);
-		hashTable[hash0] = current0;
-
-		/* check repcode at ip[2] */
-		if((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) {
-			ip0 = ip2;
-			match0 = ip0 - rep_offset1;
-			mLength = ip0[-1] == match0[-1];
-			ip0 -= mLength;
-			match0 -= mLength;
-			offcode = REPCODE1_TO_OFFBASE;
-			mLength += 4;
-
-			/* First write next hash table entry; we've already calculated it.
-			 * This write is known to be safe because the ip1 is before the
-			 * repcode (ip2). */
-			hashTable[hash1] = (U32)(ip1 - base);
-
-			goto _match;
-		}
-
-		/* load match for ip[0] */
-		if(idx >= prefixStartIndex) {
-			mval = MEM_read32(base + idx);
-		} else {
-			mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
-		}
-
-		/* check match at ip[0] */
-		if(MEM_read32(ip0) == mval) {
-			/* found a match! */
-
-			/* First write next hash table entry; we've already calculated it.
-			 * This write is known to be safe because the ip1 == ip0 + 1, so
-			 * we know we will resume searching after ip1 */
-			hashTable[hash1] = (U32)(ip1 - base);
-
-			goto _offset;
-		}
-
-		/* lookup ip[1] */
-		idx = hashTable[hash1];
-
-		/* hash ip[2] */
-		hash0 = hash1;
-		hash1 = ZSTD_hashPtr(ip2, hlog, mls);
-
-		/* advance to next positions */
-		ip0 = ip1;
-		ip1 = ip2;
-		ip2 = ip3;
-
-		/* write back hash table entry */
-		current0 = (U32)(ip0 - base);
-		hashTable[hash0] = current0;
-
-		/* load match for ip[0] */
-		if(idx >= prefixStartIndex) {
-			mval = MEM_read32(base + idx);
-		} else {
-			mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
-		}
-
-		/* check match at ip[0] */
-		if(MEM_read32(ip0) == mval) {
-			/* found a match! */
-
-			/* first write next hash table entry; we've already calculated it */
-			if(step <= 4) {
-				/* We need to avoid writing an index into the hash table >= the
-				 * position at which we will pick up our searching after we've
-				 * taken this match.
-				 *
-				 * The minimum possible match has length 4, so the earliest ip0
-				 * can be after we take this match will be the current ip0 + 4.
-				 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
-				 * write this position.
-				 */
-				hashTable[hash1] = (U32)(ip1 - base);
-			}
-
-			goto _offset;
-		}
-
-		/* lookup ip[1] */
-		idx = hashTable[hash1];
-
-		/* hash ip[2] */
-		hash0 = hash1;
-		hash1 = ZSTD_hashPtr(ip2, hlog, mls);
-
-		/* advance to next positions */
-		ip0 = ip1;
-		ip1 = ip2;
-		ip2 = ip0 + step;
-		ip3 = ip1 + step;
-
-		/* calculate step */
-		if(ip2 >= nextStep) {
-			step++;
-			PREFETCH_L1(ip1 + 64);
-			PREFETCH_L1(ip1 + 128);
-			nextStep += kStepIncr;
-		}
-	} while(ip3 < ilimit);
-
-_cleanup:
-	/* Note that there are probably still a couple positions we could search.
-	 * However, it seems to be a meaningful performance hit to try to search
-	 * them. So let's not. */
-
-	/* When the repcodes are outside of the prefix, we set them to zero before the loop.
-	 * When the offsets are still zero, we need to restore them after the block to have a correct
-	 * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
-	 * offsets were invalid. We need to figure out which offset to refill with.
-	 *     - If both offsets are zero they are in the same order.
-	 *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
-	 *     - If only one is zero, we need to decide which offset to restore.
-	 *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
-	 *         - It is impossible for rep_offset2 to be non-zero.
-	 *
-	 * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
-	 * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
-	 */
-	offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
-
-	/* save reps for next block */
-	rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
-	rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
-
-	/* Return the last literals size */
-	return (size_t)(iend - anchor);
-
-_offset: /* Requires: ip0, idx */
-
-	/* Compute the offset code. */
-	match0 = base + idx;
-	rep_offset2 = rep_offset1;
-	rep_offset1 = (U32)(ip0 - match0);
-	offcode = OFFSET_TO_OFFBASE(rep_offset1);
-	mLength = 4;
-
-	/* Count the backwards match length. */
-	while(((ip0 > anchor) & (match0 > prefixStart)) && (ip0[-1] == match0[-1])) {
-		ip0--;
-		match0--;
-		mLength++;
-	}
-
-_match: /* Requires: ip0, match0, offcode */
-
-	/* Count the forward length. */
-	mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);
-
-	ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
-
-	ip0 += mLength;
-	anchor = ip0;
-
-	/* Fill table and check for immediate repcode. */
-	if(ip0 <= ilimit) {
-		/* Fill Table */
-		assert(base + current0 + 2 > istart);                                   /* check base overflow */
-		hashTable[ZSTD_hashPtr(base + current0 + 2, hlog, mls)] = current0 + 2; /* here because current+2 could be > iend-8 */
-		hashTable[ZSTD_hashPtr(ip0 - 2, hlog, mls)] = (U32)(ip0 - 2 - base);
-
-		if(rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */
-			while((ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2))) {
-				/* store sequence */
-				size_t const rLength = ZSTD_count(ip0 + 4, ip0 + 4 - rep_offset2, iend) + 4;
-				{
-					U32 const tmpOff = rep_offset2;
-					rep_offset2 = rep_offset1;
-					rep_offset1 = tmpOff;
-				} /* swap rep_offset2 <=> rep_offset1 */
-				hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0 - base);
-				ip0 += rLength;
-				ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
-				anchor = ip0;
-				continue; /* faster when present (confirmed on gcc-8) ... (?) */
-			}
-		}
-	}
-
-	goto _start;
-}
-
-#define ZSTD_GEN_FAST_FN(dictMode, mls, step)                                                            \
-	static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step(                                   \
-	    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                              \
-	    void const * src, size_t srcSize) {                                                              \
-		return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \
-	}
-
-ZSTD_GEN_FAST_FN(noDict, 4, 1)
-ZSTD_GEN_FAST_FN(noDict, 5, 1)
-ZSTD_GEN_FAST_FN(noDict, 6, 1)
-ZSTD_GEN_FAST_FN(noDict, 7, 1)
-
-ZSTD_GEN_FAST_FN(noDict, 4, 0)
-ZSTD_GEN_FAST_FN(noDict, 5, 0)
-ZSTD_GEN_FAST_FN(noDict, 6, 0)
-ZSTD_GEN_FAST_FN(noDict, 7, 0)
-
-size_t ZSTD_compressBlock_fast(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	U32 const mls = ms->cParams.minMatch;
-	assert(ms->dictMatchState == NULL);
-	if(ms->cParams.targetLength > 1) {
-		switch(mls) {
-		default: /* includes case 3 */
-		case 4:
-			return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);
-		case 5:
-			return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);
-		case 6:
-			return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);
-		case 7:
-			return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
-		}
-	} else {
-		switch(mls) {
-		default: /* includes case 3 */
-		case 4:
-			return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);
-		case 5:
-			return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);
-		case 6:
-			return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);
-		case 7:
-			return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
-		}
-	}
-}
-
-FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_fast_dictMatchState_generic(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize, U32 const mls, U32 const hasStep) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const hashTable = ms->hashTable;
-	U32 const hlog = cParams->hashLog;
-	/* support stepSize of 0 */
-	U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
-	const BYTE* const base = ms->window.base;
-	const BYTE* const istart = (const BYTE*)src;
-	const BYTE* ip0 = istart;
-	const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
-	const BYTE* anchor = istart;
-	const U32 prefixStartIndex = ms->window.dictLimit;
-	const BYTE* const prefixStart = base + prefixStartIndex;
-	const BYTE* const iend = istart + srcSize;
-	const BYTE* const ilimit = iend - HASH_READ_SIZE;
-	U32 offset_1 = rep[0], offset_2 = rep[1];
-
-	const ZSTD_matchState_t* const dms = ms->dictMatchState;
-	const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
-	const U32* const dictHashTable = dms->hashTable;
-	const U32 dictStartIndex = dms->window.dictLimit;
-	const BYTE* const dictBase = dms->window.base;
-	const BYTE* const dictStart = dictBase + dictStartIndex;
-	const BYTE* const dictEnd = dms->window.nextSrc;
-	const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase);
-	const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart);
-	const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
-
-	/* if a dictionary is still attached, it necessarily means that
-	 * it is within window size. So we just check it. */
-	const U32 maxDistance = 1U << cParams->windowLog;
-	const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
-	assert(endIndex - prefixStartIndex <= maxDistance);
-	(void)maxDistance;
-	(void)endIndex; /* these variables are not used when assert() is disabled */
-
-	(void)hasStep; /* not currently specialized on whether it's accelerated */
-
-	/* ensure there will be no underflow
-	 * when translating a dict index into a local index */
-	assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
-
-	if(ms->prefetchCDictTables) {
-		size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
-		PREFETCH_AREA(dictHashTable, hashTableBytes)
-	}
-
-	/* init */
-	DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
-	ip0 += (dictAndPrefixLength == 0);
-	/* dictMatchState repCode checks don't currently handle repCode == 0
-	 * disabling. */
-	assert(offset_1 <= dictAndPrefixLength);
-	assert(offset_2 <= dictAndPrefixLength);
-
-	/* Outer search loop */
-	assert(stepSize >= 1);
-	while(ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
-		size_t mLength;
-		size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
-
-		size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
-		U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
-		int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
-
-		U32 matchIndex = hashTable[hash0];
-		U32 curr = (U32)(ip0 - base);
-		size_t step = stepSize;
-		const size_t kStepIncr = 1 << kSearchStrength;
-		const BYTE* nextStep = ip0 + kStepIncr;
-
-		/* Inner search loop */
-		while(1) {
-			const BYTE* match = base + matchIndex;
-			const U32 repIndex = curr + 1 - offset_1;
-			const BYTE* repMatch = (repIndex < prefixStartIndex) ? dictBase + (repIndex - dictIndexDelta) : base + repIndex;
-			const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
-			size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
-			hashTable[hash0] = curr; /* update hash table */
-
-			if(((U32)((prefixStartIndex - 1) - repIndex) >=
-			    3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
-			   && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
-				const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-				mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
-				ip0++;
-				ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
-				break;
-			}
-
-			if(dictTagsMatch) {
-				/* Found a possible dict match */
-				const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
-				const BYTE* dictMatch = dictBase + dictMatchIndex;
-				if(dictMatchIndex > dictStartIndex &&
-				   MEM_read32(dictMatch) == MEM_read32(ip0)) {
-					/* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
-					if(matchIndex <= prefixStartIndex) {
-						U32 const offset = (U32)(curr - dictMatchIndex - dictIndexDelta);
-						mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
-						while(((ip0 > anchor) & (dictMatch > dictStart)) && (ip0[-1] == dictMatch[-1])) {
-							ip0--;
-							dictMatch--;
-							mLength++;
-						} /* catch up */
-						offset_2 = offset_1;
-						offset_1 = offset;
-						ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
-						break;
-					}
-				}
-			}
-
-			if(matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
-				/* found a regular match */
-				U32 const offset = (U32)(ip0 - match);
-				mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
-				while(((ip0 > anchor) & (match > prefixStart)) && (ip0[-1] == match[-1])) {
-					ip0--;
-					match--;
-					mLength++;
-				} /* catch up */
-				offset_2 = offset_1;
-				offset_1 = offset;
-				ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
-				break;
-			}
-
-			/* Prepare for next iteration */
-			dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
-			dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
-			matchIndex = hashTable[hash1];
-
-			if(ip1 >= nextStep) {
-				step++;
-				nextStep += kStepIncr;
-			}
-			ip0 = ip1;
-			ip1 = ip1 + step;
-			if(ip1 > ilimit)
-				goto _cleanup;
-
-			curr = (U32)(ip0 - base);
-			hash0 = hash1;
-		} /* end inner search loop */
-
-		/* match found */
-		assert(mLength);
-		ip0 += mLength;
-		anchor = ip0;
-
-		if(ip0 <= ilimit) {
-			/* Fill Table */
-			assert(base + curr + 2 > istart);                               /* check base overflow */
-			hashTable[ZSTD_hashPtr(base + curr + 2, hlog, mls)] = curr + 2; /* here because curr+2 could be > iend-8 */
-			hashTable[ZSTD_hashPtr(ip0 - 2, hlog, mls)] = (U32)(ip0 - 2 - base);
-
-			/* check immediate repcode */
-			while(ip0 <= ilimit) {
-				U32 const current2 = (U32)(ip0 - base);
-				U32 const repIndex2 = current2 - offset_2;
-				const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase - dictIndexDelta + repIndex2 : base + repIndex2;
-				if(((U32)((prefixStartIndex - 1) - (U32)repIndex2) >= 3 /* intentional overflow */) && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
-					const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-					size_t const repLength2 = ZSTD_count_2segments(ip0 + 4, repMatch2 + 4, iend, repEnd2, prefixStart) + 4;
-					U32 tmpOffset = offset_2;
-					offset_2 = offset_1;
-					offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
-					ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
-					hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
-					ip0 += repLength2;
-					anchor = ip0;
-					continue;
-				}
-				break;
-			}
-		}
-
-		/* Prepare for next iteration */
-		assert(ip0 == anchor);
-		ip1 = ip0 + stepSize;
-	}
-
-_cleanup:
-	/* save reps for next block */
-	rep[0] = offset_1;
-	rep[1] = offset_2;
-
-	/* Return the last literals size */
-	return (size_t)(iend - anchor);
-}
-
-ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)
-ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)
-ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
-ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
-
-size_t ZSTD_compressBlock_fast_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	U32 const mls = ms->cParams.minMatch;
-	assert(ms->dictMatchState != NULL);
-	switch(mls) {
-	default: /* includes case 3 */
-	case 4:
-		return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);
-	case 5:
-		return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);
-	case 6:
-		return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);
-	case 7:
-		return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);
-	}
-}
-
-static size_t ZSTD_compressBlock_fast_extDict_generic(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize, U32 const mls, U32 const hasStep) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const hashTable = ms->hashTable;
-	U32 const hlog = cParams->hashLog;
-	/* support stepSize of 0 */
-	size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
-	const BYTE* const base = ms->window.base;
-	const BYTE* const dictBase = ms->window.dictBase;
-	const BYTE* const istart = (const BYTE*)src;
-	const BYTE* anchor = istart;
-	const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
-	const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
-	const U32 dictStartIndex = lowLimit;
-	const BYTE* const dictStart = dictBase + dictStartIndex;
-	const U32 dictLimit = ms->window.dictLimit;
-	const U32 prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;
-	const BYTE* const prefixStart = base + prefixStartIndex;
-	const BYTE* const dictEnd = dictBase + prefixStartIndex;
-	const BYTE* const iend = istart + srcSize;
-	const BYTE* const ilimit = iend - 8;
-	U32 offset_1 = rep[0], offset_2 = rep[1];
-	U32 offsetSaved1 = 0, offsetSaved2 = 0;
-
-	const BYTE* ip0 = istart;
-	const BYTE* ip1;
-	const BYTE* ip2;
-	const BYTE* ip3;
-	U32 current0;
-
-	size_t hash0;        /* hash for ip0 */
-	size_t hash1;        /* hash for ip1 */
-	U32 idx;             /* match idx for ip0 */
-	const BYTE* idxBase; /* base pointer for idx */
-
-	U32 offcode;
-	const BYTE* match0;
-	size_t mLength;
-	const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
-
-	size_t step;
-	const BYTE* nextStep;
-	const size_t kStepIncr = (1 << (kSearchStrength - 1));
-
-	(void)hasStep; /* not currently specialized on whether it's accelerated */
-
-	DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
-
-	/* switch to "regular" variant if extDict is invalidated due to maxDistance */
-	if(prefixStartIndex == dictStartIndex)
-		return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
-
-	{
-		U32 const curr = (U32)(ip0 - base);
-		U32 const maxRep = curr - dictStartIndex;
-		if(offset_2 >= maxRep)
-			offsetSaved2 = offset_2, offset_2 = 0;
-		if(offset_1 >= maxRep)
-			offsetSaved1 = offset_1, offset_1 = 0;
-	}
-
-	/* start each op */
-_start: /* Requires: ip0 */
-
-	step = stepSize;
-	nextStep = ip0 + kStepIncr;
-
-	/* calculate positions, ip0 - anchor == 0, so we skip step calc */
-	ip1 = ip0 + 1;
-	ip2 = ip0 + step;
-	ip3 = ip2 + 1;
-
-	if(ip3 >= ilimit) {
-		goto _cleanup;
-	}
-
-	hash0 = ZSTD_hashPtr(ip0, hlog, mls);
-	hash1 = ZSTD_hashPtr(ip1, hlog, mls);
-
-	idx = hashTable[hash0];
-	idxBase = idx < prefixStartIndex ? dictBase : base;
-
-	do {
-		{ /* load repcode match for ip[2] */
-			U32 const current2 = (U32)(ip2 - base);
-			U32 const repIndex = current2 - offset_1;
-			const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
-			U32 rval;
-			if(((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
-			   & (offset_1 > 0)) {
-				rval = MEM_read32(repBase + repIndex);
-			} else {
-				rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
-			}
-
-			/* write back hash table entry */
-			current0 = (U32)(ip0 - base);
-			hashTable[hash0] = current0;
-
-			/* check repcode at ip[2] */
-			if(MEM_read32(ip2) == rval) {
-				ip0 = ip2;
-				match0 = repBase + repIndex;
-				matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-				assert((match0 != prefixStart) & (match0 != dictStart));
-				mLength = ip0[-1] == match0[-1];
-				ip0 -= mLength;
-				match0 -= mLength;
-				offcode = REPCODE1_TO_OFFBASE;
-				mLength += 4;
-				goto _match;
-			}
-		}
-
-		{                                                                                             /* load match for ip[0] */
-			U32 const mval = idx >= dictStartIndex ? MEM_read32(idxBase + idx) : MEM_read32(ip0) ^ 1; /* guaranteed not to match */
-
-			/* check match at ip[0] */
-			if(MEM_read32(ip0) == mval) {
-				/* found a match! */
-				goto _offset;
-			}
-		}
-
-		/* lookup ip[1] */
-		idx = hashTable[hash1];
-		idxBase = idx < prefixStartIndex ? dictBase : base;
-
-		/* hash ip[2] */
-		hash0 = hash1;
-		hash1 = ZSTD_hashPtr(ip2, hlog, mls);
-
-		/* advance to next positions */
-		ip0 = ip1;
-		ip1 = ip2;
-		ip2 = ip3;
-
-		/* write back hash table entry */
-		current0 = (U32)(ip0 - base);
-		hashTable[hash0] = current0;
-
-		{                                                                                             /* load match for ip[0] */
-			U32 const mval = idx >= dictStartIndex ? MEM_read32(idxBase + idx) : MEM_read32(ip0) ^ 1; /* guaranteed not to match */
-
-			/* check match at ip[0] */
-			if(MEM_read32(ip0) == mval) {
-				/* found a match! */
-				goto _offset;
-			}
-		}
-
-		/* lookup ip[1] */
-		idx = hashTable[hash1];
-		idxBase = idx < prefixStartIndex ? dictBase : base;
-
-		/* hash ip[2] */
-		hash0 = hash1;
-		hash1 = ZSTD_hashPtr(ip2, hlog, mls);
-
-		/* advance to next positions */
-		ip0 = ip1;
-		ip1 = ip2;
-		ip2 = ip0 + step;
-		ip3 = ip1 + step;
-
-		/* calculate step */
-		if(ip2 >= nextStep) {
-			step++;
-			PREFETCH_L1(ip1 + 64);
-			PREFETCH_L1(ip1 + 128);
-			nextStep += kStepIncr;
-		}
-	} while(ip3 < ilimit);
-
-_cleanup:
-	/* Note that there are probably still a couple positions we could search.
-	 * However, it seems to be a meaningful performance hit to try to search
-	 * them. So let's not. */
-
-	/* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
-	 * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
-	offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
-
-	/* save reps for next block */
-	rep[0] = offset_1 ? offset_1 : offsetSaved1;
-	rep[1] = offset_2 ? offset_2 : offsetSaved2;
-
-	/* Return the last literals size */
-	return (size_t)(iend - anchor);
-
-_offset: /* Requires: ip0, idx, idxBase */
-
-	/* Compute the offset code. */
-	{
-		U32 const offset = current0 - idx;
-		const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
-		matchEnd = idx < prefixStartIndex ? dictEnd : iend;
-		match0 = idxBase + idx;
-		offset_2 = offset_1;
-		offset_1 = offset;
-		offcode = OFFSET_TO_OFFBASE(offset);
-		mLength = 4;
-
-		/* Count the backwards match length. */
-		while(((ip0 > anchor) & (match0 > lowMatchPtr)) && (ip0[-1] == match0[-1])) {
-			ip0--;
-			match0--;
-			mLength++;
-		}
-	}
-
-_match: /* Requires: ip0, match0, offcode, matchEnd */
-
-	/* Count the forward length. */
-	assert(matchEnd != 0);
-	mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
-
-	ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
-
-	ip0 += mLength;
-	anchor = ip0;
-
-	/* write next hash table entry */
-	if(ip1 < ip0) {
-		hashTable[hash1] = (U32)(ip1 - base);
-	}
-
-	/* Fill table and check for immediate repcode. */
-	if(ip0 <= ilimit) {
-		/* Fill Table */
-		assert(base + current0 + 2 > istart);                                   /* check base overflow */
-		hashTable[ZSTD_hashPtr(base + current0 + 2, hlog, mls)] = current0 + 2; /* here because current+2 could be > iend-8 */
-		hashTable[ZSTD_hashPtr(ip0 - 2, hlog, mls)] = (U32)(ip0 - 2 - base);
-
-		while(ip0 <= ilimit) {
-			U32 const repIndex2 = (U32)(ip0 - base) - offset_2;
-			const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
-			if((((U32)((prefixStartIndex - 1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */
-			   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
-				const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-				size_t const repLength2 = ZSTD_count_2segments(ip0 + 4, repMatch2 + 4, iend, repEnd2, prefixStart) + 4;
-				{
-					U32 const tmpOffset = offset_2;
-					offset_2 = offset_1;
-					offset_1 = tmpOffset;
-				} /* swap offset_2 <=> offset_1 */
-				ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
-				hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0 - base);
-				ip0 += repLength2;
-				anchor = ip0;
-				continue;
-			}
-			break;
-		}
-	}
-
-	goto _start;
-}
-
-ZSTD_GEN_FAST_FN(extDict, 4, 0)
-ZSTD_GEN_FAST_FN(extDict, 5, 0)
-ZSTD_GEN_FAST_FN(extDict, 6, 0)
-ZSTD_GEN_FAST_FN(extDict, 7, 0)
-
-size_t ZSTD_compressBlock_fast_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	U32 const mls = ms->cParams.minMatch;
-	assert(ms->dictMatchState == NULL);
-	switch(mls) {
-	default: /* includes case 3 */
-	case 4:
-		return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);
-	case 5:
-		return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);
-	case 6:
-		return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);
-	case 7:
-		return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);
-	}
-}
diff --git a/src/zstd/zstd_internal.h b/src/zstd/zstd_internal.h
deleted file mode 100644
index ac3e29406..000000000
--- a/src/zstd/zstd_internal.h
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef ZSTD_CCOMMON_H_MODULE
-#define ZSTD_CCOMMON_H_MODULE
-
-/* this module contains definitions which must be identical
- * across compression, decompression and dictBuilder.
- * It also contains a few functions useful to at least 2 of them
- * and which benefit from being inlined */
-
-/*-*************************************
- *  Dependencies
- ***************************************/
-#include "compiler.h"
-#include "cpu.h"
-#include "mem.h"
-#include "debug.h" /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
-#include "error_private.h"
-#define ZSTD_STATIC_LINKING_ONLY
-#include "zstd.h"
-#define FSE_STATIC_LINKING_ONLY
-#include "fse.h"
-#include "huf.h"
-#ifndef XXH_STATIC_LINKING_ONLY
-#define XXH_STATIC_LINKING_ONLY /* XXH64_state_t */
-#endif
-#include "xxhash.h" /* XXH_reset, update, digest */
-#ifndef ZSTD_NO_TRACE
-#include "zstd_trace.h"
-#else
-#define ZSTD_TRACE 0
-#endif
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* ---- static assert (debug) --- */
-#define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)
-#define ZSTD_isError ERR_isError /* for inlining */
-#define FSE_isError ERR_isError
-#define HUF_isError ERR_isError
-
-/*-*************************************
- *  shared macros
- ***************************************/
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define BOUNDED(min, val, max) (MAX(min, MIN(val, max)))
-
-/*-*************************************
- *  Common constants
- ***************************************/
-#define ZSTD_OPT_NUM (1 << 12)
-
-#define ZSTD_REP_NUM 3 /* number of repcodes */
-static UNUSED_ATTR const U32 repStartValue[ZSTD_REP_NUM] = {1, 4, 8};
-
-#define KB *(1 << 10)
-#define MB *(1 << 20)
-#define GB *(1U << 30)
-
-#define BIT7 128
-#define BIT6 64
-#define BIT5 32
-#define BIT4 16
-#define BIT1 2
-#define BIT0 1
-
-#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
-static UNUSED_ATTR const size_t ZSTD_fcs_fieldSize[4] = {0, 2, 4, 8};
-static UNUSED_ATTR const size_t ZSTD_did_fieldSize[4] = {0, 1, 2, 4};
-
-#define ZSTD_FRAMEIDSIZE 4 /* magic number size */
-
-#define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
-static UNUSED_ATTR const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
-typedef enum { bt_raw,
-	           bt_rle,
-	           bt_compressed,
-	           bt_reserved } blockType_e;
-
-#define ZSTD_FRAMECHECKSUMSIZE 4
-
-#define MIN_SEQUENCES_SIZE 1                                  /* nbSeq==0 */
-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */
-#define MIN_LITERALS_FOR_4_STREAMS 6
-
-typedef enum { set_basic,
-	           set_rle,
-	           set_compressed,
-	           set_repeat } symbolEncodingType_e;
-
-#define LONGNBSEQ 0x7F00
-
-#define MINMATCH 3
-
-#define Litbits 8
-#define LitHufLog 11
-#define MaxLit ((1 << Litbits) - 1)
-#define MaxML 52
-#define MaxLL 35
-#define DefaultMaxOff 28
-#define MaxOff 31
-#define MaxSeq MAX(MaxLL, MaxML) /* Assumption : MaxOff < MaxLL,MaxML */
-#define MLFSELog 9
-#define LLFSELog 9
-#define OffFSELog 8
-#define MaxFSELog MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
-#define MaxMLBits 16
-#define MaxLLBits 16
-
-#define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
-/* Each table cannot take more than #symbols * FSELog bits */
-#define ZSTD_MAX_FSE_HEADERS_SIZE (((MaxML + 1) * MLFSELog + (MaxLL + 1) * LLFSELog + (MaxOff + 1) * OffFSELog + 7) / 8)
-
-static UNUSED_ATTR const U8 LL_bits[MaxLL + 1] = {
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 1, 2, 2, 3, 3,
-    4, 6, 7, 8, 9, 10, 11, 12,
-    13, 14, 15, 16};
-static UNUSED_ATTR const S16 LL_defaultNorm[MaxLL + 1] = {
-    4, 3, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 1, 1, 1,
-    2, 2, 2, 2, 2, 2, 2, 2,
-    2, 3, 2, 1, 1, 1, 1, 1,
-    -1, -1, -1, -1};
-#define LL_DEFAULTNORMLOG 6 /* for static allocation */
-static UNUSED_ATTR const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
-
-static UNUSED_ATTR const U8 ML_bits[MaxML + 1] = {
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 1, 2, 2, 3, 3,
-    4, 4, 5, 7, 8, 9, 10, 11,
-    12, 13, 14, 15, 16};
-static UNUSED_ATTR const S16 ML_defaultNorm[MaxML + 1] = {
-    1, 4, 3, 2, 2, 2, 2, 2,
-    2, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, -1, -1,
-    -1, -1, -1, -1, -1};
-#define ML_DEFAULTNORMLOG 6 /* for static allocation */
-static UNUSED_ATTR const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
-
-static UNUSED_ATTR const S16 OF_defaultNorm[DefaultMaxOff + 1] = {
-    1, 1, 1, 1, 1, 1, 2, 2,
-    2, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1,
-    -1, -1, -1, -1, -1};
-#define OF_DEFAULTNORMLOG 5 /* for static allocation */
-static UNUSED_ATTR const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
-
-/*-*******************************************
- *  Shared functions to include for inlining
- *********************************************/
-static void ZSTD_copy8(void* dst, const void* src) {
-#if defined(ZSTD_ARCH_ARM_NEON)
-	vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
-#else
-	ZSTD_memcpy(dst, src, 8);
-#endif
-}
-#define COPY8(d, s)       \
-	{                     \
-		ZSTD_copy8(d, s); \
-		d += 8;           \
-		s += 8;           \
-	}
-
-/* Need to use memmove here since the literal buffer can now be located within
-   the dst buffer. In circumstances where the op "catches up" to where the
-   literal buffer is, there can be partial overlaps in this call on the final
-   copy if the literal is being shifted by less than 16 bytes. */
-static void ZSTD_copy16(void* dst, const void* src) {
-#if defined(ZSTD_ARCH_ARM_NEON)
-	vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
-#elif defined(ZSTD_ARCH_X86_SSE2)
-	_mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src));
-#elif defined(__clang__)
-	ZSTD_memmove(dst, src, 16);
-#else
-	/* ZSTD_memmove is not inlined properly by gcc */
-	BYTE copy16_buf[16];
-	ZSTD_memcpy(copy16_buf, src, 16);
-	ZSTD_memcpy(dst, copy16_buf, 16);
-#endif
-}
-#define COPY16(d, s)       \
-	{                      \
-		ZSTD_copy16(d, s); \
-		d += 16;           \
-		s += 16;           \
-	}
-
-#define WILDCOPY_OVERLENGTH 32
-#define WILDCOPY_VECLEN 16
-
-typedef enum {
-	ZSTD_no_overlap,
-	ZSTD_overlap_src_before_dst
-	/*  ZSTD_overlap_dst_before_src, */
-} ZSTD_overlap_e;
-
-/*! ZSTD_wildcopy() :
- *  Custom version of ZSTD_memcpy(), can over read/write up to WILDCOPY_OVERLENGTH bytes (if length==0)
- *  @param ovtype controls the overlap detection
- *         - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
- *         - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart.
- *           The src buffer must be before the dst buffer.
- */
-MEM_STATIC FORCE_INLINE_ATTR void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e const ovtype) {
-	ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
-	const BYTE* ip = (const BYTE*)src;
-	BYTE* op = (BYTE*)dst;
-	BYTE* const oend = op + length;
-
-	if(ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
-		/* Handle short offset copies. */
-		do {
-			COPY8(op, ip)
-		} while(op < oend);
-	} else {
-		assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
-		/* Separate out the first COPY16() call because the copy length is
-		 * almost certain to be short, so the branches have different
-		 * probabilities. Since it is almost certain to be short, only do
-		 * one COPY16() in the first call. Then, do two calls per loop since
-		 * at that point it is more likely to have a high trip count.
-		 */
-		ZSTD_copy16(op, ip);
-		if(16 >= length)
-			return;
-		op += 16;
-		ip += 16;
-		do {
-			COPY16(op, ip);
-			COPY16(op, ip);
-		} while(op < oend);
-	}
-}
-
-MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize) {
-	size_t const length = MIN(dstCapacity, srcSize);
-	if(length > 0) {
-		ZSTD_memcpy(dst, src, length);
-	}
-	return length;
-}
-
-/* define "workspace is too large" as this number of times larger than needed */
-#define ZSTD_WORKSPACETOOLARGE_FACTOR 3
-
-/* when workspace is continuously too large
- * during at least this number of times,
- * context's memory usage is considered wasteful,
- * because it's sized to handle a worst case scenario which rarely happens.
- * In which case, resize it down to free some memory */
-#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128
-
-/* Controls whether the input/output buffer is buffered or stable. */
-typedef enum {
-	ZSTD_bm_buffered = 0, /* Buffer the input/output */
-	ZSTD_bm_stable = 1    /* ZSTD_inBuffer/ZSTD_outBuffer is stable */
-} ZSTD_bufferMode_e;
-
-/*-*******************************************
- *  Private declarations
- *********************************************/
-typedef struct seqDef_s {
-	U32 offBase; /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */
-	U16 litLength;
-	U16 mlBase; /* mlBase == matchLength - MINMATCH */
-} seqDef;
-
-/* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */
-typedef enum {
-	ZSTD_llt_none = 0,          /* no longLengthType */
-	ZSTD_llt_literalLength = 1, /* represents a long literal */
-	ZSTD_llt_matchLength = 2    /* represents a long match */
-} ZSTD_longLengthType_e;
-
-typedef struct {
-	seqDef* sequencesStart;
-	seqDef* sequences; /* ptr to end of sequences */
-	BYTE* litStart;
-	BYTE* lit; /* ptr to end of literals */
-	BYTE* llCode;
-	BYTE* mlCode;
-	BYTE* ofCode;
-	size_t maxNbSeq;
-	size_t maxNbLit;
-
-	/* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength
-	 * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
-	 * the existing value of the litLength or matchLength by 0x10000.
-	 */
-	ZSTD_longLengthType_e longLengthType;
-	U32 longLengthPos; /* Index of the sequence to apply long length modification to */
-} seqStore_t;
-
-typedef struct {
-	U32 litLength;
-	U32 matchLength;
-} ZSTD_sequenceLength;
-
-/**
- * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences
- * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength.
- */
-MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const * seqStore, seqDef const * seq) {
-	ZSTD_sequenceLength seqLen;
-	seqLen.litLength = seq->litLength;
-	seqLen.matchLength = seq->mlBase + MINMATCH;
-	if(seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
-		if(seqStore->longLengthType == ZSTD_llt_literalLength) {
-			seqLen.litLength += 0x10000;
-		}
-		if(seqStore->longLengthType == ZSTD_llt_matchLength) {
-			seqLen.matchLength += 0x10000;
-		}
-	}
-	return seqLen;
-}
-
-/**
- * Contains the compressed frame size and an upper-bound for the decompressed frame size.
- * Note: before using `compressedSize`, check for errors using ZSTD_isError().
- *       similarly, before using `decompressedBound`, check for errors using:
- *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
- */
-typedef struct {
-	size_t nbBlocks;
-	size_t compressedSize;
-	unsigned long long decompressedBound;
-} ZSTD_frameSizeInfo; /* decompress & legacy */
-
-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */
-int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);       /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
-
-/* custom memory allocation functions */
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
-
-/* ZSTD_invalidateRepCodes() :
- * ensures next compression will not use repcodes from previous block.
- * Note : only works with regular variant;
- *        do not use with extDict variant ! */
-void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx); /* zstdmt, adaptive_compression (shouldn't get this definition from here) */
-
-typedef struct {
-	blockType_e blockType;
-	U32 lastBlock;
-	U32 origSize;
-} blockProperties_t; /* declared here for decompress and fullbench */
-
-/*! ZSTD_getcBlockSize() :
- *  Provides the size of compressed block from block header `src` */
-/* Used by: decompress, fullbench (does not get its definition from here) */
-size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
-                          blockProperties_t* bpPtr);
-
-/*! ZSTD_decodeSeqHeaders() :
- *  decode sequence header from src */
-/* Used by: decompress, fullbench (does not get its definition from here) */
-size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
-                             const void* src, size_t srcSize);
-
-/**
- * @returns true iff the CPU supports dynamic BMI2 dispatch.
- */
-MEM_STATIC int ZSTD_cpuSupportsBmi2(void) {
-	ZSTD_cpuid_t cpuid = ZSTD_cpuid();
-	return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid);
-}
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* ZSTD_CCOMMON_H_MODULE */
diff --git a/src/zstd/zstd_lazy.c b/src/zstd/zstd_lazy.c
deleted file mode 100644
index 6b861cc38..000000000
--- a/src/zstd/zstd_lazy.c
+++ /dev/null
@@ -1,2124 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#include "zstd_compress_internal.h"
-#include "zstd_lazy.h"
-#include "bits.h" /* ZSTD_countTrailingZeros64 */
-
-/*-*************************************
- *  Binary Tree search
- ***************************************/
-
-static void
-ZSTD_updateDUBT(ZSTD_matchState_t* ms,
-                const BYTE* ip, const BYTE* iend,
-                U32 mls) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const hashTable = ms->hashTable;
-	U32 const hashLog = cParams->hashLog;
-
-	U32* const bt = ms->chainTable;
-	U32 const btLog = cParams->chainLog - 1;
-	U32 const btMask = (1 << btLog) - 1;
-
-	const BYTE* const base = ms->window.base;
-	U32 const target = (U32)(ip - base);
-	U32 idx = ms->nextToUpdate;
-
-	if(idx != target)
-		DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
-		         idx, target, ms->window.dictLimit);
-	assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */
-	(void)iend;
-
-	assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */
-	for(; idx < target; idx++) {
-		size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */
-		U32 const matchIndex = hashTable[h];
-
-		U32* const nextCandidatePtr = bt + 2 * (idx & btMask);
-		U32* const sortMarkPtr = nextCandidatePtr + 1;
-
-		DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
-		hashTable[h] = idx;             /* Update Hash Table */
-		*nextCandidatePtr = matchIndex; /* update BT like a chain */
-		*sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
-	}
-	ms->nextToUpdate = target;
-}
-
-/** ZSTD_insertDUBT1() :
- *  sort one already inserted but unsorted position
- *  assumption : curr >= btlow == (curr - btmask)
- *  doesn't fail */
-static void
-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
-                 U32 curr, const BYTE* inputEnd,
-                 U32 nbCompares, U32 btLow,
-                 const ZSTD_dictMode_e dictMode) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const bt = ms->chainTable;
-	U32 const btLog = cParams->chainLog - 1;
-	U32 const btMask = (1 << btLog) - 1;
-	size_t commonLengthSmaller = 0, commonLengthLarger = 0;
-	const BYTE* const base = ms->window.base;
-	const BYTE* const dictBase = ms->window.dictBase;
-	const U32 dictLimit = ms->window.dictLimit;
-	const BYTE* const ip = (curr >= dictLimit) ? base + curr : dictBase + curr;
-	const BYTE* const iend = (curr >= dictLimit) ? inputEnd : dictBase + dictLimit;
-	const BYTE* const dictEnd = dictBase + dictLimit;
-	const BYTE* const prefixStart = base + dictLimit;
-	const BYTE* match;
-	U32* smallerPtr = bt + 2 * (curr & btMask);
-	U32* largerPtr = smallerPtr + 1;
-	U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
-	U32 dummy32;                  /* to be nullified at the end */
-	U32 const windowValid = ms->window.lowLimit;
-	U32 const maxDistance = 1U << cParams->windowLog;
-	U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
-
-	DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
-	         curr, dictLimit, windowLow);
-	assert(curr >= btLow);
-	assert(ip < iend); /* condition for ZSTD_count */
-
-	for(; nbCompares && (matchIndex > windowLow); --nbCompares) {
-		U32* const nextPtr = bt + 2 * (matchIndex & btMask);
-		size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
-		assert(matchIndex < curr);
-		/* note : all candidates are now supposed sorted,
-		 * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
-		 * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
-
-		if((dictMode != ZSTD_extDict) || (matchIndex + matchLength >= dictLimit) /* both in current segment*/
-		   || (curr < dictLimit) /* both in extDict */) {
-			const BYTE* const mBase = ((dictMode != ZSTD_extDict) || (matchIndex + matchLength >= dictLimit)) ? base : dictBase;
-			assert((matchIndex + matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
-			       || (curr < dictLimit));
-			match = mBase + matchIndex;
-			matchLength += ZSTD_count(ip + matchLength, match + matchLength, iend);
-		} else {
-			match = dictBase + matchIndex;
-			matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart);
-			if(matchIndex + matchLength >= dictLimit)
-				match = base + matchIndex; /* preparation for next read of match[matchLength] */
-		}
-
-		DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
-		         curr, matchIndex, (U32)matchLength);
-
-		if(ip + matchLength == iend) { /* equal : no way to know if inf or sup */
-			break;                     /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
-		}
-
-		if(match[matchLength] < ip[matchLength]) { /* necessarily within buffer */
-			/* match is smaller than current */
-			*smallerPtr = matchIndex;          /* update smaller idx */
-			commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
-			if(matchIndex <= btLow) {
-				smallerPtr = &dummy32;
-				break;
-			} /* beyond tree size, stop searching */
-			DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
-			         matchIndex, btLow, nextPtr[1]);
-			smallerPtr = nextPtr + 1; /* new "candidate" => larger than match, which was smaller than target */
-			matchIndex = nextPtr[1];  /* new matchIndex, larger than previous and closer to current */
-		} else {
-			/* match is larger than current */
-			*largerPtr = matchIndex;
-			commonLengthLarger = matchLength;
-			if(matchIndex <= btLow) {
-				largerPtr = &dummy32;
-				break;
-			} /* beyond tree size, stop searching */
-			DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
-			         matchIndex, btLow, nextPtr[0]);
-			largerPtr = nextPtr;
-			matchIndex = nextPtr[0];
-		}
-	}
-
-	*smallerPtr = *largerPtr = 0;
-}
-
-static size_t
-ZSTD_DUBT_findBetterDictMatch(
-    const ZSTD_matchState_t* ms,
-    const BYTE* const ip, const BYTE* const iend,
-    size_t* offsetPtr,
-    size_t bestLength,
-    U32 nbCompares,
-    U32 const mls,
-    const ZSTD_dictMode_e dictMode) {
-	const ZSTD_matchState_t* const dms = ms->dictMatchState;
-	const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
-	const U32* const dictHashTable = dms->hashTable;
-	U32 const hashLog = dmsCParams->hashLog;
-	size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
-	U32 dictMatchIndex = dictHashTable[h];
-
-	const BYTE* const base = ms->window.base;
-	const BYTE* const prefixStart = base + ms->window.dictLimit;
-	U32 const curr = (U32)(ip - base);
-	const BYTE* const dictBase = dms->window.base;
-	const BYTE* const dictEnd = dms->window.nextSrc;
-	U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
-	U32 const dictLowLimit = dms->window.lowLimit;
-	U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
-
-	U32* const dictBt = dms->chainTable;
-	U32 const btLog = dmsCParams->chainLog - 1;
-	U32 const btMask = (1 << btLog) - 1;
-	U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
-
-	size_t commonLengthSmaller = 0, commonLengthLarger = 0;
-
-	(void)dictMode;
-	assert(dictMode == ZSTD_dictMatchState);
-
-	for(; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
-		U32* const nextPtr = dictBt + 2 * (dictMatchIndex & btMask);
-		size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
-		const BYTE* match = dictBase + dictMatchIndex;
-		matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart);
-		if(dictMatchIndex + matchLength >= dictHighLimit)
-			match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */
-
-		if(matchLength > bestLength) {
-			U32 matchIndex = dictMatchIndex + dictIndexDelta;
-			if((4 * (int)(matchLength - bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)offsetPtr[0] + 1))) {
-				DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
-				         curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
-				bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
-			}
-			if(ip + matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
-				break;                     /* drop, to guarantee consistency (miss a little bit of compression) */
-			}
-		}
-
-		if(match[matchLength] < ip[matchLength]) {
-			if(dictMatchIndex <= btLow) {
-				break;
-			}                                  /* beyond tree size, stop the search */
-			commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
-			dictMatchIndex = nextPtr[1];       /* new matchIndex larger than previous (closer to current) */
-		} else {
-			/* match is larger than current */
-			if(dictMatchIndex <= btLow) {
-				break;
-			} /* beyond tree size, stop the search */
-			commonLengthLarger = matchLength;
-			dictMatchIndex = nextPtr[0];
-		}
-	}
-
-	if(bestLength >= MINMATCH) {
-		U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr);
-		(void)mIndex;
-		DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
-		         curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
-	}
-	return bestLength;
-}
-
-static size_t
-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
-                        const BYTE* const ip, const BYTE* const iend,
-                        size_t* offBasePtr,
-                        U32 const mls,
-                        const ZSTD_dictMode_e dictMode) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const hashTable = ms->hashTable;
-	U32 const hashLog = cParams->hashLog;
-	size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
-	U32 matchIndex = hashTable[h];
-
-	const BYTE* const base = ms->window.base;
-	U32 const curr = (U32)(ip - base);
-	U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
-
-	U32* const bt = ms->chainTable;
-	U32 const btLog = cParams->chainLog - 1;
-	U32 const btMask = (1 << btLog) - 1;
-	U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
-	U32 const unsortLimit = MAX(btLow, windowLow);
-
-	U32* nextCandidate = bt + 2 * (matchIndex & btMask);
-	U32* unsortedMark = bt + 2 * (matchIndex & btMask) + 1;
-	U32 nbCompares = 1U << cParams->searchLog;
-	U32 nbCandidates = nbCompares;
-	U32 previousCandidate = 0;
-
-	DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
-	assert(ip <= iend - 8); /* required for h calculation */
-	assert(dictMode != ZSTD_dedicatedDictSearch);
-
-	/* reach end of unsorted candidates list */
-	while((matchIndex > unsortLimit) && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK) && (nbCandidates > 1)) {
-		DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
-		         matchIndex);
-		*unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */
-		previousCandidate = matchIndex;
-		matchIndex = *nextCandidate;
-		nextCandidate = bt + 2 * (matchIndex & btMask);
-		unsortedMark = bt + 2 * (matchIndex & btMask) + 1;
-		nbCandidates--;
-	}
-
-	/* nullify last candidate if it's still unsorted
-	 * simplification, detrimental to compression ratio, beneficial for speed */
-	if((matchIndex > unsortLimit) && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)) {
-		DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
-		         matchIndex);
-		*nextCandidate = *unsortedMark = 0;
-	}
-
-	/* batch sort stacked candidates */
-	matchIndex = previousCandidate;
-	while(matchIndex) { /* will end on matchIndex == 0 */
-		U32* const nextCandidateIdxPtr = bt + 2 * (matchIndex & btMask) + 1;
-		U32 const nextCandidateIdx = *nextCandidateIdxPtr;
-		ZSTD_insertDUBT1(ms, matchIndex, iend,
-		                 nbCandidates, unsortLimit, dictMode);
-		matchIndex = nextCandidateIdx;
-		nbCandidates++;
-	}
-
-	/* find longest match */
-	{
-		size_t commonLengthSmaller = 0, commonLengthLarger = 0;
-		const BYTE* const dictBase = ms->window.dictBase;
-		const U32 dictLimit = ms->window.dictLimit;
-		const BYTE* const dictEnd = dictBase + dictLimit;
-		const BYTE* const prefixStart = base + dictLimit;
-		U32* smallerPtr = bt + 2 * (curr & btMask);
-		U32* largerPtr = bt + 2 * (curr & btMask) + 1;
-		U32 matchEndIdx = curr + 8 + 1;
-		U32 dummy32; /* to be nullified at the end */
-		size_t bestLength = 0;
-
-		matchIndex = hashTable[h];
-		hashTable[h] = curr; /* Update Hash Table */
-
-		for(; nbCompares && (matchIndex > windowLow); --nbCompares) {
-			U32* const nextPtr = bt + 2 * (matchIndex & btMask);
-			size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
-			const BYTE* match;
-
-			if((dictMode != ZSTD_extDict) || (matchIndex + matchLength >= dictLimit)) {
-				match = base + matchIndex;
-				matchLength += ZSTD_count(ip + matchLength, match + matchLength, iend);
-			} else {
-				match = dictBase + matchIndex;
-				matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart);
-				if(matchIndex + matchLength >= dictLimit)
-					match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
-			}
-
-			if(matchLength > bestLength) {
-				if(matchLength > matchEndIdx - matchIndex)
-					matchEndIdx = matchIndex + (U32)matchLength;
-				if((4 * (int)(matchLength - bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)))
-					bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
-				if(ip + matchLength == iend) { /* equal : no way to know if inf or sup */
-					if(dictMode == ZSTD_dictMatchState) {
-						nbCompares = 0; /* in addition to avoiding checking any
-						                 * further in this loop, make sure we
-						                 * skip checking in the dictionary. */
-					}
-					break; /* drop, to guarantee consistency (miss a little bit of compression) */
-				}
-			}
-
-			if(match[matchLength] < ip[matchLength]) {
-				/* match is smaller than current */
-				*smallerPtr = matchIndex;          /* update smaller idx */
-				commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
-				if(matchIndex <= btLow) {
-					smallerPtr = &dummy32;
-					break;
-				}                         /* beyond tree size, stop the search */
-				smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */
-				matchIndex = nextPtr[1];  /* new matchIndex larger than previous (closer to current) */
-			} else {
-				/* match is larger than current */
-				*largerPtr = matchIndex;
-				commonLengthLarger = matchLength;
-				if(matchIndex <= btLow) {
-					largerPtr = &dummy32;
-					break;
-				} /* beyond tree size, stop the search */
-				largerPtr = nextPtr;
-				matchIndex = nextPtr[0];
-			}
-		}
-
-		*smallerPtr = *largerPtr = 0;
-
-		assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
-		if(dictMode == ZSTD_dictMatchState && nbCompares) {
-			bestLength = ZSTD_DUBT_findBetterDictMatch(
-			    ms, ip, iend,
-			    offBasePtr, bestLength, nbCompares,
-			    mls, dictMode);
-		}
-
-		assert(matchEndIdx > curr + 8);     /* ensure nextToUpdate is increased */
-		ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
-		if(bestLength >= MINMATCH) {
-			U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr);
-			(void)mIndex;
-			DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
-			         curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
-		}
-		return bestLength;
-	}
-}
-
-/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_BtFindBestMatch(ZSTD_matchState_t* ms,
-                     const BYTE* const ip, const BYTE* const iLimit,
-                     size_t* offBasePtr,
-                     const U32 mls /* template */,
-                     const ZSTD_dictMode_e dictMode) {
-	DEBUGLOG(7, "ZSTD_BtFindBestMatch");
-	if(ip < ms->window.base + ms->nextToUpdate)
-		return 0; /* skipped area */
-	ZSTD_updateDUBT(ms, ip, iLimit, mls);
-	return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
-}
-
-/***********************************
- * Dedicated dict search
- ***********************************/
-
-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip) {
-	const BYTE* const base = ms->window.base;
-	U32 const target = (U32)(ip - base);
-	U32* const hashTable = ms->hashTable;
-	U32* const chainTable = ms->chainTable;
-	U32 const chainSize = 1 << ms->cParams.chainLog;
-	U32 idx = ms->nextToUpdate;
-	U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
-	U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
-	U32 const cacheSize = bucketSize - 1;
-	U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
-	U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
-
-	/* We know the hashtable is oversized by a factor of `bucketSize`.
-	 * We are going to temporarily pretend `bucketSize == 1`, keeping only a
-	 * single entry. We will use the rest of the space to construct a temporary
-	 * chaintable.
-	 */
-	U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
-	U32* const tmpHashTable = hashTable;
-	U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
-	U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
-	U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
-	U32 hashIdx;
-
-	assert(ms->cParams.chainLog <= 24);
-	assert(ms->cParams.hashLog > ms->cParams.chainLog);
-	assert(idx != 0);
-	assert(tmpMinChain <= minChain);
-
-	/* fill conventional hash table and conventional chain table */
-	for(; idx < target; idx++) {
-		U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
-		if(idx >= tmpMinChain) {
-			tmpChainTable[idx - tmpMinChain] = hashTable[h];
-		}
-		tmpHashTable[h] = idx;
-	}
-
-	/* sort chains into ddss chain table */
-	{
-		U32 chainPos = 0;
-		for(hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
-			U32 count;
-			U32 countBeyondMinChain = 0;
-			U32 i = tmpHashTable[hashIdx];
-			for(count = 0; i >= tmpMinChain && count < cacheSize; count++) {
-				/* skip through the chain to the first position that won't be
-				 * in the hash cache bucket */
-				if(i < minChain) {
-					countBeyondMinChain++;
-				}
-				i = tmpChainTable[i - tmpMinChain];
-			}
-			if(count == cacheSize) {
-				for(count = 0; count < chainLimit;) {
-					if(i < minChain) {
-						if(!i || ++countBeyondMinChain > cacheSize) {
-							/* only allow pulling `cacheSize` number of entries
-							 * into the cache or chainTable beyond `minChain`,
-							 * to replace the entries pulled out of the
-							 * chainTable into the cache. This lets us reach
-							 * back further without increasing the total number
-							 * of entries in the chainTable, guaranteeing the
-							 * DDSS chain table will fit into the space
-							 * allocated for the regular one. */
-							break;
-						}
-					}
-					chainTable[chainPos++] = i;
-					count++;
-					if(i < tmpMinChain) {
-						break;
-					}
-					i = tmpChainTable[i - tmpMinChain];
-				}
-			} else {
-				count = 0;
-			}
-			if(count) {
-				tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
-			} else {
-				tmpHashTable[hashIdx] = 0;
-			}
-		}
-		assert(chainPos <= chainSize); /* I believe this is guaranteed... */
-	}
-
-	/* move chain pointers into the last entry of each hash bucket */
-	for(hashIdx = (1 << hashLog); hashIdx;) {
-		U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
-		U32 const chainPackedPointer = tmpHashTable[hashIdx];
-		U32 i;
-		for(i = 0; i < cacheSize; i++) {
-			hashTable[bucketIdx + i] = 0;
-		}
-		hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
-	}
-
-	/* fill the buckets of the hash table */
-	for(idx = ms->nextToUpdate; idx < target; idx++) {
-		U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
-		              << ZSTD_LAZY_DDSS_BUCKET_LOG;
-		U32 i;
-		/* Shift hash cache down 1. */
-		for(i = cacheSize - 1; i; i--)
-			hashTable[h + i] = hashTable[h + i - 1];
-		hashTable[h] = idx;
-	}
-
-	ms->nextToUpdate = target;
-}
-
-/* Returns the longest match length found in the dedicated dict search structure.
- * If none are longer than the argument ml, then ml will be returned.
- */
-FORCE_INLINE_TEMPLATE
-size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
-                                            const ZSTD_matchState_t* const dms,
-                                            const BYTE* const ip, const BYTE* const iLimit,
-                                            const BYTE* const prefixStart, const U32 curr,
-                                            const U32 dictLimit, const size_t ddsIdx) {
-	const U32 ddsLowestIndex = dms->window.dictLimit;
-	const BYTE* const ddsBase = dms->window.base;
-	const BYTE* const ddsEnd = dms->window.nextSrc;
-	const U32 ddsSize = (U32)(ddsEnd - ddsBase);
-	const U32 ddsIndexDelta = dictLimit - ddsSize;
-	const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
-	const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
-	U32 ddsAttempt;
-	U32 matchIndex;
-
-	for(ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
-		PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
-	}
-
-	{
-		U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
-		U32 const chainIndex = chainPackedPointer >> 8;
-
-		PREFETCH_L1(&dms->chainTable[chainIndex]);
-	}
-
-	for(ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
-		size_t currentMl = 0;
-		const BYTE* match;
-		matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
-		match = ddsBase + matchIndex;
-
-		if(!matchIndex) {
-			return ml;
-		}
-
-		/* guaranteed by table construction */
-		(void)ddsLowestIndex;
-		assert(matchIndex >= ddsLowestIndex);
-		assert(match + 4 <= ddsEnd);
-		if(MEM_read32(match) == MEM_read32(ip)) {
-			/* assumption : matchIndex <= dictLimit-4 (by table construction) */
-			currentMl = ZSTD_count_2segments(ip + 4, match + 4, iLimit, ddsEnd, prefixStart) + 4;
-		}
-
-		/* save best solution */
-		if(currentMl > ml) {
-			ml = currentMl;
-			*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
-			if(ip + currentMl == iLimit) {
-				/* best possible, avoids read overflow on next attempt */
-				return ml;
-			}
-		}
-	}
-
-	{
-		U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
-		U32 chainIndex = chainPackedPointer >> 8;
-		U32 const chainLength = chainPackedPointer & 0xFF;
-		U32 const chainAttempts = nbAttempts - ddsAttempt;
-		U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
-		U32 chainAttempt;
-
-		for(chainAttempt = 0; chainAttempt < chainLimit; chainAttempt++) {
-			PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
-		}
-
-		for(chainAttempt = 0; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
-			size_t currentMl = 0;
-			const BYTE* match;
-			matchIndex = dms->chainTable[chainIndex];
-			match = ddsBase + matchIndex;
-
-			/* guaranteed by table construction */
-			assert(matchIndex >= ddsLowestIndex);
-			assert(match + 4 <= ddsEnd);
-			if(MEM_read32(match) == MEM_read32(ip)) {
-				/* assumption : matchIndex <= dictLimit-4 (by table construction) */
-				currentMl = ZSTD_count_2segments(ip + 4, match + 4, iLimit, ddsEnd, prefixStart) + 4;
-			}
-
-			/* save best solution */
-			if(currentMl > ml) {
-				ml = currentMl;
-				*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
-				if(ip + currentMl == iLimit)
-					break; /* best possible, avoids read overflow on next attempt */
-			}
-		}
-	}
-	return ml;
-}
-
-/* *********************************
- *  Hash Chain
- ***********************************/
-#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
-
-/* Update chains up to ip (excluded)
-   Assumption : always within prefix (i.e. not within extDict) */
-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
-    ZSTD_matchState_t* ms,
-    const ZSTD_compressionParameters* const cParams,
-    const BYTE* ip, U32 const mls) {
-	U32* const hashTable = ms->hashTable;
-	const U32 hashLog = cParams->hashLog;
-	U32* const chainTable = ms->chainTable;
-	const U32 chainMask = (1 << cParams->chainLog) - 1;
-	const BYTE* const base = ms->window.base;
-	const U32 target = (U32)(ip - base);
-	U32 idx = ms->nextToUpdate;
-
-	while(idx < target) { /* catch up */
-		size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls);
-		NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
-		hashTable[h] = idx;
-		idx++;
-	}
-
-	ms->nextToUpdate = target;
-	return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
-}
-
-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
-}
-
-/* inlining is important to hardwire a hot branch (template emulation) */
-FORCE_INLINE_TEMPLATE
-size_t ZSTD_HcFindBestMatch(
-    ZSTD_matchState_t* ms,
-    const BYTE* const ip, const BYTE* const iLimit,
-    size_t* offsetPtr,
-    const U32 mls, const ZSTD_dictMode_e dictMode) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const chainTable = ms->chainTable;
-	const U32 chainSize = (1 << cParams->chainLog);
-	const U32 chainMask = chainSize - 1;
-	const BYTE* const base = ms->window.base;
-	const BYTE* const dictBase = ms->window.dictBase;
-	const U32 dictLimit = ms->window.dictLimit;
-	const BYTE* const prefixStart = base + dictLimit;
-	const BYTE* const dictEnd = dictBase + dictLimit;
-	const U32 curr = (U32)(ip - base);
-	const U32 maxDistance = 1U << cParams->windowLog;
-	const U32 lowestValid = ms->window.lowLimit;
-	const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
-	const U32 isDictionary = (ms->loadedDictEnd != 0);
-	const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
-	const U32 minChain = curr > chainSize ? curr - chainSize : 0;
-	U32 nbAttempts = 1U << cParams->searchLog;
-	size_t ml = 4 - 1;
-
-	const ZSTD_matchState_t* const dms = ms->dictMatchState;
-	const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
-	                           ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG
-	                           : 0;
-	const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
-	                          ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG
-	                          : 0;
-
-	U32 matchIndex;
-
-	if(dictMode == ZSTD_dedicatedDictSearch) {
-		const U32* entry = &dms->hashTable[ddsIdx];
-		PREFETCH_L1(entry);
-	}
-
-	/* HC4 match finder */
-	matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
-
-	for(; (matchIndex >= lowLimit) & (nbAttempts > 0); nbAttempts--) {
-		size_t currentMl = 0;
-		if((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
-			const BYTE* const match = base + matchIndex;
-			assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
-			/* read 4B starting from (match + ml + 1 - sizeof(U32)) */
-			if(MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
-				currentMl = ZSTD_count(ip, match, iLimit);
-		} else {
-			const BYTE* const match = dictBase + matchIndex;
-			assert(match + 4 <= dictEnd);
-			if(MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
-				currentMl = ZSTD_count_2segments(ip + 4, match + 4, iLimit, dictEnd, prefixStart) + 4;
-		}
-
-		/* save best solution */
-		if(currentMl > ml) {
-			ml = currentMl;
-			*offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
-			if(ip + currentMl == iLimit)
-				break; /* best possible, avoids read overflow on next attempt */
-		}
-
-		if(matchIndex <= minChain)
-			break;
-		matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
-	}
-
-	assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
-	if(dictMode == ZSTD_dedicatedDictSearch) {
-		ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
-		                                          ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
-	} else if(dictMode == ZSTD_dictMatchState) {
-		const U32* const dmsChainTable = dms->chainTable;
-		const U32 dmsChainSize = (1 << dms->cParams.chainLog);
-		const U32 dmsChainMask = dmsChainSize - 1;
-		const U32 dmsLowestIndex = dms->window.dictLimit;
-		const BYTE* const dmsBase = dms->window.base;
-		const BYTE* const dmsEnd = dms->window.nextSrc;
-		const U32 dmsSize = (U32)(dmsEnd - dmsBase);
-		const U32 dmsIndexDelta = dictLimit - dmsSize;
-		const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
-
-		matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
-
-		for(; (matchIndex >= dmsLowestIndex) & (nbAttempts > 0); nbAttempts--) {
-			size_t currentMl = 0;
-			const BYTE* const match = dmsBase + matchIndex;
-			assert(match + 4 <= dmsEnd);
-			if(MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
-				currentMl = ZSTD_count_2segments(ip + 4, match + 4, iLimit, dmsEnd, prefixStart) + 4;
-
-			/* save best solution */
-			if(currentMl > ml) {
-				ml = currentMl;
-				assert(curr > matchIndex + dmsIndexDelta);
-				*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
-				if(ip + currentMl == iLimit)
-					break; /* best possible, avoids read overflow on next attempt */
-			}
-
-			if(matchIndex <= dmsMinChain)
-				break;
-
-			matchIndex = dmsChainTable[matchIndex & dmsChainMask];
-		}
-	}
-
-	return ml;
-}
-
-/* *********************************
- * (SIMD) Row-based matchfinder
- ***********************************/
-/* Constants for row-based hash */
-#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
-#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
-#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
-
-#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
-
-typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */
-
-/* ZSTD_VecMask_next():
- * Starting from the LSB, returns the idx of the next non-zero bit.
- * Basically counting the nb of trailing zeroes.
- */
-MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
-	return ZSTD_countTrailingZeros64(val);
-}
-
-/* ZSTD_rotateRight_*():
- * Rotates a bitfield to the right by "count" bits.
- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
- */
-FORCE_INLINE_TEMPLATE
-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
-	assert(count < 64);
-	count &= 0x3F; /* for fickle pattern recognition */
-	return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
-}
-
-FORCE_INLINE_TEMPLATE
-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
-	assert(count < 32);
-	count &= 0x1F; /* for fickle pattern recognition */
-	return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
-}
-
-FORCE_INLINE_TEMPLATE
-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
-	assert(count < 16);
-	count &= 0x0F; /* for fickle pattern recognition */
-	return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
-}
-
-/* ZSTD_row_nextIndex():
- * Returns the next index to insert at within a tagTable row, and updates the "head"
- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
- */
-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
-	U32 const next = (*tagRow - 1) & rowMask;
-	*tagRow = (BYTE)next;
-	return next;
-}
-
-/* ZSTD_isAligned():
- * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
- */
-MEM_STATIC int ZSTD_isAligned(void const * ptr, size_t align) {
-	assert((align & (align - 1)) == 0);
-	return (((size_t)ptr) & (align - 1)) == 0;
-}
-
-/* ZSTD_row_prefetch():
- * Performs prefetching for the hashTable and tagTable at a given row.
- */
-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const * hashTable, U16 const * tagTable, U32 const relRow, U32 const rowLog) {
-	PREFETCH_L1(hashTable + relRow);
-	if(rowLog >= 5) {
-		PREFETCH_L1(hashTable + relRow + 16);
-		/* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
-	}
-	PREFETCH_L1(tagTable + relRow);
-	if(rowLog == 6) {
-		PREFETCH_L1(tagTable + relRow + 32);
-	}
-	assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
-	assert(ZSTD_isAligned(hashTable + relRow, 64));                 /* prefetched hash row always 64-byte aligned */
-	assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
-}
-
-/* ZSTD_row_fillHashCache():
- * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
- * but not beyond iLimit.
- */
-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
-                                                  U32 const rowLog, U32 const mls,
-                                                  U32 idx, const BYTE* const iLimit) {
-	U32 const * const hashTable = ms->hashTable;
-	U16 const * const tagTable = ms->tagTable;
-	U32 const hashLog = ms->rowHashLog;
-	U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
-	U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
-
-	for(; idx < lim; ++idx) {
-		U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
-		U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
-		ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
-		ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
-	}
-
-	DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
-	         ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
-	         ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
-}
-
-/* ZSTD_row_nextCachedHash():
- * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
- * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
- */
-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const * hashTable,
-                                                  U16 const * tagTable, BYTE const * base,
-                                                  U32 idx, U32 const hashLog,
-                                                  U32 const rowLog, U32 const mls) {
-	U32 const newHash = (U32)ZSTD_hashPtr(base + idx + ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
-	U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
-	ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
-	{
-		U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
-		cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
-		return hash;
-	}
-}
-
-/* ZSTD_row_update_internalImpl():
- * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
- */
-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
-                                                        U32 updateStartIdx, U32 const updateEndIdx,
-                                                        U32 const mls, U32 const rowLog,
-                                                        U32 const rowMask, U32 const useCache) {
-	U32* const hashTable = ms->hashTable;
-	U16* const tagTable = ms->tagTable;
-	U32 const hashLog = ms->rowHashLog;
-	const BYTE* const base = ms->window.base;
-
-	DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
-	for(; updateStartIdx < updateEndIdx; ++updateStartIdx) {
-		U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
-		                          : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
-		U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
-		U32* const row = hashTable + relRow;
-		BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
-		                                              Explicit cast allows us to get exact desired position within each row */
-		U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
-
-		assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
-		((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
-		row[pos] = updateStartIdx;
-	}
-}
-
-/* ZSTD_row_update_internal():
- * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
- * Skips sections of long matches as is necessary.
- */
-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
-                                                    U32 const mls, U32 const rowLog,
-                                                    U32 const rowMask, U32 const useCache) {
-	U32 idx = ms->nextToUpdate;
-	const BYTE* const base = ms->window.base;
-	const U32 target = (U32)(ip - base);
-	const U32 kSkipThreshold = 384;
-	const U32 kMaxMatchStartPositionsToUpdate = 96;
-	const U32 kMaxMatchEndPositionsToUpdate = 32;
-
-	if(useCache) {
-		/* Only skip positions when using hash cache, i.e.
-		 * if we are loading a dict, don't skip anything.
-		 * If we decide to skip, then we only update a set number
-		 * of positions at the beginning and end of the match.
-		 */
-		if(UNLIKELY(target - idx > kSkipThreshold)) {
-			U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
-			ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
-			idx = target - kMaxMatchEndPositionsToUpdate;
-			ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip + 1);
-		}
-	}
-	assert(target >= idx);
-	ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
-	ms->nextToUpdate = target;
-}
-
-/* ZSTD_row_update():
- * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
- * processing.
- */
-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
-	const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
-	const U32 rowMask = (1u << rowLog) - 1;
-	const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
-
-	DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
-	ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
-}
-
-/* Returns the mask width of bits group of which will be set to 1. Given not all
- * architectures have easy movemask instruction, this helps to iterate over
- * groups of bits easier and faster.
- */
-FORCE_INLINE_TEMPLATE U32
-ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) {
-	assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
-	assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
-	(void)rowEntries;
-#if defined(ZSTD_ARCH_ARM_NEON)
-	/* NEON path only works for little endian */
-	if(!MEM_isLittleEndian()) {
-		return 1;
-	}
-	if(rowEntries == 16) {
-		return 4;
-	}
-	if(rowEntries == 32) {
-		return 2;
-	}
-	if(rowEntries == 64) {
-		return 1;
-	}
-#endif
-	return 1;
-}
-
-#if defined(ZSTD_ARCH_X86_SSE2)
-FORCE_INLINE_TEMPLATE ZSTD_VecMask
-ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head) {
-	const __m128i comparisonMask = _mm_set1_epi8((char)tag);
-	int matches[4] = {0};
-	int i;
-	assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
-	for(i = 0; i < nbChunks; i++) {
-		const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16 * i));
-		const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
-		matches[i] = _mm_movemask_epi8(equalMask);
-	}
-	if(nbChunks == 1)
-		return ZSTD_rotateRight_U16((U16)matches[0], head);
-	if(nbChunks == 2)
-		return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
-	assert(nbChunks == 4);
-	return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
-}
-#endif
-
-#if defined(ZSTD_ARCH_ARM_NEON)
-FORCE_INLINE_TEMPLATE ZSTD_VecMask
-ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) {
-	assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
-	if(rowEntries == 16) {
-		/* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
-		 * After that groups of 4 bits represent the equalMask. We lower
-		 * all bits except the highest in these groups by doing AND with
-		 * 0x88 = 0b10001000.
-		 */
-		const uint8x16_t chunk = vld1q_u8(src);
-		const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
-		const uint8x8_t res = vshrn_n_u16(equalMask, 4);
-		const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
-		return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
-	} else if(rowEntries == 32) {
-		/* Same idea as with rowEntries == 16 but doing AND with
-		 * 0x55 = 0b01010101.
-		 */
-		const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
-		const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
-		const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
-		const uint8x16_t dup = vdupq_n_u8(tag);
-		const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
-		const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
-		const uint8x8_t res = vsli_n_u8(t0, t1, 4);
-		const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
-		return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
-	} else { /* rowEntries == 64 */
-		const uint8x16x4_t chunk = vld4q_u8(src);
-		const uint8x16_t dup = vdupq_n_u8(tag);
-		const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
-		const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
-		const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
-		const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
-
-		const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
-		const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
-		const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
-		const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
-		const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
-		const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
-		return ZSTD_rotateRight_U64(matches, headGrouped);
-	}
-}
-#endif
-
-/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
- * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
- * matches the hash at the nth position in a row of the tagTable.
- * Each row is a circular buffer beginning at the value of "headGrouped". So we
- * must rotate the "matches" bitfield to match up with the actual layout of the
- * entries within the hashTable */
-FORCE_INLINE_TEMPLATE ZSTD_VecMask
-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) {
-	const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
-	assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
-	assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
-	assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
-
-#if defined(ZSTD_ARCH_X86_SSE2)
-
-	return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
-
-#else /* SW or NEON-LE */
-
-#if defined(ZSTD_ARCH_ARM_NEON)
-	/* This NEON path only works for little endian - otherwise use SWAR below */
-	if(MEM_isLittleEndian()) {
-		return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
-	}
-#endif /* ZSTD_ARCH_ARM_NEON */
-	/* SWAR */
-	{
-		const int chunkSize = sizeof(size_t);
-		const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
-		const size_t xFF = ~((size_t)0);
-		const size_t x01 = xFF / 0xFF;
-		const size_t x80 = x01 << 7;
-		const size_t splatChar = tag * x01;
-		ZSTD_VecMask matches = 0;
-		int i = rowEntries - chunkSize;
-		assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
-		if(MEM_isLittleEndian()) { /* runtime check so have two loops */
-			const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
-			do {
-				size_t chunk = MEM_readST(&src[i]);
-				chunk ^= splatChar;
-				chunk = (((chunk | x80) - x01) | chunk) & x80;
-				matches <<= chunkSize;
-				matches |= (chunk * extractMagic) >> shiftAmount;
-				i -= chunkSize;
-			} while(i >= 0);
-		} else { /* big endian: reverse bits during extraction */
-			const size_t msb = xFF ^ (xFF >> 1);
-			const size_t extractMagic = (msb / 0x1FF) | msb;
-			do {
-				size_t chunk = MEM_readST(&src[i]);
-				chunk ^= splatChar;
-				chunk = (((chunk | x80) - x01) | chunk) & x80;
-				matches <<= chunkSize;
-				matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
-				i -= chunkSize;
-			} while(i >= 0);
-		}
-		matches = ~matches;
-		if(rowEntries == 16) {
-			return ZSTD_rotateRight_U16((U16)matches, headGrouped);
-		} else if(rowEntries == 32) {
-			return ZSTD_rotateRight_U32((U32)matches, headGrouped);
-		} else {
-			return ZSTD_rotateRight_U64((U64)matches, headGrouped);
-		}
-	}
-#endif
-}
-
-/* The high-level approach of the SIMD row based match finder is as follows:
- * - Figure out where to insert the new entry:
- *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
- *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
- *        which row to insert into.
- *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
- *        be considered as a circular buffer with a "head" index that resides in the tagTable.
- *      - Also insert the "tag" into the equivalent row and position in the tagTable.
- *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
- *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
- *                  for alignment/performance reasons, leaving some bytes unused.
- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
- *   generate a bitfield that we can cycle through to check the collisions in the hash table.
- * - Pick the longest match.
- */
-FORCE_INLINE_TEMPLATE
-size_t ZSTD_RowFindBestMatch(
-    ZSTD_matchState_t* ms,
-    const BYTE* const ip, const BYTE* const iLimit,
-    size_t* offsetPtr,
-    const U32 mls, const ZSTD_dictMode_e dictMode,
-    const U32 rowLog) {
-	U32* const hashTable = ms->hashTable;
-	U16* const tagTable = ms->tagTable;
-	U32* const hashCache = ms->hashCache;
-	const U32 hashLog = ms->rowHashLog;
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	const BYTE* const base = ms->window.base;
-	const BYTE* const dictBase = ms->window.dictBase;
-	const U32 dictLimit = ms->window.dictLimit;
-	const BYTE* const prefixStart = base + dictLimit;
-	const BYTE* const dictEnd = dictBase + dictLimit;
-	const U32 curr = (U32)(ip - base);
-	const U32 maxDistance = 1U << cParams->windowLog;
-	const U32 lowestValid = ms->window.lowLimit;
-	const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
-	const U32 isDictionary = (ms->loadedDictEnd != 0);
-	const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
-	const U32 rowEntries = (1U << rowLog);
-	const U32 rowMask = rowEntries - 1;
-	const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
-	const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
-	U32 nbAttempts = 1U << cappedSearchLog;
-	size_t ml = 4 - 1;
-
-	/* DMS/DDS variables that may be referenced laster */
-	const ZSTD_matchState_t* const dms = ms->dictMatchState;
-
-	/* Initialize the following variables to satisfy static analyzer */
-	size_t ddsIdx = 0;
-	U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
-	U32 dmsTag = 0;
-	U32* dmsRow = NULL;
-	BYTE* dmsTagRow = NULL;
-
-	if(dictMode == ZSTD_dedicatedDictSearch) {
-		const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
-		{ /* Prefetch DDS hashtable entry */
-			ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
-			PREFETCH_L1(&dms->hashTable[ddsIdx]);
-		}
-		ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
-	}
-
-	if(dictMode == ZSTD_dictMatchState) {
-		/* Prefetch DMS rows */
-		U32* const dmsHashTable = dms->hashTable;
-		U16* const dmsTagTable = dms->tagTable;
-		U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
-		U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
-		dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
-		dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
-		dmsRow = dmsHashTable + dmsRelRow;
-		ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
-	}
-
-	/* Update the hashTable and tagTable up to (but not including) ip */
-	ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
-	{ /* Get the hash for ip, compute the appropriate row */
-		U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
-		U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
-		U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
-		U32* const row = hashTable + relRow;
-		BYTE* tagRow = (BYTE*)(tagTable + relRow);
-		U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
-		U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
-		size_t numMatches = 0;
-		size_t currMatch = 0;
-		ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
-
-		/* Cycle through the matches and prefetch */
-		for(; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
-			U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
-			U32 const matchIndex = row[matchPos];
-			assert(numMatches < rowEntries);
-			if(matchIndex < lowLimit)
-				break;
-			if((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
-				PREFETCH_L1(base + matchIndex);
-			} else {
-				PREFETCH_L1(dictBase + matchIndex);
-			}
-			matchBuffer[numMatches++] = matchIndex;
-		}
-
-		/* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
-		   in ZSTD_row_update_internal() at the next search. */
-		{
-			U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
-			tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
-			row[pos] = ms->nextToUpdate++;
-		}
-
-		/* Return the longest match */
-		for(; currMatch < numMatches; ++currMatch) {
-			U32 const matchIndex = matchBuffer[currMatch];
-			size_t currentMl = 0;
-			assert(matchIndex < curr);
-			assert(matchIndex >= lowLimit);
-
-			if((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
-				const BYTE* const match = base + matchIndex;
-				assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
-				/* read 4B starting from (match + ml + 1 - sizeof(U32)) */
-				if(MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
-					currentMl = ZSTD_count(ip, match, iLimit);
-			} else {
-				const BYTE* const match = dictBase + matchIndex;
-				assert(match + 4 <= dictEnd);
-				if(MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
-					currentMl = ZSTD_count_2segments(ip + 4, match + 4, iLimit, dictEnd, prefixStart) + 4;
-			}
-
-			/* Save best solution */
-			if(currentMl > ml) {
-				ml = currentMl;
-				*offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
-				if(ip + currentMl == iLimit)
-					break; /* best possible, avoids read overflow on next attempt */
-			}
-		}
-	}
-
-	assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
-	if(dictMode == ZSTD_dedicatedDictSearch) {
-		ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
-		                                          ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
-	} else if(dictMode == ZSTD_dictMatchState) {
-		/* TODO: Measure and potentially add prefetching to DMS */
-		const U32 dmsLowestIndex = dms->window.dictLimit;
-		const BYTE* const dmsBase = dms->window.base;
-		const BYTE* const dmsEnd = dms->window.nextSrc;
-		const U32 dmsSize = (U32)(dmsEnd - dmsBase);
-		const U32 dmsIndexDelta = dictLimit - dmsSize;
-
-		{
-			U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
-			U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
-			size_t numMatches = 0;
-			size_t currMatch = 0;
-			ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
-
-			for(; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
-				U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
-				U32 const matchIndex = dmsRow[matchPos];
-				if(matchIndex < dmsLowestIndex)
-					break;
-				PREFETCH_L1(dmsBase + matchIndex);
-				matchBuffer[numMatches++] = matchIndex;
-			}
-
-			/* Return the longest match */
-			for(; currMatch < numMatches; ++currMatch) {
-				U32 const matchIndex = matchBuffer[currMatch];
-				size_t currentMl = 0;
-				assert(matchIndex >= dmsLowestIndex);
-				assert(matchIndex < curr);
-
-				{
-					const BYTE* const match = dmsBase + matchIndex;
-					assert(match + 4 <= dmsEnd);
-					if(MEM_read32(match) == MEM_read32(ip))
-						currentMl = ZSTD_count_2segments(ip + 4, match + 4, iLimit, dmsEnd, prefixStart) + 4;
-				}
-
-				if(currentMl > ml) {
-					ml = currentMl;
-					assert(curr > matchIndex + dmsIndexDelta);
-					*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
-					if(ip + currentMl == iLimit)
-						break;
-				}
-			}
-		}
-	}
-	return ml;
-}
-
-/**
- * Generate search functions templated on (dictMode, mls, rowLog).
- * These functions are outlined for code size & compilation time.
- * ZSTD_searchMax() dispatches to the correct implementation function.
- *
- * TODO: The start of the search function involves loading and calculating a
- * bunch of constants from the ZSTD_matchState_t. These computations could be
- * done in an initialization function, and saved somewhere in the match state.
- * Then we could pass a pointer to the saved state instead of the match state,
- * and avoid duplicate computations.
- *
- * TODO: Move the match re-winding into searchMax. This improves compression
- * ratio, and unlocks further simplifications with the next TODO.
- *
- * TODO: Try moving the repcode search into searchMax. After the re-winding
- * and repcode search are in searchMax, there is no more logic in the match
- * finder loop that requires knowledge about the dictMode. So we should be
- * able to avoid force inlining it, and we can join the extDict loop with
- * the single segment loop. It should go in searchMax instead of its own
- * function to avoid having multiple virtual function calls per search.
- */
-
-#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
-#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
-#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
-
-#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
-
-#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls)                                           \
-	ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)(                      \
-	    ZSTD_matchState_t * ms,                                                        \
-	    const BYTE* ip, const BYTE* const iLimit,                                      \
-	    size_t* offBasePtr) {                                                          \
-		assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                           \
-		return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
-	}
-
-#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls)                                          \
-	ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)(                     \
-	    ZSTD_matchState_t * ms,                                                       \
-	    const BYTE* ip, const BYTE* const iLimit,                                     \
-	    size_t* offsetPtr) {                                                          \
-		assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                          \
-		return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
-	}
-
-#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)                                          \
-	ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(                     \
-	    ZSTD_matchState_t * ms,                                                                \
-	    const BYTE* ip, const BYTE* const iLimit,                                              \
-	    size_t* offsetPtr) {                                                                   \
-		assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                                   \
-		assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog);                               \
-		return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
-	}
-
-#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
-	X(dictMode, mls, 4)                        \
-	X(dictMode, mls, 5)                        \
-	X(dictMode, mls, 6)
-
-#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
-	ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4)      \
-	ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5)      \
-	ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
-
-#define ZSTD_FOR_EACH_MLS(X, dictMode) \
-	X(dictMode, 4)                     \
-	X(dictMode, 5)                     \
-	X(dictMode, 6)
-
-#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
-	X(__VA_ARGS__, noDict)              \
-	X(__VA_ARGS__, extDict)             \
-	X(__VA_ARGS__, dictMatchState)      \
-	X(__VA_ARGS__, dedicatedDictSearch)
-
-/* Generate row search fns for each combination of (dictMode, mls, rowLog) */
-ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
-/* Generate binary Tree search fns for each combination of (dictMode, mls) */
-ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
-/* Generate hash chain search fns for each combination of (dictMode, mls) */
-ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
-
-typedef enum { search_hashChain = 0,
-	           search_binaryTree = 1,
-	           search_rowHash = 2 } searchMethod_e;
-
-#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
-	case mls:                                     \
-		return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
-#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
-	case mls:                                     \
-		return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
-#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
-	case rowLog:                                           \
-		return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
-
-#define ZSTD_SWITCH_MLS(X, dictMode)   \
-	switch(mls) {                      \
-		ZSTD_FOR_EACH_MLS(X, dictMode) \
-	}
-
-#define ZSTD_SWITCH_ROWLOG(dictMode, mls)                                    \
-	case mls:                                                                \
-		switch(rowLog) {                                                     \
-			ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
-		}                                                                    \
-		ZSTD_UNREACHABLE;                                                    \
-		break;
-
-#define ZSTD_SWITCH_SEARCH_METHOD(dictMode)                   \
-	switch(searchMethod) {                                    \
-	case search_hashChain:                                    \
-		ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
-		break;                                                \
-	case search_binaryTree:                                   \
-		ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
-		break;                                                \
-	case search_rowHash:                                      \
-		ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode)         \
-		break;                                                \
-	}                                                         \
-	ZSTD_UNREACHABLE;
-
-/**
- * Searches for the longest match at @p ip.
- * Dispatches to the correct implementation function based on the
- * (searchMethod, dictMode, mls, rowLog). We use switch statements
- * here instead of using an indirect function call through a function
- * pointer because after Spectre and Meltdown mitigations, indirect
- * function calls can be very costly, especially in the kernel.
- *
- * NOTE: dictMode and searchMethod should be templated, so those switch
- * statements should be optimized out. Only the mls & rowLog switches
- * should be left.
- *
- * @param ms The match state.
- * @param ip The position to search at.
- * @param iend The end of the input data.
- * @param[out] offsetPtr Stores the match offset into this pointer.
- * @param mls The minimum search length, in the range [4, 6].
- * @param rowLog The row log (if applicable), in the range [4, 6].
- * @param searchMethod The search method to use (templated).
- * @param dictMode The dictMode (templated).
- *
- * @returns The length of the longest match found, or < mls if no match is found.
- * If a match is found its offset is stored in @p offsetPtr.
- */
-FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
-    ZSTD_matchState_t* ms,
-    const BYTE* ip,
-    const BYTE* iend,
-    size_t* offsetPtr,
-    U32 const mls,
-    U32 const rowLog,
-    searchMethod_e const searchMethod,
-    ZSTD_dictMode_e const dictMode) {
-	if(dictMode == ZSTD_noDict) {
-		ZSTD_SWITCH_SEARCH_METHOD(noDict)
-	} else if(dictMode == ZSTD_extDict) {
-		ZSTD_SWITCH_SEARCH_METHOD(extDict)
-	} else if(dictMode == ZSTD_dictMatchState) {
-		ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
-	} else if(dictMode == ZSTD_dedicatedDictSearch) {
-		ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
-	}
-	ZSTD_UNREACHABLE;
-	return 0;
-}
-
-/* *******************************
- *  Common parser - lazy strategy
- *********************************/
-
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_compressBlock_lazy_generic(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore,
-    U32 rep[ZSTD_REP_NUM],
-    const void* src, size_t srcSize,
-    const searchMethod_e searchMethod, const U32 depth,
-    ZSTD_dictMode_e const dictMode) {
-	const BYTE* const istart = (const BYTE*)src;
-	const BYTE* ip = istart;
-	const BYTE* anchor = istart;
-	const BYTE* const iend = istart + srcSize;
-	const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
-	const BYTE* const base = ms->window.base;
-	const U32 prefixLowestIndex = ms->window.dictLimit;
-	const BYTE* const prefixLowest = base + prefixLowestIndex;
-	const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
-	const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
-
-	U32 offset_1 = rep[0], offset_2 = rep[1];
-	U32 offsetSaved1 = 0, offsetSaved2 = 0;
-
-	const int isDMS = dictMode == ZSTD_dictMatchState;
-	const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
-	const int isDxS = isDMS || isDDS;
-	const ZSTD_matchState_t* const dms = ms->dictMatchState;
-	const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;
-	const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
-	const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
-	const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
-	const U32 dictIndexDelta = isDxS ? prefixLowestIndex - (U32)(dictEnd - dictBase) : 0;
-	const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
-
-	DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
-	ip += (dictAndPrefixLength == 0);
-	if(dictMode == ZSTD_noDict) {
-		U32 const curr = (U32)(ip - base);
-		U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
-		U32 const maxRep = curr - windowLow;
-		if(offset_2 > maxRep)
-			offsetSaved2 = offset_2, offset_2 = 0;
-		if(offset_1 > maxRep)
-			offsetSaved1 = offset_1, offset_1 = 0;
-	}
-	if(isDxS) {
-		/* dictMatchState repCode checks don't currently handle repCode == 0
-		 * disabling. */
-		assert(offset_1 <= dictAndPrefixLength);
-		assert(offset_2 <= dictAndPrefixLength);
-	}
-
-	if(searchMethod == search_rowHash) {
-		ZSTD_row_fillHashCache(ms, base, rowLog,
-		                       MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-		                       ms->nextToUpdate, ilimit);
-	}
-
-	/* Match Loop */
-#if defined(__GNUC__) && defined(__x86_64__)
-	/* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
-	 * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
-	 */
-	__asm__(".p2align 5");
-#endif
-	while(ip < ilimit) {
-		size_t matchLength = 0;
-		size_t offBase = REPCODE1_TO_OFFBASE;
-		const BYTE* start = ip + 1;
-		DEBUGLOG(7, "search baseline (depth 0)");
-
-		/* check repCode */
-		if(isDxS) {
-			const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
-			const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch) && repIndex < prefixLowestIndex) ? dictBase + (repIndex - dictIndexDelta) : base + repIndex;
-			if(((U32)((prefixLowestIndex - 1) - repIndex) >= 3 /* intentional underflow */) && (MEM_read32(repMatch) == MEM_read32(ip + 1))) {
-				const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
-				matchLength = ZSTD_count_2segments(ip + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixLowest) + 4;
-				if(depth == 0)
-					goto _storeSequence;
-			}
-		}
-		if(dictMode == ZSTD_noDict && ((offset_1 > 0) & (MEM_read32(ip + 1 - offset_1) == MEM_read32(ip + 1)))) {
-			matchLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4;
-			if(depth == 0)
-				goto _storeSequence;
-		}
-
-		/* first search (depth 0) */
-		{
-			size_t offbaseFound = 999999999;
-			size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
-			if(ml2 > matchLength)
-				matchLength = ml2, start = ip, offBase = offbaseFound;
-		}
-
-		if(matchLength < 4) {
-			ip += ((ip - anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
-			continue;
-		}
-
-		/* let's try to find a better solution */
-		if(depth >= 1)
-			while(ip < ilimit) {
-				DEBUGLOG(7, "search depth 1");
-				ip++;
-				if((dictMode == ZSTD_noDict) && (offBase) && ((offset_1 > 0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
-					size_t const mlRep = ZSTD_count(ip + 4, ip + 4 - offset_1, iend) + 4;
-					int const gain2 = (int)(mlRep * 3);
-					int const gain1 = (int)(matchLength * 3 - ZSTD_highbit32((U32)offBase) + 1);
-					if((mlRep >= 4) && (gain2 > gain1))
-						matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
-				}
-				if(isDxS) {
-					const U32 repIndex = (U32)(ip - base) - offset_1;
-					const BYTE* repMatch = repIndex < prefixLowestIndex ? dictBase + (repIndex - dictIndexDelta) : base + repIndex;
-					if(((U32)((prefixLowestIndex - 1) - repIndex) >= 3 /* intentional underflow */) && (MEM_read32(repMatch) == MEM_read32(ip))) {
-						const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
-						size_t const mlRep = ZSTD_count_2segments(ip + 4, repMatch + 4, iend, repMatchEnd, prefixLowest) + 4;
-						int const gain2 = (int)(mlRep * 3);
-						int const gain1 = (int)(matchLength * 3 - ZSTD_highbit32((U32)offBase) + 1);
-						if((mlRep >= 4) && (gain2 > gain1))
-							matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
-					}
-				}
-				{
-					size_t ofbCandidate = 999999999;
-					size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
-					int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
-					int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offBase) + 4);
-					if((ml2 >= 4) && (gain2 > gain1)) {
-						matchLength = ml2, offBase = ofbCandidate, start = ip;
-						continue; /* search a better one */
-					}
-				}
-
-				/* let's find an even better one */
-				if((depth == 2) && (ip < ilimit)) {
-					DEBUGLOG(7, "search depth 2");
-					ip++;
-					if((dictMode == ZSTD_noDict) && (offBase) && ((offset_1 > 0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
-						size_t const mlRep = ZSTD_count(ip + 4, ip + 4 - offset_1, iend) + 4;
-						int const gain2 = (int)(mlRep * 4);
-						int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offBase) + 1);
-						if((mlRep >= 4) && (gain2 > gain1))
-							matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
-					}
-					if(isDxS) {
-						const U32 repIndex = (U32)(ip - base) - offset_1;
-						const BYTE* repMatch = repIndex < prefixLowestIndex ? dictBase + (repIndex - dictIndexDelta) : base + repIndex;
-						if(((U32)((prefixLowestIndex - 1) - repIndex) >= 3 /* intentional underflow */) && (MEM_read32(repMatch) == MEM_read32(ip))) {
-							const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
-							size_t const mlRep = ZSTD_count_2segments(ip + 4, repMatch + 4, iend, repMatchEnd, prefixLowest) + 4;
-							int const gain2 = (int)(mlRep * 4);
-							int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offBase) + 1);
-							if((mlRep >= 4) && (gain2 > gain1))
-								matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
-						}
-					}
-					{
-						size_t ofbCandidate = 999999999;
-						size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
-						int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
-						int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offBase) + 7);
-						if((ml2 >= 4) && (gain2 > gain1)) {
-							matchLength = ml2, offBase = ofbCandidate, start = ip;
-							continue;
-						}
-					}
-				}
-				break; /* nothing found : store previous solution */
-			}
-
-		/* NOTE:
-		 * Pay attention that `start[-value]` can lead to strange undefined behavior
-		 * notably if `value` is unsigned, resulting in a large positive `-value`.
-		 */
-		/* catch up */
-		if(OFFBASE_IS_OFFSET(offBase)) {
-			if(dictMode == ZSTD_noDict) {
-				while(((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) && (start[-1] == (start - OFFBASE_TO_OFFSET(offBase))[-1])) /* only search for offset within prefix */
-				{
-					start--;
-					matchLength++;
-				}
-			}
-			if(isDxS) {
-				U32 const matchIndex = (U32)((size_t)(start - base) - OFFBASE_TO_OFFSET(offBase));
-				const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
-				const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
-				while((start > anchor) && (match > mStart) && (start[-1] == match[-1])) {
-					start--;
-					match--;
-					matchLength++;
-				} /* catch up */
-			}
-			offset_2 = offset_1;
-			offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
-		}
-		/* store sequence */
-	_storeSequence : {
-		size_t const litLength = (size_t)(start - anchor);
-		ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
-		anchor = ip = start + matchLength;
-	}
-
-		/* check immediate repcode */
-		if(isDxS) {
-			while(ip <= ilimit) {
-				U32 const current2 = (U32)(ip - base);
-				U32 const repIndex = current2 - offset_2;
-				const BYTE* repMatch = repIndex < prefixLowestIndex ? dictBase - dictIndexDelta + repIndex : base + repIndex;
-				if(((U32)((prefixLowestIndex - 1) - (U32)repIndex) >= 3 /* intentional overflow */) && (MEM_read32(repMatch) == MEM_read32(ip))) {
-					const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
-					matchLength = ZSTD_count_2segments(ip + 4, repMatch + 4, iend, repEnd2, prefixLowest) + 4;
-					offBase = offset_2;
-					offset_2 = offset_1;
-					offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
-					ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
-					ip += matchLength;
-					anchor = ip;
-					continue;
-				}
-				break;
-			}
-		}
-
-		if(dictMode == ZSTD_noDict) {
-			while(((ip <= ilimit) & (offset_2 > 0)) && (MEM_read32(ip) == MEM_read32(ip - offset_2))) {
-				/* store sequence */
-				matchLength = ZSTD_count(ip + 4, ip + 4 - offset_2, iend) + 4;
-				offBase = offset_2;
-				offset_2 = offset_1;
-				offset_1 = (U32)offBase; /* swap repcodes */
-				ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
-				ip += matchLength;
-				anchor = ip;
-				continue; /* faster when present ... (?) */
-			}
-		}
-	}
-
-	/* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
-	 * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
-	offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
-
-	/* save reps for next block */
-	rep[0] = offset_1 ? offset_1 : offsetSaved1;
-	rep[1] = offset_2 ? offset_2 : offsetSaved2;
-
-	/* Return the last literals size */
-	return (size_t)(iend - anchor);
-}
-
-size_t ZSTD_compressBlock_btlazy2(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
-}
-
-size_t ZSTD_compressBlock_lazy2(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
-}
-
-size_t ZSTD_compressBlock_lazy(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
-}
-
-size_t ZSTD_compressBlock_greedy(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
-}
-
-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
-}
-
-size_t ZSTD_compressBlock_lazy2_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
-}
-
-size_t ZSTD_compressBlock_lazy_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
-}
-
-size_t ZSTD_compressBlock_greedy_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
-}
-
-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
-}
-
-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
-}
-
-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
-}
-
-/* Row-based matchfinder */
-size_t ZSTD_compressBlock_lazy2_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
-}
-
-size_t ZSTD_compressBlock_lazy_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
-}
-
-size_t ZSTD_compressBlock_greedy_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
-}
-
-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
-}
-
-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
-}
-
-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
-}
-
-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
-}
-
-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
-}
-
-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
-}
-
-FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_lazy_extDict_generic(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore,
-    U32 rep[ZSTD_REP_NUM],
-    const void* src, size_t srcSize,
-    const searchMethod_e searchMethod, const U32 depth) {
-	const BYTE* const istart = (const BYTE*)src;
-	const BYTE* ip = istart;
-	const BYTE* anchor = istart;
-	const BYTE* const iend = istart + srcSize;
-	const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
-	const BYTE* const base = ms->window.base;
-	const U32 dictLimit = ms->window.dictLimit;
-	const BYTE* const prefixStart = base + dictLimit;
-	const BYTE* const dictBase = ms->window.dictBase;
-	const BYTE* const dictEnd = dictBase + dictLimit;
-	const BYTE* const dictStart = dictBase + ms->window.lowLimit;
-	const U32 windowLog = ms->cParams.windowLog;
-	const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
-	const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
-
-	U32 offset_1 = rep[0], offset_2 = rep[1];
-
-	DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
-
-	/* init */
-	ip += (ip == prefixStart);
-	if(searchMethod == search_rowHash) {
-		ZSTD_row_fillHashCache(ms, base, rowLog,
-		                       MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-		                       ms->nextToUpdate, ilimit);
-	}
-
-	/* Match Loop */
-#if defined(__GNUC__) && defined(__x86_64__)
-	/* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
-	 * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
-	 */
-	__asm__(".p2align 5");
-#endif
-	while(ip < ilimit) {
-		size_t matchLength = 0;
-		size_t offBase = REPCODE1_TO_OFFBASE;
-		const BYTE* start = ip + 1;
-		U32 curr = (U32)(ip - base);
-
-		/* check repCode */
-		{
-			const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr + 1, windowLog);
-			const U32 repIndex = (U32)(curr + 1 - offset_1);
-			const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
-			const BYTE* const repMatch = repBase + repIndex;
-			if(((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional overflow */
-			   & (offset_1 <= curr + 1 - windowLow))    /* note: we are searching at curr+1 */
-				if(MEM_read32(ip + 1) == MEM_read32(repMatch)) {
-					/* repcode detected we should take it */
-					const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
-					matchLength = ZSTD_count_2segments(ip + 1 + 4, repMatch + 4, iend, repEnd, prefixStart) + 4;
-					if(depth == 0)
-						goto _storeSequence;
-				}
-		}
-
-		/* first search (depth 0) */
-		{
-			size_t ofbCandidate = 999999999;
-			size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
-			if(ml2 > matchLength)
-				matchLength = ml2, start = ip, offBase = ofbCandidate;
-		}
-
-		if(matchLength < 4) {
-			ip += ((ip - anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
-			continue;
-		}
-
-		/* let's try to find a better solution */
-		if(depth >= 1)
-			while(ip < ilimit) {
-				ip++;
-				curr++;
-				/* check repCode */
-				if(offBase) {
-					const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
-					const U32 repIndex = (U32)(curr - offset_1);
-					const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
-					const BYTE* const repMatch = repBase + repIndex;
-					if(((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
-					   & (offset_1 <= curr - windowLow))        /* equivalent to `curr > repIndex >= windowLow` */
-						if(MEM_read32(ip) == MEM_read32(repMatch)) {
-							/* repcode detected */
-							const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
-							size_t const repLength = ZSTD_count_2segments(ip + 4, repMatch + 4, iend, repEnd, prefixStart) + 4;
-							int const gain2 = (int)(repLength * 3);
-							int const gain1 = (int)(matchLength * 3 - ZSTD_highbit32((U32)offBase) + 1);
-							if((repLength >= 4) && (gain2 > gain1))
-								matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
-						}
-				}
-
-				/* search match, depth 1 */
-				{
-					size_t ofbCandidate = 999999999;
-					size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
-					int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
-					int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offBase) + 4);
-					if((ml2 >= 4) && (gain2 > gain1)) {
-						matchLength = ml2, offBase = ofbCandidate, start = ip;
-						continue; /* search a better one */
-					}
-				}
-
-				/* let's find an even better one */
-				if((depth == 2) && (ip < ilimit)) {
-					ip++;
-					curr++;
-					/* check repCode */
-					if(offBase) {
-						const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
-						const U32 repIndex = (U32)(curr - offset_1);
-						const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
-						const BYTE* const repMatch = repBase + repIndex;
-						if(((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
-						   & (offset_1 <= curr - windowLow))        /* equivalent to `curr > repIndex >= windowLow` */
-							if(MEM_read32(ip) == MEM_read32(repMatch)) {
-								/* repcode detected */
-								const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
-								size_t const repLength = ZSTD_count_2segments(ip + 4, repMatch + 4, iend, repEnd, prefixStart) + 4;
-								int const gain2 = (int)(repLength * 4);
-								int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offBase) + 1);
-								if((repLength >= 4) && (gain2 > gain1))
-									matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
-							}
-					}
-
-					/* search match, depth 2 */
-					{
-						size_t ofbCandidate = 999999999;
-						size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
-						int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
-						int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offBase) + 7);
-						if((ml2 >= 4) && (gain2 > gain1)) {
-							matchLength = ml2, offBase = ofbCandidate, start = ip;
-							continue;
-						}
-					}
-				}
-				break; /* nothing found : store previous solution */
-			}
-
-		/* catch up */
-		if(OFFBASE_IS_OFFSET(offBase)) {
-			U32 const matchIndex = (U32)((size_t)(start - base) - OFFBASE_TO_OFFSET(offBase));
-			const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
-			const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
-			while((start > anchor) && (match > mStart) && (start[-1] == match[-1])) {
-				start--;
-				match--;
-				matchLength++;
-			} /* catch up */
-			offset_2 = offset_1;
-			offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
-		}
-
-		/* store sequence */
-	_storeSequence : {
-		size_t const litLength = (size_t)(start - anchor);
-		ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
-		anchor = ip = start + matchLength;
-	}
-
-		/* check immediate repcode */
-		while(ip <= ilimit) {
-			const U32 repCurrent = (U32)(ip - base);
-			const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
-			const U32 repIndex = repCurrent - offset_2;
-			const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
-			const BYTE* const repMatch = repBase + repIndex;
-			if(((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
-			   & (offset_2 <= repCurrent - windowLow))  /* equivalent to `curr > repIndex >= windowLow` */
-				if(MEM_read32(ip) == MEM_read32(repMatch)) {
-					/* repcode detected we should take it */
-					const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
-					matchLength = ZSTD_count_2segments(ip + 4, repMatch + 4, iend, repEnd, prefixStart) + 4;
-					offBase = offset_2;
-					offset_2 = offset_1;
-					offset_1 = (U32)offBase; /* swap offset history */
-					ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
-					ip += matchLength;
-					anchor = ip;
-					continue; /* faster when present ... (?) */
-				}
-			break;
-		}
-	}
-
-	/* Save reps for next block */
-	rep[0] = offset_1;
-	rep[1] = offset_2;
-
-	/* Return the last literals size */
-	return (size_t)(iend - anchor);
-}
-
-size_t ZSTD_compressBlock_greedy_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
-}
-
-size_t ZSTD_compressBlock_lazy_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize)
-
-{
-	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
-}
-
-size_t ZSTD_compressBlock_lazy2_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize)
-
-{
-	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
-}
-
-size_t ZSTD_compressBlock_btlazy2_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize)
-
-{
-	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
-}
-
-size_t ZSTD_compressBlock_greedy_extDict_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
-}
-
-size_t ZSTD_compressBlock_lazy_extDict_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize)
-
-{
-	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
-}
-
-size_t ZSTD_compressBlock_lazy2_extDict_row(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize) {
-	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
-}
diff --git a/src/zstd/zstd_lazy.h b/src/zstd/zstd_lazy.h
deleted file mode 100644
index 1de0e50b0..000000000
--- a/src/zstd/zstd_lazy.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef ZSTD_LAZY_H
-#define ZSTD_LAZY_H
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#include "zstd_compress_internal.h"
-
-/**
- * Dedicated Dictionary Search Structure bucket log. In the
- * ZSTD_dedicatedDictSearch mode, the hashTable has
- * 2 ** ZSTD_LAZY_DDSS_BUCKET_LOG entries in each bucket, rather than just
- * one.
- */
-#define ZSTD_LAZY_DDSS_BUCKET_LOG 2
-
-#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
-
-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
-
-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
-
-void ZSTD_preserveUnsortedMark(U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
-
-size_t ZSTD_compressBlock_btlazy2(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-
-size_t ZSTD_compressBlock_btlazy2_dictMatchState(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_dictMatchState(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_dictMatchState(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_dictMatchState(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_dictMatchState_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_dictMatchState_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-
-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-
-size_t ZSTD_compressBlock_greedy_extDict(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_extDict(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_extDict(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_extDict_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_extDict_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_extDict_row(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_btlazy2_extDict(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* ZSTD_LAZY_H */
diff --git a/src/zstd/zstd_ldm.c b/src/zstd/zstd_ldm.c
deleted file mode 100644
index d7bf918dd..000000000
--- a/src/zstd/zstd_ldm.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#include "zstd_ldm.h"
-
-#include "debug.h"
-#include "xxhash.h"
-#include "zstd_fast.h"        /* ZSTD_fillHashTable() */
-#include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */
-#include "zstd_ldm_geartab.h"
-
-#define LDM_BUCKET_SIZE_LOG 3
-#define LDM_MIN_MATCH_LENGTH 64
-#define LDM_HASH_RLOG 7
-
-typedef struct {
-	U64 rolling;
-	U64 stopMask;
-} ldmRollingHashState_t;
-
-/** ZSTD_ldm_gear_init():
- *
- * Initializes the rolling hash state such that it will honor the
- * settings in params. */
-static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const * params) {
-	unsigned maxBitsInMask = MIN(params->minMatchLength, 64);
-	unsigned hashRateLog = params->hashRateLog;
-
-	state->rolling = ~(U32)0;
-
-	/* The choice of the splitting criterion is subject to two conditions:
-	 *   1. it has to trigger on average every 2^(hashRateLog) bytes;
-	 *   2. ideally, it has to depend on a window of minMatchLength bytes.
-	 *
-	 * In the gear hash algorithm, bit n depends on the last n bytes;
-	 * so in order to obtain a good quality splitting criterion it is
-	 * preferable to use bits with high weight.
-	 *
-	 * To match condition 1 we use a mask with hashRateLog bits set
-	 * and, because of the previous remark, we make sure these bits
-	 * have the highest possible weight while still respecting
-	 * condition 2.
-	 */
-	if(hashRateLog > 0 && hashRateLog <= maxBitsInMask) {
-		state->stopMask = (((U64)1 << hashRateLog) - 1) << (maxBitsInMask - hashRateLog);
-	} else {
-		/* In this degenerate case we simply honor the hash rate. */
-		state->stopMask = ((U64)1 << hashRateLog) - 1;
-	}
-}
-
-/** ZSTD_ldm_gear_reset()
- * Feeds [data, data + minMatchLength) into the hash without registering any
- * splits. This effectively resets the hash state. This is used when skipping
- * over data, either at the beginning of a block, or skipping sections.
- */
-static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state,
-                                BYTE const * data, size_t minMatchLength) {
-	U64 hash = state->rolling;
-	size_t n = 0;
-
-#define GEAR_ITER_ONCE()                                       \
-	do {                                                       \
-		hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \
-		n += 1;                                                \
-	} while(0)
-	while(n + 3 < minMatchLength) {
-		GEAR_ITER_ONCE();
-		GEAR_ITER_ONCE();
-		GEAR_ITER_ONCE();
-		GEAR_ITER_ONCE();
-	}
-	while(n < minMatchLength) {
-		GEAR_ITER_ONCE();
-	}
-#undef GEAR_ITER_ONCE
-}
-
-/** ZSTD_ldm_gear_feed():
- *
- * Registers in the splits array all the split points found in the first
- * size bytes following the data pointer. This function terminates when
- * either all the data has been processed or LDM_BATCH_SIZE splits are
- * present in the splits array.
- *
- * Precondition: The splits array must not be full.
- * Returns: The number of bytes processed. */
-static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state,
-                                 BYTE const * data, size_t size,
-                                 size_t* splits, unsigned* numSplits) {
-	size_t n;
-	U64 hash, mask;
-
-	hash = state->rolling;
-	mask = state->stopMask;
-	n = 0;
-
-#define GEAR_ITER_ONCE()                                       \
-	do {                                                       \
-		hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \
-		n += 1;                                                \
-		if(UNLIKELY((hash & mask) == 0)) {                     \
-			splits[*numSplits] = n;                            \
-			*numSplits += 1;                                   \
-			if(*numSplits == LDM_BATCH_SIZE)                   \
-				goto done;                                     \
-		}                                                      \
-	} while(0)
-
-	while(n + 3 < size) {
-		GEAR_ITER_ONCE();
-		GEAR_ITER_ONCE();
-		GEAR_ITER_ONCE();
-		GEAR_ITER_ONCE();
-	}
-	while(n < size) {
-		GEAR_ITER_ONCE();
-	}
-
-#undef GEAR_ITER_ONCE
-
-done:
-	state->rolling = hash;
-	return n;
-}
-
-void ZSTD_ldm_adjustParameters(ldmParams_t* params,
-                               ZSTD_compressionParameters const * cParams) {
-	params->windowLog = cParams->windowLog;
-	ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
-	DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
-	if(!params->bucketSizeLog)
-		params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
-	if(!params->minMatchLength)
-		params->minMatchLength = LDM_MIN_MATCH_LENGTH;
-	if(params->hashLog == 0) {
-		params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
-		assert(params->hashLog <= ZSTD_HASHLOG_MAX);
-	}
-	if(params->hashRateLog == 0) {
-		params->hashRateLog = params->windowLog < params->hashLog
-		                          ? 0
-		                          : params->windowLog - params->hashLog;
-	}
-	params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
-}
-
-size_t ZSTD_ldm_getTableSize(ldmParams_t params) {
-	size_t const ldmHSize = ((size_t)1) << params.hashLog;
-	size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog);
-	size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog);
-	size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t));
-	return params.enableLdm == ZSTD_ps_enable ? totalSize : 0;
-}
-
-size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) {
-	return params.enableLdm == ZSTD_ps_enable ? (maxChunkSize / params.minMatchLength) : 0;
-}
-
-/** ZSTD_ldm_getBucket() :
- *  Returns a pointer to the start of the bucket associated with hash. */
-static ldmEntry_t* ZSTD_ldm_getBucket(
-    ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams) {
-	return ldmState->hashTable + (hash << ldmParams.bucketSizeLog);
-}
-
-/** ZSTD_ldm_insertEntry() :
- *  Insert the entry with corresponding hash into the hash table */
-static void ZSTD_ldm_insertEntry(ldmState_t* ldmState,
-                                 size_t const hash, const ldmEntry_t entry,
-                                 ldmParams_t const ldmParams) {
-	BYTE* const pOffset = ldmState->bucketOffsets + hash;
-	unsigned const offset = *pOffset;
-
-	*(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + offset) = entry;
-	*pOffset = (BYTE)((offset + 1) & ((1u << ldmParams.bucketSizeLog) - 1));
-}
-
-/** ZSTD_ldm_countBackwardsMatch() :
- *  Returns the number of bytes that match backwards before pIn and pMatch.
- *
- *  We count only bytes where pMatch >= pBase and pIn >= pAnchor. */
-static size_t ZSTD_ldm_countBackwardsMatch(
-    const BYTE* pIn, const BYTE* pAnchor,
-    const BYTE* pMatch, const BYTE* pMatchBase) {
-	size_t matchLength = 0;
-	while(pIn > pAnchor && pMatch > pMatchBase && pIn[-1] == pMatch[-1]) {
-		pIn--;
-		pMatch--;
-		matchLength++;
-	}
-	return matchLength;
-}
-
-/** ZSTD_ldm_countBackwardsMatch_2segments() :
- *  Returns the number of bytes that match backwards from pMatch,
- *  even with the backwards match spanning 2 different segments.
- *
- *  On reaching `pMatchBase`, start counting from mEnd */
-static size_t ZSTD_ldm_countBackwardsMatch_2segments(
-    const BYTE* pIn, const BYTE* pAnchor,
-    const BYTE* pMatch, const BYTE* pMatchBase,
-    const BYTE* pExtDictStart, const BYTE* pExtDictEnd) {
-	size_t matchLength = ZSTD_ldm_countBackwardsMatch(pIn, pAnchor, pMatch, pMatchBase);
-	if(pMatch - matchLength != pMatchBase || pMatchBase == pExtDictStart) {
-		/* If backwards match is entirely in the extDict or prefix, immediately return */
-		return matchLength;
-	}
-	DEBUGLOG(7, "ZSTD_ldm_countBackwardsMatch_2segments: found 2-parts backwards match (length in prefix==%zu)", matchLength);
-	matchLength += ZSTD_ldm_countBackwardsMatch(pIn - matchLength, pAnchor, pExtDictEnd, pExtDictStart);
-	DEBUGLOG(7, "final backwards match length = %zu", matchLength);
-	return matchLength;
-}
-
-/** ZSTD_ldm_fillFastTables() :
- *
- *  Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies.
- *  This is similar to ZSTD_loadDictionaryContent.
- *
- *  The tables for the other strategies are filled within their
- *  block compressors. */
-static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
-                                      void const * end) {
-	const BYTE* const iend = (const BYTE*)end;
-
-	switch(ms->cParams.strategy) {
-	case ZSTD_fast:
-		ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
-		break;
-
-	case ZSTD_dfast:
-		ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
-		break;
-
-	case ZSTD_greedy:
-	case ZSTD_lazy:
-	case ZSTD_lazy2:
-	case ZSTD_btlazy2:
-	case ZSTD_btopt:
-	case ZSTD_btultra:
-	case ZSTD_btultra2:
-		break;
-	default:
-		assert(0); /* not possible : not a valid strategy id */
-	}
-
-	return 0;
-}
-
-void ZSTD_ldm_fillHashTable(
-    ldmState_t* ldmState, const BYTE* ip,
-    const BYTE* iend, ldmParams_t const * params) {
-	U32 const minMatchLength = params->minMatchLength;
-	U32 const hBits = params->hashLog - params->bucketSizeLog;
-	BYTE const * const base = ldmState->window.base;
-	BYTE const * const istart = ip;
-	ldmRollingHashState_t hashState;
-	size_t* const splits = ldmState->splitIndices;
-	unsigned numSplits;
-
-	DEBUGLOG(5, "ZSTD_ldm_fillHashTable");
-
-	ZSTD_ldm_gear_init(&hashState, params);
-	while(ip < iend) {
-		size_t hashed;
-		unsigned n;
-
-		numSplits = 0;
-		hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits);
-
-		for(n = 0; n < numSplits; n++) {
-			if(ip + splits[n] >= istart + minMatchLength) {
-				BYTE const * const split = ip + splits[n] - minMatchLength;
-				U64 const xxhash = XXH64(split, minMatchLength, 0);
-				U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1));
-				ldmEntry_t entry;
-
-				entry.offset = (U32)(split - base);
-				entry.checksum = (U32)(xxhash >> 32);
-				ZSTD_ldm_insertEntry(ldmState, hash, entry, *params);
-			}
-		}
-
-		ip += hashed;
-	}
-}
-
-/** ZSTD_ldm_limitTableUpdate() :
- *
- *  Sets cctx->nextToUpdate to a position corresponding closer to anchor
- *  if it is far way
- *  (after a long match, only update tables a limited amount). */
-static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) {
-	U32 const curr = (U32)(anchor - ms->window.base);
-	if(curr > ms->nextToUpdate + 1024) {
-		ms->nextToUpdate =
-		    curr - MIN(512, curr - ms->nextToUpdate - 1024);
-	}
-}
-
-static size_t ZSTD_ldm_generateSequences_internal(
-    ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
-    ldmParams_t const * params, void const * src, size_t srcSize) {
-	/* LDM parameters */
-	int const extDict = ZSTD_window_hasExtDict(ldmState->window);
-	U32 const minMatchLength = params->minMatchLength;
-	U32 const entsPerBucket = 1U << params->bucketSizeLog;
-	U32 const hBits = params->hashLog - params->bucketSizeLog;
-	/* Prefix and extDict parameters */
-	U32 const dictLimit = ldmState->window.dictLimit;
-	U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit;
-	BYTE const * const base = ldmState->window.base;
-	BYTE const * const dictBase = extDict ? ldmState->window.dictBase : NULL;
-	BYTE const * const dictStart = extDict ? dictBase + lowestIndex : NULL;
-	BYTE const * const dictEnd = extDict ? dictBase + dictLimit : NULL;
-	BYTE const * const lowPrefixPtr = base + dictLimit;
-	/* Input bounds */
-	BYTE const * const istart = (BYTE const *)src;
-	BYTE const * const iend = istart + srcSize;
-	BYTE const * const ilimit = iend - HASH_READ_SIZE;
-	/* Input positions */
-	BYTE const * anchor = istart;
-	BYTE const * ip = istart;
-	/* Rolling hash state */
-	ldmRollingHashState_t hashState;
-	/* Arrays for staged-processing */
-	size_t* const splits = ldmState->splitIndices;
-	ldmMatchCandidate_t* const candidates = ldmState->matchCandidates;
-	unsigned numSplits;
-
-	if(srcSize < minMatchLength)
-		return iend - anchor;
-
-	/* Initialize the rolling hash state with the first minMatchLength bytes */
-	ZSTD_ldm_gear_init(&hashState, params);
-	ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength);
-	ip += minMatchLength;
-
-	while(ip < ilimit) {
-		size_t hashed;
-		unsigned n;
-
-		numSplits = 0;
-		hashed = ZSTD_ldm_gear_feed(&hashState, ip, ilimit - ip,
-		                            splits, &numSplits);
-
-		for(n = 0; n < numSplits; n++) {
-			BYTE const * const split = ip + splits[n] - minMatchLength;
-			U64 const xxhash = XXH64(split, minMatchLength, 0);
-			U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1));
-
-			candidates[n].split = split;
-			candidates[n].hash = hash;
-			candidates[n].checksum = (U32)(xxhash >> 32);
-			candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, *params);
-			PREFETCH_L1(candidates[n].bucket);
-		}
-
-		for(n = 0; n < numSplits; n++) {
-			size_t forwardMatchLength = 0, backwardMatchLength = 0,
-			       bestMatchLength = 0, mLength;
-			U32 offset;
-			BYTE const * const split = candidates[n].split;
-			U32 const checksum = candidates[n].checksum;
-			U32 const hash = candidates[n].hash;
-			ldmEntry_t* const bucket = candidates[n].bucket;
-			ldmEntry_t const * cur;
-			ldmEntry_t const * bestEntry = NULL;
-			ldmEntry_t newEntry;
-
-			newEntry.offset = (U32)(split - base);
-			newEntry.checksum = checksum;
-
-			/* If a split point would generate a sequence overlapping with
-			 * the previous one, we merely register it in the hash table and
-			 * move on */
-			if(split < anchor) {
-				ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
-				continue;
-			}
-
-			for(cur = bucket; cur < bucket + entsPerBucket; cur++) {
-				size_t curForwardMatchLength, curBackwardMatchLength,
-				    curTotalMatchLength;
-				if(cur->checksum != checksum || cur->offset <= lowestIndex) {
-					continue;
-				}
-				if(extDict) {
-					BYTE const * const curMatchBase =
-					    cur->offset < dictLimit ? dictBase : base;
-					BYTE const * const pMatch = curMatchBase + cur->offset;
-					BYTE const * const matchEnd =
-					    cur->offset < dictLimit ? dictEnd : iend;
-					BYTE const * const lowMatchPtr =
-					    cur->offset < dictLimit ? dictStart : lowPrefixPtr;
-					curForwardMatchLength =
-					    ZSTD_count_2segments(split, pMatch, iend, matchEnd, lowPrefixPtr);
-					if(curForwardMatchLength < minMatchLength) {
-						continue;
-					}
-					curBackwardMatchLength = ZSTD_ldm_countBackwardsMatch_2segments(
-					    split, anchor, pMatch, lowMatchPtr, dictStart, dictEnd);
-				} else { /* !extDict */
-					BYTE const * const pMatch = base + cur->offset;
-					curForwardMatchLength = ZSTD_count(split, pMatch, iend);
-					if(curForwardMatchLength < minMatchLength) {
-						continue;
-					}
-					curBackwardMatchLength =
-					    ZSTD_ldm_countBackwardsMatch(split, anchor, pMatch, lowPrefixPtr);
-				}
-				curTotalMatchLength = curForwardMatchLength + curBackwardMatchLength;
-
-				if(curTotalMatchLength > bestMatchLength) {
-					bestMatchLength = curTotalMatchLength;
-					forwardMatchLength = curForwardMatchLength;
-					backwardMatchLength = curBackwardMatchLength;
-					bestEntry = cur;
-				}
-			}
-
-			/* No match found -- insert an entry into the hash table
-			 * and process the next candidate match */
-			if(bestEntry == NULL) {
-				ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
-				continue;
-			}
-
-			/* Match found */
-			offset = (U32)(split - base) - bestEntry->offset;
-			mLength = forwardMatchLength + backwardMatchLength;
-			{
-				rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;
-
-				/* Out of sequence storage */
-				if(rawSeqStore->size == rawSeqStore->capacity)
-					return ERROR(dstSize_tooSmall);
-				seq->litLength = (U32)(split - backwardMatchLength - anchor);
-				seq->matchLength = (U32)mLength;
-				seq->offset = offset;
-				rawSeqStore->size++;
-			}
-
-			/* Insert the current entry into the hash table --- it must be
-			 * done after the previous block to avoid clobbering bestEntry */
-			ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
-
-			anchor = split + forwardMatchLength;
-
-			/* If we find a match that ends after the data that we've hashed
-			 * then we have a repeating, overlapping, pattern. E.g. all zeros.
-			 * If one repetition of the pattern matches our `stopMask` then all
-			 * repetitions will. We don't need to insert them all into out table,
-			 * only the first one. So skip over overlapping matches.
-			 * This is a major speed boost (20x) for compressing a single byte
-			 * repeated, when that byte ends up in the table.
-			 */
-			if(anchor > ip + hashed) {
-				ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength);
-				/* Continue the outer loop at anchor (ip + hashed == anchor). */
-				ip = anchor - hashed;
-				break;
-			}
-		}
-
-		ip += hashed;
-	}
-
-	return iend - anchor;
-}
-
-/*! ZSTD_ldm_reduceTable() :
- *  reduce table indexes by `reducerValue` */
-static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
-                                 U32 const reducerValue) {
-	U32 u;
-	for(u = 0; u < size; u++) {
-		if(table[u].offset < reducerValue)
-			table[u].offset = 0;
-		else
-			table[u].offset -= reducerValue;
-	}
-}
-
-size_t ZSTD_ldm_generateSequences(
-    ldmState_t* ldmState, rawSeqStore_t* sequences,
-    ldmParams_t const * params, void const * src, size_t srcSize) {
-	U32 const maxDist = 1U << params->windowLog;
-	BYTE const * const istart = (BYTE const *)src;
-	BYTE const * const iend = istart + srcSize;
-	size_t const kMaxChunkSize = 1 << 20;
-	size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0);
-	size_t chunk;
-	size_t leftoverSize = 0;
-
-	assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize);
-	/* Check that ZSTD_window_update() has been called for this chunk prior
-	 * to passing it to this function.
-	 */
-	assert(ldmState->window.nextSrc >= (BYTE const *)src + srcSize);
-	/* The input could be very large (in zstdmt), so it must be broken up into
-	 * chunks to enforce the maximum distance and handle overflow correction.
-	 */
-	assert(sequences->pos <= sequences->size);
-	assert(sequences->size <= sequences->capacity);
-	for(chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) {
-		BYTE const * const chunkStart = istart + chunk * kMaxChunkSize;
-		size_t const remaining = (size_t)(iend - chunkStart);
-		BYTE const * const chunkEnd =
-		    (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize;
-		size_t const chunkSize = chunkEnd - chunkStart;
-		size_t newLeftoverSize;
-		size_t const prevSize = sequences->size;
-
-		assert(chunkStart < iend);
-		/* 1. Perform overflow correction if necessary. */
-		if(ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDist, ldmState->loadedDictEnd, chunkStart, chunkEnd)) {
-			U32 const ldmHSize = 1U << params->hashLog;
-			U32 const correction = ZSTD_window_correctOverflow(
-			    &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart);
-			ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction);
-			/* invalidate dictionaries on overflow correction */
-			ldmState->loadedDictEnd = 0;
-		}
-		/* 2. We enforce the maximum offset allowed.
-		 *
-		 * kMaxChunkSize should be small enough that we don't lose too much of
-		 * the window through early invalidation.
-		 * TODO: * Test the chunk size.
-		 *       * Try invalidation after the sequence generation and test the
-		 *         offset against maxDist directly.
-		 *
-		 * NOTE: Because of dictionaries + sequence splitting we MUST make sure
-		 * that any offset used is valid at the END of the sequence, since it may
-		 * be split into two sequences. This condition holds when using
-		 * ZSTD_window_enforceMaxDist(), but if we move to checking offsets
-		 * against maxDist directly, we'll have to carefully handle that case.
-		 */
-		ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL);
-		/* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
-		newLeftoverSize = ZSTD_ldm_generateSequences_internal(
-		    ldmState, sequences, params, chunkStart, chunkSize);
-		if(ZSTD_isError(newLeftoverSize))
-			return newLeftoverSize;
-		/* 4. We add the leftover literals from previous iterations to the first
-		 *    newly generated sequence, or add the `newLeftoverSize` if none are
-		 *    generated.
-		 */
-		/* Prepend the leftover literals from the last call */
-		if(prevSize < sequences->size) {
-			sequences->seq[prevSize].litLength += (U32)leftoverSize;
-			leftoverSize = newLeftoverSize;
-		} else {
-			assert(newLeftoverSize == chunkSize);
-			leftoverSize += chunkSize;
-		}
-	}
-	return 0;
-}
-
-void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) {
-	while(srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
-		rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
-		if(srcSize <= seq->litLength) {
-			/* Skip past srcSize literals */
-			seq->litLength -= (U32)srcSize;
-			return;
-		}
-		srcSize -= seq->litLength;
-		seq->litLength = 0;
-		if(srcSize < seq->matchLength) {
-			/* Skip past the first srcSize of the match */
-			seq->matchLength -= (U32)srcSize;
-			if(seq->matchLength < minMatch) {
-				/* The match is too short, omit it */
-				if(rawSeqStore->pos + 1 < rawSeqStore->size) {
-					seq[1].litLength += seq[0].matchLength;
-				}
-				rawSeqStore->pos++;
-			}
-			return;
-		}
-		srcSize -= seq->matchLength;
-		seq->matchLength = 0;
-		rawSeqStore->pos++;
-	}
-}
-
-/**
- * If the sequence length is longer than remaining then the sequence is split
- * between this block and the next.
- *
- * Returns the current sequence to handle, or if the rest of the block should
- * be literals, it returns a sequence with offset == 0.
- */
-static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
-                                 U32 const remaining, U32 const minMatch) {
-	rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
-	assert(sequence.offset > 0);
-	/* Likely: No partial sequence */
-	if(remaining >= sequence.litLength + sequence.matchLength) {
-		rawSeqStore->pos++;
-		return sequence;
-	}
-	/* Cut the sequence short (offset == 0 ==> rest is literals). */
-	if(remaining <= sequence.litLength) {
-		sequence.offset = 0;
-	} else if(remaining < sequence.litLength + sequence.matchLength) {
-		sequence.matchLength = remaining - sequence.litLength;
-		if(sequence.matchLength < minMatch) {
-			sequence.offset = 0;
-		}
-	}
-	/* Skip past `remaining` bytes for the future sequences. */
-	ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch);
-	return sequence;
-}
-
-void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
-	U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
-	while(currPos && rawSeqStore->pos < rawSeqStore->size) {
-		rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos];
-		if(currPos >= currSeq.litLength + currSeq.matchLength) {
-			currPos -= currSeq.litLength + currSeq.matchLength;
-			rawSeqStore->pos++;
-		} else {
-			rawSeqStore->posInSequence = currPos;
-			break;
-		}
-	}
-	if(currPos == 0 || rawSeqStore->pos == rawSeqStore->size) {
-		rawSeqStore->posInSequence = 0;
-	}
-}
-
-size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
-                              ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-                              ZSTD_paramSwitch_e useRowMatchFinder,
-                              void const * src, size_t srcSize) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	unsigned const minMatch = cParams->minMatch;
-	ZSTD_blockCompressor const blockCompressor =
-	    ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms));
-	/* Input bounds */
-	BYTE const * const istart = (BYTE const *)src;
-	BYTE const * const iend = istart + srcSize;
-	/* Input positions */
-	BYTE const * ip = istart;
-
-	DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize);
-	/* If using opt parser, use LDMs only as candidates rather than always accepting them */
-	if(cParams->strategy >= ZSTD_btopt) {
-		size_t lastLLSize;
-		ms->ldmSeqStore = rawSeqStore;
-		lastLLSize = blockCompressor(ms, seqStore, rep, src, srcSize);
-		ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore, srcSize);
-		return lastLLSize;
-	}
-
-	assert(rawSeqStore->pos <= rawSeqStore->size);
-	assert(rawSeqStore->size <= rawSeqStore->capacity);
-	/* Loop through each sequence and apply the block compressor to the literals */
-	while(rawSeqStore->pos < rawSeqStore->size && ip < iend) {
-		/* maybeSplitSequence updates rawSeqStore->pos */
-		rawSeq const sequence = maybeSplitSequence(rawSeqStore,
-		                                           (U32)(iend - ip), minMatch);
-		int i;
-		/* End signal */
-		if(sequence.offset == 0)
-			break;
-
-		assert(ip + sequence.litLength + sequence.matchLength <= iend);
-
-		/* Fill tables for block compressor */
-		ZSTD_ldm_limitTableUpdate(ms, ip);
-		ZSTD_ldm_fillFastTables(ms, ip);
-		/* Run the block compressor */
-		DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip - istart), sequence.litLength);
-		{
-			size_t const newLitLength =
-			    blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
-			ip += sequence.litLength;
-			/* Update the repcodes */
-			for(i = ZSTD_REP_NUM - 1; i > 0; i--)
-				rep[i] = rep[i - 1];
-			rep[0] = sequence.offset;
-			/* Store the sequence */
-			ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
-			              OFFSET_TO_OFFBASE(sequence.offset),
-			              sequence.matchLength);
-			ip += sequence.matchLength;
-		}
-	}
-	/* Fill the tables for the block compressor */
-	ZSTD_ldm_limitTableUpdate(ms, ip);
-	ZSTD_ldm_fillFastTables(ms, ip);
-	/* Compress the last literals */
-	return blockCompressor(ms, seqStore, rep, ip, iend - ip);
-}
diff --git a/src/zstd/zstd_opt.c b/src/zstd/zstd_opt.c
deleted file mode 100644
index 39d198151..000000000
--- a/src/zstd/zstd_opt.c
+++ /dev/null
@@ -1,1483 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#include "zstd_compress_internal.h"
-#include "hist.h"
-#include "zstd_opt.h"
-
-#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
-#define ZSTD_MAX_PRICE (1 << 30)
-
-#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
-
-/*-*************************************
- *  Price functions for optimal parser
- ***************************************/
-
-#if 0 /* approximation at bit level (for tests) */
-#define BITCOST_ACCURACY 0
-#define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
-#elif 0 /* fractional bit accuracy (for tests) */
-#define BITCOST_ACCURACY 8
-#define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#define WEIGHT(stat, opt) ((void)(opt), ZSTD_fracWeight(stat))
-#else /* opt==approx, ultra==accurate */
-#define BITCOST_ACCURACY 8
-#define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#define WEIGHT(stat, opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
-#endif
-
-/* ZSTD_bitWeight() :
- * provide estimated "cost" of a stat in full bits only */
-MEM_STATIC U32 ZSTD_bitWeight(U32 stat) {
-	return (ZSTD_highbit32(stat + 1) * BITCOST_MULTIPLIER);
-}
-
-/* ZSTD_fracWeight() :
- * provide fractional-bit "cost" of a stat,
- * using linear interpolation approximation */
-MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) {
-	U32 const stat = rawStat + 1;
-	U32 const hb = ZSTD_highbit32(stat);
-	U32 const BWeight = hb * BITCOST_MULTIPLIER;
-	/* Fweight was meant for "Fractional weight"
-	 * but it's effectively a value between 1 and 2
-	 * using fixed point arithmetic */
-	U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
-	U32 const weight = BWeight + FWeight;
-	assert(hb + BITCOST_ACCURACY < 31);
-	return weight;
-}
-
-#if(DEBUGLEVEL >= 2)
-/* debugging function,
- * @return price in bytes as fractional value
- * for debug messages only */
-MEM_STATIC double ZSTD_fCost(int price) {
-	return (double)price / (BITCOST_MULTIPLIER * 8);
-}
-#endif
-
-static int ZSTD_compressedLiterals(optState_t const * const optPtr) {
-	return optPtr->literalCompressionMode != ZSTD_ps_disable;
-}
-
-static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel) {
-	if(ZSTD_compressedLiterals(optPtr))
-		optPtr->litSumBasePrice = WEIGHT(optPtr->litSum, optLevel);
-	optPtr->litLengthSumBasePrice = WEIGHT(optPtr->litLengthSum, optLevel);
-	optPtr->matchLengthSumBasePrice = WEIGHT(optPtr->matchLengthSum, optLevel);
-	optPtr->offCodeSumBasePrice = WEIGHT(optPtr->offCodeSum, optLevel);
-}
-
-static U32 sum_u32(const unsigned table[], size_t nbElts) {
-	size_t n;
-	U32 total = 0;
-	for(n = 0; n < nbElts; n++) {
-		total += table[n];
-	}
-	return total;
-}
-
-typedef enum { base_0possible = 0,
-	           base_1guaranteed = 1 } base_directive_e;
-
-static U32
-ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1) {
-	U32 s, sum = 0;
-	DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
-	         (unsigned)lastEltIndex + 1, (unsigned)shift);
-	assert(shift < 30);
-	for(s = 0; s < lastEltIndex + 1; s++) {
-		unsigned const base = base1 ? 1 : (table[s] > 0);
-		unsigned const newStat = base + (table[s] >> shift);
-		sum += newStat;
-		table[s] = newStat;
-	}
-	return sum;
-}
-
-/* ZSTD_scaleStats() :
- * reduce all elt frequencies in table if sum too large
- * return the resulting sum of elements */
-static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) {
-	U32 const prevsum = sum_u32(table, lastEltIndex + 1);
-	U32 const factor = prevsum >> logTarget;
-	DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex + 1, (unsigned)logTarget);
-	assert(logTarget < 30);
-	if(factor <= 1)
-		return prevsum;
-	return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
-}
-
-/* ZSTD_rescaleFreqs() :
- * if first block (detected by optPtr->litLengthSum == 0) : init statistics
- *    take hints from dictionary if there is one
- *    and init from zero if there is none,
- *    using src for literals stats, and baseline stats for sequence symbols
- * otherwise downscale existing stats, to be used as seed for next block.
- */
-static void
-ZSTD_rescaleFreqs(optState_t* const optPtr,
-                  const BYTE* const src, size_t const srcSize,
-                  int const optLevel) {
-	int const compressedLiterals = ZSTD_compressedLiterals(optPtr);
-	DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
-	optPtr->priceType = zop_dynamic;
-
-	if(optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */
-
-		/* heuristic: use pre-defined stats for too small inputs */
-		if(srcSize <= ZSTD_PREDEF_THRESHOLD) {
-			DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
-			optPtr->priceType = zop_predef;
-		}
-
-		assert(optPtr->symbolCosts != NULL);
-		if(optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
-
-			/* huffman stats covering the full value set : table presumed generated by dictionary */
-			optPtr->priceType = zop_dynamic;
-
-			if(compressedLiterals) {
-				/* generate literals statistics from huffman table */
-				unsigned lit;
-				assert(optPtr->litFreq != NULL);
-				optPtr->litSum = 0;
-				for(lit = 0; lit <= MaxLit; lit++) {
-					U32 const scaleLog = 11; /* scale to 2K */
-					U32 const bitCost = HUF_getNbBitsFromCTable(optPtr->symbolCosts->huf.CTable, lit);
-					assert(bitCost <= scaleLog);
-					optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog - bitCost) : 1 /*minimum to calculate cost*/;
-					optPtr->litSum += optPtr->litFreq[lit];
-				}
-			}
-
-			{
-				unsigned ll;
-				FSE_CState_t llstate;
-				FSE_initCState(&llstate, optPtr->symbolCosts->fse.litlengthCTable);
-				optPtr->litLengthSum = 0;
-				for(ll = 0; ll <= MaxLL; ll++) {
-					U32 const scaleLog = 10; /* scale to 1K */
-					U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll);
-					assert(bitCost < scaleLog);
-					optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog - bitCost) : 1 /*minimum to calculate cost*/;
-					optPtr->litLengthSum += optPtr->litLengthFreq[ll];
-				}
-			}
-
-			{
-				unsigned ml;
-				FSE_CState_t mlstate;
-				FSE_initCState(&mlstate, optPtr->symbolCosts->fse.matchlengthCTable);
-				optPtr->matchLengthSum = 0;
-				for(ml = 0; ml <= MaxML; ml++) {
-					U32 const scaleLog = 10;
-					U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml);
-					assert(bitCost < scaleLog);
-					optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog - bitCost) : 1 /*minimum to calculate cost*/;
-					optPtr->matchLengthSum += optPtr->matchLengthFreq[ml];
-				}
-			}
-
-			{
-				unsigned of;
-				FSE_CState_t ofstate;
-				FSE_initCState(&ofstate, optPtr->symbolCosts->fse.offcodeCTable);
-				optPtr->offCodeSum = 0;
-				for(of = 0; of <= MaxOff; of++) {
-					U32 const scaleLog = 10;
-					U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of);
-					assert(bitCost < scaleLog);
-					optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog - bitCost) : 1 /*minimum to calculate cost*/;
-					optPtr->offCodeSum += optPtr->offCodeFreq[of];
-				}
-			}
-
-		} else { /* first block, no dictionary */
-
-			assert(optPtr->litFreq != NULL);
-			if(compressedLiterals) {
-				/* base initial cost of literals on direct frequency within src */
-				unsigned lit = MaxLit;
-				HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */
-				optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
-			}
-
-			{
-				unsigned const baseLLfreqs[MaxLL + 1] = {
-				    4, 2, 1, 1, 1, 1, 1, 1,
-				    1, 1, 1, 1, 1, 1, 1, 1,
-				    1, 1, 1, 1, 1, 1, 1, 1,
-				    1, 1, 1, 1, 1, 1, 1, 1,
-				    1, 1, 1, 1};
-				ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(baseLLfreqs));
-				optPtr->litLengthSum = sum_u32(baseLLfreqs, MaxLL + 1);
-			}
-
-			{
-				unsigned ml;
-				for(ml = 0; ml <= MaxML; ml++)
-					optPtr->matchLengthFreq[ml] = 1;
-			}
-			optPtr->matchLengthSum = MaxML + 1;
-
-			{
-				unsigned const baseOFCfreqs[MaxOff + 1] = {
-				    6, 2, 1, 1, 2, 3, 4, 4,
-				    4, 3, 2, 1, 1, 1, 1, 1,
-				    1, 1, 1, 1, 1, 1, 1, 1,
-				    1, 1, 1, 1, 1, 1, 1, 1};
-				ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(baseOFCfreqs));
-				optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff + 1);
-			}
-		}
-
-	} else { /* new block : scale down accumulated statistics */
-
-		if(compressedLiterals)
-			optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
-		optPtr->litLengthSum = ZSTD_scaleStats(optPtr->litLengthFreq, MaxLL, 11);
-		optPtr->matchLengthSum = ZSTD_scaleStats(optPtr->matchLengthFreq, MaxML, 11);
-		optPtr->offCodeSum = ZSTD_scaleStats(optPtr->offCodeFreq, MaxOff, 11);
-	}
-
-	ZSTD_setBasePrices(optPtr, optLevel);
-}
-
-/* ZSTD_rawLiteralsCost() :
- * price of literals (only) in specified segment (which length can be 0).
- * does not include price of literalLength symbol */
-static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
-                                const optState_t* const optPtr,
-                                int optLevel) {
-	if(litLength == 0)
-		return 0;
-
-	if(!ZSTD_compressedLiterals(optPtr))
-		return (litLength << 3) * BITCOST_MULTIPLIER; /* Uncompressed - 8 bytes per literal. */
-
-	if(optPtr->priceType == zop_predef)
-		return (litLength * 6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */
-
-	/* dynamic statistics */
-	{
-		U32 price = optPtr->litSumBasePrice * litLength;
-		U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
-		U32 u;
-		assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
-		for(u = 0; u < litLength; u++) {
-			U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
-			if(UNLIKELY(litPrice > litPriceMax))
-				litPrice = litPriceMax;
-			price -= litPrice;
-		}
-		return price;
-	}
-}
-
-/* ZSTD_litLengthPrice() :
- * cost of literalLength symbol */
-static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel) {
-	assert(litLength <= ZSTD_BLOCKSIZE_MAX);
-	if(optPtr->priceType == zop_predef)
-		return WEIGHT(litLength, optLevel);
-
-	/* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
-	 * because it isn't representable in the zstd format.
-	 * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
-	 * In such a case, the block would be all literals.
-	 */
-	if(litLength == ZSTD_BLOCKSIZE_MAX)
-		return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
-
-	/* dynamic statistics */
-	{
-		U32 const llCode = ZSTD_LLcode(litLength);
-		return (LL_bits[llCode] * BITCOST_MULTIPLIER) + optPtr->litLengthSumBasePrice - WEIGHT(optPtr->litLengthFreq[llCode], optLevel);
-	}
-}
-
-/* ZSTD_getMatchPrice() :
- * Provides the cost of the match part (offset + matchLength) of a sequence.
- * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
- * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
- * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
- */
-FORCE_INLINE_TEMPLATE U32
-ZSTD_getMatchPrice(U32 const offBase,
-                   U32 const matchLength,
-                   const optState_t* const optPtr,
-                   int const optLevel) {
-	U32 price;
-	U32 const offCode = ZSTD_highbit32(offBase);
-	U32 const mlBase = matchLength - MINMATCH;
-	assert(matchLength >= MINMATCH);
-
-	if(optPtr->priceType == zop_predef)                                          /* fixed scheme, does not use statistics */
-		return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
-
-	/* dynamic statistics */
-	price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
-	if((optLevel < 2) /*static*/ && offCode >= 20)
-		price += (offCode - 19) * 2 * BITCOST_MULTIPLIER; /* handicap for long distance offsets, favor decompression speed */
-
-	/* match Length */
-	{
-		U32 const mlCode = ZSTD_MLcode(mlBase);
-		price += (ML_bits[mlCode] * BITCOST_MULTIPLIER) + (optPtr->matchLengthSumBasePrice - WEIGHT(optPtr->matchLengthFreq[mlCode], optLevel));
-	}
-
-	price += BITCOST_MULTIPLIER / 5; /* heuristic : make matches a bit more costly to favor less sequences -> faster decompression speed */
-
-	DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price);
-	return price;
-}
-
-/* ZSTD_updateStats() :
- * assumption : literals + litLength <= iend */
-static void ZSTD_updateStats(optState_t* const optPtr,
-                             U32 litLength, const BYTE* literals,
-                             U32 offBase, U32 matchLength) {
-	/* literals */
-	if(ZSTD_compressedLiterals(optPtr)) {
-		U32 u;
-		for(u = 0; u < litLength; u++)
-			optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
-		optPtr->litSum += litLength * ZSTD_LITFREQ_ADD;
-	}
-
-	/* literal Length */
-	{
-		U32 const llCode = ZSTD_LLcode(litLength);
-		optPtr->litLengthFreq[llCode]++;
-		optPtr->litLengthSum++;
-	}
-
-	/* offset code : follows storeSeq() numeric representation */
-	{
-		U32 const offCode = ZSTD_highbit32(offBase);
-		assert(offCode <= MaxOff);
-		optPtr->offCodeFreq[offCode]++;
-		optPtr->offCodeSum++;
-	}
-
-	/* match Length */
-	{
-		U32 const mlBase = matchLength - MINMATCH;
-		U32 const mlCode = ZSTD_MLcode(mlBase);
-		optPtr->matchLengthFreq[mlCode]++;
-		optPtr->matchLengthSum++;
-	}
-}
-
-/* ZSTD_readMINMATCH() :
- * function safe only for comparisons
- * assumption : memPtr must be at least 4 bytes before end of buffer */
-MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) {
-	switch(length) {
-	default:
-	case 4:
-		return MEM_read32(memPtr);
-	case 3:
-		if(MEM_isLittleEndian())
-			return MEM_read32(memPtr) << 8;
-		else
-			return MEM_read32(memPtr) >> 8;
-	}
-}
-
-/* Update hashTable3 up to ip (excluded)
-   Assumption : always within prefix (i.e. not within extDict) */
-static U32 ZSTD_insertAndFindFirstIndexHash3(const ZSTD_matchState_t* ms,
-                                             U32* nextToUpdate3,
-                                             const BYTE* const ip) {
-	U32* const hashTable3 = ms->hashTable3;
-	U32 const hashLog3 = ms->hashLog3;
-	const BYTE* const base = ms->window.base;
-	U32 idx = *nextToUpdate3;
-	U32 const target = (U32)(ip - base);
-	size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3);
-	assert(hashLog3 > 0);
-
-	while(idx < target) {
-		hashTable3[ZSTD_hash3Ptr(base + idx, hashLog3)] = idx;
-		idx++;
-	}
-
-	*nextToUpdate3 = target;
-	return hashTable3[hash3];
-}
-
-/*-*************************************
- *  Binary Tree search
- ***************************************/
-/** ZSTD_insertBt1() : add one or multiple positions to tree.
- * @param ip assumed <= iend-8 .
- * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
- * @return : nb of positions added */
-static U32 ZSTD_insertBt1(
-    const ZSTD_matchState_t* ms,
-    const BYTE* const ip, const BYTE* const iend,
-    U32 const target,
-    U32 const mls, const int extDict) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32* const hashTable = ms->hashTable;
-	U32 const hashLog = cParams->hashLog;
-	size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
-	U32* const bt = ms->chainTable;
-	U32 const btLog = cParams->chainLog - 1;
-	U32 const btMask = (1 << btLog) - 1;
-	U32 matchIndex = hashTable[h];
-	size_t commonLengthSmaller = 0, commonLengthLarger = 0;
-	const BYTE* const base = ms->window.base;
-	const BYTE* const dictBase = ms->window.dictBase;
-	const U32 dictLimit = ms->window.dictLimit;
-	const BYTE* const dictEnd = dictBase + dictLimit;
-	const BYTE* const prefixStart = base + dictLimit;
-	const BYTE* match;
-	const U32 curr = (U32)(ip - base);
-	const U32 btLow = btMask >= curr ? 0 : curr - btMask;
-	U32* smallerPtr = bt + 2 * (curr & btMask);
-	U32* largerPtr = smallerPtr + 1;
-	U32 dummy32; /* to be nullified at the end */
-	/* windowLow is based on target because
-	 * we only need positions that will be in the window at the end of the tree update.
-	 */
-	U32 const windowLow = ZSTD_getLowestMatchIndex(ms, target, cParams->windowLog);
-	U32 matchEndIdx = curr + 8 + 1;
-	size_t bestLength = 8;
-	U32 nbCompares = 1U << cParams->searchLog;
-#ifdef ZSTD_C_PREDICT
-	U32 predictedSmall = *(bt + 2 * ((curr - 1) & btMask) + 0);
-	U32 predictedLarge = *(bt + 2 * ((curr - 1) & btMask) + 1);
-	predictedSmall += (predictedSmall > 0);
-	predictedLarge += (predictedLarge > 0);
-#endif /* ZSTD_C_PREDICT */
-
-	DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr);
-
-	assert(curr <= target);
-	assert(ip <= iend - 8); /* required for h calculation */
-	hashTable[h] = curr;    /* Update Hash Table */
-
-	assert(windowLow > 0);
-	for(; nbCompares && (matchIndex >= windowLow); --nbCompares) {
-		U32* const nextPtr = bt + 2 * (matchIndex & btMask);
-		size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
-		assert(matchIndex < curr);
-
-#ifdef ZSTD_C_PREDICT                                                 /* note : can create issues when hlog small <= 11 */
-		const U32* predictPtr = bt + 2 * ((matchIndex - 1) & btMask); /* written this way, as bt is a roll buffer */
-		if(matchIndex == predictedSmall) {
-			/* no need to check length, result known */
-			*smallerPtr = matchIndex;
-			if(matchIndex <= btLow) {
-				smallerPtr = &dummy32;
-				break;
-			}                         /* beyond tree size, stop the search */
-			smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */
-			matchIndex = nextPtr[1];  /* new matchIndex larger than previous (closer to current) */
-			predictedSmall = predictPtr[1] + (predictPtr[1] > 0);
-			continue;
-		}
-		if(matchIndex == predictedLarge) {
-			*largerPtr = matchIndex;
-			if(matchIndex <= btLow) {
-				largerPtr = &dummy32;
-				break;
-			} /* beyond tree size, stop the search */
-			largerPtr = nextPtr;
-			matchIndex = nextPtr[0];
-			predictedLarge = predictPtr[0] + (predictPtr[0] > 0);
-			continue;
-		}
-#endif
-
-		if(!extDict || (matchIndex + matchLength >= dictLimit)) {
-			assert(matchIndex + matchLength >= dictLimit); /* might be wrong if actually extDict */
-			match = base + matchIndex;
-			matchLength += ZSTD_count(ip + matchLength, match + matchLength, iend);
-		} else {
-			match = dictBase + matchIndex;
-			matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart);
-			if(matchIndex + matchLength >= dictLimit)
-				match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
-		}
-
-		if(matchLength > bestLength) {
-			bestLength = matchLength;
-			if(matchLength > matchEndIdx - matchIndex)
-				matchEndIdx = matchIndex + (U32)matchLength;
-		}
-
-		if(ip + matchLength == iend) { /* equal : no way to know if inf or sup */
-			break;                     /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
-		}
-
-		if(match[matchLength] < ip[matchLength]) { /* necessarily within buffer */
-			/* match is smaller than current */
-			*smallerPtr = matchIndex;          /* update smaller idx */
-			commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
-			if(matchIndex <= btLow) {
-				smallerPtr = &dummy32;
-				break;
-			}                         /* beyond tree size, stop searching */
-			smallerPtr = nextPtr + 1; /* new "candidate" => larger than match, which was smaller than target */
-			matchIndex = nextPtr[1];  /* new matchIndex, larger than previous and closer to current */
-		} else {
-			/* match is larger than current */
-			*largerPtr = matchIndex;
-			commonLengthLarger = matchLength;
-			if(matchIndex <= btLow) {
-				largerPtr = &dummy32;
-				break;
-			} /* beyond tree size, stop searching */
-			largerPtr = nextPtr;
-			matchIndex = nextPtr[0];
-		}
-	}
-
-	*smallerPtr = *largerPtr = 0;
-	{
-		U32 positions = 0;
-		if(bestLength > 384)
-			positions = MIN(192, (U32)(bestLength - 384)); /* speed optimization */
-		assert(matchEndIdx > curr + 8);
-		return MAX(positions, matchEndIdx - (curr + 8));
-	}
-}
-
-FORCE_INLINE_TEMPLATE
-void ZSTD_updateTree_internal(
-    ZSTD_matchState_t* ms,
-    const BYTE* const ip, const BYTE* const iend,
-    const U32 mls, const ZSTD_dictMode_e dictMode) {
-	const BYTE* const base = ms->window.base;
-	U32 const target = (U32)(ip - base);
-	U32 idx = ms->nextToUpdate;
-	DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
-	         idx, target, dictMode);
-
-	while(idx < target) {
-		U32 const forward = ZSTD_insertBt1(ms, base + idx, iend, target, mls, dictMode == ZSTD_extDict);
-		assert(idx < (U32)(idx + forward));
-		idx += forward;
-	}
-	assert((size_t)(ip - base) <= (size_t)(U32)(-1));
-	assert((size_t)(iend - base) <= (size_t)(U32)(-1));
-	ms->nextToUpdate = target;
-}
-
-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
-	ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
-}
-
-FORCE_INLINE_TEMPLATE U32
-ZSTD_insertBtAndGetAllMatches(
-    ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */
-    ZSTD_matchState_t* ms,
-    U32* nextToUpdate3,
-    const BYTE* const ip, const BYTE* const iLimit,
-    const ZSTD_dictMode_e dictMode,
-    const U32 rep[ZSTD_REP_NUM],
-    const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
-    const U32 lengthToBeat,
-    const U32 mls /* template */) {
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-	U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM - 1);
-	const BYTE* const base = ms->window.base;
-	U32 const curr = (U32)(ip - base);
-	U32 const hashLog = cParams->hashLog;
-	U32 const minMatch = (mls == 3) ? 3 : 4;
-	U32* const hashTable = ms->hashTable;
-	size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
-	U32 matchIndex = hashTable[h];
-	U32* const bt = ms->chainTable;
-	U32 const btLog = cParams->chainLog - 1;
-	U32 const btMask = (1U << btLog) - 1;
-	size_t commonLengthSmaller = 0, commonLengthLarger = 0;
-	const BYTE* const dictBase = ms->window.dictBase;
-	U32 const dictLimit = ms->window.dictLimit;
-	const BYTE* const dictEnd = dictBase + dictLimit;
-	const BYTE* const prefixStart = base + dictLimit;
-	U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
-	U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
-	U32 const matchLow = windowLow ? windowLow : 1;
-	U32* smallerPtr = bt + 2 * (curr & btMask);
-	U32* largerPtr = bt + 2 * (curr & btMask) + 1;
-	U32 matchEndIdx = curr + 8 + 1; /* farthest referenced position of any match => detects repetitive patterns */
-	U32 dummy32;                    /* to be nullified at the end */
-	U32 mnum = 0;
-	U32 nbCompares = 1U << cParams->searchLog;
-
-	const ZSTD_matchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
-	const ZSTD_compressionParameters* const dmsCParams =
-	    dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL;
-	const BYTE* const dmsBase = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
-	const BYTE* const dmsEnd = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL;
-	U32 const dmsHighLimit = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0;
-	U32 const dmsLowLimit = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0;
-	U32 const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0;
-	U32 const dmsHashLog = dictMode == ZSTD_dictMatchState ? dmsCParams->hashLog : hashLog;
-	U32 const dmsBtLog = dictMode == ZSTD_dictMatchState ? dmsCParams->chainLog - 1 : btLog;
-	U32 const dmsBtMask = dictMode == ZSTD_dictMatchState ? (1U << dmsBtLog) - 1 : 0;
-	U32 const dmsBtLow = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit;
-
-	size_t bestLength = lengthToBeat - 1;
-	DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", curr);
-
-	/* check repCode */
-	assert(ll0 <= 1); /* necessarily 1 or 0 */
-	{
-		U32 const lastR = ZSTD_REP_NUM + ll0;
-		U32 repCode;
-		for(repCode = ll0; repCode < lastR; repCode++) {
-			U32 const repOffset = (repCode == ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
-			U32 const repIndex = curr - repOffset;
-			U32 repLen = 0;
-			assert(curr >= dictLimit);
-			if(repOffset - 1 /* intentional overflow, discards 0 and -1 */ < curr - dictLimit) { /* equivalent to `curr > repIndex >= dictLimit` */
-				/* We must validate the repcode offset because when we're using a dictionary the
-				 * valid offset range shrinks when the dictionary goes out of bounds.
-				 */
-				if((repIndex >= windowLow) & (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch))) {
-					repLen = (U32)ZSTD_count(ip + minMatch, ip + minMatch - repOffset, iLimit) + minMatch;
-				}
-			} else { /* repIndex < dictLimit || repIndex >= curr */
-				const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ? dmsBase + repIndex - dmsIndexDelta : dictBase + repIndex;
-				assert(curr >= windowLow);
-				if(dictMode == ZSTD_extDict && (((repOffset - 1) /*intentional overflow*/ < curr - windowLow) /* equivalent to `curr > repIndex >= windowLow` */
-				                                & (((U32)((dictLimit - 1) - repIndex) >= 3)) /* intentional overflow : do not test positions overlapping 2 memory segments */) &&
-				   (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch))) {
-					repLen = (U32)ZSTD_count_2segments(ip + minMatch, repMatch + minMatch, iLimit, dictEnd, prefixStart) + minMatch;
-				}
-				if(dictMode == ZSTD_dictMatchState && (((repOffset - 1) /*intentional overflow*/ < curr - (dmsLowLimit + dmsIndexDelta)) /* equivalent to `curr > repIndex >= dmsLowLimit` */
-				                                       & ((U32)((dictLimit - 1) - repIndex) >= 3))                                       /* intentional overflow : do not test positions overlapping 2 memory segments */
-				   && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch))) {
-					repLen = (U32)ZSTD_count_2segments(ip + minMatch, repMatch + minMatch, iLimit, dmsEnd, prefixStart) + minMatch;
-				}
-			}
-			/* save longer solution */
-			if(repLen > bestLength) {
-				DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
-				         repCode, ll0, repOffset, repLen);
-				bestLength = repLen;
-				matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */
-				matches[mnum].len = (U32)repLen;
-				mnum++;
-				if((repLen > sufficient_len) | (ip + repLen == iLimit)) { /* best possible */
-					return mnum;
-				}
-			}
-		}
-	}
-
-	/* HC3 match finder */
-	if((mls == 3) /*static*/ && (bestLength < mls)) {
-		U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, nextToUpdate3, ip);
-		if((matchIndex3 >= matchLow) & (curr - matchIndex3 < (1 << 18)) /*heuristic : longer distance likely too expensive*/) {
-			size_t mlen;
-			if((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) {
-				const BYTE* const match = base + matchIndex3;
-				mlen = ZSTD_count(ip, match, iLimit);
-			} else {
-				const BYTE* const match = dictBase + matchIndex3;
-				mlen = ZSTD_count_2segments(ip, match, iLimit, dictEnd, prefixStart);
-			}
-
-			/* save best solution */
-			if(mlen >= mls /* == 3 > bestLength */) {
-				DEBUGLOG(8, "found small match with hlog3, of length %u",
-				         (U32)mlen);
-				bestLength = mlen;
-				assert(curr > matchIndex3);
-				assert(mnum == 0); /* no prior solution */
-				matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
-				matches[0].len = (U32)mlen;
-				mnum = 1;
-				if((mlen > sufficient_len) |
-				   (ip + mlen == iLimit)) {      /* best possible length */
-					ms->nextToUpdate = curr + 1; /* skip insertion */
-					return 1;
-				}
-			}
-		}
-		/* no dictMatchState lookup: dicts don't have a populated HC3 table */
-	} /* if (mls == 3) */
-
-	hashTable[h] = curr; /* Update Hash Table */
-
-	for(; nbCompares && (matchIndex >= matchLow); --nbCompares) {
-		U32* const nextPtr = bt + 2 * (matchIndex & btMask);
-		const BYTE* match;
-		size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
-		assert(curr > matchIndex);
-
-		if((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex + matchLength >= dictLimit)) {
-			assert(matchIndex + matchLength >= dictLimit); /* ensure the condition is correct when !extDict */
-			match = base + matchIndex;
-			if(matchIndex >= dictLimit)
-				assert(memcmp(match, ip, matchLength) == 0); /* ensure early section of match is equal as expected */
-			matchLength += ZSTD_count(ip + matchLength, match + matchLength, iLimit);
-		} else {
-			match = dictBase + matchIndex;
-			assert(memcmp(match, ip, matchLength) == 0); /* ensure early section of match is equal as expected */
-			matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iLimit, dictEnd, prefixStart);
-			if(matchIndex + matchLength >= dictLimit)
-				match = base + matchIndex; /* prepare for match[matchLength] read */
-		}
-
-		if(matchLength > bestLength) {
-			DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
-			         (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
-			assert(matchEndIdx > matchIndex);
-			if(matchLength > matchEndIdx - matchIndex)
-				matchEndIdx = matchIndex + (U32)matchLength;
-			bestLength = matchLength;
-			matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
-			matches[mnum].len = (U32)matchLength;
-			mnum++;
-			if((matchLength > ZSTD_OPT_NUM) | (ip + matchLength == iLimit) /* equal : no way to know if inf or sup */) {
-				if(dictMode == ZSTD_dictMatchState)
-					nbCompares = 0; /* break should also skip searching dms */
-				break;              /* drop, to preserve bt consistency (miss a little bit of compression) */
-			}
-		}
-
-		if(match[matchLength] < ip[matchLength]) {
-			/* match smaller than current */
-			*smallerPtr = matchIndex;          /* update smaller idx */
-			commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
-			if(matchIndex <= btLow) {
-				smallerPtr = &dummy32;
-				break;
-			}                         /* beyond tree size, stop the search */
-			smallerPtr = nextPtr + 1; /* new candidate => larger than match, which was smaller than current */
-			matchIndex = nextPtr[1];  /* new matchIndex, larger than previous, closer to current */
-		} else {
-			*largerPtr = matchIndex;
-			commonLengthLarger = matchLength;
-			if(matchIndex <= btLow) {
-				largerPtr = &dummy32;
-				break;
-			} /* beyond tree size, stop the search */
-			largerPtr = nextPtr;
-			matchIndex = nextPtr[0];
-		}
-	}
-
-	*smallerPtr = *largerPtr = 0;
-
-	assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
-	if(dictMode == ZSTD_dictMatchState && nbCompares) {
-		size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls);
-		U32 dictMatchIndex = dms->hashTable[dmsH];
-		const U32* const dmsBt = dms->chainTable;
-		commonLengthSmaller = commonLengthLarger = 0;
-		for(; nbCompares && (dictMatchIndex > dmsLowLimit); --nbCompares) {
-			const U32* const nextPtr = dmsBt + 2 * (dictMatchIndex & dmsBtMask);
-			size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
-			const BYTE* match = dmsBase + dictMatchIndex;
-			matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iLimit, dmsEnd, prefixStart);
-			if(dictMatchIndex + matchLength >= dmsHighLimit)
-				match = base + dictMatchIndex + dmsIndexDelta; /* to prepare for next usage of match[matchLength] */
-
-			if(matchLength > bestLength) {
-				matchIndex = dictMatchIndex + dmsIndexDelta;
-				DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
-				         (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
-				if(matchLength > matchEndIdx - matchIndex)
-					matchEndIdx = matchIndex + (U32)matchLength;
-				bestLength = matchLength;
-				matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
-				matches[mnum].len = (U32)matchLength;
-				mnum++;
-				if((matchLength > ZSTD_OPT_NUM) | (ip + matchLength == iLimit) /* equal : no way to know if inf or sup */) {
-					break; /* drop, to guarantee consistency (miss a little bit of compression) */
-				}
-			}
-
-			if(dictMatchIndex <= dmsBtLow) {
-				break;
-			} /* beyond tree size, stop the search */
-			if(match[matchLength] < ip[matchLength]) {
-				commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
-				dictMatchIndex = nextPtr[1];       /* new matchIndex larger than previous (closer to current) */
-			} else {
-				/* match is larger than current */
-				commonLengthLarger = matchLength;
-				dictMatchIndex = nextPtr[0];
-			}
-		}
-	} /* if (dictMode == ZSTD_dictMatchState) */
-
-	assert(matchEndIdx > curr + 8);
-	ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
-	return mnum;
-}
-
-typedef U32 (*ZSTD_getAllMatchesFn)(
-    ZSTD_match_t*,
-    ZSTD_matchState_t*,
-    U32*,
-    const BYTE*,
-    const BYTE*,
-    const U32 rep[ZSTD_REP_NUM],
-    U32 const ll0,
-    U32 const lengthToBeat);
-
-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
-    ZSTD_match_t* matches,
-    ZSTD_matchState_t* ms,
-    U32* nextToUpdate3,
-    const BYTE* ip,
-    const BYTE* const iHighLimit,
-    const U32 rep[ZSTD_REP_NUM],
-    U32 const ll0,
-    U32 const lengthToBeat,
-    const ZSTD_dictMode_e dictMode,
-    const U32 mls) {
-	assert(BOUNDED(3, ms->cParams.minMatch, 6) == mls);
-	DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=%d, mls=%u)", (int)dictMode, mls);
-	if(ip < ms->window.base + ms->nextToUpdate)
-		return 0; /* skipped area */
-	ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode);
-	return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, mls);
-}
-
-#define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##dictMode##_##mls
-
-#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls)        \
-	static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)(  \
-	    ZSTD_match_t * matches,                            \
-	    ZSTD_matchState_t * ms,                            \
-	    U32 * nextToUpdate3,                               \
-	    const BYTE* ip,                                    \
-	    const BYTE* const iHighLimit,                      \
-	    const U32 rep[ZSTD_REP_NUM],                       \
-	    U32 const ll0,                                     \
-	    U32 const lengthToBeat) {                          \
-		return ZSTD_btGetAllMatches_internal(              \
-		    matches, ms, nextToUpdate3, ip, iHighLimit,    \
-		    rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \
-	}
-
-#define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode) \
-	GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3) \
-	GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4) \
-	GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5) \
-	GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6)
-
-GEN_ZSTD_BT_GET_ALL_MATCHES(noDict)
-GEN_ZSTD_BT_GET_ALL_MATCHES(extDict)
-GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState)
-
-#define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode)      \
-	{                                                \
-		ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3),     \
-		    ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \
-		    ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \
-		    ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6)  \
-	}
-
-static ZSTD_getAllMatchesFn
-ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const * ms, ZSTD_dictMode_e const dictMode) {
-	ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = {
-	    ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict),
-	    ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict),
-	    ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState)};
-	U32 const mls = BOUNDED(3, ms->cParams.minMatch, 6);
-	assert((U32)dictMode < 3);
-	assert(mls - 3 < 4);
-	return getAllMatchesFns[(int)dictMode][mls - 3];
-}
-
-/*************************
- *  LDM helper functions  *
- *************************/
-
-/* Struct containing info needed to make decision about ldm inclusion */
-typedef struct {
-	rawSeqStore_t seqStore; /* External match candidates store for this block */
-	U32 startPosInBlock;    /* Start position of the current match candidate */
-	U32 endPosInBlock;      /* End position of the current match candidate */
-	U32 offset;             /* Offset of the match candidate */
-} ZSTD_optLdm_t;
-
-/* ZSTD_optLdm_skipRawSeqStoreBytes():
- * Moves forward in @rawSeqStore by @nbBytes,
- * which will update the fields 'pos' and 'posInSequence'.
- */
-static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
-	U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
-	while(currPos && rawSeqStore->pos < rawSeqStore->size) {
-		rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos];
-		if(currPos >= currSeq.litLength + currSeq.matchLength) {
-			currPos -= currSeq.litLength + currSeq.matchLength;
-			rawSeqStore->pos++;
-		} else {
-			rawSeqStore->posInSequence = currPos;
-			break;
-		}
-	}
-	if(currPos == 0 || rawSeqStore->pos == rawSeqStore->size) {
-		rawSeqStore->posInSequence = 0;
-	}
-}
-
-/* ZSTD_opt_getNextMatchAndUpdateSeqStore():
- * Calculates the beginning and end of the next match in the current block.
- * Updates 'pos' and 'posInSequence' of the ldmSeqStore.
- */
-static void
-ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock,
-                                       U32 blockBytesRemaining) {
-	rawSeq currSeq;
-	U32 currBlockEndPos;
-	U32 literalsBytesRemaining;
-	U32 matchBytesRemaining;
-
-	/* Setting match end position to MAX to ensure we never use an LDM during this block */
-	if(optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) {
-		optLdm->startPosInBlock = UINT_MAX;
-		optLdm->endPosInBlock = UINT_MAX;
-		return;
-	}
-	/* Calculate appropriate bytes left in matchLength and litLength
-	 * after adjusting based on ldmSeqStore->posInSequence */
-	currSeq = optLdm->seqStore.seq[optLdm->seqStore.pos];
-	assert(optLdm->seqStore.posInSequence <= currSeq.litLength + currSeq.matchLength);
-	currBlockEndPos = currPosInBlock + blockBytesRemaining;
-	literalsBytesRemaining = (optLdm->seqStore.posInSequence < currSeq.litLength) ? currSeq.litLength - (U32)optLdm->seqStore.posInSequence : 0;
-	matchBytesRemaining = (literalsBytesRemaining == 0) ? currSeq.matchLength - ((U32)optLdm->seqStore.posInSequence - currSeq.litLength) : currSeq.matchLength;
-
-	/* If there are more literal bytes than bytes remaining in block, no ldm is possible */
-	if(literalsBytesRemaining >= blockBytesRemaining) {
-		optLdm->startPosInBlock = UINT_MAX;
-		optLdm->endPosInBlock = UINT_MAX;
-		ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, blockBytesRemaining);
-		return;
-	}
-
-	/* Matches may be < MINMATCH by this process. In that case, we will reject them
-	   when we are deciding whether or not to add the ldm */
-	optLdm->startPosInBlock = currPosInBlock + literalsBytesRemaining;
-	optLdm->endPosInBlock = optLdm->startPosInBlock + matchBytesRemaining;
-	optLdm->offset = currSeq.offset;
-
-	if(optLdm->endPosInBlock > currBlockEndPos) {
-		/* Match ends after the block ends, we can't use the whole match */
-		optLdm->endPosInBlock = currBlockEndPos;
-		ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, currBlockEndPos - currPosInBlock);
-	} else {
-		/* Consume nb of bytes equal to size of sequence left */
-		ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, literalsBytesRemaining + matchBytesRemaining);
-	}
-}
-
-/* ZSTD_optLdm_maybeAddMatch():
- * Adds a match if it's long enough,
- * based on it's 'matchStartPosInBlock' and 'matchEndPosInBlock',
- * into 'matches'. Maintains the correct ordering of 'matches'.
- */
-static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
-                                      const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) {
-	U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
-	/* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
-	U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
-
-	/* Ensure that current block position is not outside of the match */
-	if(currPosInBlock < optLdm->startPosInBlock || currPosInBlock >= optLdm->endPosInBlock || candidateMatchLength < MINMATCH) {
-		return;
-	}
-
-	if(*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches - 1].len) && *nbMatches < ZSTD_OPT_NUM)) {
-		U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
-		DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
-		         candidateOffBase, candidateMatchLength, currPosInBlock);
-		matches[*nbMatches].len = candidateMatchLength;
-		matches[*nbMatches].off = candidateOffBase;
-		(*nbMatches)++;
-	}
-}
-
-/* ZSTD_optLdm_processMatchCandidate():
- * Wrapper function to update ldm seq store and call ldm functions as necessary.
- */
-static void
-ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
-                                  ZSTD_match_t* matches, U32* nbMatches,
-                                  U32 currPosInBlock, U32 remainingBytes) {
-	if(optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) {
-		return;
-	}
-
-	if(currPosInBlock >= optLdm->endPosInBlock) {
-		if(currPosInBlock > optLdm->endPosInBlock) {
-			/* The position at which ZSTD_optLdm_processMatchCandidate() is called is not necessarily
-			 * at the end of a match from the ldm seq store, and will often be some bytes
-			 * over beyond matchEndPosInBlock. As such, we need to correct for these "overshoots"
-			 */
-			U32 const posOvershoot = currPosInBlock - optLdm->endPosInBlock;
-			ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOvershoot);
-		}
-		ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes);
-	}
-	ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock);
-}
-
-/*-*******************************
- *  Optimal parser
- *********************************/
-
-static U32 ZSTD_totalLen(ZSTD_optimal_t sol) {
-	return sol.litlen + sol.mlen;
-}
-
-#if 0 /* debug */
-
-static void
-listStats(const U32* table, int lastEltID)
-{
-    int const nbElts = lastEltID + 1;
-    int enb;
-    for (enb=0; enb < nbElts; enb++) {
-        (void)table;
-        /* RAWLOG(2, "%3i:%3i,  ", enb, table[enb]); */
-        RAWLOG(2, "%4i,", table[enb]);
-    }
-    RAWLOG(2, " \n");
-}
-
-#endif
-
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
-                               seqStore_t* seqStore,
-                               U32 rep[ZSTD_REP_NUM],
-                               const void* src, size_t srcSize,
-                               const int optLevel,
-                               const ZSTD_dictMode_e dictMode) {
-	optState_t* const optStatePtr = &ms->opt;
-	const BYTE* const istart = (const BYTE*)src;
-	const BYTE* ip = istart;
-	const BYTE* anchor = istart;
-	const BYTE* const iend = istart + srcSize;
-	const BYTE* const ilimit = iend - 8;
-	const BYTE* const base = ms->window.base;
-	const BYTE* const prefixStart = base + ms->window.dictLimit;
-	const ZSTD_compressionParameters* const cParams = &ms->cParams;
-
-	ZSTD_getAllMatchesFn getAllMatches = ZSTD_selectBtGetAllMatches(ms, dictMode);
-
-	U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM - 1);
-	U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4;
-	U32 nextToUpdate3 = ms->nextToUpdate;
-
-	ZSTD_optimal_t* const opt = optStatePtr->priceTable;
-	ZSTD_match_t* const matches = optStatePtr->matchTable;
-	ZSTD_optimal_t lastSequence;
-	ZSTD_optLdm_t optLdm;
-
-	optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
-	optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
-	ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip - istart), (U32)(iend - ip));
-
-	/* init */
-	DEBUGLOG(5, "ZSTD_compressBlock_opt_generic: current=%u, prefix=%u, nextToUpdate=%u",
-	         (U32)(ip - base), ms->window.dictLimit, ms->nextToUpdate);
-	assert(optLevel <= 2);
-	ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel);
-	ip += (ip == prefixStart);
-
-	/* Match Loop */
-	while(ip < ilimit) {
-		U32 cur, last_pos = 0;
-
-		/* find first match */
-		{
-			U32 const litlen = (U32)(ip - anchor);
-			U32 const ll0 = !litlen;
-			U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
-			ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
-			                                  (U32)(ip - istart), (U32)(iend - ip));
-			if(!nbMatches) {
-				ip++;
-				continue;
-			}
-
-			/* initialize opt[0] */
-			{
-				U32 i;
-				for(i = 0; i < ZSTD_REP_NUM; i++)
-					opt[0].rep[i] = rep[i];
-			}
-			opt[0].mlen = 0; /* means is_a_literal */
-			opt[0].litlen = litlen;
-			/* We don't need to include the actual price of the literals because
-			 * it is static for the duration of the forward pass, and is included
-			 * in every price. We include the literal length to avoid negative
-			 * prices when we subtract the previous literal length.
-			 */
-			opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
-
-			/* large match -> immediate encoding */
-			{
-				U32 const maxML = matches[nbMatches - 1].len;
-				U32 const maxOffBase = matches[nbMatches - 1].off;
-				DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
-				         nbMatches, maxML, maxOffBase, (U32)(ip - prefixStart));
-
-				if(maxML > sufficient_len) {
-					lastSequence.litlen = litlen;
-					lastSequence.mlen = maxML;
-					lastSequence.off = maxOffBase;
-					DEBUGLOG(6, "large match (%u>%u), immediate encoding",
-					         maxML, sufficient_len);
-					cur = 0;
-					last_pos = ZSTD_totalLen(lastSequence);
-					goto _shortestPath;
-				}
-			}
-
-			/* set prices for first matches starting position == 0 */
-			assert(opt[0].price >= 0);
-			{
-				U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
-				U32 pos;
-				U32 matchNb;
-				for(pos = 1; pos < minMatch; pos++) {
-					opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */
-				}
-				for(matchNb = 0; matchNb < nbMatches; matchNb++) {
-					U32 const offBase = matches[matchNb].off;
-					U32 const end = matches[matchNb].len;
-					for(; pos <= end; pos++) {
-						U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
-						U32 const sequencePrice = literalsPrice + matchPrice;
-						DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
-						         pos, ZSTD_fCost((int)sequencePrice));
-						opt[pos].mlen = pos;
-						opt[pos].off = offBase;
-						opt[pos].litlen = litlen;
-						opt[pos].price = (int)sequencePrice;
-					}
-				}
-				last_pos = pos - 1;
-			}
-		}
-
-		/* check further positions */
-		for(cur = 1; cur <= last_pos; cur++) {
-			const BYTE* const inr = ip + cur;
-			assert(cur < ZSTD_OPT_NUM);
-			DEBUGLOG(7, "cPos:%zi==rPos:%u", inr - istart, cur)
-
-			/* Fix current position with one literal if cheaper */
-			{
-				U32 const litlen = (opt[cur - 1].mlen == 0) ? opt[cur - 1].litlen + 1 : 1;
-				int const price = opt[cur - 1].price + (int)ZSTD_rawLiteralsCost(ip + cur - 1, 1, optStatePtr, optLevel) + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) - (int)ZSTD_litLengthPrice(litlen - 1, optStatePtr, optLevel);
-				assert(price < 1000000000); /* overflow check */
-				if(price <= opt[cur].price) {
-					DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
-					         inr - istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
-					         opt[cur - 1].rep[0], opt[cur - 1].rep[1], opt[cur - 1].rep[2]);
-					opt[cur].mlen = 0;
-					opt[cur].off = 0;
-					opt[cur].litlen = litlen;
-					opt[cur].price = price;
-				} else {
-					DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
-					         inr - istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
-					         opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
-				}
-			}
-
-			/* Set the repcodes of the current position. We must do it here
-			 * because we rely on the repcodes of the 2nd to last sequence being
-			 * correct to set the next chunks repcodes during the backward
-			 * traversal.
-			 */
-			ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
-			assert(cur >= opt[cur].mlen);
-			if(opt[cur].mlen != 0) {
-				U32 const prev = cur - opt[cur].mlen;
-				repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen == 0);
-				ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
-			} else {
-				ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
-			}
-
-			/* last match must start at a minimum distance of 8 from oend */
-			if(inr > ilimit)
-				continue;
-
-			if(cur == last_pos)
-				break;
-
-			if((optLevel == 0) /*static_test*/
-			   && (opt[cur + 1].price <= opt[cur].price + (BITCOST_MULTIPLIER / 2))) {
-				DEBUGLOG(7, "move to next rPos:%u : price is <=", cur + 1);
-				continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
-			}
-
-			assert(opt[cur].price >= 0);
-			{
-				U32 const ll0 = (opt[cur].mlen != 0);
-				U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
-				U32 const previousPrice = (U32)opt[cur].price;
-				U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
-				U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
-				U32 matchNb;
-
-				ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
-				                                  (U32)(inr - istart), (U32)(iend - inr));
-
-				if(!nbMatches) {
-					DEBUGLOG(7, "rPos:%u : no match found", cur);
-					continue;
-				}
-
-				{
-					U32 const maxML = matches[nbMatches - 1].len;
-					DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
-					         inr - istart, cur, nbMatches, maxML);
-
-					if((maxML > sufficient_len) || (cur + maxML >= ZSTD_OPT_NUM)) {
-						lastSequence.mlen = maxML;
-						lastSequence.off = matches[nbMatches - 1].off;
-						lastSequence.litlen = litlen;
-						cur -= (opt[cur].mlen == 0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
-						last_pos = cur + ZSTD_totalLen(lastSequence);
-						if(cur > ZSTD_OPT_NUM)
-							cur = 0; /* underflow => first match */
-						goto _shortestPath;
-					}
-				}
-
-				/* set prices using matches found at position == cur */
-				for(matchNb = 0; matchNb < nbMatches; matchNb++) {
-					U32 const offset = matches[matchNb].off;
-					U32 const lastML = matches[matchNb].len;
-					U32 const startML = (matchNb > 0) ? matches[matchNb - 1].len + 1 : minMatch;
-					U32 mlen;
-
-					DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
-					         matchNb, matches[matchNb].off, lastML, litlen);
-
-					for(mlen = lastML; mlen >= startML; mlen--) { /* scan downward */
-						U32 const pos = cur + mlen;
-						int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
-
-						if((pos > last_pos) || (price < opt[pos].price)) {
-							DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
-							         pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
-							while(last_pos < pos) {
-								opt[last_pos + 1].price = ZSTD_MAX_PRICE;
-								last_pos++;
-							} /* fill empty positions */
-							opt[pos].mlen = mlen;
-							opt[pos].off = offset;
-							opt[pos].litlen = litlen;
-							opt[pos].price = price;
-						} else {
-							DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
-							         pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
-							if(optLevel == 0)
-								break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
-						}
-					}
-				}
-			}
-		} /* for (cur = 1; cur <= last_pos; cur++) */
-
-		lastSequence = opt[last_pos];
-		cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */
-		assert(cur < ZSTD_OPT_NUM);                                                                /* control overflow*/
-
-	_shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
-		assert(opt[0].mlen == 0);
-
-		/* Set the next chunk's repcodes based on the repcodes of the beginning
-		 * of the last match, and the last sequence. This avoids us having to
-		 * update them while traversing the sequences.
-		 */
-		if(lastSequence.mlen != 0) {
-			repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen == 0);
-			ZSTD_memcpy(rep, &reps, sizeof(reps));
-		} else {
-			ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
-		}
-
-		{
-			U32 const storeEnd = cur + 1;
-			U32 storeStart = storeEnd;
-			U32 seqPos = cur;
-
-			DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
-			         last_pos, cur);
-			(void)last_pos;
-			assert(storeEnd < ZSTD_OPT_NUM);
-			DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
-			         storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
-			opt[storeEnd] = lastSequence;
-			while(seqPos > 0) {
-				U32 const backDist = ZSTD_totalLen(opt[seqPos]);
-				storeStart--;
-				DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
-				         seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
-				opt[storeStart] = opt[seqPos];
-				seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
-			}
-
-			/* save sequences */
-			DEBUGLOG(6, "sending selected sequences into seqStore") {
-				U32 storePos;
-				for(storePos = storeStart; storePos <= storeEnd; storePos++) {
-					U32 const llen = opt[storePos].litlen;
-					U32 const mlen = opt[storePos].mlen;
-					U32 const offBase = opt[storePos].off;
-					U32 const advance = llen + mlen;
-					DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
-					         anchor - istart, (unsigned)llen, (unsigned)mlen);
-
-					if(mlen == 0) {                   /* only literals => must be last "sequence", actually starting a new stream of sequences */
-						assert(storePos == storeEnd); /* must be last sequence */
-						ip = anchor + llen;           /* last "sequence" is a bunch of literals => don't progress anchor */
-						continue;                     /* will finish */
-					}
-
-					assert(anchor + llen <= iend);
-					ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
-					ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
-					anchor += advance;
-					ip = anchor;
-				}
-			}
-			ZSTD_setBasePrices(optStatePtr, optLevel);
-		}
-	} /* while (ip < ilimit) */
-
-	/* Return the last literals size */
-	return (size_t)(iend - anchor);
-}
-
-static size_t ZSTD_compressBlock_opt0(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) {
-	return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
-}
-
-static size_t ZSTD_compressBlock_opt2(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) {
-	return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
-}
-
-size_t ZSTD_compressBlock_btopt(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    const void* src, size_t srcSize) {
-	DEBUGLOG(5, "ZSTD_compressBlock_btopt");
-	return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
-}
-
-/* ZSTD_initStats_ultra():
- * make a first compression pass, just to seed stats with more accurate starting values.
- * only works on first block, with no dictionary and no ldm.
- * this function cannot error out, its narrow contract must be respected.
- */
-static void
-ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
-                     seqStore_t* seqStore,
-                     U32 rep[ZSTD_REP_NUM],
-                     const void* src, size_t srcSize) {
-	U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */
-	ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
-
-	DEBUGLOG(4, "ZSTD_initStats_ultra (srcSize=%zu)", srcSize);
-	assert(ms->opt.litLengthSum == 0);                       /* first block */
-	assert(seqStore->sequences == seqStore->sequencesStart); /* no ldm */
-	assert(ms->window.dictLimit == ms->window.lowLimit);     /* no dictionary */
-	assert(ms->window.dictLimit - ms->nextToUpdate <= 1);    /* no prefix (note: intentional overflow, defined as 2-complement) */
-
-	ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/
-
-	/* invalidate first scan from history, only keep entropy stats */
-	ZSTD_resetSeqStore(seqStore);
-	ms->window.base -= srcSize;
-	ms->window.dictLimit += (U32)srcSize;
-	ms->window.lowLimit = ms->window.dictLimit;
-	ms->nextToUpdate = ms->window.dictLimit;
-}
-
-size_t ZSTD_compressBlock_btultra(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    const void* src, size_t srcSize) {
-	DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
-	return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
-}
-
-size_t ZSTD_compressBlock_btultra2(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    const void* src, size_t srcSize) {
-	U32 const curr = (U32)((const BYTE*)src - ms->window.base);
-	DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
-
-	/* 2-passes strategy:
-	 * this strategy makes a first pass over first block to collect statistics
-	 * in order to seed next round's statistics with it.
-	 * After 1st pass, function forgets history, and starts a new block.
-	 * Consequently, this can only work if no data has been previously loaded in tables,
-	 * aka, no dictionary, no prefix, no ldm preprocessing.
-	 * The compression ratio gain is generally small (~0.5% on first block),
-	 ** the cost is 2x cpu time on first block. */
-	assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
-	if((ms->opt.litLengthSum == 0)                          /* first block */
-	   && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */
-	   && (ms->window.dictLimit == ms->window.lowLimit)     /* no dictionary */
-	   && (curr == ms->window.dictLimit)                    /* start of frame, nothing already loaded nor skipped */
-	   && (srcSize > ZSTD_PREDEF_THRESHOLD)                 /* input large enough to not employ default stats */
-	) {
-		ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
-	}
-
-	return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
-}
-
-size_t ZSTD_compressBlock_btopt_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    const void* src, size_t srcSize) {
-	return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
-}
-
-size_t ZSTD_compressBlock_btultra_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    const void* src, size_t srcSize) {
-	return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
-}
-
-size_t ZSTD_compressBlock_btopt_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    const void* src, size_t srcSize) {
-	return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
-}
-
-size_t ZSTD_compressBlock_btultra_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    const void* src, size_t srcSize) {
-	return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
-}
-
-/* note : no btultra2 variant for extDict nor dictMatchState,
- * because btultra2 is not meant to work with dictionaries
- * and is only specific for the first block (no prefix) */
diff --git a/src/zstd/zstd_opt.h b/src/zstd/zstd_opt.h
deleted file mode 100644
index 72c308cf3..000000000
--- a/src/zstd/zstd_opt.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-#ifndef ZSTD_OPT_H
-#define ZSTD_OPT_H
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#include "zstd_compress_internal.h"
-
-/* used in ZSTD_loadDictionaryContent() */
-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
-
-size_t ZSTD_compressBlock_btopt(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
-size_t ZSTD_compressBlock_btultra(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
-size_t ZSTD_compressBlock_btultra2(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
-
-size_t ZSTD_compressBlock_btopt_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
-size_t ZSTD_compressBlock_btultra_dictMatchState(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
-
-size_t ZSTD_compressBlock_btopt_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
-size_t ZSTD_compressBlock_btultra_extDict(
-    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    void const * src, size_t srcSize);
-
-/* note : no btultra2 variant for extDict nor dictMatchState,
- * because btultra2 is not meant to work with dictionaries
- * and is only specific for the first block (no prefix) */
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* ZSTD_OPT_H */
diff --git a/src/zstd/zstdmt_compress.c b/src/zstd/zstdmt_compress.c
deleted file mode 100644
index 6abc654e8..000000000
--- a/src/zstd/zstdmt_compress.c
+++ /dev/null
@@ -1,1852 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/* ======   Compiler specifics   ====== */
-#if defined(_MSC_VER)
-#pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */
-#endif
-
-/* ======   Constants   ====== */
-#define ZSTDMT_OVERLAPLOG_DEFAULT 0
-
-/* ======   Dependencies   ====== */
-#include "../common/zstd_deps.h"    /* ZSTD_memcpy, ZSTD_memset, INT_MAX, UINT_MAX */
-#include "../common/mem.h"          /* MEM_STATIC */
-#include "../common/pool.h"         /* threadpool */
-#include "../common/threading.h"    /* mutex */
-#include "zstd_compress_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */
-#include "zstd_ldm.h"
-#include "zstdmt_compress.h"
-
-/* Guards code to support resizing the SeqPool.
- * We will want to resize the SeqPool to save memory in the future.
- * Until then, comment the code out since it is unused.
- */
-#define ZSTD_RESIZE_SEQPOOL 0
-
-/* ======   Debug   ====== */
-#if defined(DEBUGLEVEL) && (DEBUGLEVEL >= 2) && !defined(_MSC_VER) && !defined(__MINGW32__)
-
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/times.h>
-
-#define DEBUG_PRINTHEX(l, p, n)                                       \
-	{                                                                 \
-		unsigned debug_u;                                             \
-		for(debug_u = 0; debug_u < (n); debug_u++)                    \
-			RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
-		RAWLOG(l, " \n");                                             \
-	}
-
-static unsigned long long GetCurrentClockTimeMicroseconds(void) {
-	static clock_t _ticksPerSecond = 0;
-	if(_ticksPerSecond <= 0)
-		_ticksPerSecond = sysconf(_SC_CLK_TCK);
-
-	{
-		struct tms junk;
-		clock_t newTicks = (clock_t)times(&junk);
-		return ((((unsigned long long)newTicks) * (1000000)) / _ticksPerSecond);
-	}
-}
-
-#define MUTEX_WAIT_TIME_DLEVEL 6
-#define ZSTD_PTHREAD_MUTEX_LOCK(mutex)                                                                       \
-	{                                                                                                        \
-		if(DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) {                                                           \
-			unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds();                         \
-			ZSTD_pthread_mutex_lock(mutex);                                                                  \
-			{                                                                                                \
-				unsigned long long const afterTime = GetCurrentClockTimeMicroseconds();                      \
-				unsigned long long const elapsedTime = (afterTime - beforeTime);                             \
-				if(elapsedTime > 1000) { /* or whatever threshold you like; I'm using 1 millisecond here */  \
-					DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \
-					         elapsedTime, #mutex);                                                           \
-				}                                                                                            \
-			}                                                                                                \
-		} else {                                                                                             \
-			ZSTD_pthread_mutex_lock(mutex);                                                                  \
-		}                                                                                                    \
-	}
-
-#else
-
-#define ZSTD_PTHREAD_MUTEX_LOCK(m) ZSTD_pthread_mutex_lock(m)
-#define DEBUG_PRINTHEX(l, p, n) \
-	{ }
-
-#endif
-
-/* =====   Buffer Pool   ===== */
-/* a single Buffer Pool can be invoked from multiple threads in parallel */
-
-typedef struct buffer_s {
-	void* start;
-	size_t capacity;
-} buffer_t;
-
-static const buffer_t g_nullBuffer = {NULL, 0};
-
-typedef struct ZSTDMT_bufferPool_s {
-	ZSTD_pthread_mutex_t poolMutex;
-	size_t bufferSize;
-	unsigned totalBuffers;
-	unsigned nbBuffers;
-	ZSTD_customMem cMem;
-	buffer_t bTable[1]; /* variable size */
-} ZSTDMT_bufferPool;
-
-static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem) {
-	ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)ZSTD_customCalloc(
-	    sizeof(ZSTDMT_bufferPool) + (maxNbBuffers - 1) * sizeof(buffer_t), cMem);
-	if(bufPool == NULL)
-		return NULL;
-	if(ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) {
-		ZSTD_customFree(bufPool, cMem);
-		return NULL;
-	}
-	bufPool->bufferSize = 64 KB;
-	bufPool->totalBuffers = maxNbBuffers;
-	bufPool->nbBuffers = 0;
-	bufPool->cMem = cMem;
-	return bufPool;
-}
-
-static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool) {
-	unsigned u;
-	DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool);
-	if(!bufPool)
-		return; /* compatibility with free on NULL */
-	for(u = 0; u < bufPool->totalBuffers; u++) {
-		DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->bTable[u].start);
-		ZSTD_customFree(bufPool->bTable[u].start, bufPool->cMem);
-	}
-	ZSTD_pthread_mutex_destroy(&bufPool->poolMutex);
-	ZSTD_customFree(bufPool, bufPool->cMem);
-}
-
-/* only works at initialization, not during compression */
-static size_t ZSTDMT_sizeof_bufferPool(ZSTDMT_bufferPool* bufPool) {
-	size_t const poolSize = sizeof(*bufPool) + (bufPool->totalBuffers - 1) * sizeof(buffer_t);
-	unsigned u;
-	size_t totalBufferSize = 0;
-	ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
-	for(u = 0; u < bufPool->totalBuffers; u++)
-		totalBufferSize += bufPool->bTable[u].capacity;
-	ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
-
-	return poolSize + totalBufferSize;
-}
-
-/* ZSTDMT_setBufferSize() :
- * all future buffers provided by this buffer pool will have _at least_ this size
- * note : it's better for all buffers to have same size,
- * as they become freely interchangeable, reducing malloc/free usages and memory fragmentation */
-static void ZSTDMT_setBufferSize(ZSTDMT_bufferPool* const bufPool, size_t const bSize) {
-	ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
-	DEBUGLOG(4, "ZSTDMT_setBufferSize: bSize = %u", (U32)bSize);
-	bufPool->bufferSize = bSize;
-	ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
-}
-
-static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, unsigned maxNbBuffers) {
-	if(srcBufPool == NULL)
-		return NULL;
-	if(srcBufPool->totalBuffers >= maxNbBuffers) /* good enough */
-		return srcBufPool;
-	/* need a larger buffer pool */
-	{
-		ZSTD_customMem const cMem = srcBufPool->cMem;
-		size_t const bSize = srcBufPool->bufferSize; /* forward parameters */
-		ZSTDMT_bufferPool* newBufPool;
-		ZSTDMT_freeBufferPool(srcBufPool);
-		newBufPool = ZSTDMT_createBufferPool(maxNbBuffers, cMem);
-		if(newBufPool == NULL)
-			return newBufPool;
-		ZSTDMT_setBufferSize(newBufPool, bSize);
-		return newBufPool;
-	}
-}
-
-/** ZSTDMT_getBuffer() :
- *  assumption : bufPool must be valid
- * @return : a buffer, with start pointer and size
- *  note: allocation may fail, in this case, start==NULL and size==0 */
-static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool) {
-	size_t const bSize = bufPool->bufferSize;
-	DEBUGLOG(5, "ZSTDMT_getBuffer: bSize = %u", (U32)bufPool->bufferSize);
-	ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
-	if(bufPool->nbBuffers) { /* try to use an existing buffer */
-		buffer_t const buf = bufPool->bTable[--(bufPool->nbBuffers)];
-		size_t const availBufferSize = buf.capacity;
-		bufPool->bTable[bufPool->nbBuffers] = g_nullBuffer;
-		if((availBufferSize >= bSize) & ((availBufferSize >> 3) <= bSize)) {
-			/* large enough, but not too much */
-			DEBUGLOG(5, "ZSTDMT_getBuffer: provide buffer %u of size %u",
-			         bufPool->nbBuffers, (U32)buf.capacity);
-			ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
-			return buf;
-		}
-		/* size conditions not respected : scratch this buffer, create new one */
-		DEBUGLOG(5, "ZSTDMT_getBuffer: existing buffer does not meet size conditions => freeing");
-		ZSTD_customFree(buf.start, bufPool->cMem);
-	}
-	ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
-	/* create new buffer */
-	DEBUGLOG(5, "ZSTDMT_getBuffer: create a new buffer");
-	{
-		buffer_t buffer;
-		void* const start = ZSTD_customMalloc(bSize, bufPool->cMem);
-		buffer.start = start; /* note : start can be NULL if malloc fails ! */
-		buffer.capacity = (start == NULL) ? 0 : bSize;
-		if(start == NULL) {
-			DEBUGLOG(5, "ZSTDMT_getBuffer: buffer allocation failure !!");
-		} else {
-			DEBUGLOG(5, "ZSTDMT_getBuffer: created buffer of size %u", (U32)bSize);
-		}
-		return buffer;
-	}
-}
-
-#if ZSTD_RESIZE_SEQPOOL
-/** ZSTDMT_resizeBuffer() :
- * assumption : bufPool must be valid
- * @return : a buffer that is at least the buffer pool buffer size.
- *           If a reallocation happens, the data in the input buffer is copied.
- */
-static buffer_t ZSTDMT_resizeBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buffer) {
-	size_t const bSize = bufPool->bufferSize;
-	if(buffer.capacity < bSize) {
-		void* const start = ZSTD_customMalloc(bSize, bufPool->cMem);
-		buffer_t newBuffer;
-		newBuffer.start = start;
-		newBuffer.capacity = start == NULL ? 0 : bSize;
-		if(start != NULL) {
-			assert(newBuffer.capacity >= buffer.capacity);
-			ZSTD_memcpy(newBuffer.start, buffer.start, buffer.capacity);
-			DEBUGLOG(5, "ZSTDMT_resizeBuffer: created buffer of size %u", (U32)bSize);
-			return newBuffer;
-		}
-		DEBUGLOG(5, "ZSTDMT_resizeBuffer: buffer allocation failure !!");
-	}
-	return buffer;
-}
-#endif
-
-/* store buffer for later re-use, up to pool capacity */
-static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf) {
-	DEBUGLOG(5, "ZSTDMT_releaseBuffer");
-	if(buf.start == NULL)
-		return; /* compatible with release on NULL */
-	ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
-	if(bufPool->nbBuffers < bufPool->totalBuffers) {
-		bufPool->bTable[bufPool->nbBuffers++] = buf; /* stored for later use */
-		DEBUGLOG(5, "ZSTDMT_releaseBuffer: stored buffer of size %u in slot %u",
-		         (U32)buf.capacity, (U32)(bufPool->nbBuffers - 1));
-		ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
-		return;
-	}
-	ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
-	/* Reached bufferPool capacity (should not happen) */
-	DEBUGLOG(5, "ZSTDMT_releaseBuffer: pool capacity reached => freeing ");
-	ZSTD_customFree(buf.start, bufPool->cMem);
-}
-
-/* We need 2 output buffers per worker since each dstBuff must be flushed after it is released.
- * The 3 additional buffers are as follows:
- *   1 buffer for input loading
- *   1 buffer for "next input" when submitting current one
- *   1 buffer stuck in queue */
-#define BUF_POOL_MAX_NB_BUFFERS(nbWorkers) (2 * (nbWorkers) + 3)
-
-/* After a worker releases its rawSeqStore, it is immediately ready for reuse.
- * So we only need one seq buffer per worker. */
-#define SEQ_POOL_MAX_NB_BUFFERS(nbWorkers) (nbWorkers)
-
-/* =====   Seq Pool Wrapper   ====== */
-
-typedef ZSTDMT_bufferPool ZSTDMT_seqPool;
-
-static size_t ZSTDMT_sizeof_seqPool(ZSTDMT_seqPool* seqPool) {
-	return ZSTDMT_sizeof_bufferPool(seqPool);
-}
-
-static rawSeqStore_t bufferToSeq(buffer_t buffer) {
-	rawSeqStore_t seq = kNullRawSeqStore;
-	seq.seq = (rawSeq*)buffer.start;
-	seq.capacity = buffer.capacity / sizeof(rawSeq);
-	return seq;
-}
-
-static buffer_t seqToBuffer(rawSeqStore_t seq) {
-	buffer_t buffer;
-	buffer.start = seq.seq;
-	buffer.capacity = seq.capacity * sizeof(rawSeq);
-	return buffer;
-}
-
-static rawSeqStore_t ZSTDMT_getSeq(ZSTDMT_seqPool* seqPool) {
-	if(seqPool->bufferSize == 0) {
-		return kNullRawSeqStore;
-	}
-	return bufferToSeq(ZSTDMT_getBuffer(seqPool));
-}
-
-#if ZSTD_RESIZE_SEQPOOL
-static rawSeqStore_t ZSTDMT_resizeSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq) {
-	return bufferToSeq(ZSTDMT_resizeBuffer(seqPool, seqToBuffer(seq)));
-}
-#endif
-
-static void ZSTDMT_releaseSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq) {
-	ZSTDMT_releaseBuffer(seqPool, seqToBuffer(seq));
-}
-
-static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq) {
-	ZSTDMT_setBufferSize(seqPool, nbSeq * sizeof(rawSeq));
-}
-
-static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem) {
-	ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(SEQ_POOL_MAX_NB_BUFFERS(nbWorkers), cMem);
-	if(seqPool == NULL)
-		return NULL;
-	ZSTDMT_setNbSeq(seqPool, 0);
-	return seqPool;
-}
-
-static void ZSTDMT_freeSeqPool(ZSTDMT_seqPool* seqPool) {
-	ZSTDMT_freeBufferPool(seqPool);
-}
-
-static ZSTDMT_seqPool* ZSTDMT_expandSeqPool(ZSTDMT_seqPool* pool, U32 nbWorkers) {
-	return ZSTDMT_expandBufferPool(pool, SEQ_POOL_MAX_NB_BUFFERS(nbWorkers));
-}
-
-/* =====   CCtx Pool   ===== */
-/* a single CCtx Pool can be invoked from multiple threads in parallel */
-
-typedef struct {
-	ZSTD_pthread_mutex_t poolMutex;
-	int totalCCtx;
-	int availCCtx;
-	ZSTD_customMem cMem;
-	ZSTD_CCtx* cctx[1]; /* variable size */
-} ZSTDMT_CCtxPool;
-
-/* note : all CCtx borrowed from the pool should be released back to the pool _before_ freeing the pool */
-static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool) {
-	int cid;
-	for(cid = 0; cid < pool->totalCCtx; cid++)
-		ZSTD_freeCCtx(pool->cctx[cid]); /* note : compatible with free on NULL */
-	ZSTD_pthread_mutex_destroy(&pool->poolMutex);
-	ZSTD_customFree(pool, pool->cMem);
-}
-
-/* ZSTDMT_createCCtxPool() :
- * implies nbWorkers >= 1 , checked by caller ZSTDMT_createCCtx() */
-static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(int nbWorkers,
-                                              ZSTD_customMem cMem) {
-	ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*)ZSTD_customCalloc(
-	    sizeof(ZSTDMT_CCtxPool) + (nbWorkers - 1) * sizeof(ZSTD_CCtx*), cMem);
-	assert(nbWorkers > 0);
-	if(!cctxPool)
-		return NULL;
-	if(ZSTD_pthread_mutex_init(&cctxPool->poolMutex, NULL)) {
-		ZSTD_customFree(cctxPool, cMem);
-		return NULL;
-	}
-	cctxPool->cMem = cMem;
-	cctxPool->totalCCtx = nbWorkers;
-	cctxPool->availCCtx = 1; /* at least one cctx for single-thread mode */
-	cctxPool->cctx[0] = ZSTD_createCCtx_advanced(cMem);
-	if(!cctxPool->cctx[0]) {
-		ZSTDMT_freeCCtxPool(cctxPool);
-		return NULL;
-	}
-	DEBUGLOG(3, "cctxPool created, with %u workers", nbWorkers);
-	return cctxPool;
-}
-
-static ZSTDMT_CCtxPool* ZSTDMT_expandCCtxPool(ZSTDMT_CCtxPool* srcPool,
-                                              int nbWorkers) {
-	if(srcPool == NULL)
-		return NULL;
-	if(nbWorkers <= srcPool->totalCCtx)
-		return srcPool; /* good enough */
-	/* need a larger cctx pool */
-	{
-		ZSTD_customMem const cMem = srcPool->cMem;
-		ZSTDMT_freeCCtxPool(srcPool);
-		return ZSTDMT_createCCtxPool(nbWorkers, cMem);
-	}
-}
-
-/* only works during initialization phase, not during compression */
-static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool) {
-	ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
-	{
-		unsigned const nbWorkers = cctxPool->totalCCtx;
-		size_t const poolSize = sizeof(*cctxPool) + (nbWorkers - 1) * sizeof(ZSTD_CCtx*);
-		unsigned u;
-		size_t totalCCtxSize = 0;
-		for(u = 0; u < nbWorkers; u++) {
-			totalCCtxSize += ZSTD_sizeof_CCtx(cctxPool->cctx[u]);
-		}
-		ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
-		assert(nbWorkers > 0);
-		return poolSize + totalCCtxSize;
-	}
-}
-
-static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* cctxPool) {
-	DEBUGLOG(5, "ZSTDMT_getCCtx");
-	ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
-	if(cctxPool->availCCtx) {
-		cctxPool->availCCtx--;
-		{
-			ZSTD_CCtx* const cctx = cctxPool->cctx[cctxPool->availCCtx];
-			ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
-			return cctx;
-		}
-	}
-	ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
-	DEBUGLOG(5, "create one more CCtx");
-	return ZSTD_createCCtx_advanced(cctxPool->cMem); /* note : can be NULL, when creation fails ! */
-}
-
-static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx) {
-	if(cctx == NULL)
-		return; /* compatibility with release on NULL */
-	ZSTD_pthread_mutex_lock(&pool->poolMutex);
-	if(pool->availCCtx < pool->totalCCtx)
-		pool->cctx[pool->availCCtx++] = cctx;
-	else {
-		/* pool overflow : should not happen, since totalCCtx==nbWorkers */
-		DEBUGLOG(4, "CCtx pool overflow : free cctx");
-		ZSTD_freeCCtx(cctx);
-	}
-	ZSTD_pthread_mutex_unlock(&pool->poolMutex);
-}
-
-/* ====   Serial State   ==== */
-
-typedef struct {
-	void const * start;
-	size_t size;
-} range_t;
-
-typedef struct {
-	/* All variables in the struct are protected by mutex. */
-	ZSTD_pthread_mutex_t mutex;
-	ZSTD_pthread_cond_t cond;
-	ZSTD_CCtx_params params;
-	ldmState_t ldmState;
-	XXH64_state_t xxhState;
-	unsigned nextJobID;
-	/* Protects ldmWindow.
-	 * Must be acquired after the main mutex when acquiring both.
-	 */
-	ZSTD_pthread_mutex_t ldmWindowMutex;
-	ZSTD_pthread_cond_t ldmWindowCond; /* Signaled when ldmWindow is updated */
-	ZSTD_window_t ldmWindow;           /* A thread-safe copy of ldmState.window */
-} serialState_t;
-
-static int
-ZSTDMT_serialState_reset(serialState_t* serialState,
-                         ZSTDMT_seqPool* seqPool,
-                         ZSTD_CCtx_params params,
-                         size_t jobSize,
-                         const void* dict, size_t const dictSize,
-                         ZSTD_dictContentType_e dictContentType) {
-	/* Adjust parameters */
-	if(params.ldmParams.enableLdm == ZSTD_ps_enable) {
-		DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10);
-		ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
-		assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
-		assert(params.ldmParams.hashRateLog < 32);
-	} else {
-		ZSTD_memset(&params.ldmParams, 0, sizeof(params.ldmParams));
-	}
-	serialState->nextJobID = 0;
-	if(params.fParams.checksumFlag)
-		XXH64_reset(&serialState->xxhState, 0);
-	if(params.ldmParams.enableLdm == ZSTD_ps_enable) {
-		ZSTD_customMem cMem = params.customMem;
-		unsigned const hashLog = params.ldmParams.hashLog;
-		size_t const hashSize = ((size_t)1 << hashLog) * sizeof(ldmEntry_t);
-		unsigned const bucketLog =
-		    params.ldmParams.hashLog - params.ldmParams.bucketSizeLog;
-		unsigned const prevBucketLog =
-		    serialState->params.ldmParams.hashLog -
-		    serialState->params.ldmParams.bucketSizeLog;
-		size_t const numBuckets = (size_t)1 << bucketLog;
-		/* Size the seq pool tables */
-		ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, jobSize));
-		/* Reset the window */
-		ZSTD_window_init(&serialState->ldmState.window);
-		/* Resize tables and output space if necessary. */
-		if(serialState->ldmState.hashTable == NULL || serialState->params.ldmParams.hashLog < hashLog) {
-			ZSTD_customFree(serialState->ldmState.hashTable, cMem);
-			serialState->ldmState.hashTable = (ldmEntry_t*)ZSTD_customMalloc(hashSize, cMem);
-		}
-		if(serialState->ldmState.bucketOffsets == NULL || prevBucketLog < bucketLog) {
-			ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem);
-			serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_customMalloc(numBuckets, cMem);
-		}
-		if(!serialState->ldmState.hashTable || !serialState->ldmState.bucketOffsets)
-			return 1;
-		/* Zero the tables */
-		ZSTD_memset(serialState->ldmState.hashTable, 0, hashSize);
-		ZSTD_memset(serialState->ldmState.bucketOffsets, 0, numBuckets);
-
-		/* Update window state and fill hash table with dict */
-		serialState->ldmState.loadedDictEnd = 0;
-		if(dictSize > 0) {
-			if(dictContentType == ZSTD_dct_rawContent) {
-				BYTE const * const dictEnd = (const BYTE*)dict + dictSize;
-				ZSTD_window_update(&serialState->ldmState.window, dict, dictSize, /* forceNonContiguous */ 0);
-				ZSTD_ldm_fillHashTable(&serialState->ldmState, (const BYTE*)dict, dictEnd, &params.ldmParams);
-				serialState->ldmState.loadedDictEnd = params.forceWindow ? 0 : (U32)(dictEnd - serialState->ldmState.window.base);
-			} else {
-				/* don't even load anything */
-			}
-		}
-
-		/* Initialize serialState's copy of ldmWindow. */
-		serialState->ldmWindow = serialState->ldmState.window;
-	}
-
-	serialState->params = params;
-	serialState->params.jobSize = (U32)jobSize;
-	return 0;
-}
-
-static int ZSTDMT_serialState_init(serialState_t* serialState) {
-	int initError = 0;
-	ZSTD_memset(serialState, 0, sizeof(*serialState));
-	initError |= ZSTD_pthread_mutex_init(&serialState->mutex, NULL);
-	initError |= ZSTD_pthread_cond_init(&serialState->cond, NULL);
-	initError |= ZSTD_pthread_mutex_init(&serialState->ldmWindowMutex, NULL);
-	initError |= ZSTD_pthread_cond_init(&serialState->ldmWindowCond, NULL);
-	return initError;
-}
-
-static void ZSTDMT_serialState_free(serialState_t* serialState) {
-	ZSTD_customMem cMem = serialState->params.customMem;
-	ZSTD_pthread_mutex_destroy(&serialState->mutex);
-	ZSTD_pthread_cond_destroy(&serialState->cond);
-	ZSTD_pthread_mutex_destroy(&serialState->ldmWindowMutex);
-	ZSTD_pthread_cond_destroy(&serialState->ldmWindowCond);
-	ZSTD_customFree(serialState->ldmState.hashTable, cMem);
-	ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem);
-}
-
-static void ZSTDMT_serialState_update(serialState_t* serialState,
-                                      ZSTD_CCtx* jobCCtx, rawSeqStore_t seqStore,
-                                      range_t src, unsigned jobID) {
-	/* Wait for our turn */
-	ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
-	while(serialState->nextJobID < jobID) {
-		DEBUGLOG(5, "wait for serialState->cond");
-		ZSTD_pthread_cond_wait(&serialState->cond, &serialState->mutex);
-	}
-	/* A future job may error and skip our job */
-	if(serialState->nextJobID == jobID) {
-		/* It is now our turn, do any processing necessary */
-		if(serialState->params.ldmParams.enableLdm == ZSTD_ps_enable) {
-			size_t error;
-			assert(seqStore.seq != NULL && seqStore.pos == 0 &&
-			       seqStore.size == 0 && seqStore.capacity > 0);
-			assert(src.size <= serialState->params.jobSize);
-			ZSTD_window_update(&serialState->ldmState.window, src.start, src.size, /* forceNonContiguous */ 0);
-			error = ZSTD_ldm_generateSequences(
-			    &serialState->ldmState, &seqStore,
-			    &serialState->params.ldmParams, src.start, src.size);
-			/* We provide a large enough buffer to never fail. */
-			assert(!ZSTD_isError(error));
-			(void)error;
-			/* Update ldmWindow to match the ldmState.window and signal the main
-			 * thread if it is waiting for a buffer.
-			 */
-			ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex);
-			serialState->ldmWindow = serialState->ldmState.window;
-			ZSTD_pthread_cond_signal(&serialState->ldmWindowCond);
-			ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex);
-		}
-		if(serialState->params.fParams.checksumFlag && src.size > 0)
-			XXH64_update(&serialState->xxhState, src.start, src.size);
-	}
-	/* Now it is the next jobs turn */
-	serialState->nextJobID++;
-	ZSTD_pthread_cond_broadcast(&serialState->cond);
-	ZSTD_pthread_mutex_unlock(&serialState->mutex);
-
-	if(seqStore.size > 0) {
-		size_t const err = ZSTD_referenceExternalSequences(
-		    jobCCtx, seqStore.seq, seqStore.size);
-		assert(serialState->params.ldmParams.enableLdm == ZSTD_ps_enable);
-		assert(!ZSTD_isError(err));
-		(void)err;
-	}
-}
-
-static void ZSTDMT_serialState_ensureFinished(serialState_t* serialState,
-                                              unsigned jobID, size_t cSize) {
-	ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
-	if(serialState->nextJobID <= jobID) {
-		assert(ZSTD_isError(cSize));
-		(void)cSize;
-		DEBUGLOG(5, "Skipping past job %u because of error", jobID);
-		serialState->nextJobID = jobID + 1;
-		ZSTD_pthread_cond_broadcast(&serialState->cond);
-
-		ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex);
-		ZSTD_window_clear(&serialState->ldmWindow);
-		ZSTD_pthread_cond_signal(&serialState->ldmWindowCond);
-		ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex);
-	}
-	ZSTD_pthread_mutex_unlock(&serialState->mutex);
-}
-
-/* ------------------------------------------ */
-/* =====          Worker thread         ===== */
-/* ------------------------------------------ */
-
-static const range_t kNullRange = {NULL, 0};
-
-typedef struct {
-	size_t consumed;                  /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx */
-	size_t cSize;                     /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx, then set0 by mtctx */
-	ZSTD_pthread_mutex_t job_mutex;   /* Thread-safe - used by mtctx and worker */
-	ZSTD_pthread_cond_t job_cond;     /* Thread-safe - used by mtctx and worker */
-	ZSTDMT_CCtxPool* cctxPool;        /* Thread-safe - used by mtctx and (all) workers */
-	ZSTDMT_bufferPool* bufPool;       /* Thread-safe - used by mtctx and (all) workers */
-	ZSTDMT_seqPool* seqPool;          /* Thread-safe - used by mtctx and (all) workers */
-	serialState_t* serial;            /* Thread-safe - used by mtctx and (all) workers */
-	buffer_t dstBuff;                 /* set by worker (or mtctx), then read by worker & mtctx, then modified by mtctx => no barrier */
-	range_t prefix;                   /* set by mtctx, then read by worker & mtctx => no barrier */
-	range_t src;                      /* set by mtctx, then read by worker & mtctx => no barrier */
-	unsigned jobID;                   /* set by mtctx, then read by worker => no barrier */
-	unsigned firstJob;                /* set by mtctx, then read by worker => no barrier */
-	unsigned lastJob;                 /* set by mtctx, then read by worker => no barrier */
-	ZSTD_CCtx_params params;          /* set by mtctx, then read by worker => no barrier */
-	const ZSTD_CDict* cdict;          /* set by mtctx, then read by worker => no barrier */
-	unsigned long long fullFrameSize; /* set by mtctx, then read by worker => no barrier */
-	size_t dstFlushed;                /* used only by mtctx */
-	unsigned frameChecksumNeeded;     /* used only by mtctx */
-} ZSTDMT_jobDescription;
-
-#define JOB_ERROR(e)                                \
-	{                                               \
-		ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);   \
-		job->cSize = e;                             \
-		ZSTD_pthread_mutex_unlock(&job->job_mutex); \
-		goto _endJob;                               \
-	}
-
-/* ZSTDMT_compressionJob() is a POOL_function type */
-static void ZSTDMT_compressionJob(void* jobDescription) {
-	ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription;
-	ZSTD_CCtx_params jobParams = job->params; /* do not modify job->params ! copy it, modify the copy */
-	ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool);
-	rawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool);
-	buffer_t dstBuff = job->dstBuff;
-	size_t lastCBlockSize = 0;
-
-	/* resources */
-	if(cctx == NULL)
-		JOB_ERROR(ERROR(memory_allocation));
-	if(dstBuff.start == NULL) { /* streaming job : doesn't provide a dstBuffer */
-		dstBuff = ZSTDMT_getBuffer(job->bufPool);
-		if(dstBuff.start == NULL)
-			JOB_ERROR(ERROR(memory_allocation));
-		job->dstBuff = dstBuff; /* this value can be read in ZSTDMT_flush, when it copies the whole job */
-	}
-	if(jobParams.ldmParams.enableLdm == ZSTD_ps_enable && rawSeqStore.seq == NULL)
-		JOB_ERROR(ERROR(memory_allocation));
-
-	/* Don't compute the checksum for chunks, since we compute it externally,
-	 * but write it in the header.
-	 */
-	if(job->jobID != 0)
-		jobParams.fParams.checksumFlag = 0;
-	/* Don't run LDM for the chunks, since we handle it externally */
-	jobParams.ldmParams.enableLdm = ZSTD_ps_disable;
-	/* Correct nbWorkers to 0. */
-	jobParams.nbWorkers = 0;
-
-	/* init */
-	if(job->cdict) {
-		size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, job->cdict, &jobParams, job->fullFrameSize);
-		assert(job->firstJob); /* only allowed for first job */
-		if(ZSTD_isError(initError))
-			JOB_ERROR(initError);
-	} else { /* srcStart points at reloaded section */
-		U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size;
-		{
-			size_t const forceWindowError = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_forceMaxWindow, !job->firstJob);
-			if(ZSTD_isError(forceWindowError))
-				JOB_ERROR(forceWindowError);
-		}
-		if(!job->firstJob) {
-			size_t const err = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_deterministicRefPrefix, 0);
-			if(ZSTD_isError(err))
-				JOB_ERROR(err);
-		}
-		{
-			size_t const initError = ZSTD_compressBegin_advanced_internal(cctx,
-			                                                              job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
-			                                                              ZSTD_dtlm_fast,
-			                                                              NULL, /*cdict*/
-			                                                              &jobParams, pledgedSrcSize);
-			if(ZSTD_isError(initError))
-				JOB_ERROR(initError);
-		}
-	}
-
-	/* Perform serial step as early as possible, but after CCtx initialization */
-	ZSTDMT_serialState_update(job->serial, cctx, rawSeqStore, job->src, job->jobID);
-
-	if(!job->firstJob) { /* flush and overwrite frame header when it's not first job */
-		size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0);
-		if(ZSTD_isError(hSize))
-			JOB_ERROR(hSize);
-		DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize);
-		ZSTD_invalidateRepCodes(cctx);
-	}
-
-	/* compress */
-	{
-		size_t const chunkSize = 4 * ZSTD_BLOCKSIZE_MAX;
-		int const nbChunks = (int)((job->src.size + (chunkSize - 1)) / chunkSize);
-		const BYTE* ip = (const BYTE*)job->src.start;
-		BYTE* const ostart = (BYTE*)dstBuff.start;
-		BYTE* op = ostart;
-		BYTE* oend = op + dstBuff.capacity;
-		int chunkNb;
-		if(sizeof(size_t) > sizeof(int))
-			assert(job->src.size < ((size_t)INT_MAX) * chunkSize); /* check overflow */
-		DEBUGLOG(5, "ZSTDMT_compressionJob: compress %u bytes in %i blocks", (U32)job->src.size, nbChunks);
-		assert(job->cSize == 0);
-		for(chunkNb = 1; chunkNb < nbChunks; chunkNb++) {
-			size_t const cSize = ZSTD_compressContinue(cctx, op, oend - op, ip, chunkSize);
-			if(ZSTD_isError(cSize))
-				JOB_ERROR(cSize);
-			ip += chunkSize;
-			op += cSize;
-			assert(op < oend);
-			/* stats */
-			ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
-			job->cSize += cSize;
-			job->consumed = chunkSize * chunkNb;
-			DEBUGLOG(5, "ZSTDMT_compressionJob: compress new block : cSize==%u bytes (total: %u)",
-			         (U32)cSize, (U32)job->cSize);
-			ZSTD_pthread_cond_signal(&job->job_cond); /* warns some more data is ready to be flushed */
-			ZSTD_pthread_mutex_unlock(&job->job_mutex);
-		}
-		/* last block */
-		assert(chunkSize > 0);
-		assert((chunkSize & (chunkSize - 1)) == 0); /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
-		if((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/) {
-			size_t const lastBlockSize1 = job->src.size & (chunkSize - 1);
-			size_t const lastBlockSize = ((lastBlockSize1 == 0) & (job->src.size >= chunkSize)) ? chunkSize : lastBlockSize1;
-			size_t const cSize = (job->lastJob) ? ZSTD_compressEnd(cctx, op, oend - op, ip, lastBlockSize) : ZSTD_compressContinue(cctx, op, oend - op, ip, lastBlockSize);
-			if(ZSTD_isError(cSize))
-				JOB_ERROR(cSize);
-			lastCBlockSize = cSize;
-		}
-	}
-	if(!job->firstJob) {
-		/* Double check that we don't have an ext-dict, because then our
-		 * repcode invalidation doesn't work.
-		 */
-		assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
-	}
-	ZSTD_CCtx_trace(cctx, 0);
-
-_endJob:
-	ZSTDMT_serialState_ensureFinished(job->serial, job->jobID, job->cSize);
-	if(job->prefix.size > 0)
-		DEBUGLOG(5, "Finished with prefix: %zx", (size_t)job->prefix.start);
-	DEBUGLOG(5, "Finished with source: %zx", (size_t)job->src.start);
-	/* release resources */
-	ZSTDMT_releaseSeq(job->seqPool, rawSeqStore);
-	ZSTDMT_releaseCCtx(job->cctxPool, cctx);
-	/* report */
-	ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
-	if(ZSTD_isError(job->cSize))
-		assert(lastCBlockSize == 0);
-	job->cSize += lastCBlockSize;
-	job->consumed = job->src.size; /* when job->consumed == job->src.size , compression job is presumed completed */
-	ZSTD_pthread_cond_signal(&job->job_cond);
-	ZSTD_pthread_mutex_unlock(&job->job_mutex);
-}
-
-/* ------------------------------------------ */
-/* =====   Multi-threaded compression   ===== */
-/* ------------------------------------------ */
-
-typedef struct {
-	range_t prefix; /* read-only non-owned prefix buffer */
-	buffer_t buffer;
-	size_t filled;
-} inBuff_t;
-
-typedef struct {
-	BYTE* buffer;    /* The round input buffer. All jobs get references
-	                  * to pieces of the buffer. ZSTDMT_tryGetInputRange()
-	                  * handles handing out job input buffers, and makes
-	                  * sure it doesn't overlap with any pieces still in use.
-	                  */
-	size_t capacity; /* The capacity of buffer. */
-	size_t pos;      /* The position of the current inBuff in the round
-	                  * buffer. Updated past the end if the inBuff once
-	                  * the inBuff is sent to the worker thread.
-	                  * pos <= capacity.
-	                  */
-} roundBuff_t;
-
-static const roundBuff_t kNullRoundBuff = {NULL, 0, 0};
-
-#define RSYNC_LENGTH 32
-/* Don't create chunks smaller than the zstd block size.
- * This stops us from regressing compression ratio too much,
- * and ensures our output fits in ZSTD_compressBound().
- *
- * If this is shrunk < ZSTD_BLOCKSIZELOG_MIN then
- * ZSTD_COMPRESSBOUND() will need to be updated.
- */
-#define RSYNC_MIN_BLOCK_LOG ZSTD_BLOCKSIZELOG_MAX
-#define RSYNC_MIN_BLOCK_SIZE (1 << RSYNC_MIN_BLOCK_LOG)
-
-typedef struct {
-	U64 hash;
-	U64 hitMask;
-	U64 primePower;
-} rsyncState_t;
-
-struct ZSTDMT_CCtx_s {
-	POOL_ctx* factory;
-	ZSTDMT_jobDescription* jobs;
-	ZSTDMT_bufferPool* bufPool;
-	ZSTDMT_CCtxPool* cctxPool;
-	ZSTDMT_seqPool* seqPool;
-	ZSTD_CCtx_params params;
-	size_t targetSectionSize;
-	size_t targetPrefixSize;
-	int jobReady; /* 1 => one job is already prepared, but pool has shortage of workers. Don't create a new job. */
-	inBuff_t inBuff;
-	roundBuff_t roundBuff;
-	serialState_t serial;
-	rsyncState_t rsync;
-	unsigned jobIDMask;
-	unsigned doneJobID;
-	unsigned nextJobID;
-	unsigned frameEnded;
-	unsigned allJobsCompleted;
-	unsigned long long frameContentSize;
-	unsigned long long consumed;
-	unsigned long long produced;
-	ZSTD_customMem cMem;
-	ZSTD_CDict* cdictLocal;
-	const ZSTD_CDict* cdict;
-	unsigned providedFactory : 1;
-};
-
-static void ZSTDMT_freeJobsTable(ZSTDMT_jobDescription* jobTable, U32 nbJobs, ZSTD_customMem cMem) {
-	U32 jobNb;
-	if(jobTable == NULL)
-		return;
-	for(jobNb = 0; jobNb < nbJobs; jobNb++) {
-		ZSTD_pthread_mutex_destroy(&jobTable[jobNb].job_mutex);
-		ZSTD_pthread_cond_destroy(&jobTable[jobNb].job_cond);
-	}
-	ZSTD_customFree(jobTable, cMem);
-}
-
-/* ZSTDMT_allocJobsTable()
- * allocate and init a job table.
- * update *nbJobsPtr to next power of 2 value, as size of table */
-static ZSTDMT_jobDescription* ZSTDMT_createJobsTable(U32* nbJobsPtr, ZSTD_customMem cMem) {
-	U32 const nbJobsLog2 = ZSTD_highbit32(*nbJobsPtr) + 1;
-	U32 const nbJobs = 1 << nbJobsLog2;
-	U32 jobNb;
-	ZSTDMT_jobDescription* const jobTable = (ZSTDMT_jobDescription*)
-	    ZSTD_customCalloc(nbJobs * sizeof(ZSTDMT_jobDescription), cMem);
-	int initError = 0;
-	if(jobTable == NULL)
-		return NULL;
-	*nbJobsPtr = nbJobs;
-	for(jobNb = 0; jobNb < nbJobs; jobNb++) {
-		initError |= ZSTD_pthread_mutex_init(&jobTable[jobNb].job_mutex, NULL);
-		initError |= ZSTD_pthread_cond_init(&jobTable[jobNb].job_cond, NULL);
-	}
-	if(initError != 0) {
-		ZSTDMT_freeJobsTable(jobTable, nbJobs, cMem);
-		return NULL;
-	}
-	return jobTable;
-}
-
-static size_t ZSTDMT_expandJobsTable(ZSTDMT_CCtx* mtctx, U32 nbWorkers) {
-	U32 nbJobs = nbWorkers + 2;
-	if(nbJobs > mtctx->jobIDMask + 1) { /* need more job capacity */
-		ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask + 1, mtctx->cMem);
-		mtctx->jobIDMask = 0;
-		mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, mtctx->cMem);
-		if(mtctx->jobs == NULL)
-			return ERROR(memory_allocation);
-		assert((nbJobs != 0) && ((nbJobs & (nbJobs - 1)) == 0)); /* ensure nbJobs is a power of 2 */
-		mtctx->jobIDMask = nbJobs - 1;
-	}
-	return 0;
-}
-
-/* ZSTDMT_CCtxParam_setNbWorkers():
- * Internal use only */
-static size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers) {
-	return ZSTD_CCtxParams_setParameter(params, ZSTD_c_nbWorkers, (int)nbWorkers);
-}
-
-MEM_STATIC ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced_internal(unsigned nbWorkers, ZSTD_customMem cMem, ZSTD_threadPool* pool) {
-	ZSTDMT_CCtx* mtctx;
-	U32 nbJobs = nbWorkers + 2;
-	int initError;
-	DEBUGLOG(3, "ZSTDMT_createCCtx_advanced (nbWorkers = %u)", nbWorkers);
-
-	if(nbWorkers < 1)
-		return NULL;
-	nbWorkers = MIN(nbWorkers, ZSTDMT_NBWORKERS_MAX);
-	if((cMem.customAlloc != NULL) ^ (cMem.customFree != NULL))
-		/* invalid custom allocator */
-		return NULL;
-
-	mtctx = (ZSTDMT_CCtx*)ZSTD_customCalloc(sizeof(ZSTDMT_CCtx), cMem);
-	if(!mtctx)
-		return NULL;
-	ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers);
-	mtctx->cMem = cMem;
-	mtctx->allJobsCompleted = 1;
-	if(pool != NULL) {
-		mtctx->factory = pool;
-		mtctx->providedFactory = 1;
-	} else {
-		mtctx->factory = POOL_create_advanced(nbWorkers, 0, cMem);
-		mtctx->providedFactory = 0;
-	}
-	mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, cMem);
-	assert(nbJobs > 0);
-	assert((nbJobs & (nbJobs - 1)) == 0); /* ensure nbJobs is a power of 2 */
-	mtctx->jobIDMask = nbJobs - 1;
-	mtctx->bufPool = ZSTDMT_createBufferPool(BUF_POOL_MAX_NB_BUFFERS(nbWorkers), cMem);
-	mtctx->cctxPool = ZSTDMT_createCCtxPool(nbWorkers, cMem);
-	mtctx->seqPool = ZSTDMT_createSeqPool(nbWorkers, cMem);
-	initError = ZSTDMT_serialState_init(&mtctx->serial);
-	mtctx->roundBuff = kNullRoundBuff;
-	if(!mtctx->factory | !mtctx->jobs | !mtctx->bufPool | !mtctx->cctxPool | !mtctx->seqPool | initError) {
-		ZSTDMT_freeCCtx(mtctx);
-		return NULL;
-	}
-	DEBUGLOG(3, "mt_cctx created, for %u threads", nbWorkers);
-	return mtctx;
-}
-
-ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, ZSTD_customMem cMem, ZSTD_threadPool* pool) {
-#ifdef ZSTD_MULTITHREAD
-	return ZSTDMT_createCCtx_advanced_internal(nbWorkers, cMem, pool);
-#else
-	(void)nbWorkers;
-	(void)cMem;
-	(void)pool;
-	return NULL;
-#endif
-}
-
-/* ZSTDMT_releaseAllJobResources() :
- * note : ensure all workers are killed first ! */
-static void ZSTDMT_releaseAllJobResources(ZSTDMT_CCtx* mtctx) {
-	unsigned jobID;
-	DEBUGLOG(3, "ZSTDMT_releaseAllJobResources");
-	for(jobID = 0; jobID <= mtctx->jobIDMask; jobID++) {
-		/* Copy the mutex/cond out */
-		ZSTD_pthread_mutex_t const mutex = mtctx->jobs[jobID].job_mutex;
-		ZSTD_pthread_cond_t const cond = mtctx->jobs[jobID].job_cond;
-
-		DEBUGLOG(4, "job%02u: release dst address %08X", jobID, (U32)(size_t)mtctx->jobs[jobID].dstBuff.start);
-		ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff);
-
-		/* Clear the job description, but keep the mutex/cond */
-		ZSTD_memset(&mtctx->jobs[jobID], 0, sizeof(mtctx->jobs[jobID]));
-		mtctx->jobs[jobID].job_mutex = mutex;
-		mtctx->jobs[jobID].job_cond = cond;
-	}
-	mtctx->inBuff.buffer = g_nullBuffer;
-	mtctx->inBuff.filled = 0;
-	mtctx->allJobsCompleted = 1;
-}
-
-static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* mtctx) {
-	DEBUGLOG(4, "ZSTDMT_waitForAllJobsCompleted");
-	while(mtctx->doneJobID < mtctx->nextJobID) {
-		unsigned const jobID = mtctx->doneJobID & mtctx->jobIDMask;
-		ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
-		while(mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
-			DEBUGLOG(4, "waiting for jobCompleted signal from job %u", mtctx->doneJobID); /* we want to block when waiting for data to flush */
-			ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
-		}
-		ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex);
-		mtctx->doneJobID++;
-	}
-}
-
-size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx) {
-	if(mtctx == NULL)
-		return 0; /* compatible with free on NULL */
-	if(!mtctx->providedFactory)
-		POOL_free(mtctx->factory);        /* stop and free worker threads */
-	ZSTDMT_releaseAllJobResources(mtctx); /* release job resources into pools first */
-	ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask + 1, mtctx->cMem);
-	ZSTDMT_freeBufferPool(mtctx->bufPool);
-	ZSTDMT_freeCCtxPool(mtctx->cctxPool);
-	ZSTDMT_freeSeqPool(mtctx->seqPool);
-	ZSTDMT_serialState_free(&mtctx->serial);
-	ZSTD_freeCDict(mtctx->cdictLocal);
-	if(mtctx->roundBuff.buffer)
-		ZSTD_customFree(mtctx->roundBuff.buffer, mtctx->cMem);
-	ZSTD_customFree(mtctx, mtctx->cMem);
-	return 0;
-}
-
-size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx) {
-	if(mtctx == NULL)
-		return 0; /* supports sizeof NULL */
-	return sizeof(*mtctx) + POOL_sizeof(mtctx->factory) + ZSTDMT_sizeof_bufferPool(mtctx->bufPool) + (mtctx->jobIDMask + 1) * sizeof(ZSTDMT_jobDescription) + ZSTDMT_sizeof_CCtxPool(mtctx->cctxPool) + ZSTDMT_sizeof_seqPool(mtctx->seqPool) + ZSTD_sizeof_CDict(mtctx->cdictLocal) + mtctx->roundBuff.capacity;
-}
-
-/* ZSTDMT_resize() :
- * @return : error code if fails, 0 on success */
-static size_t ZSTDMT_resize(ZSTDMT_CCtx* mtctx, unsigned nbWorkers) {
-	if(POOL_resize(mtctx->factory, nbWorkers))
-		return ERROR(memory_allocation);
-	FORWARD_IF_ERROR(ZSTDMT_expandJobsTable(mtctx, nbWorkers), "");
-	mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, BUF_POOL_MAX_NB_BUFFERS(nbWorkers));
-	if(mtctx->bufPool == NULL)
-		return ERROR(memory_allocation);
-	mtctx->cctxPool = ZSTDMT_expandCCtxPool(mtctx->cctxPool, nbWorkers);
-	if(mtctx->cctxPool == NULL)
-		return ERROR(memory_allocation);
-	mtctx->seqPool = ZSTDMT_expandSeqPool(mtctx->seqPool, nbWorkers);
-	if(mtctx->seqPool == NULL)
-		return ERROR(memory_allocation);
-	ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers);
-	return 0;
-}
-
-/*! ZSTDMT_updateCParams_whileCompressing() :
- *  Updates a selected set of compression parameters, remaining compatible with currently active frame.
- *  New parameters will be applied to next compression job. */
-void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams) {
-	U32 const saved_wlog = mtctx->params.cParams.windowLog; /* Do not modify windowLog while compressing */
-	int const compressionLevel = cctxParams->compressionLevel;
-	DEBUGLOG(5, "ZSTDMT_updateCParams_whileCompressing (level:%i)",
-	         compressionLevel);
-	mtctx->params.compressionLevel = compressionLevel;
-	{
-		ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
-		cParams.windowLog = saved_wlog;
-		mtctx->params.cParams = cParams;
-	}
-}
-
-/* ZSTDMT_getFrameProgression():
- * tells how much data has been consumed (input) and produced (output) for current frame.
- * able to count progression inside worker threads.
- * Note : mutex will be acquired during statistics collection inside workers. */
-ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx) {
-	ZSTD_frameProgression fps;
-	DEBUGLOG(5, "ZSTDMT_getFrameProgression");
-	fps.ingested = mtctx->consumed + mtctx->inBuff.filled;
-	fps.consumed = mtctx->consumed;
-	fps.produced = fps.flushed = mtctx->produced;
-	fps.currentJobID = mtctx->nextJobID;
-	fps.nbActiveWorkers = 0;
-	{
-		unsigned jobNb;
-		unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady;
-		assert(mtctx->jobReady <= 1);
-		DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)",
-		         mtctx->doneJobID, lastJobNb, mtctx->jobReady)
-		for(jobNb = mtctx->doneJobID; jobNb < lastJobNb; jobNb++) {
-			unsigned const wJobID = jobNb & mtctx->jobIDMask;
-			ZSTDMT_jobDescription* jobPtr = &mtctx->jobs[wJobID];
-			ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
-			{
-				size_t const cResult = jobPtr->cSize;
-				size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
-				size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
-				assert(flushed <= produced);
-				fps.ingested += jobPtr->src.size;
-				fps.consumed += jobPtr->consumed;
-				fps.produced += produced;
-				fps.flushed += flushed;
-				fps.nbActiveWorkers += (jobPtr->consumed < jobPtr->src.size);
-			}
-			ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
-		}
-	}
-	return fps;
-}
-
-size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx) {
-	size_t toFlush;
-	unsigned const jobID = mtctx->doneJobID;
-	assert(jobID <= mtctx->nextJobID);
-	if(jobID == mtctx->nextJobID)
-		return 0; /* no active job => nothing to flush */
-
-	/* look into oldest non-fully-flushed job */
-	{
-		unsigned const wJobID = jobID & mtctx->jobIDMask;
-		ZSTDMT_jobDescription* const jobPtr = &mtctx->jobs[wJobID];
-		ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
-		{
-			size_t const cResult = jobPtr->cSize;
-			size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
-			size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
-			assert(flushed <= produced);
-			assert(jobPtr->consumed <= jobPtr->src.size);
-			toFlush = produced - flushed;
-			/* if toFlush==0, nothing is available to flush.
-			 * However, jobID is expected to still be active:
-			 * if jobID was already completed and fully flushed,
-			 * ZSTDMT_flushProduced() should have already moved onto next job.
-			 * Therefore, some input has not yet been consumed. */
-			if(toFlush == 0) {
-				assert(jobPtr->consumed < jobPtr->src.size);
-			}
-		}
-		ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
-	}
-
-	return toFlush;
-}
-
-/* ------------------------------------------ */
-/* =====   Multi-threaded compression   ===== */
-/* ------------------------------------------ */
-
-static unsigned ZSTDMT_computeTargetJobLog(const ZSTD_CCtx_params* params) {
-	unsigned jobLog;
-	if(params->ldmParams.enableLdm == ZSTD_ps_enable) {
-		/* In Long Range Mode, the windowLog is typically oversized.
-		 * In which case, it's preferable to determine the jobSize
-		 * based on cycleLog instead. */
-		jobLog = MAX(21, ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy) + 3);
-	} else {
-		jobLog = MAX(20, params->cParams.windowLog + 2);
-	}
-	return MIN(jobLog, (unsigned)ZSTDMT_JOBLOG_MAX);
-}
-
-static int ZSTDMT_overlapLog_default(ZSTD_strategy strat) {
-	switch(strat) {
-	case ZSTD_btultra2:
-		return 9;
-	case ZSTD_btultra:
-	case ZSTD_btopt:
-		return 8;
-	case ZSTD_btlazy2:
-	case ZSTD_lazy2:
-		return 7;
-	case ZSTD_lazy:
-	case ZSTD_greedy:
-	case ZSTD_dfast:
-	case ZSTD_fast:
-	default:;
-	}
-	return 6;
-}
-
-static int ZSTDMT_overlapLog(int ovlog, ZSTD_strategy strat) {
-	assert(0 <= ovlog && ovlog <= 9);
-	if(ovlog == 0)
-		return ZSTDMT_overlapLog_default(strat);
-	return ovlog;
-}
-
-static size_t ZSTDMT_computeOverlapSize(const ZSTD_CCtx_params* params) {
-	int const overlapRLog = 9 - ZSTDMT_overlapLog(params->overlapLog, params->cParams.strategy);
-	int ovLog = (overlapRLog >= 8) ? 0 : (params->cParams.windowLog - overlapRLog);
-	assert(0 <= overlapRLog && overlapRLog <= 8);
-	if(params->ldmParams.enableLdm == ZSTD_ps_enable) {
-		/* In Long Range Mode, the windowLog is typically oversized.
-		 * In which case, it's preferable to determine the jobSize
-		 * based on chainLog instead.
-		 * Then, ovLog becomes a fraction of the jobSize, rather than windowSize */
-		ovLog = MIN(params->cParams.windowLog, ZSTDMT_computeTargetJobLog(params) - 2) - overlapRLog;
-	}
-	assert(0 <= ovLog && ovLog <= ZSTD_WINDOWLOG_MAX);
-	DEBUGLOG(4, "overlapLog : %i", params->overlapLog);
-	DEBUGLOG(4, "overlap size : %i", 1 << ovLog);
-	return (ovLog == 0) ? 0 : (size_t)1 << ovLog;
-}
-
-/* ====================================== */
-/* =======      Streaming API     ======= */
-/* ====================================== */
-
-size_t ZSTDMT_initCStream_internal(
-    ZSTDMT_CCtx* mtctx,
-    const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
-    const ZSTD_CDict* cdict, ZSTD_CCtx_params params,
-    unsigned long long pledgedSrcSize) {
-	DEBUGLOG(4, "ZSTDMT_initCStream_internal (pledgedSrcSize=%u, nbWorkers=%u, cctxPool=%u)",
-	         (U32)pledgedSrcSize, params.nbWorkers, mtctx->cctxPool->totalCCtx);
-
-	/* params supposed partially fully validated at this point */
-	assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
-	assert(!((dict) && (cdict))); /* either dict or cdict, not both */
-
-	/* init */
-	if(params.nbWorkers != mtctx->params.nbWorkers)
-		FORWARD_IF_ERROR(ZSTDMT_resize(mtctx, params.nbWorkers), "");
-
-	if(params.jobSize != 0 && params.jobSize < ZSTDMT_JOBSIZE_MIN)
-		params.jobSize = ZSTDMT_JOBSIZE_MIN;
-	if(params.jobSize > (size_t)ZSTDMT_JOBSIZE_MAX)
-		params.jobSize = (size_t)ZSTDMT_JOBSIZE_MAX;
-
-	DEBUGLOG(4, "ZSTDMT_initCStream_internal: %u workers", params.nbWorkers);
-
-	if(mtctx->allJobsCompleted == 0) { /* previous compression not correctly finished */
-		ZSTDMT_waitForAllJobsCompleted(mtctx);
-		ZSTDMT_releaseAllJobResources(mtctx);
-		mtctx->allJobsCompleted = 1;
-	}
-
-	mtctx->params = params;
-	mtctx->frameContentSize = pledgedSrcSize;
-	if(dict) {
-		ZSTD_freeCDict(mtctx->cdictLocal);
-		mtctx->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize,
-		                                              ZSTD_dlm_byCopy, dictContentType, /* note : a loadPrefix becomes an internal CDict */
-		                                              params.cParams, mtctx->cMem);
-		mtctx->cdict = mtctx->cdictLocal;
-		if(mtctx->cdictLocal == NULL)
-			return ERROR(memory_allocation);
-	} else {
-		ZSTD_freeCDict(mtctx->cdictLocal);
-		mtctx->cdictLocal = NULL;
-		mtctx->cdict = cdict;
-	}
-
-	mtctx->targetPrefixSize = ZSTDMT_computeOverlapSize(&params);
-	DEBUGLOG(4, "overlapLog=%i => %u KB", params.overlapLog, (U32)(mtctx->targetPrefixSize >> 10));
-	mtctx->targetSectionSize = params.jobSize;
-	if(mtctx->targetSectionSize == 0) {
-		mtctx->targetSectionSize = 1ULL << ZSTDMT_computeTargetJobLog(&params);
-	}
-	assert(mtctx->targetSectionSize <= (size_t)ZSTDMT_JOBSIZE_MAX);
-
-	if(params.rsyncable) {
-		/* Aim for the targetsectionSize as the average job size. */
-		U32 const jobSizeKB = (U32)(mtctx->targetSectionSize >> 10);
-		U32 const rsyncBits = (assert(jobSizeKB >= 1), ZSTD_highbit32(jobSizeKB) + 10);
-		/* We refuse to create jobs < RSYNC_MIN_BLOCK_SIZE bytes, so make sure our
-		 * expected job size is at least 4x larger. */
-		assert(rsyncBits >= RSYNC_MIN_BLOCK_LOG + 2);
-		DEBUGLOG(4, "rsyncLog = %u", rsyncBits);
-		mtctx->rsync.hash = 0;
-		mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1;
-		mtctx->rsync.primePower = ZSTD_rollingHash_primePower(RSYNC_LENGTH);
-	}
-	if(mtctx->targetSectionSize < mtctx->targetPrefixSize)
-		mtctx->targetSectionSize = mtctx->targetPrefixSize; /* job size must be >= overlap size */
-	DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize >> 10), (U32)params.jobSize);
-	DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize >> 10));
-	ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize));
-	{
-		/* If ldm is enabled we need windowSize space. */
-		size_t const windowSize = mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable ? (1U << mtctx->params.cParams.windowLog) : 0;
-		/* Two buffers of slack, plus extra space for the overlap
-		 * This is the minimum slack that LDM works with. One extra because
-		 * flush might waste up to targetSectionSize-1 bytes. Another extra
-		 * for the overlap (if > 0), then one to fill which doesn't overlap
-		 * with the LDM window.
-		 */
-		size_t const nbSlackBuffers = 2 + (mtctx->targetPrefixSize > 0);
-		size_t const slackSize = mtctx->targetSectionSize * nbSlackBuffers;
-		/* Compute the total size, and always have enough slack */
-		size_t const nbWorkers = MAX(mtctx->params.nbWorkers, 1);
-		size_t const sectionsSize = mtctx->targetSectionSize * nbWorkers;
-		size_t const capacity = MAX(windowSize, sectionsSize) + slackSize;
-		if(mtctx->roundBuff.capacity < capacity) {
-			if(mtctx->roundBuff.buffer)
-				ZSTD_customFree(mtctx->roundBuff.buffer, mtctx->cMem);
-			mtctx->roundBuff.buffer = (BYTE*)ZSTD_customMalloc(capacity, mtctx->cMem);
-			if(mtctx->roundBuff.buffer == NULL) {
-				mtctx->roundBuff.capacity = 0;
-				return ERROR(memory_allocation);
-			}
-			mtctx->roundBuff.capacity = capacity;
-		}
-	}
-	DEBUGLOG(4, "roundBuff capacity : %u KB", (U32)(mtctx->roundBuff.capacity >> 10));
-	mtctx->roundBuff.pos = 0;
-	mtctx->inBuff.buffer = g_nullBuffer;
-	mtctx->inBuff.filled = 0;
-	mtctx->inBuff.prefix = kNullRange;
-	mtctx->doneJobID = 0;
-	mtctx->nextJobID = 0;
-	mtctx->frameEnded = 0;
-	mtctx->allJobsCompleted = 0;
-	mtctx->consumed = 0;
-	mtctx->produced = 0;
-	if(ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, mtctx->targetSectionSize,
-	                            dict, dictSize, dictContentType))
-		return ERROR(memory_allocation);
-	return 0;
-}
-
-/* ZSTDMT_writeLastEmptyBlock()
- * Write a single empty block with an end-of-frame to finish a frame.
- * Job must be created from streaming variant.
- * This function is always successful if expected conditions are fulfilled.
- */
-static void ZSTDMT_writeLastEmptyBlock(ZSTDMT_jobDescription* job) {
-	assert(job->lastJob == 1);
-	assert(job->src.size == 0);         /* last job is empty -> will be simplified into a last empty block */
-	assert(job->firstJob == 0);         /* cannot be first job, as it also needs to create frame header */
-	assert(job->dstBuff.start == NULL); /* invoked from streaming variant only (otherwise, dstBuff might be user's output) */
-	job->dstBuff = ZSTDMT_getBuffer(job->bufPool);
-	if(job->dstBuff.start == NULL) {
-		job->cSize = ERROR(memory_allocation);
-		return;
-	}
-	assert(job->dstBuff.capacity >= ZSTD_blockHeaderSize); /* no buffer should ever be that small */
-	job->src = kNullRange;
-	job->cSize = ZSTD_writeLastEmptyBlock(job->dstBuff.start, job->dstBuff.capacity);
-	assert(!ZSTD_isError(job->cSize));
-	assert(job->consumed == 0);
-}
-
-static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZSTD_EndDirective endOp) {
-	unsigned const jobID = mtctx->nextJobID & mtctx->jobIDMask;
-	int const endFrame = (endOp == ZSTD_e_end);
-
-	if(mtctx->nextJobID > mtctx->doneJobID + mtctx->jobIDMask) {
-		DEBUGLOG(5, "ZSTDMT_createCompressionJob: will not create new job : table is full");
-		assert((mtctx->nextJobID & mtctx->jobIDMask) == (mtctx->doneJobID & mtctx->jobIDMask));
-		return 0;
-	}
-
-	if(!mtctx->jobReady) {
-		BYTE const * src = (BYTE const *)mtctx->inBuff.buffer.start;
-		DEBUGLOG(5, "ZSTDMT_createCompressionJob: preparing job %u to compress %u bytes with %u preload ",
-		         mtctx->nextJobID, (U32)srcSize, (U32)mtctx->inBuff.prefix.size);
-		mtctx->jobs[jobID].src.start = src;
-		mtctx->jobs[jobID].src.size = srcSize;
-		assert(mtctx->inBuff.filled >= srcSize);
-		mtctx->jobs[jobID].prefix = mtctx->inBuff.prefix;
-		mtctx->jobs[jobID].consumed = 0;
-		mtctx->jobs[jobID].cSize = 0;
-		mtctx->jobs[jobID].params = mtctx->params;
-		mtctx->jobs[jobID].cdict = mtctx->nextJobID == 0 ? mtctx->cdict : NULL;
-		mtctx->jobs[jobID].fullFrameSize = mtctx->frameContentSize;
-		mtctx->jobs[jobID].dstBuff = g_nullBuffer;
-		mtctx->jobs[jobID].cctxPool = mtctx->cctxPool;
-		mtctx->jobs[jobID].bufPool = mtctx->bufPool;
-		mtctx->jobs[jobID].seqPool = mtctx->seqPool;
-		mtctx->jobs[jobID].serial = &mtctx->serial;
-		mtctx->jobs[jobID].jobID = mtctx->nextJobID;
-		mtctx->jobs[jobID].firstJob = (mtctx->nextJobID == 0);
-		mtctx->jobs[jobID].lastJob = endFrame;
-		mtctx->jobs[jobID].frameChecksumNeeded = mtctx->params.fParams.checksumFlag && endFrame && (mtctx->nextJobID > 0);
-		mtctx->jobs[jobID].dstFlushed = 0;
-
-		/* Update the round buffer pos and clear the input buffer to be reset */
-		mtctx->roundBuff.pos += srcSize;
-		mtctx->inBuff.buffer = g_nullBuffer;
-		mtctx->inBuff.filled = 0;
-		/* Set the prefix */
-		if(!endFrame) {
-			size_t const newPrefixSize = MIN(srcSize, mtctx->targetPrefixSize);
-			mtctx->inBuff.prefix.start = src + srcSize - newPrefixSize;
-			mtctx->inBuff.prefix.size = newPrefixSize;
-		} else { /* endFrame==1 => no need for another input buffer */
-			mtctx->inBuff.prefix = kNullRange;
-			mtctx->frameEnded = endFrame;
-			if(mtctx->nextJobID == 0) {
-				/* single job exception : checksum is already calculated directly within worker thread */
-				mtctx->params.fParams.checksumFlag = 0;
-			}
-		}
-
-		if((srcSize == 0) && (mtctx->nextJobID > 0) /*single job must also write frame header*/) {
-			DEBUGLOG(5, "ZSTDMT_createCompressionJob: creating a last empty block to end frame");
-			assert(endOp == ZSTD_e_end); /* only possible case : need to end the frame with an empty last block */
-			ZSTDMT_writeLastEmptyBlock(mtctx->jobs + jobID);
-			mtctx->nextJobID++;
-			return 0;
-		}
-	}
-
-	DEBUGLOG(5, "ZSTDMT_createCompressionJob: posting job %u : %u bytes  (end:%u, jobNb == %u (mod:%u))",
-	         mtctx->nextJobID,
-	         (U32)mtctx->jobs[jobID].src.size,
-	         mtctx->jobs[jobID].lastJob,
-	         mtctx->nextJobID,
-	         jobID);
-	if(POOL_tryAdd(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[jobID])) {
-		mtctx->nextJobID++;
-		mtctx->jobReady = 0;
-	} else {
-		DEBUGLOG(5, "ZSTDMT_createCompressionJob: no worker available for job %u", mtctx->nextJobID);
-		mtctx->jobReady = 1;
-	}
-	return 0;
-}
-
-/*! ZSTDMT_flushProduced() :
- *  flush whatever data has been produced but not yet flushed in current job.
- *  move to next job if current one is fully flushed.
- * `output` : `pos` will be updated with amount of data flushed .
- * `blockToFlush` : if >0, the function will block and wait if there is no data available to flush .
- * @return : amount of data remaining within internal buffer, 0 if no more, 1 if unknown but > 0, or an error code */
-static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, unsigned blockToFlush, ZSTD_EndDirective end) {
-	unsigned const wJobID = mtctx->doneJobID & mtctx->jobIDMask;
-	DEBUGLOG(5, "ZSTDMT_flushProduced (blocking:%u , job %u <= %u)",
-	         blockToFlush, mtctx->doneJobID, mtctx->nextJobID);
-	assert(output->size >= output->pos);
-
-	ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex);
-	if(blockToFlush && (mtctx->doneJobID < mtctx->nextJobID)) {
-		assert(mtctx->jobs[wJobID].dstFlushed <= mtctx->jobs[wJobID].cSize);
-		while(mtctx->jobs[wJobID].dstFlushed == mtctx->jobs[wJobID].cSize) { /* nothing to flush */
-			if(mtctx->jobs[wJobID].consumed == mtctx->jobs[wJobID].src.size) {
-				DEBUGLOG(5, "job %u is completely consumed (%u == %u) => don't wait for cond, there will be none",
-				         mtctx->doneJobID, (U32)mtctx->jobs[wJobID].consumed, (U32)mtctx->jobs[wJobID].src.size);
-				break;
-			}
-			DEBUGLOG(5, "waiting for something to flush from job %u (currently flushed: %u bytes)",
-			         mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
-			ZSTD_pthread_cond_wait(&mtctx->jobs[wJobID].job_cond, &mtctx->jobs[wJobID].job_mutex); /* block when nothing to flush but some to come */
-		}
-	}
-
-	/* try to flush something */
-	{
-		size_t cSize = mtctx->jobs[wJobID].cSize;                /* shared */
-		size_t const srcConsumed = mtctx->jobs[wJobID].consumed; /* shared */
-		size_t const srcSize = mtctx->jobs[wJobID].src.size;     /* read-only, could be done after mutex lock, but no-declaration-after-statement */
-		ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
-		if(ZSTD_isError(cSize)) {
-			DEBUGLOG(5, "ZSTDMT_flushProduced: job %u : compression error detected : %s",
-			         mtctx->doneJobID, ZSTD_getErrorName(cSize));
-			ZSTDMT_waitForAllJobsCompleted(mtctx);
-			ZSTDMT_releaseAllJobResources(mtctx);
-			return cSize;
-		}
-		/* add frame checksum if necessary (can only happen once) */
-		assert(srcConsumed <= srcSize);
-		if((srcConsumed == srcSize) /* job completed -> worker no longer active */
-		   && mtctx->jobs[wJobID].frameChecksumNeeded) {
-			U32 const checksum = (U32)XXH64_digest(&mtctx->serial.xxhState);
-			DEBUGLOG(4, "ZSTDMT_flushProduced: writing checksum : %08X \n", checksum);
-			MEM_writeLE32((char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].cSize, checksum);
-			cSize += 4;
-			mtctx->jobs[wJobID].cSize += 4; /* can write this shared value, as worker is no longer active */
-			mtctx->jobs[wJobID].frameChecksumNeeded = 0;
-		}
-
-		if(cSize > 0) { /* compression is ongoing or completed */
-			size_t const toFlush = MIN(cSize - mtctx->jobs[wJobID].dstFlushed, output->size - output->pos);
-			DEBUGLOG(5, "ZSTDMT_flushProduced: Flushing %u bytes from job %u (completion:%u/%u, generated:%u)",
-			         (U32)toFlush, mtctx->doneJobID, (U32)srcConsumed, (U32)srcSize, (U32)cSize);
-			assert(mtctx->doneJobID < mtctx->nextJobID);
-			assert(cSize >= mtctx->jobs[wJobID].dstFlushed);
-			assert(mtctx->jobs[wJobID].dstBuff.start != NULL);
-			if(toFlush > 0) {
-				ZSTD_memcpy((char*)output->dst + output->pos,
-				            (const char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].dstFlushed,
-				            toFlush);
-			}
-			output->pos += toFlush;
-			mtctx->jobs[wJobID].dstFlushed += toFlush; /* can write : this value is only used by mtctx */
-
-			if((srcConsumed == srcSize)                        /* job is completed */
-			   && (mtctx->jobs[wJobID].dstFlushed == cSize)) { /* output buffer fully flushed => free this job position */
-				DEBUGLOG(5, "Job %u completed (%u bytes), moving to next one",
-				         mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
-				ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[wJobID].dstBuff);
-				DEBUGLOG(5, "dstBuffer released");
-				mtctx->jobs[wJobID].dstBuff = g_nullBuffer;
-				mtctx->jobs[wJobID].cSize = 0; /* ensure this job slot is considered "not started" in future check */
-				mtctx->consumed += srcSize;
-				mtctx->produced += cSize;
-				mtctx->doneJobID++;
-			}
-		}
-
-		/* return value : how many bytes left in buffer ; fake it to 1 when unknown but >0 */
-		if(cSize > mtctx->jobs[wJobID].dstFlushed)
-			return (cSize - mtctx->jobs[wJobID].dstFlushed);
-		if(srcSize > srcConsumed)
-			return 1; /* current job not completely compressed */
-	}
-	if(mtctx->doneJobID < mtctx->nextJobID)
-		return 1; /* some more jobs ongoing */
-	if(mtctx->jobReady)
-		return 1; /* one job is ready to push, just not yet in the list */
-	if(mtctx->inBuff.filled > 0)
-		return 1;                                /* input is not empty, and still needs to be converted into a job */
-	mtctx->allJobsCompleted = mtctx->frameEnded; /* all jobs are entirely flushed => if this one is last one, frame is completed */
-	if(end == ZSTD_e_end)
-		return !mtctx->frameEnded; /* for ZSTD_e_end, question becomes : is frame completed ? instead of : are internal buffers fully flushed ? */
-	return 0;                      /* internal buffers fully flushed */
-}
-
-/**
- * Returns the range of data used by the earliest job that is not yet complete.
- * If the data of the first job is broken up into two segments, we cover both
- * sections.
- */
-static range_t ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx) {
-	unsigned const firstJobID = mtctx->doneJobID;
-	unsigned const lastJobID = mtctx->nextJobID;
-	unsigned jobID;
-
-	for(jobID = firstJobID; jobID < lastJobID; ++jobID) {
-		unsigned const wJobID = jobID & mtctx->jobIDMask;
-		size_t consumed;
-
-		ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex);
-		consumed = mtctx->jobs[wJobID].consumed;
-		ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
-
-		if(consumed < mtctx->jobs[wJobID].src.size) {
-			range_t range = mtctx->jobs[wJobID].prefix;
-			if(range.size == 0) {
-				/* Empty prefix */
-				range = mtctx->jobs[wJobID].src;
-			}
-			/* Job source in multiple segments not supported yet */
-			assert(range.start <= mtctx->jobs[wJobID].src.start);
-			return range;
-		}
-	}
-	return kNullRange;
-}
-
-/**
- * Returns non-zero iff buffer and range overlap.
- */
-static int ZSTDMT_isOverlapped(buffer_t buffer, range_t range) {
-	BYTE const * const bufferStart = (BYTE const *)buffer.start;
-	BYTE const * const rangeStart = (BYTE const *)range.start;
-
-	if(rangeStart == NULL || bufferStart == NULL)
-		return 0;
-
-	{
-		BYTE const * const bufferEnd = bufferStart + buffer.capacity;
-		BYTE const * const rangeEnd = rangeStart + range.size;
-
-		/* Empty ranges cannot overlap */
-		if(bufferStart == bufferEnd || rangeStart == rangeEnd)
-			return 0;
-
-		return bufferStart < rangeEnd && rangeStart < bufferEnd;
-	}
-}
-
-static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window) {
-	range_t extDict;
-	range_t prefix;
-
-	DEBUGLOG(5, "ZSTDMT_doesOverlapWindow");
-	extDict.start = window.dictBase + window.lowLimit;
-	extDict.size = window.dictLimit - window.lowLimit;
-
-	prefix.start = window.base + window.dictLimit;
-	prefix.size = window.nextSrc - (window.base + window.dictLimit);
-	DEBUGLOG(5, "extDict [0x%zx, 0x%zx)",
-	         (size_t)extDict.start,
-	         (size_t)extDict.start + extDict.size);
-	DEBUGLOG(5, "prefix  [0x%zx, 0x%zx)",
-	         (size_t)prefix.start,
-	         (size_t)prefix.start + prefix.size);
-
-	return ZSTDMT_isOverlapped(buffer, extDict) || ZSTDMT_isOverlapped(buffer, prefix);
-}
-
-static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, buffer_t buffer) {
-	if(mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable) {
-		ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex;
-		DEBUGLOG(5, "ZSTDMT_waitForLdmComplete");
-		DEBUGLOG(5, "source  [0x%zx, 0x%zx)",
-		         (size_t)buffer.start,
-		         (size_t)buffer.start + buffer.capacity);
-		ZSTD_PTHREAD_MUTEX_LOCK(mutex);
-		while(ZSTDMT_doesOverlapWindow(buffer, mtctx->serial.ldmWindow)) {
-			DEBUGLOG(5, "Waiting for LDM to finish...");
-			ZSTD_pthread_cond_wait(&mtctx->serial.ldmWindowCond, mutex);
-		}
-		DEBUGLOG(6, "Done waiting for LDM to finish");
-		ZSTD_pthread_mutex_unlock(mutex);
-	}
-}
-
-/**
- * Attempts to set the inBuff to the next section to fill.
- * If any part of the new section is still in use we give up.
- * Returns non-zero if the buffer is filled.
- */
-static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx) {
-	range_t const inUse = ZSTDMT_getInputDataInUse(mtctx);
-	size_t const spaceLeft = mtctx->roundBuff.capacity - mtctx->roundBuff.pos;
-	size_t const target = mtctx->targetSectionSize;
-	buffer_t buffer;
-
-	DEBUGLOG(5, "ZSTDMT_tryGetInputRange");
-	assert(mtctx->inBuff.buffer.start == NULL);
-	assert(mtctx->roundBuff.capacity >= target);
-
-	if(spaceLeft < target) {
-		/* ZSTD_invalidateRepCodes() doesn't work for extDict variants.
-		 * Simply copy the prefix to the beginning in that case.
-		 */
-		BYTE* const start = (BYTE*)mtctx->roundBuff.buffer;
-		size_t const prefixSize = mtctx->inBuff.prefix.size;
-
-		buffer.start = start;
-		buffer.capacity = prefixSize;
-		if(ZSTDMT_isOverlapped(buffer, inUse)) {
-			DEBUGLOG(5, "Waiting for buffer...");
-			return 0;
-		}
-		ZSTDMT_waitForLdmComplete(mtctx, buffer);
-		ZSTD_memmove(start, mtctx->inBuff.prefix.start, prefixSize);
-		mtctx->inBuff.prefix.start = start;
-		mtctx->roundBuff.pos = prefixSize;
-	}
-	buffer.start = mtctx->roundBuff.buffer + mtctx->roundBuff.pos;
-	buffer.capacity = target;
-
-	if(ZSTDMT_isOverlapped(buffer, inUse)) {
-		DEBUGLOG(5, "Waiting for buffer...");
-		return 0;
-	}
-	assert(!ZSTDMT_isOverlapped(buffer, mtctx->inBuff.prefix));
-
-	ZSTDMT_waitForLdmComplete(mtctx, buffer);
-
-	DEBUGLOG(5, "Using prefix range [%zx, %zx)",
-	         (size_t)mtctx->inBuff.prefix.start,
-	         (size_t)mtctx->inBuff.prefix.start + mtctx->inBuff.prefix.size);
-	DEBUGLOG(5, "Using source range [%zx, %zx)",
-	         (size_t)buffer.start,
-	         (size_t)buffer.start + buffer.capacity);
-
-	mtctx->inBuff.buffer = buffer;
-	mtctx->inBuff.filled = 0;
-	assert(mtctx->roundBuff.pos + buffer.capacity <= mtctx->roundBuff.capacity);
-	return 1;
-}
-
-typedef struct {
-	size_t toLoad; /* The number of bytes to load from the input. */
-	int flush;     /* Boolean declaring if we must flush because we found a synchronization point. */
-} syncPoint_t;
-
-/**
- * Searches through the input for a synchronization point. If one is found, we
- * will instruct the caller to flush, and return the number of bytes to load.
- * Otherwise, we will load as many bytes as possible and instruct the caller
- * to continue as normal.
- */
-static syncPoint_t
-findSynchronizationPoint(ZSTDMT_CCtx const * mtctx, ZSTD_inBuffer const input) {
-	BYTE const * const istart = (BYTE const *)input.src + input.pos;
-	U64 const primePower = mtctx->rsync.primePower;
-	U64 const hitMask = mtctx->rsync.hitMask;
-
-	syncPoint_t syncPoint;
-	U64 hash;
-	BYTE const * prev;
-	size_t pos;
-
-	syncPoint.toLoad = MIN(input.size - input.pos, mtctx->targetSectionSize - mtctx->inBuff.filled);
-	syncPoint.flush = 0;
-	if(!mtctx->params.rsyncable)
-		/* Rsync is disabled. */
-		return syncPoint;
-	if(mtctx->inBuff.filled + input.size - input.pos < RSYNC_MIN_BLOCK_SIZE)
-		/* We don't emit synchronization points if it would produce too small blocks.
-		 * We don't have enough input to find a synchronization point, so don't look.
-		 */
-		return syncPoint;
-	if(mtctx->inBuff.filled + syncPoint.toLoad < RSYNC_LENGTH)
-		/* Not enough to compute the hash.
-		 * We will miss any synchronization points in this RSYNC_LENGTH byte
-		 * window. However, since it depends only in the internal buffers, if the
-		 * state is already synchronized, we will remain synchronized.
-		 * Additionally, the probability that we miss a synchronization point is
-		 * low: RSYNC_LENGTH / targetSectionSize.
-		 */
-		return syncPoint;
-	/* Initialize the loop variables. */
-	if(mtctx->inBuff.filled < RSYNC_MIN_BLOCK_SIZE) {
-		/* We don't need to scan the first RSYNC_MIN_BLOCK_SIZE positions
-		 * because they can't possibly be a sync point. So we can start
-		 * part way through the input buffer.
-		 */
-		pos = RSYNC_MIN_BLOCK_SIZE - mtctx->inBuff.filled;
-		if(pos >= RSYNC_LENGTH) {
-			prev = istart + pos - RSYNC_LENGTH;
-			hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH);
-		} else {
-			assert(mtctx->inBuff.filled >= RSYNC_LENGTH);
-			prev = (BYTE const *)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH;
-			hash = ZSTD_rollingHash_compute(prev + pos, (RSYNC_LENGTH - pos));
-			hash = ZSTD_rollingHash_append(hash, istart, pos);
-		}
-	} else {
-		/* We have enough bytes buffered to initialize the hash,
-		 * and have processed enough bytes to find a sync point.
-		 * Start scanning at the beginning of the input.
-		 */
-		assert(mtctx->inBuff.filled >= RSYNC_MIN_BLOCK_SIZE);
-		assert(RSYNC_MIN_BLOCK_SIZE >= RSYNC_LENGTH);
-		pos = 0;
-		prev = (BYTE const *)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH;
-		hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH);
-		if((hash & hitMask) == hitMask) {
-			/* We're already at a sync point so don't load any more until
-			 * we're able to flush this sync point.
-			 * This likely happened because the job table was full so we
-			 * couldn't add our job.
-			 */
-			syncPoint.toLoad = 0;
-			syncPoint.flush = 1;
-			return syncPoint;
-		}
-	}
-	/* Starting with the hash of the previous RSYNC_LENGTH bytes, roll
-	 * through the input. If we hit a synchronization point, then cut the
-	 * job off, and tell the compressor to flush the job. Otherwise, load
-	 * all the bytes and continue as normal.
-	 * If we go too long without a synchronization point (targetSectionSize)
-	 * then a block will be emitted anyways, but this is okay, since if we
-	 * are already synchronized we will remain synchronized.
-	 */
-	assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
-	for(; pos < syncPoint.toLoad; ++pos) {
-		BYTE const toRemove = pos < RSYNC_LENGTH ? prev[pos] : istart[pos - RSYNC_LENGTH];
-		/* This assert is very expensive, and Debian compiles with asserts enabled.
-		 * So disable it for now. We can get similar coverage by checking it at the
-		 * beginning & end of the loop.
-		 * assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
-		 */
-		hash = ZSTD_rollingHash_rotate(hash, toRemove, istart[pos], primePower);
-		assert(mtctx->inBuff.filled + pos >= RSYNC_MIN_BLOCK_SIZE);
-		if((hash & hitMask) == hitMask) {
-			syncPoint.toLoad = pos + 1;
-			syncPoint.flush = 1;
-			++pos; /* for assert */
-			break;
-		}
-	}
-	assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
-	return syncPoint;
-}
-
-size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx) {
-	size_t hintInSize = mtctx->targetSectionSize - mtctx->inBuff.filled;
-	if(hintInSize == 0)
-		hintInSize = mtctx->targetSectionSize;
-	return hintInSize;
-}
-
-/** ZSTDMT_compressStream_generic() :
- *  internal use only - exposed to be invoked from zstd_compress.c
- *  assumption : output and input are valid (pos <= size)
- * @return : minimum amount of data remaining to flush, 0 if none */
-size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
-                                     ZSTD_outBuffer* output,
-                                     ZSTD_inBuffer* input,
-                                     ZSTD_EndDirective endOp) {
-	unsigned forwardInputProgress = 0;
-	DEBUGLOG(5, "ZSTDMT_compressStream_generic (endOp=%u, srcSize=%u)",
-	         (U32)endOp, (U32)(input->size - input->pos));
-	assert(output->pos <= output->size);
-	assert(input->pos <= input->size);
-
-	if((mtctx->frameEnded) && (endOp == ZSTD_e_continue)) {
-		/* current frame being ended. Only flush/end are allowed */
-		return ERROR(stage_wrong);
-	}
-
-	/* fill input buffer */
-	if((!mtctx->jobReady) && (input->size > input->pos)) { /* support NULL input */
-		if(mtctx->inBuff.buffer.start == NULL) {
-			assert(mtctx->inBuff.filled == 0); /* Can't fill an empty buffer */
-			if(!ZSTDMT_tryGetInputRange(mtctx)) {
-				/* It is only possible for this operation to fail if there are
-				 * still compression jobs ongoing.
-				 */
-				DEBUGLOG(5, "ZSTDMT_tryGetInputRange failed");
-				assert(mtctx->doneJobID != mtctx->nextJobID);
-			} else
-				DEBUGLOG(5, "ZSTDMT_tryGetInputRange completed successfully : mtctx->inBuff.buffer.start = %p", mtctx->inBuff.buffer.start);
-		}
-		if(mtctx->inBuff.buffer.start != NULL) {
-			syncPoint_t const syncPoint = findSynchronizationPoint(mtctx, *input);
-			if(syncPoint.flush && endOp == ZSTD_e_continue) {
-				endOp = ZSTD_e_flush;
-			}
-			assert(mtctx->inBuff.buffer.capacity >= mtctx->targetSectionSize);
-			DEBUGLOG(5, "ZSTDMT_compressStream_generic: adding %u bytes on top of %u to buffer of size %u",
-			         (U32)syncPoint.toLoad, (U32)mtctx->inBuff.filled, (U32)mtctx->targetSectionSize);
-			ZSTD_memcpy((char*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled, (const char*)input->src + input->pos, syncPoint.toLoad);
-			input->pos += syncPoint.toLoad;
-			mtctx->inBuff.filled += syncPoint.toLoad;
-			forwardInputProgress = syncPoint.toLoad > 0;
-		}
-	}
-	if((input->pos < input->size) && (endOp == ZSTD_e_end)) {
-		/* Can't end yet because the input is not fully consumed.
-		 * We are in one of these cases:
-		 * - mtctx->inBuff is NULL & empty: we couldn't get an input buffer so don't create a new job.
-		 * - We filled the input buffer: flush this job but don't end the frame.
-		 * - We hit a synchronization point: flush this job but don't end the frame.
-		 */
-		assert(mtctx->inBuff.filled == 0 || mtctx->inBuff.filled == mtctx->targetSectionSize || mtctx->params.rsyncable);
-		endOp = ZSTD_e_flush;
-	}
-
-	if((mtctx->jobReady) || (mtctx->inBuff.filled >= mtctx->targetSectionSize) /* filled enough : let's compress */
-	   || ((endOp != ZSTD_e_continue) && (mtctx->inBuff.filled > 0))           /* something to flush : let's go */
-	   || ((endOp == ZSTD_e_end) && (!mtctx->frameEnded))) {                   /* must finish the frame with a zero-size block */
-		size_t const jobSize = mtctx->inBuff.filled;
-		assert(mtctx->inBuff.filled <= mtctx->targetSectionSize);
-		FORWARD_IF_ERROR(ZSTDMT_createCompressionJob(mtctx, jobSize, endOp), "");
-	}
-
-	/* check for potential compressed data ready to be flushed */
-	{
-		size_t const remainingToFlush = ZSTDMT_flushProduced(mtctx, output, !forwardInputProgress, endOp); /* block if there was no forward input progress */
-		if(input->pos < input->size)
-			return MAX(remainingToFlush, 1); /* input not consumed : do not end flush yet */
-		DEBUGLOG(5, "end of ZSTDMT_compressStream_generic: remainingToFlush = %u", (U32)remainingToFlush);
-		return remainingToFlush;
-	}
-}