Skip to content

Commit

Permalink
Remove KV-cache compression disabling flag for compressed models (#1141)
Browse files Browse the repository at this point in the history
* Remove kv cache compression disabling flag for compressed models

* Add kv-cache precision flag check to a separate method

* Add deprecation warning for

* Fix test

* Update optimum/intel/openvino/configuration.py

Co-authored-by: Alexander Kozlov <alexander.kozlov@intel.com>

---------

Co-authored-by: Alexander Kozlov <alexander.kozlov@intel.com>
  • Loading branch information
nikita-savelyevv and AlexKoff88 authored Feb 5, 2025
1 parent 61a74cd commit f601b8b
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 59 deletions.
6 changes: 0 additions & 6 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,12 +493,6 @@ class StoreAttr(object):
from optimum.intel.openvino.quantization import _weight_only_quantization

_weight_only_quantization(submodel, quantization_config)
# kv cache compression disabled if quantization config is not provided,
# to keep aligned result of applying auto int8 compression and via explicit setting config, we should update it
if submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]):
prev_rt_info = submodel.get_rt_info("runtime_options").value
prev_rt_info.pop("KV_CACHE_PRECISION")
submodel.set_rt_info(prev_rt_info, "runtime_options")
compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
del submodel
Expand Down
7 changes: 7 additions & 0 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,13 @@ def __init__(
):
super().__init__(bits=bits, sym=sym, group_size=weights_group_size, **kwargs)
self.activations_group_size = activations_group_size
logger.warning(
"OVDynamicQuantizationConfig is deprecated and will be removed in optimum-intel v1.24.0. "
"Dynamic quantization and KV cache compression are enabled by default starting from OpenVINO 2024.6 and "
"there is no need to enable them manually. If you need precise control over these parameters, please "
"provide `DYNAMIC_QUANTIZATION_GROUP_SIZE` and `KV_CACHE_PRECISION` with `ov_config` argument during model "
"inference."
)


@dataclass
Expand Down
11 changes: 10 additions & 1 deletion optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -1042,7 +1042,7 @@ def _weight_only_quantization(
else:
mode = CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM

return nncf.compress_weights(
compressed_model = nncf.compress_weights(
model,
mode=mode,
ratio=config.ratio,
Expand All @@ -1060,6 +1060,15 @@ def _weight_only_quantization(
**kwargs,
)

# If KV cache compression was disabled, remove the disabling flag from the model
if compressed_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]):
prev_rt_info = compressed_model.get_rt_info("runtime_options").value
if prev_rt_info["KV_CACHE_PRECISION"] == "f16":
prev_rt_info.pop("KV_CACHE_PRECISION")
compressed_model.set_rt_info(prev_rt_info, "runtime_options")

return compressed_model


def _full_quantization(
model: openvino.runtime.Model,
Expand Down
42 changes: 21 additions & 21 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from utils_tests import (
_ARCHITECTURES_TO_EXPECTED_INT8,
MODEL_NAMES,
compare_num_quantized_nodes_per_model,
check_compression_state_per_model,
get_num_quantized_nodes,
)

Expand Down Expand Up @@ -192,27 +192,27 @@ class OVCLIExportTestCase(unittest.TestCase):
"image-text-to-text",
"llava_next",
"int4 --group-size 16 --ratio 0.8",
[{"int8": 14, "int4": 16}, {"int8": 9}, {"int8": 1}],
[{"int8": 14, "int4": 16}, {"int8": 1}, {"int8": 9}],
),
(
"image-text-to-text",
"llava_next",
'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" '
"--dataset contextual --num-samples 1",
[{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}],
[{"int8": 6, "int4": 24}, {"int8": 1}, {"int8": 9}],
),
(
"image-text-to-text",
"nanollava",
"int4 --group-size 8 --ratio 0.8 --trust-remote-code",
[{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
[{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}],
),
(
"image-text-to-text",
"nanollava",
'int4 --group-size 8 --ratio 0.8 --sensitivity-metric "mean_activation_variance" '
"--dataset contextual --num-samples 1 --trust-remote-code",
[{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
[{"int8": 16, "int4": 14}, {"int8": 1}, {"int8": 15}],
),
]
)
Expand All @@ -224,40 +224,40 @@ class OVCLIExportTestCase(unittest.TestCase):
"image-text-to-text",
"minicpmv",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
[{"int8": 10, "int4": 20}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
[{"int8": 10, "int4": 20}, {"int8": 1}, {"int8": 26}, {"int8": 6}],
),
(
"image-text-to-text",
"minicpmv",
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
"--dataset contextual --num-samples 1 --trust-remote-code",
[{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
[{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 26}, {"int8": 6}],
),
(
"image-text-to-text",
"internvl2",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
[{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
[{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}],
),
(
"image-text-to-text",
"internvl2",
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
"--dataset contextual --num-samples 1 --trust-remote-code",
[{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
[{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}],
),
(
"image-text-to-text",
"phi3_v",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
[{"int8": 8, "int4": 10}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
[{"int8": 8, "int4": 10}, {"int8": 1}, {"int8": 7}, {"int8": 2}],
),
(
"image-text-to-text",
"phi3_v",
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
"--dataset contextual --num-samples 1 --trust-remote-code",
[{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
[{"int8": 4, "int4": 14}, {"int8": 1}, {"int8": 7}, {"int8": 2}],
),
(
"image-text-to-text",
Expand Down Expand Up @@ -369,14 +369,15 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2
)
elif task.startswith("image-text-to-text"):
models = [model.language_model, model.vision_embeddings]
models = list(model.submodels.values())
else:
models = [model]

expected_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
for i, model in enumerate(models):
_, num_weight_nodes = get_num_quantized_nodes(model)
self.assertEqual(expected_int8[i], num_weight_nodes["int8"])
expected_int8 = [{"int8": it} for it in expected_int8]
if task.startswith("text2text-generation") and (not task.endswith("with-past") or model.decoder.stateful):
expected_int8 = expected_int8[:2]
check_compression_state_per_model(self, models, expected_int8)

@parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES)
def test_exporters_cli_hybrid_quantization(
Expand All @@ -389,11 +390,11 @@ def test_exporters_cli_hybrid_quantization(
check=True,
)
model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir)
num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(
model.unet if model.unet is not None else model.transformer
)
vision_model = model.unet.model if model.unet is not None else model.transformer.model
num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(vision_model)
self.assertEqual(expected_int8_nodes, num_weight_nodes["int8"])
self.assertEqual(expected_fake_nodes, num_fake_nodes)
self.assertFalse(vision_model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))

@parameterized.expand(TEST_4BIT_CONFIGURATIONS)
def test_exporters_cli_4bit(
Expand All @@ -419,10 +420,9 @@ def test_exporters_cli_4bit(
if task == "text-generation-with-past":
submodels = [model]
elif task == "image-text-to-text":
submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
submodels += [getattr(model, part) for part in model.additional_parts]
submodels = list(model.submodels.values())

compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model)

self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout)
Expand Down
Loading

0 comments on commit f601b8b

Please sign in to comment.