From ad7e2da6470939c2469d849530a45ae7dd0f2524 Mon Sep 17 00:00:00 2001 From: angazenn Date: Fri, 21 Feb 2025 10:05:25 +0800 Subject: [PATCH 1/5] add int8 cache dtype when using attention quantization Signed-off-by: angazenn --- vllm_ascend/worker.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vllm_ascend/worker.py b/vllm_ascend/worker.py index c5884e36..2323e439 100644 --- a/vllm_ascend/worker.py +++ b/vllm_ascend/worker.py @@ -101,6 +101,17 @@ def __init__( not in ["medusa", "mlp_speculator", "eagle"]) \ else {"return_hidden_states": True} + if vllm_config.quant_config is not None and \ + 'fa_quant_type' in vllm_config.quant_config.quant_description.keys(): + # using ascend attention quant. + # TODO: Updates of cache_config should de added into + # NPUPlatorm.check_and_update_config. However, this function fails to + # update STR_DTYPE_TO_TORCH_DTYPE which is used by vLLM 0.7.1 to convert + # dtype string to torch.dtype. Hence we have to move these codes to here. + from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE + cache_config.cache_dtype = 'int8' + STR_DTYPE_TO_TORCH_DTYPE['int8'] = torch.int8 + ModelRunnerClass: Type[ModelRunnerBase] = NPUModelRunner if model_config.runner_type == "pooling": ModelRunnerClass = PoolingModelRunner From 09c7eaf0a867a19624080db90a1384f11a080ebd Mon Sep 17 00:00:00 2001 From: angazenn Date: Fri, 21 Feb 2025 10:14:32 +0800 Subject: [PATCH 2/5] fix bugs Signed-off-by: angazenn --- vllm_ascend/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/worker.py b/vllm_ascend/worker.py index 2323e439..119ce747 100644 --- a/vllm_ascend/worker.py +++ b/vllm_ascend/worker.py @@ -109,7 +109,7 @@ def __init__( # update STR_DTYPE_TO_TORCH_DTYPE which is used by vLLM 0.7.1 to convert # dtype string to torch.dtype. Hence we have to move these codes to here. from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE - cache_config.cache_dtype = 'int8' + self.cache_config.cache_dtype = 'int8' STR_DTYPE_TO_TORCH_DTYPE['int8'] = torch.int8 ModelRunnerClass: Type[ModelRunnerBase] = NPUModelRunner From e41fc9b472a65a529ec5415274da84a98164d905 Mon Sep 17 00:00:00 2001 From: angazenn Date: Fri, 21 Feb 2025 10:48:17 +0800 Subject: [PATCH 3/5] fix yapf check Signed-off-by: angazenn --- vllm_ascend/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/worker.py b/vllm_ascend/worker.py index 119ce747..e2424082 100644 --- a/vllm_ascend/worker.py +++ b/vllm_ascend/worker.py @@ -106,7 +106,7 @@ def __init__( # using ascend attention quant. # TODO: Updates of cache_config should de added into # NPUPlatorm.check_and_update_config. However, this function fails to - # update STR_DTYPE_TO_TORCH_DTYPE which is used by vLLM 0.7.1 to convert + # update STR_DTYPE_TO_TORCH_DTYPE which is used by vLLM 0.7.1 to convert # dtype string to torch.dtype. Hence we have to move these codes to here. from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE self.cache_config.cache_dtype = 'int8' From 168154e61b8fbf031bf4226bd2cda31c4ae3ec4f Mon Sep 17 00:00:00 2001 From: angazenn Date: Fri, 21 Feb 2025 17:45:17 +0800 Subject: [PATCH 4/5] add patches Signed-off-by: angazenn --- vllm_ascend/patch/__init__.py | 19 +++ vllm_ascend/patch/patch_attention.py | 161 +++++++++++++++++++++++++ vllm_ascend/patch/patch_cache_dtype.py | 23 ++++ 3 files changed, 203 insertions(+) create mode 100644 vllm_ascend/patch/__init__.py create mode 100644 vllm_ascend/patch/patch_attention.py create mode 100644 vllm_ascend/patch/patch_cache_dtype.py diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py new file mode 100644 index 00000000..5b72ffbf --- /dev/null +++ b/vllm_ascend/patch/__init__.py @@ -0,0 +1,19 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from vllm_ascend.patch import patch_cache_dtype # noqa +from vllm_ascend.patch import patch_attention # noqa \ No newline at end of file diff --git a/vllm_ascend/patch/patch_attention.py b/vllm_ascend/patch/patch_attention.py new file mode 100644 index 00000000..8a0e8da7 --- /dev/null +++ b/vllm_ascend/patch/patch_attention.py @@ -0,0 +1,161 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/vllm/attention/layer.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This file is used to monkey patch vLLM Attention.__init__ function +# and move the instantiation of num_heads, head_size, num_kv_heads +# ahead of the initialization of attention quant methods, which is +# required by ascend attention quant method to initialize. +# Remove this file when vllm support it. + +from typing import Any, Dict, List, Optional + +import torch + +import vllm.envs as envs +from vllm.attention import Attention, AttentionType +from vllm.attention.selector import backend_name_to_enum, get_attn_backend +from vllm.config import CacheConfig, get_current_vllm_config +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op + + +def attention_init( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + alibi_slopes: Optional[List[float]] = None, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + per_layer_sliding_window: Optional[int] = None, + use_mla: bool = False, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + **extra_impl_args, + ) -> None: + super(Attention, self).__init__() + if per_layer_sliding_window is not None: + # per-layer sliding window + sliding_window = per_layer_sliding_window + elif cache_config is not None: + # model-level sliding window + sliding_window = cache_config.sliding_window + else: + sliding_window = None + + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + is_attention_free = cache_config.is_attention_free + calculate_kv_scales = cache_config.calculate_kv_scales + else: + kv_cache_dtype = "auto" + block_size = 16 + is_attention_free = False + calculate_kv_scales = False + if num_kv_heads is None: + num_kv_heads = num_heads + + # The default k/v_scale is set to 1.0. This is ignored + # when kv-cache is not fp8, and should be used with + # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we + # expect the pre-quantized k/v_scale to be loaded along + # with the model weights. + self.kv_cache_dtype = kv_cache_dtype + self.calculate_kv_scales = calculate_kv_scales + self._k_scale = torch.tensor(1.0, dtype=torch.float32) + self._v_scale = torch.tensor(1.0, dtype=torch.float32) + + # We also keep the float32 versions of k/v_scale for attention + # backends that don't support tensors (Flashinfer) + self._k_scale_float = 1.0 + self._v_scale_float = 1.0 + + # should move following three lines before quant method is instantiated. + self.num_heads = num_heads + self.head_size = head_size + self.num_kv_heads = num_kv_heads + + quant_method = quant_config.get_quant_method( + self, prefix=prefix) if quant_config else None + if quant_method is not None: + assert isinstance(quant_method, BaseKVCacheMethod) + # TODO (mgoin): kv cache dtype should be specified in the FP8 + # checkpoint config and become the "auto" behavior + if self.kv_cache_dtype == "fp8_e5m2": + raise ValueError("fp8_e5m2 kv-cache is not supported with " + "fp8 checkpoints.") + # If quantization is enabled, we make "k_scale" and "v_scale" + # parameters so that it can be loaded from the model checkpoint. + # The k/v_scale will then be converted back to native float32 + # values after weight loading. + self.quant_method = quant_method + self.quant_method.create_weights(self) + + # During model initialization, the default dtype is set as the model + # weight and activation dtype. + dtype = torch.get_default_dtype() + attn_backend = get_attn_backend(head_size, + dtype, + kv_cache_dtype, + block_size, + is_attention_free, + blocksparse_params is not None, + use_mla=use_mla) + impl_cls = attn_backend.get_impl_cls() + self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, + alibi_slopes, sliding_window, kv_cache_dtype, + blocksparse_params, logits_soft_cap, attn_type, + **extra_impl_args) + self.sliding_window = sliding_window + self.backend = backend_name_to_enum(attn_backend.get_name()) + self.dtype = dtype + + # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how + # torch.compile works by registering the attention as one giant + # opaque custom op. For other platforms, we directly call them + # and let torch.compile handle them. + self.use_direct_call = not current_platform.is_cuda_alike( + ) and not current_platform.is_cpu() + + self.use_output = attn_backend.accept_output_buffer + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + self.layer_name = prefix + self.attn_type = attn_type + # use a placeholder kv cache tensor during init, which will be replaced + # by bind_kv_cache + # this variable will not be accessed if use_direct_call is True + self.kv_cache = [ + torch.tensor([]) for _ in range(get_current_vllm_config( + ).parallel_config.pipeline_parallel_size) + ] + + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) + + +Attention.__init__ = attention_init \ No newline at end of file diff --git a/vllm_ascend/patch/patch_cache_dtype.py b/vllm_ascend/patch/patch_cache_dtype.py new file mode 100644 index 00000000..a5714d1b --- /dev/null +++ b/vllm_ascend/patch/patch_cache_dtype.py @@ -0,0 +1,23 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This file is used to monkey patch int8 cache dtype in vllm to support ascend. +# Remove this file when vllm support int8 cache dtype. + +import torch +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE + +STR_DTYPE_TO_TORCH_DTYPE['int8'] = torch.int8 \ No newline at end of file From 6075a6e33727f1e7bfb2a14c223419c7bb2f4a61 Mon Sep 17 00:00:00 2001 From: angazenn Date: Sat, 22 Feb 2025 09:40:30 +0800 Subject: [PATCH 5/5] clean code Signed-off-by: angazenn --- vllm_ascend/patch/__init__.py | 3 +- vllm_ascend/patch/patch_attention.py | 239 +++++++++++++-------------- 2 files changed, 120 insertions(+), 122 deletions(-) diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 5b72ffbf..bb5cb27b 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -15,5 +15,4 @@ # limitations under the License. # -from vllm_ascend.patch import patch_cache_dtype # noqa -from vllm_ascend.patch import patch_attention # noqa \ No newline at end of file +from vllm_ascend.patch import patch_attention, patch_cache_dtype # noqa \ No newline at end of file diff --git a/vllm_ascend/patch/patch_attention.py b/vllm_ascend/patch/patch_attention.py index 8a0e8da7..c2a86cec 100644 --- a/vllm_ascend/patch/patch_attention.py +++ b/vllm_ascend/patch/patch_attention.py @@ -18,7 +18,7 @@ # # This file is used to monkey patch vLLM Attention.__init__ function # and move the instantiation of num_heads, head_size, num_kv_heads -# ahead of the initialization of attention quant methods, which is +# ahead of the initialization of attention quant methods, which is # required by ascend attention quant method to initialize. # Remove this file when vllm support it. @@ -34,128 +34,127 @@ QuantizationConfig) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import current_platform -from vllm.utils import direct_register_custom_op def attention_init( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: Optional[int] = None, - alibi_slopes: Optional[List[float]] = None, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - per_layer_sliding_window: Optional[int] = None, - use_mla: bool = False, - prefix: str = "", - attn_type: str = AttentionType.DECODER, - **extra_impl_args, - ) -> None: - super(Attention, self).__init__() - if per_layer_sliding_window is not None: - # per-layer sliding window - sliding_window = per_layer_sliding_window - elif cache_config is not None: - # model-level sliding window - sliding_window = cache_config.sliding_window - else: - sliding_window = None - - if cache_config is not None: - kv_cache_dtype = cache_config.cache_dtype - block_size = cache_config.block_size - is_attention_free = cache_config.is_attention_free - calculate_kv_scales = cache_config.calculate_kv_scales - else: - kv_cache_dtype = "auto" - block_size = 16 - is_attention_free = False - calculate_kv_scales = False - if num_kv_heads is None: - num_kv_heads = num_heads - - # The default k/v_scale is set to 1.0. This is ignored - # when kv-cache is not fp8, and should be used with - # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we - # expect the pre-quantized k/v_scale to be loaded along - # with the model weights. - self.kv_cache_dtype = kv_cache_dtype - self.calculate_kv_scales = calculate_kv_scales - self._k_scale = torch.tensor(1.0, dtype=torch.float32) - self._v_scale = torch.tensor(1.0, dtype=torch.float32) - - # We also keep the float32 versions of k/v_scale for attention - # backends that don't support tensors (Flashinfer) - self._k_scale_float = 1.0 - self._v_scale_float = 1.0 - - # should move following three lines before quant method is instantiated. - self.num_heads = num_heads - self.head_size = head_size - self.num_kv_heads = num_kv_heads - - quant_method = quant_config.get_quant_method( - self, prefix=prefix) if quant_config else None - if quant_method is not None: - assert isinstance(quant_method, BaseKVCacheMethod) - # TODO (mgoin): kv cache dtype should be specified in the FP8 - # checkpoint config and become the "auto" behavior - if self.kv_cache_dtype == "fp8_e5m2": - raise ValueError("fp8_e5m2 kv-cache is not supported with " - "fp8 checkpoints.") - # If quantization is enabled, we make "k_scale" and "v_scale" - # parameters so that it can be loaded from the model checkpoint. - # The k/v_scale will then be converted back to native float32 - # values after weight loading. - self.quant_method = quant_method - self.quant_method.create_weights(self) - - # During model initialization, the default dtype is set as the model - # weight and activation dtype. - dtype = torch.get_default_dtype() - attn_backend = get_attn_backend(head_size, - dtype, - kv_cache_dtype, - block_size, - is_attention_free, - blocksparse_params is not None, - use_mla=use_mla) - impl_cls = attn_backend.get_impl_cls() - self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap, attn_type, - **extra_impl_args) - self.sliding_window = sliding_window - self.backend = backend_name_to_enum(attn_backend.get_name()) - self.dtype = dtype - - # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how - # torch.compile works by registering the attention as one giant - # opaque custom op. For other platforms, we directly call them - # and let torch.compile handle them. - self.use_direct_call = not current_platform.is_cuda_alike( - ) and not current_platform.is_cpu() - - self.use_output = attn_backend.accept_output_buffer - compilation_config = get_current_vllm_config().compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self - self.layer_name = prefix - self.attn_type = attn_type - # use a placeholder kv cache tensor during init, which will be replaced - # by bind_kv_cache - # this variable will not be accessed if use_direct_call is True - self.kv_cache = [ - torch.tensor([]) for _ in range(get_current_vllm_config( - ).parallel_config.pipeline_parallel_size) - ] - - self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) - self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + alibi_slopes: Optional[List[float]] = None, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + per_layer_sliding_window: Optional[int] = None, + use_mla: bool = False, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + **extra_impl_args, +) -> None: + super(Attention, self).__init__() + if per_layer_sliding_window is not None: + # per-layer sliding window + sliding_window = per_layer_sliding_window + elif cache_config is not None: + # model-level sliding window + sliding_window = cache_config.sliding_window + else: + sliding_window = None + + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + is_attention_free = cache_config.is_attention_free + calculate_kv_scales = cache_config.calculate_kv_scales + else: + kv_cache_dtype = "auto" + block_size = 16 + is_attention_free = False + calculate_kv_scales = False + if num_kv_heads is None: + num_kv_heads = num_heads + + # The default k/v_scale is set to 1.0. This is ignored + # when kv-cache is not fp8, and should be used with + # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we + # expect the pre-quantized k/v_scale to be loaded along + # with the model weights. + self.kv_cache_dtype = kv_cache_dtype + self.calculate_kv_scales = calculate_kv_scales + self._k_scale = torch.tensor(1.0, dtype=torch.float32) + self._v_scale = torch.tensor(1.0, dtype=torch.float32) + + # We also keep the float32 versions of k/v_scale for attention + # backends that don't support tensors (Flashinfer) + self._k_scale_float = 1.0 + self._v_scale_float = 1.0 + + # should move following three lines before quant method is instantiated. + self.num_heads = num_heads + self.head_size = head_size + self.num_kv_heads = num_kv_heads + + quant_method = quant_config.get_quant_method( + self, prefix=prefix) if quant_config else None + if quant_method is not None: + assert isinstance(quant_method, BaseKVCacheMethod) + # TODO (mgoin): kv cache dtype should be specified in the FP8 + # checkpoint config and become the "auto" behavior + if self.kv_cache_dtype == "fp8_e5m2": + raise ValueError("fp8_e5m2 kv-cache is not supported with " + "fp8 checkpoints.") + # If quantization is enabled, we make "k_scale" and "v_scale" + # parameters so that it can be loaded from the model checkpoint. + # The k/v_scale will then be converted back to native float32 + # values after weight loading. + self.quant_method = quant_method + self.quant_method.create_weights(self) + + # During model initialization, the default dtype is set as the model + # weight and activation dtype. + dtype = torch.get_default_dtype() + attn_backend = get_attn_backend(head_size, + dtype, + kv_cache_dtype, + block_size, + is_attention_free, + blocksparse_params is not None, + use_mla=use_mla) + impl_cls = attn_backend.get_impl_cls() + self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, + alibi_slopes, sliding_window, kv_cache_dtype, + blocksparse_params, logits_soft_cap, attn_type, + **extra_impl_args) + self.sliding_window = sliding_window + self.backend = backend_name_to_enum(attn_backend.get_name()) + self.dtype = dtype + + # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how + # torch.compile works by registering the attention as one giant + # opaque custom op. For other platforms, we directly call them + # and let torch.compile handle them. + self.use_direct_call = not current_platform.is_cuda_alike( + ) and not current_platform.is_cpu() + + self.use_output = attn_backend.accept_output_buffer + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + self.layer_name = prefix + self.attn_type = attn_type + # use a placeholder kv cache tensor during init, which will be replaced + # by bind_kv_cache + # this variable will not be accessed if use_direct_call is True + self.kv_cache = [ + torch.tensor([]) for _ in range( + get_current_vllm_config().parallel_config.pipeline_parallel_size) + ] + + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) Attention.__init__ = attention_init \ No newline at end of file