-
Notifications
You must be signed in to change notification settings - Fork 37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Core]Add Ascend Quantize #7
Changes from all commits
bafc5f5
a4aaea4
43b9f70
46b7ca2
5b1b34d
7a41f8f
d332351
37c4543
7e230f0
1bfb206
9bbc77c
b5d4bf6
d3dd745
b809659
7f4b41c
a553906
3b94a9f
210e6dc
639b602
a92d9fe
44737f9
f1e9556
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# | ||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. | ||
# This file is a part of the vllm-ascend project. | ||
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py | ||
# Copyright 2023 The vLLM team. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
"""Tests whether ascend quantization based on MindIE-Turbo is enabled correctly. | ||
|
||
Run `pytest tests/quantization/test_mindie_turbo.py`. | ||
""" | ||
|
||
import os | ||
|
||
import pytest | ||
|
||
import vllm # noqa: F401 | ||
|
||
import vllm_ascend # noqa: F401 | ||
from vllm_ascend.quantization.quant_config import AscendLinearMethod | ||
|
||
from tests.conftest import VllmRunner | ||
from tests.quantization.utils import is_mindie_turbo_supported, example_quantization | ||
|
||
MODELS = [ | ||
"Qwen/Qwen2.5-0.5B-Instruct", | ||
] | ||
|
||
|
||
@pytest.mark.skipif(not is_mindie_turbo_supported(), | ||
reason="MindIE-Turbo is not installed.") | ||
@pytest.mark.parametrize("model_name_or_path", MODELS) | ||
@pytest.mark.parametrize("max_tokens", [5]) | ||
def test_mindie_turbo( | ||
model_name_or_path: str, | ||
max_tokens: int, | ||
) -> None: | ||
# vLLM must load weights from disk. Hence we need to save the quantized | ||
# weights at first, and then load it by vLLM. | ||
temp_path = os.path.join(os.path.dirname(__file__), "temp_weight") | ||
if not os.path.exists(temp_path): | ||
os.makedirs(temp_path) | ||
example_quantization(model_name_or_path, temp_path) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why import inner the test? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is because mindie_turbo should be import before vllm in early versions of mindie_turbo. Perhaps this conflict has been resolved now and these packages imported can be moved outside. |
||
prompt = "What's deep learning?" | ||
example_prompts = [prompt] | ||
|
||
with VllmRunner(temp_path, | ||
max_model_len=8192, | ||
dtype="bfloat16", | ||
enforce_eager=False, | ||
gpu_memory_utilization=0.7) as vllm_model: | ||
|
||
output = vllm_model.generate_greedy(example_prompts, max_tokens) | ||
assert output |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# | ||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. | ||
# This file is a part of the vllm-ascend project. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
import os | ||
import shutil | ||
import torch | ||
from transformers import AutoTokenizer, AutoModelForCausalLM | ||
from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig, AntiOutlier | ||
from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig | ||
|
||
|
||
def is_mindie_turbo_supported() -> bool: | ||
try: | ||
import mindie_turbo | ||
except: | ||
return False | ||
|
||
return True | ||
|
||
|
||
def example_quantization(model_name_or_path: str, tmp_path: str) -> None: | ||
|
||
tokenizer = AutoTokenizer.from_pretrained( | ||
pretrained_model_name_or_path=model_name_or_path | ||
) | ||
|
||
model = AutoModelForCausalLM.from_pretrained( | ||
pretrained_model_name_or_path=model_name_or_path, | ||
device_map="npu:0", | ||
torch_dtype="auto" | ||
).eval() | ||
|
||
data_list = ["What's deep learning?"] | ||
dataset_calib = [] | ||
for calib_data in data_list: | ||
inputs = tokenizer(calib_data, return_tensors='pt').to("npu:0") | ||
dataset_calib.append([inputs.data['input_ids']]) | ||
|
||
anti_config = AntiOutlierConfig(anti_method="m2", dev_type="npu", dev_id=0) | ||
anti_outlier = AntiOutlier(model, calib_data=dataset_calib, cfg=anti_config) | ||
anti_outlier.process() | ||
|
||
disable_names = ['lm_head'] | ||
for layer_index in range(24): | ||
disable_names.append(f'model.layers.{layer_index}.mlp.down_proj') | ||
|
||
quant_config = QuantConfig( | ||
a_bit=8, | ||
w_bit=8, | ||
disable_names=disable_names, | ||
dev_type='npu', | ||
dev_id=0, | ||
act_method=3, | ||
pr=1.0, | ||
w_sym=True, | ||
mm_tensor=False | ||
) | ||
|
||
calibrator = Calibrator(model, quant_config, calib_data=dataset_calib, disable_level='L0') | ||
calibrator.run() | ||
|
||
# Currently, we need add config.json manualy for quantized weights generated by msmodelslim. | ||
# Following codes will be removed once msmodelslim can generate complete weights | ||
# except 'calibrator.save(tmp_path, save_type=["safe_tensor"])'. | ||
class EmptyModule(torch.nn.Module): | ||
def __init__(self) -> None: | ||
super(EmptyModule, self).__init__() | ||
|
||
def forward(self, x): | ||
return x | ||
|
||
test_quant_config = { | ||
"group_size": 0, | ||
"kv_quant_type": None, | ||
"fa_quant_type": None, | ||
"w_bit": 8, | ||
"a_bit": 8, | ||
"dev_type": "npu", | ||
"fraction": 0.01, | ||
"act_method": 3, | ||
"co_sparse": False, | ||
"anti_method": "m2", | ||
"disable_level": "L0", | ||
"do_smooth": False, | ||
"use_sigma": False, | ||
"sigma_factor": 3.0, | ||
"is_lowbit": False, | ||
"mm_tensor": False, | ||
"w_sym": True, | ||
"open_outlier": True, | ||
"is_dynamic": False | ||
} | ||
|
||
calibrator.model.config.update({"quantization_config": test_quant_config}) | ||
calibrator.model.config.quantization_config.update( | ||
{"quant_description": calibrator.quant_model_json_description.quant_model_description} | ||
) | ||
|
||
calibrator.save(tmp_path, save_type=["safe_tensor"]) | ||
calibrator.model.save_pretrained(tmp_path, state_dict=EmptyModule().state_dict()) | ||
tokenizer.save_pretrained(tmp_path) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a TODO here. Once more method is available in vllm-ascend. the skip can be removed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test case is designed for quantization methods based on mindie-turbo. For other possible quantization methods in the future, we can add new test cases.