From 5d5acd93bb80ab8b84096e637da6896548155abb Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Thu, 27 Feb 2025 15:23:23 +0800 Subject: [PATCH] [CI] upgrade to newest pta Co-authored-by: angazenn Signed-off-by: wangxiyuan --- .github/workflows/vllm_ascend_test.yaml | 4 +- docs/source/installation.md | 10 ++-- vllm_ascend/attention.py | 67 +++++++------------------ vllm_ascend/ops/rotary_embedding.py | 2 +- 4 files changed, 27 insertions(+), 56 deletions(-) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index c041f91b..5991d879 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -108,9 +108,9 @@ jobs: run: | mkdir pta cd pta - wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz + wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py310.tar.gz tar -xvf pytorch_v2.5.1_py310.tar.gz - pip install ./torch_npu-2.5.1.dev20250218-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl + pip install ./torch_npu-2.5.1.dev20250226-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl cd .. rm -rf pta diff --git a/docs/source/installation.md b/docs/source/installation.md index ffd79a1c..3501b0c8 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -12,7 +12,7 @@ This document describes how to install vllm-ascend manually. | Software | Supported version | Note | | ------------ | ----------------- | ---- | | CANN | >= 8.0.0 | Required for vllm-ascend and torch-npu | - | torch-npu | >= 2.5.1.dev20250218 | Required for vllm-ascend | + | torch-npu | >= 2.5.1.dev20250226 | Required for vllm-ascend | | torch | >= 2.5.1 | Required for torch-npu and vllm | You have 2 way to install: @@ -134,15 +134,15 @@ pip install vllm-ascend==|pip_vllm_ascend_version| --extra-index https://downloa # # Here we take python 3.10 on aarch64 as an example. Feel free to install the correct version for your environment. See: # -# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py39.tar.gz -# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz -# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py311.tar.gz +# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py39.tar.gz +# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py310.tar.gz +# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250226.4/pytorch_v2.5.1_py311.tar.gz # mkdir pta cd pta wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz tar -xvf pytorch_v2.5.1_py310.tar.gz -pip install ./torch_npu-2.5.1.dev20250218-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl +pip install ./torch_npu-2.5.1.dev20250226-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ``` or build from **source code**: diff --git a/vllm_ascend/attention.py b/vllm_ascend/attention.py index 66bc45e1..433e83fe 100644 --- a/vllm_ascend/attention.py +++ b/vllm_ascend/attention.py @@ -577,13 +577,11 @@ def forward( self.num_kv_heads, self.head_size) slots = attn_metadata.slot_mapping - torch_npu.npu_reshapecache(key=key, - value=value, - keyCache=key_cache, - valueCache=value_cache, - slotMapping=slots, - compressType=0, - kvCacheCfg=0) + torch_npu._npu_reshape_and_cache(key=key, + value=value, + key_cache=key_cache, + value_cache=value_cache, + slot_indices=slots) if attn_metadata.num_prefills > 0: @@ -596,32 +594,15 @@ def forward( np.array( attn_metadata.prefill_metadata.seq_lens).astype( np.int32)) - torch_npu.npu_selfattention( + torch_npu._npu_flash_attention( query=query, key=key, value=value, mask=mask, - maskType=1, - isTriuMask=0, - seqLen=self.seq_lens_tensor_cpu, - scale=self.scale, - qScale=1, - headNum=self.num_heads, - kvHeadNum=self.num_kv_heads, - mlaVHeadSize=0, - calcType=3, - kernelType=0, - clampType=0, - scaleType=0, - quantType=0, - cacheType=0, - batchRunStatusEnable=False, - kvcacheCfg=0, - clampMin=0, - clampMax=0, - inputLayout=0, - windowSize=0, - outDataType=0, + seq_len=self.seq_lens_tensor_cpu, + scale_value=self.scale, + num_heads=self.num_heads, + num_kv_heads=self.num_kv_heads, out=output) else: # TODO: Will support prefix cache and chunked prefill soon. @@ -634,26 +615,16 @@ def forward( np.array(attn_metadata.decode_metadata.seq_lens).astype( np.int32)) block_tables = attn_metadata.decode_metadata.block_tables - torch_npu.npu_pagedattention( + torch_npu._npu_paged_attention( query=query, - keyCache=key_cache, - valueCache=value_cache, - contextLens=self.seq_lens_tensor_cpu, - maskType=0, - kvHeadNum=self.num_kv_heads, - headNum=self.num_heads, - mlaVHeadSize=0, - qkScale=self.scale, - scaleType=0, - blockTables=block_tables, - batchRunStatusEnable=False, - hasQuantOffset=False, - calcType=3, - quantType=0, - compressType=0, - inputLayout=0, - outDataType=0, - attnOut=output) + key_cache=key_cache, + value_cache=value_cache, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + block_table=block_tables, + context_lens=self.seq_lens_tensor_cpu, + out=output) return output.view(num_tokens, self.hidden_size) diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 1999386b..552ef9f6 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -42,7 +42,7 @@ def rope_forward_oot( # TODO: Remove the contiguous in the future. query = query.contiguous() key = key.contiguous() - torch_npu.npu_rope( + torch_npu._npu_rotary_embedding( positions, query, key,