From 5c392ceb27c1b51ab734a82fdc99842dc59f1c70 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 18 Feb 2025 15:18:25 +0800 Subject: [PATCH] [Doc] Update doc to work with release Signed-off-by: wangxiyuan --- .github/workflows/vllm_ascend_test.yaml | 2 +- Dockerfile | 2 +- docs/source/conf.py | 5 +- docs/source/developer_guide/contributing.md | 5 +- .../developer_guide/versioning_policy.md | 4 +- .../developer_guide/versioning_policy.zh.md | 4 +- docs/source/installation.md | 86 +++++++++------ docs/source/quick_start.md | 101 ++++-------------- docs/source/tutorials.md | 16 +-- docs/source/user_guide/release_notes.md | 26 +++-- docs/source/user_guide/supported_models.md | 2 + 11 files changed, 120 insertions(+), 133 deletions(-) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 4df43365..5d882a5f 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -48,7 +48,7 @@ jobs: runs-on: ascend-arm64 # actionlint-ignore: runner-label container: - image: quay.io/ascend/cann:8.0.0.beta1-910b-ubuntu22.04-py3.10 + image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10 volumes: - /usr/local/dcmi:/usr/local/dcmi - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi diff --git a/Dockerfile b/Dockerfile index e806c324..5a77b710 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ # limitations under the License. # -FROM quay.io/ascend/cann:8.0.0.beta1-910b-ubuntu22.04-py3.10 +FROM quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10 # Define environments ENV DEBIAN_FRONTEND=noninteractive diff --git a/docs/source/conf.py b/docs/source/conf.py index f51af25f..4fc3809c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -65,7 +65,10 @@ 'vllm_version': 'main', # the branch of vllm-ascend, used in vllm-ascend clone and image tag # such as 'main', 'v0.7.1-dev', 'v0.7.1rc1' - 'vllm_ascend_version': 'main' + 'vllm_ascend_version': 'main', + # the newest release version of vllm, used in quick start or container image tag. + # This value should be updated when cut down release. + 'vllm_newest_release_version': "v0.7.1.rc1", } # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/developer_guide/contributing.md b/docs/source/developer_guide/contributing.md index fda7dabb..953550b5 100644 --- a/docs/source/developer_guide/contributing.md +++ b/docs/source/developer_guide/contributing.md @@ -98,8 +98,9 @@ Only specific types of PRs will be reviewed. The PR title is prefixed appropriat - `[CI]` for build or continuous integration improvements. - `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. -> [!NOTE] -> If the PR spans more than one category, please include all relevant prefixes. +:::{note} +If the PR spans more than one category, please include all relevant prefixes. +::: ## Others diff --git a/docs/source/developer_guide/versioning_policy.md b/docs/source/developer_guide/versioning_policy.md index b571a93a..7bde2258 100644 --- a/docs/source/developer_guide/versioning_policy.md +++ b/docs/source/developer_guide/versioning_policy.md @@ -43,7 +43,7 @@ Usually, each minor version of vLLM (such as 0.7) will correspond to a vllm-asce | Branch | Status | Note | |-----------|------------|--------------------------------------| | main | Maintained | CI commitment for vLLM main branch | -| 0.7.1-dev | Maintained | CI commitment for vLLM 0.7.1 version | +| v0.7.1-dev | Maintained | CI commitment for vLLM 0.7.1 version | ## Release Compatibility Matrix @@ -51,7 +51,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin: | vllm-ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |--------------|--------------| --- | --- | --- | -| v0.7.x (TBD) | v0.7.x (TBD) | 3.9 - 3.12 | 8.0.0.beta1 | 2.5.1 / 2.5.1rc1 | +| v0.7.1.rc1 | v0.7.1 | 3.9 - 3.12 | 8.0.0 | 2.5.1 / 2.5.1.dev20250218 | ## Release cadence diff --git a/docs/source/developer_guide/versioning_policy.zh.md b/docs/source/developer_guide/versioning_policy.zh.md index 7ce66168..eeb9f7f2 100644 --- a/docs/source/developer_guide/versioning_policy.zh.md +++ b/docs/source/developer_guide/versioning_policy.zh.md @@ -43,7 +43,7 @@ vllm-ascend有主干和开发两种分支。 | 分支 | 状态 | 备注 | |-----------|------------|--------------------------------------| | main | Maintained | 基于vLLM main分支CI看护 | -| 0.7.1-dev | Maintained | 基于vLLM 0.7.1版本CI看护 | +| v0.7.1-dev | Maintained | 基于vLLM 0.7.1版本CI看护 | ## 版本配套 @@ -51,7 +51,7 @@ vLLM Ascend Plugin (`vllm-ascend`) 的关键配套关系如下: | vllm-ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |--------------|---------| --- | --- | --- | -| v0.7.x (TBD) | v0.7.x (TBD) | 3.9 - 3.12 | 8.0.0.beta1 | 2.5.1 / 2.5.1rc1 | +| v0.7.1rc1 | v0.7.1 | 3.9 - 3.12 | 8.0.0 | 2.5.1 / 2.5.1.dev20250218 | ## 发布节奏 diff --git a/docs/source/installation.md b/docs/source/installation.md index bc047875..937ae68f 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -11,7 +11,7 @@ This document describes how to install vllm-ascend manually. | Software | Supported version | Note | | ------------ | ----------------- | ---- | - | CANN | >= 8.0.0.beta1 | Required for vllm-ascend and torch-npu | + | CANN | >= 8.0.0 | Required for vllm-ascend and torch-npu | | torch-npu | >= 2.5.1rc1 | Required for vllm-ascend | | torch | >= 2.5.1 | Required for torch-npu and vllm | @@ -46,7 +46,7 @@ The easiest way to prepare your software environment is using CANN image directl ```bash # Update DEVICE according to your device (/dev/davinci[0-7]) -DEVICE=/dev/davinci7 +export DEVICE=/dev/davinci7 docker run --rm \ --name vllm-ascend-env \ @@ -59,11 +59,14 @@ docker run --rm \ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ -v /etc/ascend_install.info:/etc/ascend_install.info \ - -it quay.io/ascend/cann:8.0.0.beta1-910b-ubuntu22.04-py3.10 bash + -it quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10 bash ``` You can also install CANN manually: -> NOTE: This guide takes aarc64 as an example. If you run on x86, you need to replace `aarch64` with `x86_64` for the package name shown below. + +:::{note} +This guide takes aarch64 as an example. If you run on x86, you need to replace `aarch64` with `x86_64` for the package name shown below. +::: ```bash # Create a virtual environment @@ -83,11 +86,11 @@ chmod +x ./Ascend-cann-kernels-910b_8.0.0_linux-aarch64.run ./Ascend-cann-kernels-910b_8.0.0_linux-aarch64.run --install wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-nnal_8.0.0_linux-aarch64.run -chmod +x./Ascend-cann-nnal_8.0.0_linux-aarch64.run +chmod +x. /Ascend-cann-nnal_8.0.0_linux-aarch64.run ./Ascend-cann-nnal_8.0.0_linux-aarch64.run --install source /usr/local/Ascend/ascend-toolkit/set_env.sh -source /usr/local/Ascend/nnal/set_env.sh +source /usr/local/Ascend/nnal/atb/set_env.sh ``` :::: @@ -112,7 +115,30 @@ Once it's done, you can start to set up `vllm` and `vllm-ascend`. You can install `vllm` and `vllm-ascend` from **pre-built wheel**: ```bash -pip install vllm vllm-ascend -f https://download.pytorch.org/whl/torch/ +# Install vllm from source, since `pip install vllm` doesn't work on CPU currently. +# It'll be fixed in the next vllm release, e.g. v0.7.3. +git clone --branch v0.7.1 https://github.com/vllm-project/vllm +cd vllm +VLLM_TARGET_DEVICE=empty pip install . -f https://download.pytorch.org/whl/torch/ + +# Install vllm-ascend from pypi. +pip install vllm-ascend -f https://download.pytorch.org/whl/torch/ + +# Once the packages are installed, you need to install `torch-npu` manually, +# because that vllm-ascend relies on an unreleased version of torch-npu. +# This step will be removed in the next vllm-ascend release. +# +# Here we take python 3.10 on aarch64 as an example. Feel free to install the correct version for your environment. See: +# +# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py39.tar.gz +# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz +# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py311.tar.gz +# +mkdir pta +cd pta +wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz +tar -xvf pytorch_v2.5.1_py310.tar.gz +pip install ./torch_npu-2.5.1.dev20250218-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ``` or build from **source code**: @@ -136,7 +162,9 @@ pip install -e . -f https://download.pytorch.org/whl/torch/ You can just pull the **prebuilt image** and run it with bash. -```bash +```{code-block} bash + :substitutions: + # Update DEVICE according to your device (/dev/davinci[0-7]) DEVICE=/dev/davinci7 # Update the vllm-ascend image @@ -185,7 +213,7 @@ prompts = [ ] # Create a sampling params object. -sampling_params = SamplingParams(max_tokens=100, temperature=0.0) +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") @@ -207,25 +235,23 @@ python example.py The output will be like: ```bash -INFO 02-18 02:33:37 __init__.py:28] Available plugins for group vllm.platform_plugins: -INFO 02-18 02:33:37 __init__.py:30] name=ascend, value=vllm_ascend:register -INFO 02-18 02:33:37 __init__.py:32] all available plugins for group vllm.platform_plugins will be loaded. -INFO 02-18 02:33:37 __init__.py:34] set environment variable VLLM_PLUGINS to control which plugins to load. -INFO 02-18 02:33:37 __init__.py:42] plugin ascend loaded. -INFO 02-18 02:33:37 __init__.py:174] Platform plugin ascend is activated -INFO 02-18 02:33:50 config.py:526] This model supports multiple tasks: {'reward', 'embed', 'generate', 'score', 'classify'}. Defaulting to 'generate'. -INFO 02-18 02:33:50 llm_engine.py:232] Initializing a V0 LLM engine (v0.7.1) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='./opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=npu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./opt-125m, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False, -INFO 02-18 02:33:52 importing.py:14] Triton not installed or not compatible; certain GPU-related functions will not be available. -Loading pt checkpoint shards: 0% Completed | 0/1 [00:00 - -### Prepare Environment - -You can use the container image directly with one line command: - -```bash -# Update DEVICE according to your device (/dev/davinci[0-7]) -DEVICE=/dev/davinci7 -IMAGE=quay.io/ascend/cann:8.0.rc3.beta1-910b-ubuntu22.04-py3.10 -docker run \ - --name vllm-ascend-env --device $DEVICE \ - --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc \ - -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ - -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ - -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ - -v /etc/ascend_install.info:/etc/ascend_install.info \ - -v /root/.cache:/root/.cache \ - -it --rm $IMAGE bash -``` - -You can verify by running below commands in above container shell: - -```bash -npu-smi info -``` - -You will see following message: - -``` -+-------------------------------------------------------------------------------------------+ -| npu-smi 23.0.2 Version: 23.0.2 | -+----------------------+---------------+----------------------------------------------------+ -| NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)| -| Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) | -+======================+===============+====================================================+ -| 0 xxx | OK | 0.0 40 0 / 0 | -| 0 | 0000:C1:00.0 | 0 882 / 15169 0 / 32768 | -+======================+===============+====================================================+ -``` - - -## Installation - -Prepare: - -```bash -apt update -apt install git curl vim -y -# Config pypi mirror to speedup -pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple -``` - -Create your venv - -```bash -python3 -m venv .venv -source .venv/bin/activate -pip install --upgrade pip -``` - -You can install vLLM and vllm-ascend plugin by using: +## Setup environment using container ```{code-block} bash :substitutions: -# Install vLLM (About 5 mins) -git clone --depth 1 --branch |vllm_version| https://github.com/vllm-project/vllm.git -cd vllm -VLLM_TARGET_DEVICE=empty pip install . -cd .. - -# Install vLLM Ascend Plugin: -git clone --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git -cd vllm-ascend -pip install -e . -cd .. -``` +# You can change version a suitable one base on your requirement, e.g. main +export IMAGE=ghcr.io/vllm-project/vllm-ascend:|vllm_newest_release_version| +docker run \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` ## Usage -After vLLM and vLLM Ascend plugin installation, you can start to -try [vLLM QuickStart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html). - -You have two ways to start vLLM on Ascend NPU: +There are two ways to start vLLM on Ascend NPU: ### Offline Batched Inference with vLLM @@ -99,7 +40,6 @@ With vLLM installed, you can start generating texts for list of input prompts (i ```bash # Use Modelscope mirror to speed up download -pip install modelscope export VLLM_USE_MODELSCOPE=true ``` @@ -132,7 +72,6 @@ the following command to start the vLLM server with the ```bash # Use Modelscope mirror to speed up download -pip install modelscope export VLLM_USE_MODELSCOPE=true # Deploy vLLM server (The first run will take about 3-5 mins (10 MB/s) to download models) vllm serve Qwen/Qwen2.5-0.5B-Instruct & @@ -178,7 +117,7 @@ kill -2 $VLLM_PID You will see output as below: ``` -INFO 02-12 03:34:10 launcher.py:59] Shutting down FastAPI HTTP server. +INFO: Shutting down FastAPI HTTP server. INFO: Shutting down INFO: Waiting for application shutdown. INFO: Application shutdown complete. diff --git a/docs/source/tutorials.md b/docs/source/tutorials.md index 01dedc81..9bd9cb62 100644 --- a/docs/source/tutorials.md +++ b/docs/source/tutorials.md @@ -20,7 +20,7 @@ docker run \ -v /etc/ascend_install.info:/etc/ascend_install.info \ -v /root/.cache:/root/.cache \ -p 8000:8000 \ --it quay.io/ascend/vllm-ascend:latest bash +-it ghcr.io/vllm-project/vllm-ascend:v0.7.1.rc1 bash ``` Setup environment variables: @@ -33,8 +33,9 @@ export VLLM_USE_MODELSCOPE=True export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 ``` -> [!NOTE] -> `max_split_size_mb` prevents the native allocator from splitting blocks larger than this size (in MB). This can reduce fragmentation and may allow some borderline workloads to complete without running out of memory. You can find more details [here](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html). +:::{note} +`max_split_size_mb` prevents the native allocator from splitting blocks larger than this size (in MB). This can reduce fragmentation and may allow some borderline workloads to complete without running out of memory. You can find more details [here](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html). +::: Run the following script to execute offline inference on a single NPU: @@ -82,12 +83,13 @@ docker run \ -p 8000:8000 \ -e VLLM_USE_MODELSCOPE=True \ -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ --it quay.io/ascend/vllm-ascend:latest \ +-it ghcr.io/vllm-project/vllm-ascend:v0.7.1.rc1 \ vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240 ``` -> [!NOTE] -> Add `--max_model_len` option to avoid ValueError that the Qwen2.5-7B model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (26240). +:::{note} +Add `--max_model_len` option to avoid ValueError that the Qwen2.5-7B model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (26240). This will differ with different NPU series base on the HBM size. Please modify the value according to a suitable value for your NPU series. +::: If your service start successfully, you can see the info shown below: @@ -144,7 +146,7 @@ docker run \ -v /etc/ascend_install.info:/etc/ascend_install.info \ -v /root/.cache:/root/.cache \ -p 8000:8000 \ --it quay.io/ascend/vllm-ascend:latest bash +-it ghcr.io/vllm-project/vllm-ascend:v0.7.1.rc1 bash ``` Setup environment variables: diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md index 0af4aeaf..4e7d0fa0 100644 --- a/docs/source/user_guide/release_notes.md +++ b/docs/source/user_guide/release_notes.md @@ -2,19 +2,33 @@ ## v0.7.1.rc1 -We are excited to announce the release candidate of v0.7.1 for vllm-ascend. vllm-ascend is a community maintained hardware plugin for running vLLM on the Ascend NPU. With this release, users can now enjoy the latest features and improvements of vLLM on the Ascend NPU. +🎉 Hello, World! -Note that this is a release candidate, and there may be some bugs or issues. We appreciate your feedback and suggestions [here](https://github.com/vllm-project/vllm-ascend/issues/19) +We are excited to announce the first release candidate of v0.7.1 for vllm-ascend. + +vLLM Ascend Plugin (vllm-ascend) is a community maintained hardware plugin for running vLLM on the Ascend NPU. With this release, users can now enjoy the latest features and improvements of vLLM on the Ascend NPU. + +Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.1rc1) to start the journey. Note that this is a release candidate, and there may be some bugs or issues. We appreciate your feedback and suggestions [here](https://github.com/vllm-project/vllm-ascend/issues/19) ### Highlights -- The first release which official supports the Ascend NPU on vLLM originally. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/latest/) to start the journey. +- Initial supports for Ascend NPU on vLLM. [#3](https://github.com/vllm-project/vllm-ascend/pull/3) +- DeepSeek is now supported. [#88](https://github.com/vllm-project/vllm-ascend/pull/88) [#68](https://github.com/vllm-project/vllm-ascend/pull/68) +- Qwen, Llama series and other popular models are also supported, you can see more details in [here](https://vllm-ascend.readthedocs.io/en/latest/user_guide/supported_models.html). + +### Core + +- Added the Ascend quantization config option, the implementation will comming soon. [#7](https://github.com/vllm-project/vllm-ascend/pull/7) [#73](https://github.com/vllm-project/vllm-ascend/pull/73) +- Add silu_and_mul and rope ops and add mix ops into attention layer. [#18](https://github.com/vllm-project/vllm-ascend/pull/18) -### Other changes +### Other -- Added the Ascend quantization config option, the implementation will comming soon. +- [CI] Enable Ascend CI to actively monitor and improve quality for vLLM on Ascend. [#3](https://github.com/vllm-project/vllm-ascend/pull/3) +- [Docker] Add vllm-ascend container image [#64](https://github.com/vllm-project/vllm-ascend/pull/64) +- [Docs] Add a [live doc](https://vllm-ascend.readthedocs.org) [#55](https://github.com/vllm-project/vllm-ascend/pull/55) ### Known issues -- This release relies on an unreleased torch_npu version. Please [install](https://vllm-ascend.readthedocs.io/en/latest/installation.html) it manually. +- This release relies on an unreleased torch_npu version. It has been installed within official container image already. Please [install](https://vllm-ascend.readthedocs.io/en/v0.7.1.rc1/installation.html) it manually if you are using non-container environment. - There are logs like `No platform deteced, vLLM is running on UnspecifiedPlatform` or `Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'")` shown when runing vllm-ascend. It actually doesn't affect any functionality and performance. You can just ignore it. And it has been fixed in this [PR](https://github.com/vllm-project/vllm/pull/12432) which will be included in v0.7.3 soon. +- There are logs like `# CPU blocks: 35064, # CPU blocks: 2730` shown when runing vllm-ascend which should be `# NPU blocks:` . It actually doesn't affect any functionality and performance. You can just ignore it. And it has been fixed in this [PR](https://github.com/vllm-project/vllm/pull/13378) which will be included in v0.7.3 soon. diff --git a/docs/source/user_guide/supported_models.md b/docs/source/user_guide/supported_models.md index edf3df6c..48b9cee5 100644 --- a/docs/source/user_guide/supported_models.md +++ b/docs/source/user_guide/supported_models.md @@ -5,6 +5,8 @@ | Qwen 2.5 | ✅ || | Mistral | | Need test | | DeepSeek v2.5 | |Need test | +| DeepSeek v3 | ✅||| +| DeepSeek Distill (Qwen/llama) |✅|| | LLama3.1/3.2 | ✅ || | Gemma-2 | |Need test| | baichuan | |Need test|