AlibabaPAI · KuilongCui · Jan 23, 2025 · Nov 4, 2024 · Nov 4, 2024 · Nov 4, 2024
diff --git a/.github/workflows/offline_inference.yml b/.github/workflows/offline_inference.yml
@@ -20,7 +20,7 @@ jobs:
   offline_inference:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 5
+    timeout-minutes: 10
     steps:
     - uses: actions/checkout@v4
     - name: Run offline inference example

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -20,7 +20,7 @@ jobs:
   pylint_test:
     needs: cancel_previous_workflows
     runs-on: [self-hosted]
-    timeout-minutes: 5
+    timeout-minutes: 15
     steps:
     - uses: actions/checkout@v4
     - name: Analysing the code with pylint
@@ -29,4 +29,4 @@ jobs:
           -v ${PWD}:/workspace \
           -w /workspace \
           registry.cn-beijing.aliyuncs.com/llumnix/llumnix-dev:20240909_action_678a439 \
-          bash -c "pip install -e . > /dev/null && make lint"
+          bash -c "pip install -e .[vllm] > /dev/null && make lint"
diff --git a/Makefile b/Makefile
@@ -62,7 +62,7 @@ test: check_pytest_installed
 .PHONY: unit_test
 unit_test: check_pytest_installed
 	@pytest -v --ignore=third_party --ignore=tests/e2e_test --disable-warnings
-	
+
 .PHONY: offline_test
 offline_test:
 	@python examlpes/offline_inference.py

diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ Efficient and easy <i>multi-instance</i> LLM serving
 
 ## 🔥 Latest News
 
+- [2025.1] We updated vLLM to version v0.6.3.post1
 - [2024.11] Llumnix v0.1.0 launched!
 - [2024.7] We officially released the first version of Llumnix.
 - [2024.6] We released our OSDI '24 [research paper](https://arxiv.org/abs/2406.03243) on arxiv.

diff --git a/benchmark/benchmark_serving.py b/benchmark/benchmark_serving.py
@@ -84,14 +84,12 @@ async def query_model_vllm(prompt, verbose, ip_ports):
 
     async with aiohttp.ClientSession(timeout=timeout) as session:
         best_of = 1
-        use_beam_search = False
         output_len = expected_response_len
         request_dict = {
             "prompt": prompt,
             "n": 1,
             "best_of": best_of,
-            "use_beam_search": use_beam_search,
-            "temperature": 0.0 if use_beam_search else 1.0,
+            "temperature": 1.0,
             "top_k": 1,
             "max_tokens": max(output_len, 1),
             "ignore_eos": True,
@@ -815,18 +813,18 @@ def main():
     except FileNotFoundError:
         os.mknod(file_name)
     with open(file_name, 'w') as f:
-        results.append({"qps": args.qps, 
+        results.append({"qps": args.qps,
                         "cv": args.coefficient_variation,
-                        "request_ids": request_ids, 
+                        "request_ids": request_ids,
                         "request_lens": request_lens,
-                        "request_latencies": request_latencies, 
-                        "prefill_token_latencies": prefill_token_latencies, 
+                        "request_latencies": request_latencies,
+                        "prefill_token_latencies": prefill_token_latencies,
                         "decode_token_latencies": decode_token_latencies,
-                        "decode_sum_latencies": decode_sum_latencies, 
+                        "decode_sum_latencies": decode_sum_latencies,
                         "all_decode_token_latencies": all_decode_token_latencies,
                         "inference_latencies": inference_latencies,
                         "per_token_latencies_breakdown_dict": per_token_latencies_breakdown_dict,
-                        "throughput": throughput, 
+                        "throughput": throughput,
                         "instance_num": avg_instance_num})
         json.dump(results, f)
 

diff --git a/docs/Arguments.md b/docs/Arguments.md
@@ -1,4 +1,4 @@
-Below, you can find an explanation of each argument for Llumnix, and the explanation of arguments for vLLM is shown in the following link: [vLLM Engine Arguments](https://docs.vllm.ai/en/v0.4.2/models/engine_args.html). Please note that Llumnix does not currently support all features of vLLM. The vLLM features that are not supported by Llumnix are listed at the end of this document.
+Below, you can find an explanation of each argument for Llumnix, and the explanation of arguments for vLLM is shown in the following link: [vLLM Engine Arguments](https://docs.vllm.ai/en/v0.6.3.post1/models/engine_args.html). Please note that Llumnix does not currently support all features of vLLM. The vLLM features that are not supported by Llumnix are listed at the end of this document.
 
 # Llumnix arguments
 
@@ -259,10 +259,13 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 `--enable-chunked-prefill`
 - Llumnix does not support chunked prefill currently.
 
-`--use-v2-block-manager`
-- Llumnix does not support speculative decoding currently.
-
 `--speculative-model`
 - Llumnix does not support speculative decoding currently.
 
+`--pipeline-parallel-size`
+- Llumnix does not support pipeline parallel currently.
+
+`--num-schedule-steps`
+- Llumnix does not support multi-step scheduling.
+
 Besides, Llumnix does not support sampling algorithms whose number of ouput sequences is greater than one (vllm.SamplingParams.n > 1), such as beam search.
diff --git a/docs/Quickstart.md b/docs/Quickstart.md
@@ -4,7 +4,7 @@
 
 Llumnix requires python `3.8.1~3.10.0` and is currently built on top of vLLM (version 0.4.2). Therefore, the installation requirements are almost identical to those of vLLM. You can view the specific installation requirements for vLLM at the following link:
 
-[vLLM Installation](https://docs.vllm.ai/en/v0.4.2/getting_started/installation.html)
+[vLLM Installation](https://docs.vllm.ai/en/v0.6.3.post1/getting_started/installation.html)
 
 ### Install from Pypi
 
@@ -49,7 +49,7 @@ python -m llumnix.entrypoints.vllm.api_server \
 ```
 
 Upon starting the server, Llumnix's components are automatically configured.
-In addition to the server arguments provided above, it's necessary to specify both the Llumnix arguments and the vLLM arguments. For detailed configuration options, please consult the documentation for [Llumnix arguments](./Arguments.md) and [vLLM arguments](https://docs.vllm.ai/en/v0.4.2/models/engine_args.html). Lluminx arguments from cli will override the corresponding configuration in config file.
+In addition to the server arguments provided above, it's necessary to specify both the Llumnix arguments and the vLLM arguments. For detailed configuration options, please consult the documentation for [Llumnix arguments](./Arguments.md) and [vLLM arguments](https://docs.vllm.ai/en/v0.6.3.post1/models/engine_args.html). Lluminx arguments from cli will override the corresponding configuration in config file.
 
 2. Launch multiple servers and connect to the Llumnix cluster. Llumnix uses Ray to manage multiple vLLM servers and instances. You need to configure the following environment variables for Llumnix to correctly set up the cluster.
 ```
@@ -96,12 +96,11 @@ HEAD_NODE=1 python -m llumnix.entrypoints.vllm.api_server \
                 --initial-instances $INITIAL_INSTANCES \
                 --launch-ray-cluster \
                 --model $MODEL_PATH \
-                --engine-use-ray \
                 --worker-use-ray \
                 --max-model-len 4096 \
                 --migration-backend rayrpc \
 ```
-`CONFIG_PATH` is the path to the configuration file for Llumnix, and we give an example configuration file [here](../configs/base.yml). `MODEL_PATH` defines the location of your model. `INITIAL_INSTANCES` determines the number of instances to be launched on the current node, 
+`CONFIG_PATH` is the path to the configuration file for Llumnix, and we give an example configuration file [here](../configs/base.yml). `MODEL_PATH` defines the location of your model. `INITIAL_INSTANCES` determines the number of instances to be launched on the current node,
 
 Second, you can run the benchmark to evaluate the serving performance:
 

diff --git a/docs/Supported_Models.md b/docs/Supported_Models.md
@@ -1,6 +1,6 @@
 # Supported Models
 
-Llumnix serves as the request scheduling layer on top of the LLM backend engines. Therefore, all the models supported by the backend LLM engine should ideally be supported by Llumnix. We are also conducting full compatibility tests on different models. 
+Llumnix serves as the request scheduling layer on top of the LLM backend engines. Therefore, all the models supported by the backend LLM engine should ideally be supported by Llumnix. We are also conducting full compatibility tests on different models.
 
 Currently, Llumnix is developed on top of the vLLM (version 0.4.2), making its supported models identical to those of vLLM. Up to now, our primary testing of Llumnix has been conducted on Qwen and Llama models, including:
 
@@ -10,10 +10,10 @@ Currently, Llumnix is developed on top of the vLLM (version 0.4.2), making its s
 - Qwen
 - Qwen1.5
 - Qwen2
-- More models (not officially tested): [vLLM Supported Models](https://docs.vllm.ai/en/v0.4.2/models/supported_models.html)
+- More models (not officially tested): [vLLM Supported Models](https://docs.vllm.ai/en/v0.6.3.post1/models/supported_models.html)
 
 # Supported Backends
 
 Currently, Llumnix supports vLLM as its backend LLM engine. However, Llumnix is designed for extensibility to various backend LLM inference engines. We will incorporate more inference engines in the future.
 
-- vLLM (v0.4.2)
+- vLLM (v0.6.3.post1)
diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py
@@ -73,10 +73,10 @@ async def main():
     for request in prompts:
         request_id = random_uuid()
         await manager.generate.remote(request_id=request_id,
-                                      server_info=server_info, 
+                                      server_info=server_info,
                                       prompt=request,
-                                      sampling_params=sampling_params,)
-    
+                                      params=sampling_params,)
+
     await output_task
 
 asyncio.run(main())

diff --git a/llumnix/backends/backend_interface.py b/llumnix/backends/backend_interface.py
@@ -13,7 +13,7 @@
 
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Iterable, List, Union, Deque
+from typing import Iterable, List, Union, Deque, Tuple
 
 from llumnix.llumlet.request import LlumnixRequest, RequestStatus
 from llumnix.server_info import ServerInfo
@@ -71,7 +71,7 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
 
     # Methods for migration
     @abstractmethod
-    def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_stage_num_blocks: int) -> List[int]:
+    def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_stage_num_blocks: int) -> Tuple[List[int], List[int]]:
         """Retrieves the incremental block table for a given request.
 
         This method is used to fetch a list of block numbers that represent the incremental
@@ -88,7 +88,7 @@ def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_st
                                    need to be fetched in the current stage.
 
         Returns:
-            A list of integers, where each integer represents a block number that indicates
+            A list of integers and its token ids, where each integer represents a block number that indicates
             physical index of kv cache block tensor. These block numbers can then be used
             to transfer to dstination instance.
         """
@@ -191,7 +191,8 @@ def pre_alloc(self,
                   request_id: str,
                   request_status: RequestStatus,
                   request_arrival_time: float,
-                  block_num: int) -> List[int]:
+                  block_num: int,
+                  token_ids: List[int]) -> List[int]:
         """Pre-allocates cache blocks for a migrating request.
 
         This method selects a specified number of free cache blocks to be reserved for an incoming
@@ -207,7 +208,7 @@ def pre_alloc(self,
             request_status: The status (waiting/running) of the request.
             request_arrival_time: The arrival time of the request.
             block_num: The number of cache blocks that need to be pre-allocated for the request.
-
+            token_ids: The token IDs of the request.
         Returns:
             A list of integers where each integer represents the block table reserved for the migration request.
         """

diff --git a/llumnix/backends/bladellm/llm_engine.py b/llumnix/backends/bladellm/llm_engine.py
@@ -308,7 +308,8 @@ def pre_alloc(self,
                   request_id: str,
                   request_status: RequestStatus,
                   request_arrival_time: float,
-                  block_num: int) -> List[int]:
+                  block_num: int,
+                  token_ids: List[int]) -> List[int]:
         pass
 
     def add_running_request(self, backend_request: LlumnixRequest) -> None:

diff --git a/llumnix/backends/migration_backend_interface.py b/llumnix/backends/migration_backend_interface.py
@@ -33,9 +33,9 @@ def migrate_cache(self, src_handle, src_blocks: List[int], dst_blocks: List[int]
         raise NotImplementedError
 
     @abstractmethod
-    def do_send(self, dst_handle, blocks: List[int]):
+    def do_send(self, dst_handle, blocks: List[int], virtuel_engine: int):
         raise NotImplementedError
 
     @abstractmethod
-    def do_recv(self, src_handle, blocks: List[int]):
+    def do_recv(self, src_handle, blocks: List[int], virtuel_engine: int):
         raise NotImplementedError