From ad7ed1cb6cc59f65cf5bfe1ffd2577d7070c38c4 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 12 Sep 2024 01:51:12 +0000 Subject: [PATCH 01/49] Rename request migration policy --- configs/base.yml | 2 +- docs/Arguments.md | 6 +++--- llumnix/arg_utils.py | 3 ++- llumnix/config/default.py | 2 +- llumnix/llumlet/local_migration_scheduler.py | 16 +++++++++------- .../llumlet/test_local_migration_scheduler.py | 4 ++-- 6 files changed, 18 insertions(+), 15 deletions(-) diff --git a/configs/base.yml b/configs/base.yml index afce7127..35760933 100644 --- a/configs/base.yml +++ b/configs/base.yml @@ -18,7 +18,7 @@ MANAGER: ENABLE_MIGRATION: True ENABLE_DEFRAG: True - REQUEST_MIGRATION_POLICY: 'SJF' + REQUEST_MIGRATION_POLICY: 'SRF' MIGRATION_BACKEND: 'gloo' MIGRATION_CACHE_BLOCKS: 512 diff --git a/docs/Arguments.md b/docs/Arguments.md index c8397bfa..2dbcac9c 100644 --- a/docs/Arguments.md +++ b/docs/Arguments.md @@ -17,7 +17,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] [--pair-migration-frequency PAIR_MIGRATION_FREQUENCY] [--pair-migration-policy {balanced,defrag_constrained,defrag_relaxed}] [--migrate-out-threshold MIGRATE_OUT_THRESHOLD] - [--request-migration-policy {LCFS,SJF,LJF}] + [--request-migration-policy {LCFS,SRF,LRF,FWSR}] [--enable-defrag ENABLE_DEFRAG] [--enable-scaling] [--min-instances MIN_INSTANCES] @@ -89,8 +89,8 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] `--request-migration-policy` - Request migration policy. -- Possible choices: LCFS, SJF, LJF -- Default: "SJF" +- Possible choices: LCFS, SRF, LRF, FWSR +- Default: "SRF" `--enable-defrag` - Enable defragmentation through migration based on virtual usage. diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index dd80276d..158c24c2 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -244,7 +244,8 @@ def add_cli_args( help='migrate out instance load threshold') parser.add_argument('--request-migration-policy', type=str, - choices=['LCFS', 'SJF', 'LJF'], + default=None, + choices=['LCFS', 'SRF', 'LRF', 'FWSR'], help='request migration policy') parser.add_argument('--enable-defrag', type=bool, diff --git a/llumnix/config/default.py b/llumnix/config/default.py index fb94443b..645ab72e 100644 --- a/llumnix/config/default.py +++ b/llumnix/config/default.py @@ -95,7 +95,7 @@ # Migrate out instance load threshold _C.MANAGER.MIGRATE_OUT_THRESHOLD = 3.0 # Request migration policy -_C.MANAGER.REQUEST_MIGRATION_POLICY = 'SJF' +_C.MANAGER.REQUEST_MIGRATION_POLICY = 'SRF' # Enable defragmentation through migration based on virtual usage _C.MANAGER.ENABLE_DEFRAG = False # Drop migration if the number of stages > max_stages diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index e630d982..7a34f5d0 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -23,15 +23,17 @@ def __init__(self, request_migration_policy: str, backend_engine: BackendInterfa self.backend_engine = backend_engine def get_migrate_out_request(self, min_request_len=0, max_request_len=np.inf) -> Optional[LlumnixRequest]: - # Requests meet the strict pre-migration always have higher prioirity than other migration policy. - migrate_out_request = self.get_ready_migration_request(min_request_len, max_request_len) + # Requests meet the strict pre-migration always have higher prioirity than other migration policy. + migrate_out_request: LlumnixRequest = self.get_ready_migration_request(min_request_len, max_request_len) if migrate_out_request is None: if self.request_migration_policy == 'LCFS': - migrate_out_request = self.get_last_running_request(min_request_len, max_request_len) - elif self.request_migration_policy == 'LJF': - migrate_out_request = self.get_longest_running_request(min_request_len, max_request_len) - elif self.request_migration_policy == 'SJF': - migrate_out_request = self.get_shortest_running_request(min_request_len, max_request_len) + migrate_out_request = self._get_last_running_request(min_request_len, max_request_len) + elif self.request_migration_policy == 'LRF': + migrate_out_request = self._get_longest_running_request(min_request_len, max_request_len) + elif self.request_migration_policy == 'SRF': + migrate_out_request = self._get_shortest_running_request(min_request_len, max_request_len) + elif self.request_migration_policy == 'FWSR': + migrate_out_request = self._get_first_waiting_or_shortest_running_request(min_request_len, max_request_len) return migrate_out_request # The function is used to retrieve requests on the backend that have already met the expected_steps. diff --git a/tests/unit_test/llumlet/test_local_migration_scheduler.py b/tests/unit_test/llumlet/test_local_migration_scheduler.py index d585300d..4c72d9a0 100644 --- a/tests/unit_test/llumlet/test_local_migration_scheduler.py +++ b/tests/unit_test/llumlet/test_local_migration_scheduler.py @@ -60,9 +60,9 @@ def test_scheduler_policy(): scheduler.request_migration_policy = "LCFS" assert scheduler.get_migrate_out_request().request_id == "2" - scheduler.request_migration_policy = "LJF" + scheduler.request_migration_policy = "LRF" assert scheduler.get_migrate_out_request().request_id == "1" - scheduler.request_migration_policy = "SJF" + scheduler.request_migration_policy = "SRF" assert scheduler.get_migrate_out_request().request_id == "0" engine.add_request(request_id="3", length=2, expected_steps=1) From 3baf2a44ca72dd21be7e3f4607eda0e5a7d6b37d Mon Sep 17 00:00:00 2001 From: s5u13b Date: Fri, 13 Sep 2024 02:15:24 +0000 Subject: [PATCH 02/49] Support migrate waiting request --- llumnix/backends/backend_interface.py | 56 ++++++--- llumnix/backends/vllm/llm_engine.py | 32 ++++-- llumnix/backends/vllm/scheduler.py | 48 ++++++-- llumnix/backends/vllm/sequence.py | 27 ++++- llumnix/llumlet/llumlet.py | 24 ++-- llumnix/llumlet/local_migration_scheduler.py | 24 ++-- llumnix/llumlet/migration_coordinator.py | 106 ++++++++++++------ llumnix/llumlet/request.py | 26 ++++- .../unit_test/backends/vllm/test_migration.py | 16 ++- .../unit_test/backends/vllm/test_scheduler.py | 62 +++++++++- tests/unit_test/backends/vllm/utils.py | 4 +- .../llumlet/test_migration_coordinator.py | 77 ++++++++----- 12 files changed, 364 insertions(+), 138 deletions(-) diff --git a/llumnix/backends/backend_interface.py b/llumnix/backends/backend_interface.py index 16a8ac1f..2fd42a04 100644 --- a/llumnix/backends/backend_interface.py +++ b/llumnix/backends/backend_interface.py @@ -13,9 +13,9 @@ from abc import ABC, abstractmethod from enum import Enum -from typing import Iterable, List, Union +from typing import Iterable, List, Union, Deque -from llumnix.llumlet.request import LlumnixRequest +from llumnix.llumlet.request import LlumnixRequest, RequestStatus from llumnix.server_info import ServerInfo class EngineState(str, Enum): @@ -99,12 +99,19 @@ def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_st raise NotImplementedError @abstractmethod - def get_running_queue(self) -> List[LlumnixRequest]: + def get_running_queue(self) -> Deque[LlumnixRequest]: """ Return backend's running queue. """ raise NotImplementedError + @abstractmethod + def get_waiting_queue(self) -> Deque[LlumnixRequest]: + """ + Return backend's waiting queue. + """ + raise NotImplementedError + @abstractmethod def remove_running_request(self, request_id: str) -> None: """ @@ -120,6 +127,20 @@ def remove_running_request(self, request_id: str) -> None: """ raise NotImplementedError + @abstractmethod + def remove_waiting_request(self, request_id: str) -> None: + """ + Removes a request from the backend's waiting queue. + + This method is responsible for safely halting and removing an active request from the waiting + queue of the backend engine. This action is performed in waiting request migration. + + Args: + request_id: A string identifier for the request that is to be removed from the waiting + queue. This ID uniquely identifies the request within the backend system. + """ + raise NotImplementedError + @abstractmethod def add_migrating_out_request_last_stage(self, backend_request: LlumnixRequest) -> None: """ @@ -164,17 +185,25 @@ def pop_migrating_out_requests_last_stage(self) -> List[LlumnixRequest]: raise NotImplementedError @abstractmethod - def pre_alloc(self, request_id: str, block_num: int) -> List[int]: + def pre_alloc(self, + request_id: str, + request_status: RequestStatus, + request_arrival_time: float, + block_num: int) -> List[int]: """Pre-allocates cache blocks for a migrating request. This method selects a specified number of free cache blocks to be reserved for an incoming migration request identified by the given request ID. It updates the pre-allocation cache dictionary with the allocated blocks, which ensures that these blocks are not used by - another process until the migration is finished. + another process until the migration is finished. For the waiting request, it only reserves + free cache blocks when the request is the earliest arrival one among the requests of dst instance's + waiting queue. Args: request_id: The unique identifier of the migration request for which cache blocks are to be pre-allocated. + request_status: The status (waiting/running) of the request. + request_arrival_time: The arrival time of the request. block_num: The number of cache blocks that need to be pre-allocated for the request. Returns: @@ -187,9 +216,8 @@ def add_running_request(self, backend_request: LlumnixRequest) -> None: """ Adds a backend request to the running queue for processing. - This method enqueues a backend request into engine running queue, marking it for - active processing. It is used when a suspend migrating request should be added back - to running queue. + This method enqueues a backend request into engine running queue. + It is used when a suspend migrating request should be added back to running queue. Args: backend_request: An object representing the backend request. The type of this @@ -199,19 +227,17 @@ def add_running_request(self, backend_request: LlumnixRequest) -> None: raise NotImplementedError @abstractmethod - def is_request_running(self, backend_request: LlumnixRequest) -> bool: - """Checks if a given backend request is currently in the running queue. + def add_waiting_request(self, backend_request: LlumnixRequest) -> None: + """ + Adds a backend request to the waiting queue for processing. - This method determines whether a backend request is present and actively being processed - in the running queue. + This method enqueues a backend request into engine waiting queue. + It is used when a suspend migrating request should be added back to waiting queue. Args: backend_request: An object representing the backend request. The type of this object is dependent on the backend implementation and the details of the request. - - Returns: - True if the backend request is currently in the running queue; False otherwise. """ raise NotImplementedError diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index bf583366..4b4b8655 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -13,7 +13,7 @@ import time import traceback -from typing import Any, List, Optional, Dict, Union, Iterable, Tuple +from typing import Any, List, Optional, Dict, Union, Iterable, Tuple, Deque from collections import defaultdict import threading import asyncio @@ -34,7 +34,7 @@ from llumnix.instance_info import InstanceInfo from llumnix.backends.backend_interface import BackendInterface, EngineState from llumnix.backends.vllm.scheduler import SchedulerLlumnix -from llumnix.backends.vllm.sequence import SequenceGroupLlumnix +from llumnix.backends.vllm.sequence import SequenceGroupLlumnix, RequestStatus from llumnix.backends.profiling import LatencyMemData from llumnix.server_info import ServerInfo from llumnix.internal_config import MigrationConfig @@ -295,9 +295,11 @@ def __init__( self.worker_handle_list = self.engine.model_executor.workers.copy() if len(self.worker_handle_list) + 1 == self.engine.parallel_config.world_size: self.worker_handle_list.insert(0, ray.get_actor(f"instance_{self.instance_id}", namespace="llumnix")) - self._run_workers("init_migration", instance_id=instance_id, migration_config=migration_config,\ - src_worker_handle_list=self.worker_handle_list, - placement_group=placement_group, node_id=node_id) + self._run_workers("init_migration", instance_id=instance_id, + migration_config=migration_config, + src_worker_handle_list=self.worker_handle_list, + placement_group=placement_group, + node_id=node_id) self.state = EngineState.INIT logger.info("engine ({}) current state {}".format(self.instance_id, self.state)) @@ -355,10 +357,10 @@ def commit_dst_request(self, backend_request: SequenceGroupLlumnix) -> None: async def send_blocks(self, dst_ray_actor: "ray.actor.ActorHandle", src_blocks: List[int], dst_blocks: List[int]) -> None: await dst_ray_actor.execute_engine_method.remote("_run_workers", - "migrate_cache", - dst_blocks=dst_blocks, - src_blocks=src_blocks, - src_worker_handle_list=self.worker_handle_list) + "migrate_cache", + dst_blocks=dst_blocks, + src_blocks=src_blocks, + src_worker_handle_list=self.worker_handle_list) def _run_workers(self, *args, **kwargs): # pylint: disable=protected-access @@ -373,15 +375,21 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: request_ids = set(request_id) return self.engine.abort_request(request_ids) - def get_running_queue(self) -> List[SequenceGroupLlumnix]: + def get_running_queue(self) -> Deque[SequenceGroupLlumnix]: return self.engine.scheduler.get_running_queue() + def get_waiting_queue(self) -> Deque[SequenceGroupLlumnix]: + return self.engine.scheduler.get_waiting_queue() + def get_request_incremental_blocks(self, *args, **kwargs) -> List[int]: return self.engine.scheduler.get_request_incremental_blocks(*args, **kwargs) def remove_running_request(self, *args, **kwargs) -> None: return self.engine.scheduler.remove_running_request(*args, **kwargs) + def remove_waiting_request(self, *args, **kwargs) -> None: + return self.engine.scheduler.remove_waiting_request(*args, **kwargs) + def add_migrating_out_request_last_stage(self, *args, **kwargs) -> None: return self.engine.scheduler.add_migrating_out_request_last_stage(*args, **kwargs) @@ -400,8 +408,8 @@ def should_abort_migration(self, *args, **kwargs) -> bool: def add_running_request(self, *args, **kwargs) -> None: return self.engine.scheduler.add_running_request(*args, **kwargs) - def is_request_running(self, *args, **kwargs) -> bool: - return self.engine.scheduler.is_request_running(*args, **kwargs) + def add_waiting_request(self, *args, **kwargs) -> None: + return self.engine.scheduler.add_waiting_request(*args, **kwargs) def free_dst_pre_alloc_cache(self, *args, **kwargs) -> None: return self.engine.scheduler.free_dst_pre_alloc_cache(*args, **kwargs) diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index a14db0b3..46a225c3 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -13,19 +13,21 @@ from asyncio.log import logger import time -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Deque from collections import deque from vllm.core.block_manager_v1 import BlockSpaceManagerV1, BlockTable from vllm.core.scheduler import (Scheduler, PreemptionMode, SequenceStatus, SequenceGroupMetadata, SchedulerOutputs) +from vllm.core.policy import PolicyFactory from llumnix.instance_info import InstanceInfo from llumnix.logger import init_logger -from llumnix.llumlet.request import RequestInferenceType +from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType, RequestStatus from llumnix.backends.vllm.sequence import SequenceGroupLlumnix logger = init_logger(__name__) + # TODO(ZeldaHuang): adapt prefix cache and sliding window, now use v1 manager class BlockManagerLlumnix(BlockSpaceManagerV1): def get_free_blocks(self, num_required_blocks: int) -> BlockTable: @@ -76,9 +78,12 @@ def _get_num_killed_requests(self) -> int: cnt += 1 return cnt - def get_running_queue(self): + def get_running_queue(self) -> Deque[SequenceGroupLlumnix]: return self.running + def get_waiting_queue(self) -> Deque[SequenceGroupLlumnix]: + return self.waiting + def get_all_request_ids(self) -> List[str]: request_ids : List[str] = [] for state_queue in [self.waiting, self.running, self.swapped]: @@ -86,7 +91,7 @@ def get_all_request_ids(self) -> List[str]: request_ids.append(seq_group.request_id) return request_ids - def get_request_incremental_blocks(self, backend_request: SequenceGroupLlumnix, pre_stage_num_blocks: int) -> List[int]: + def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_stage_num_blocks: int) -> List[int]: seq = backend_request.get_seqs()[0] blocks = self.block_manager.get_block_table(seq) return blocks[pre_stage_num_blocks:] @@ -94,9 +99,13 @@ def get_request_incremental_blocks(self, backend_request: SequenceGroupLlumnix, def remove_running_request(self, request_id: str) -> None: for seq_group in self.running: if seq_group.request_id == request_id: - seq = seq_group.get_seqs()[0] self.running.remove(seq_group) - seq.status = SequenceStatus.WAITING + break + + def remove_waiting_request(self, request_id: str) -> None: + for seq_group in self.waiting: + if seq_group.request_id == request_id: + self.waiting.remove(seq_group) break def add_migrating_out_request_last_stage(self, backend_request: SequenceGroupLlumnix) -> None: @@ -110,20 +119,34 @@ def pop_migrating_out_requests_last_stage(self) -> List[SequenceGroupLlumnix]: self.migrating_out_request_last_stage.clear() return migrating_out_request_last_stage - def pre_alloc(self, request_id: str, block_num: int) -> List[int]: + def pre_alloc(self, + request_id: str, + request_status: RequestStatus, + request_arrival_time: float, + block_num: int) -> List[int]: + # Only migrate waiting request when the waiting request is the earliest arrival one + # among the requests of dst instance's waiting queue. + if request_status == RequestStatus.WAITING: + if self.waiting and request_arrival_time > self.waiting[0].arrival_time: + return [] blocks = self.block_manager.get_free_blocks(block_num) + if len(blocks) < block_num: + return [] pre_blocks = self.pre_alloc_cache_dict.get(request_id, []) pre_blocks.extend(blocks) self.pre_alloc_cache_dict[request_id] = pre_blocks blocks = [block.block_number for block in blocks] return blocks - def add_running_request(self, backend_request: SequenceGroupLlumnix) -> None: - seq = backend_request.get_seqs()[0] - seq.status = SequenceStatus.RUNNING + def add_running_request(self, backend_request: LlumnixRequest) -> None: self.running.append(backend_request) - def is_request_running(self, backend_request: SequenceGroupLlumnix) -> bool: + def add_waiting_request(self, backend_request: LlumnixRequest) -> None: + self.waiting.append(backend_request) + fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs") + self.waiting = fcfs_policy.sort_by_priority(time.time(), self.waiting) + + def is_request_running(self, backend_request: LlumnixRequest) -> bool: return backend_request in self.running def free_dst_pre_alloc_cache(self, request_id: str = None) -> None: @@ -132,6 +155,7 @@ def free_dst_pre_alloc_cache(self, request_id: str = None) -> None: # pylint: disable=protected-access self.block_manager._free_block_table(blocks) else: + # TODO(s5u13b): Only effective with one-to-one migration restriction. # Clear all pre-allocated cache of dst instance when src instance encounters exception. request_ids = list(self.pre_alloc_cache_dict.keys()) for req_id in request_ids: @@ -193,6 +217,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: seq_group_metadata_list, scheduler_outputs = super().schedule() self.update_instance_info_callback(self._get_instance_info([scheduled_seq_group.seq_group \ for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups])) + for seq_group in self.waiting: + seq_group.try_schedule_times += 1 return seq_group_metadata_list, scheduler_outputs def _schedule_running(self, running_queue: deque, *args, **kwargs): diff --git a/llumnix/backends/vllm/sequence.py b/llumnix/backends/vllm/sequence.py index 3c41a5c6..146cf893 100644 --- a/llumnix/backends/vllm/sequence.py +++ b/llumnix/backends/vllm/sequence.py @@ -11,15 +11,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from vllm.sequence import SequenceGroup +import math -from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType +from vllm.sequence import SequenceGroup, SequenceStatus + +from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType, RequestStatus class SequenceGroupLlumnix(SequenceGroup, LlumnixRequest): def __init__(self, request_id, server_info, expected_steps: int, *args, **kwargs) -> None: SequenceGroup.__init__(self, request_id, *args, **kwargs) LlumnixRequest.__init__(self, request_id, server_info, expected_steps) + self.try_schedule_times = 0 @property def prompt_len(self) -> int: @@ -41,3 +44,23 @@ def inference_type(self) -> RequestInferenceType: if self.is_prefill(): return RequestInferenceType.PREFILL return RequestInferenceType.DECODE + + @property + def finished(self) -> bool: + return self.get_seqs()[0].is_finished() + + @property + def arrival_time(self) -> float: + return self.metrics.arrival_time + + @property + def request_status(self) -> RequestStatus: + if self.get_seqs()[0].status == SequenceStatus.RUNNING: + return RequestStatus.RUNNING + elif self.get_seqs()[0].status == SequenceStatus.WAITING: + return RequestStatus.WAITING + + @property + def prefill_num_blocks(self) -> int: + # Get the prefill len of the waiting request. + return math.ceil(len(self.request_len) / len(self.get_seqs()[0].block_size)) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 5aa3e4c2..3c766b14 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -27,6 +27,7 @@ from llumnix.server_info import ServerInfo from llumnix.internal_config import MigrationConfig from llumnix.queue.queue_type import QueueType +from llumnix.llumlet.request import RequestStatus logger = init_logger(__name__) @@ -55,7 +56,7 @@ def __init__(self, self.backend_engine) self.log_requests = True - self.check_state_thread = asyncio.create_task(self.check_state()) + asyncio.create_task(self._check_state_loop()) # pylint: disable=broad-except except Exception as e: logger.error("Failed to initialize llumlet: {}".format(e)) @@ -118,7 +119,7 @@ def from_args(cls, llumlet = engine_class.remote(instance_id, output_queue_type, backend_type, migration_config, *args, **kwargs) return llumlet - async def check_state(self): + async def _check_state_loop(self): while True: await asyncio.sleep(1) if self.backend_engine.state == EngineState.CRASHED: @@ -137,19 +138,21 @@ async def migrate_out(self, dst_instance_name: str, num_requests: int) -> List[s while continue_migrate and len(migrated_request_list) < num_requests: t0 = time.time() migrate_out_request = self.migration_scheduler.get_migrate_out_request() - if migrate_out_request is not None: - logger.info("migrate_out {}".format(migrate_out_request.request_id)) if migrate_out_request is None: return migrated_request_list + assert migrate_out_request.request_status in [RequestStatus.WAITING, RequestStatus.RUNNING], "Only migrate out waiting or running request" logger.info("{}->{} begin migrate out {}".format(self.instance_id, dst_instance_id, migrate_out_request.request_id)) - status = await self.migration_coordinator.migrate_out_multistage(migrate_in_ray_actor, migrate_out_request) + if migrate_out_request.request_status == RequestStatus.RUNNING: + status = await self.migration_coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) + else: + status = await self.migration_coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) if status == MigrationStatus.FINISHED_DONE: await migrate_in_ray_actor.execute_engine_method.remote("commit_dst_request", migrate_out_request) - self.backend_engine.free_src_request(migrate_out_request) + if migrate_out_request.request_status == RequestStatus.RUNNING: + self.backend_engine.free_src_request(migrate_out_request) migrated_request_list.append(migrate_out_request.request_id) - migrate_out_request.stage_timestamps.append(time.time()) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) - else: + elif status == MigrationStatus.FINISHED_SRC_ABORTED: migrate_out_request.reset_migration_args() await migrate_in_ray_actor.execute_migration_method.remote("free_dst_pre_alloc_cache", migrate_out_request.request_id) continue_migrate = False @@ -202,7 +205,10 @@ def clear_migration_states(self, is_migrate_in: bool) -> None: migrating_out_requests_last_stage = self.backend_engine.pop_migrating_out_requests_last_stage() for backend_request in migrating_out_requests_last_stage: logger.info("clear_migration_states: add request {} back to engine".format(backend_request.request_id)) - self.backend_engine.add_running_request(backend_request) + if backend_request.request_status == RequestStatus.RUNNING: + self.backend_engine.add_running_request(backend_request) + else: # backend_request.request_status == RequestStatus.WAITING + self.backend_engine.add_waiting_request(backend_request) def execute_migration_method(self, method, *args, **kwargs): executor = getattr(self.migration_coordinator, method) diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index 7a34f5d0..104358a9 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -11,12 +11,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional +from typing import Deque, Optional import numpy as np from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType from llumnix.backends.backend_interface import BackendInterface + class LocalMigrationScheduler: def __init__(self, request_migration_policy: str, backend_engine: BackendInterface) -> None: self.request_migration_policy = request_migration_policy @@ -48,28 +49,33 @@ def get_ready_migration_request(self, min_request_len, max_request_len): return request return None - def get_last_running_request(self, min_request_len, max_request_len): - running: List[LlumnixRequest] = self.backend_engine.get_running_queue() + def _get_last_running_request(self, min_request_len, max_request_len): + running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() for request in reversed(running): if request.inference_type == RequestInferenceType.DECODE \ and min_request_len <= request.request_len <= max_request_len: return request return None - def get_longest_running_request(self, min_request_len, max_request_len): - running: List[LlumnixRequest] = self.backend_engine.get_running_queue() + def _get_longest_running_request(self, min_request_len, max_request_len): + running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() condition = lambda request : request.inference_type == RequestInferenceType.DECODE \ and min_request_len <= request.request_len <= max_request_len longest_seq_group = max((request for request in running if condition(request)), \ - key=lambda request: request.request_len, default=None) + key=lambda request: request.request_len, default=None) return longest_seq_group - def get_shortest_running_request(self, min_request_len, max_request_len): - running: List[LlumnixRequest] = self.backend_engine.get_running_queue() + def _get_shortest_running_request(self, min_request_len, max_request_len): + running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() condition = lambda request : request.inference_type == RequestInferenceType.DECODE \ and min_request_len <= request.request_len <= max_request_len shortest_seq_group = min((request for request in running if condition(request)), \ - key=lambda request: request.request_len, default=None) + key=lambda request: request.request_len, default=None) return shortest_seq_group + + def _get_first_waiting_or_shortest_running_request(self, min_request_len, max_request_len): + waiting: Deque[LlumnixRequest] = self.backend_engine.get_waiting_queue() + waiting = [seq_group for seq_group in waiting if seq_group.try_schedule_times >= 1] + return waiting[0] if waiting else self._get_shortest_running_request(min_request_len, max_request_len) diff --git a/llumnix/llumlet/migration_coordinator.py b/llumnix/llumlet/migration_coordinator.py index 03b20cb2..777548b7 100644 --- a/llumnix/llumlet/migration_coordinator.py +++ b/llumnix/llumlet/migration_coordinator.py @@ -19,7 +19,7 @@ import ray from llumnix.logger import init_logger -from llumnix.llumlet.request import LlumnixRequest +from llumnix.llumlet.request import LlumnixRequest, RequestStatus from llumnix.backends.backend_interface import BackendInterface logger = init_logger(__name__) @@ -27,17 +27,15 @@ class MigrationStatus(enum.Enum): """Status of Migration.""" RUNNING = enum.auto() - # aborted by src instance - ABORTED_SRC = enum.auto() - # aborted by dst instance - ABORTED_DST = enum.auto() + FINISHED_DST_ABORTED = enum.auto() + FINISHED_SRC_ABORTED = enum.auto() FINISHED_DONE = enum.auto() @staticmethod def is_finished(status: "MigrationStatus") -> bool: return status in [ - MigrationStatus.ABORTED_SRC, - MigrationStatus.ABORTED_DST, + MigrationStatus.FINISHED_DST_ABORTED, + MigrationStatus.FINISHED_SRC_ABORTED, MigrationStatus.FINISHED_DONE ] @@ -49,20 +47,66 @@ def __init__(self, self.last_stage_max_blocks = last_stage_max_blocks self.max_stages = max_stages self.backend_engine = backend_engine + + async def migrate_out_running_request(self, + migrate_in_ray_actor: "ray.actor.ActorHandle", + migrate_out_request: LlumnixRequest) -> "MigrationStatus": + return await self._migrate_out_multistage(migrate_in_ray_actor, migrate_out_request) - async def migrate_out_onestage(self, migrate_in_ray_actor: "ray.actor.ActorHandle", migrate_out_request: LlumnixRequest, ) -> "MigrationStatus": - """one-stage live migration until last stage + async def migrate_out_waiting_request(self, + migrate_in_ray_actor: "ray.actor.ActorHandle", + migrate_out_request: LlumnixRequest) -> "MigrationStatus": + """one-stage migration for a waiting request + """ + self.backend_engine.remove_waiting_request(migrate_out_request.request_id) + self.backend_engine.add_migrating_out_request_last_stage(migrate_out_request) + prefill_num_blocks = migrate_out_request.prefill_num_blocks + dst_blocks = await migrate_in_ray_actor.execute_migration_method \ + .remote("migrate_in_pre_alloc", migrate_out_request.request_id, + migrate_out_request.request_status, + migrate_out_request.arrival_time, + prefill_num_blocks) + if len(dst_blocks) != prefill_num_blocks: + self.backend_engine.add_waiting_request(migrate_out_request) + self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) + return MigrationStatus.FINISHED_DST_ABORTED + + return MigrationStatus.FINISHED_DONE + + async def _migrate_out_multistage(self, + migrate_in_ray_actor: "ray.actor.ActorHandle", + migrate_out_request: LlumnixRequest) -> "MigrationStatus": + """Migrate out requests to a specified instance, return migrated request id. + Args: + dst_instance_name: instance actor name, used to get ray actor handle + """ + stage_count = 0 + while stage_count < self.max_stages: + stage_count += 1 + status = await self._migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) + if MigrationStatus.is_finished(status): + return status + # exceed max stages + return MigrationStatus.FINISHED_SRC_ABORTED + + async def _migrate_out_onestage(self, + migrate_in_ray_actor: "ray.actor.ActorHandle", + migrate_out_request: LlumnixRequest) -> "MigrationStatus": + """one-stage live migration until last stage for a running request """ pre_stage_num_blocks = sum(migrate_out_request.stage_num_blocks_list) incremental_blocks = self.backend_engine.get_request_incremental_blocks(migrate_out_request, pre_stage_num_blocks) # live migration, transfer all blocks except last one(currently updating) - migration_status = MigrationStatus.RUNNING is_last_stage = (len(incremental_blocks) <= self.last_stage_max_blocks) or migrate_out_request.blocking_migration if not is_last_stage: + migration_status = MigrationStatus.RUNNING src_blocks = incremental_blocks[:-1] stage_block_num = len(incremental_blocks) - 1 dst_blocks = await migrate_in_ray_actor.execute_migration_method \ - .remote("migrate_in_pre_alloc", migrate_out_request.request_id, stage_block_num) + .remote("migrate_in_pre_alloc", migrate_out_request.request_id, + migrate_out_request.request_status, + migrate_out_request.arrival_time, + stage_block_num) else: # last stage migration, stop inference, transfer all blocks migration_status = MigrationStatus.FINISHED_DONE @@ -71,15 +115,17 @@ async def migrate_out_onestage(self, migrate_in_ray_actor: "ray.actor.ActorHandl stage_block_num = len(incremental_blocks) src_blocks = incremental_blocks[:] dst_blocks = await migrate_in_ray_actor.execute_migration_method \ - .remote("migrate_in_pre_alloc", migrate_out_request.request_id, stage_block_num) + .remote("migrate_in_pre_alloc", migrate_out_request.request_id, + migrate_out_request.request_status, + migrate_out_request.arrival_time, + stage_block_num) if len(dst_blocks) != len(src_blocks): - # migrate-in instance failed to prev alloc + # migrate-in instance failed to pre alloc if is_last_stage: self.backend_engine.add_running_request(migrate_out_request) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) - migration_status = MigrationStatus.ABORTED_DST - return migration_status + return MigrationStatus.FINISHED_DST_ABORTED # do stage send/recv migrate_out_request.stage_timestamps.append(time.time()) migrate_out_request.stage_num_blocks_list.append(stage_block_num) @@ -87,31 +133,21 @@ async def migrate_out_onestage(self, migrate_in_ray_actor: "ray.actor.ActorHandl await self.backend_engine.send_blocks(migrate_in_ray_actor, src_blocks, dst_blocks) if not is_last_stage and migrate_out_request.should_abort_migration(): # migrate-out request abort by scheduler during send/recv - migration_status = MigrationStatus.ABORTED_SRC + return MigrationStatus.FINISHED_SRC_ABORTED return migration_status - async def migrate_out_multistage(self, migrate_in_ray_actor: "ray.actor.ActorHandle", migrate_out_request: LlumnixRequest) -> "MigrationStatus": - """Migrate out requests to a specified instance, return migrated request id. - Args: - dst_instance_name:instance actor name, used to get ray actor handle - """ - state_count = 0 - while state_count < self.max_stages: - state_count += 1 - status = await self.migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) - if MigrationStatus.is_finished(status): - return status - # exceed max stages - return MigrationStatus.ABORTED_SRC - - def migrate_in_pre_alloc(self, request_id: str, block_num: int) -> List[int]: + def migrate_in_pre_alloc(self, + request_id: str, + request_status: RequestStatus, + request_arrival_time: float, + block_num: int) -> List[int]: """prev alloc blocks to migrate in request """ - pre_alloc_blocks = self.backend_engine.pre_alloc(request_id ,block_num) - if len(pre_alloc_blocks) != block_num: - # failed to alloc, abort request - self.free_dst_pre_alloc_cache(request_id) + pre_alloc_blocks = self.backend_engine.pre_alloc(request_id, + request_status, + request_arrival_time, + block_num) return pre_alloc_blocks def free_dst_pre_alloc_cache(self, request_id: str = None) -> None: diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index 2319f52f..26a4af31 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -20,6 +20,10 @@ class RequestInferenceType(str, Enum): PREFILL = "prefill" DECODE = "decode" +class RequestStatus(str, Enum): + RUNNING = "running" + WAITING = "waiting" + class LlumnixRequest: def __init__(self, request_id: int, server_info: ServerInfo, expected_steps: int) -> None: self.request_id = request_id @@ -58,6 +62,22 @@ def prompt_len(self) -> int: @property def output_len(self) -> int: raise NotImplementedError + + @property + def finished(self) -> bool: + raise NotImplementedError + + @property + def arrival_time(self) -> float: + raise NotImplementedError + + @property + def request_status(self) -> RequestStatus: + raise NotImplementedError + + @property + def prefill_num_blocks(self) -> int: + raise NotImplementedError # Whether the migration of request is completed within one stage. For requests that have already reached # the expected steps, blocking_migration is True. @@ -66,7 +86,5 @@ def blocking_migration(self) -> bool: return self.output_len >= self.expected_steps def should_abort_migration(self) -> bool: - return self.output_len == 0 \ - or (self.last_preemption_time and self.last_preemption_time > self.stage_timestamps[-1]) \ - or self.inference_type == RequestInferenceType.PREFILL \ - or self.is_finished() + return self.finished \ + or (self.last_preemption_time is not None and self.last_preemption_time > self.stage_timestamps[-1]) diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index 2a8ad19e..c8157258 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -19,12 +19,13 @@ from vllm import EngineArgs, SamplingParams from vllm.utils import random_uuid +from vllm.sequence import SequenceStatus from llumnix.backends.vllm.llm_engine import BackendVLLM from llumnix.llumlet.llumlet import Llumlet from llumnix.backends.utils import BackendType from llumnix.internal_config import MigrationConfig -from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType +from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType, RequestStatus from llumnix.queue.queue_type import QueueType from tests.unit_test.queue.utils import request_output_queue_server @@ -51,6 +52,7 @@ def __init__(self): self.instance_id = "0" self.backend_engine = MockBackendVLLM() +# TODO(s5u13b): Test migrate waiting request. @pytest.mark.parametrize("migration_backend", ['rpc', 'gloo', 'nccl']) @pytest.mark.asyncio async def test_migration_correctness(setup_ray_env, migration_backend): @@ -228,13 +230,17 @@ async def test_correctness(prompt): def test_clear_migration_states(): llumlet = MockLlumlet() - llumlet.backend_engine.pre_alloc("0", 1) + llumlet.backend_engine.pre_alloc("0", RequestStatus.RUNNING, 0.0, 1) num_gpu_blocks = 8 block_size = 4 llumlet.clear_migration_states(is_migrate_in=True) - assert len(llumlet.backend_engine.pre_alloc("0", num_gpu_blocks)) == num_gpu_blocks - _, seq_group = create_dummy_prompt("0",7,block_size) + assert len(llumlet.backend_engine.pre_alloc("0", RequestStatus.RUNNING, 0.0, num_gpu_blocks)) == num_gpu_blocks + _, seq_group = create_dummy_prompt("0",7,block_size,SequenceStatus.RUNNING) llumlet.backend_engine.add_migrating_out_request_last_stage(seq_group) llumlet.clear_migration_states(is_migrate_in=False) - assert len(llumlet.backend_engine.get_running_queue()) > 0 + assert len(llumlet.backend_engine.get_running_queue()) == 1 + _, seq_group = create_dummy_prompt("0",7,block_size,SequenceStatus.WAITING) + llumlet.backend_engine.add_migrating_out_request_last_stage(seq_group) + llumlet.clear_migration_states(is_migrate_in=False) + assert len(llumlet.backend_engine.get_waiting_queue()) == 1 diff --git a/tests/unit_test/backends/vllm/test_scheduler.py b/tests/unit_test/backends/vllm/test_scheduler.py index 1c1af7ac..10874edd 100644 --- a/tests/unit_test/backends/vllm/test_scheduler.py +++ b/tests/unit_test/backends/vllm/test_scheduler.py @@ -12,13 +12,14 @@ # limitations under the License. import math +import time from vllm.sequence import Sequence from vllm.sequence import Logprob from vllm.core.policy import PolicyFactory from llumnix.backends.vllm.scheduler import BlockManagerLlumnix -from llumnix.llumlet.request import RequestInferenceType +from llumnix.llumlet.request import RequestInferenceType, RequestStatus from .utils import create_dummy_prompt, initialize_scheduler, create_token_budget @@ -129,6 +130,25 @@ def test_scheduler_running_request(): scheduler.add_running_request(seq_group) assert scheduler.get_num_unfinished_seq_groups() == 4 +def test_scheduler_waiting_request(): + scheduler = initialize_scheduler() + num_seq_group = 4 + block_size = 4 + _, seq_group_0 = create_dummy_prompt("0", prompt_length=0, block_size=block_size) + for idx in range(1, num_seq_group + 1): + _, seq_group = create_dummy_prompt(str(idx), prompt_length=idx, block_size=block_size) + scheduler.add_seq_group(seq_group) + assert scheduler.get_num_unfinished_seq_groups() == 4 + scheduler.remove_waiting_request("1") + assert scheduler.get_num_unfinished_seq_groups() == 3 + _, seq_group = create_dummy_prompt("6", prompt_length=idx, block_size=block_size) + scheduler.add_waiting_request(seq_group) + assert scheduler.get_num_unfinished_seq_groups() == 4 + # Test if sort the waiting queue by arrival time in add_waiting_request. + scheduler.add_waiting_request(seq_group_0) + waiting_queue = scheduler.get_waiting_queue() + assert waiting_queue[0] == seq_group_0 + def test_scheduler_migrating_out_request_last_stage(): scheduler = initialize_scheduler() block_size = 4 @@ -142,13 +162,13 @@ def test_scheduler_migrating_out_request_last_stage(): def test_scheduler_pre_alloc(): # total 8 blocks scheduler = initialize_scheduler() - blocks = scheduler.pre_alloc("1", 2) + blocks = scheduler.pre_alloc("1", RequestStatus.RUNNING, 0.0, 2) assert len(blocks) == 2 assert len(scheduler.pre_alloc_cache_dict["1"]) == 2 - blocks = scheduler.pre_alloc("1", 4) + blocks = scheduler.pre_alloc("1", RequestStatus.RUNNING, 0.0, 4) assert len(blocks) == 4 assert len(scheduler.pre_alloc_cache_dict["1"]) == 6 - blocks = scheduler.pre_alloc("2,", 4) + blocks = scheduler.pre_alloc("2", RequestStatus.RUNNING, 0.0, 4) assert len(blocks) == 0 def test_schedule_running(): @@ -176,3 +196,37 @@ def test_schedule_running(): assert len(running_scheduled.decode_seq_groups) == 1 assert len(running_scheduled.prefill_seq_groups) == 0 assert len(remainig_running) == 1 + + # test pre alloc waiting condition + # total 8 blocks + scheduler = initialize_scheduler() + before_arrival = time.time() + _, seq_group = create_dummy_prompt("1", prompt_length=1, block_size=2, expected_steps=math.inf) + after_arrival = time.time() + blocks = scheduler.pre_alloc("2", RequestStatus.WAITING, after_arrival, 2) + assert len(blocks) == 2 + scheduler.add_waiting_request(seq_group) + blocks = scheduler.pre_alloc("3", RequestStatus.WAITING, after_arrival, 2) + assert len(blocks) == 0 + blocks = scheduler.pre_alloc("4", RequestStatus.WAITING, before_arrival, 2) + assert len(blocks) == 2 + +def test_try_schedule_times(): + # total 8 blocks + scheduler = initialize_scheduler() + _, seq_group_1 = create_dummy_prompt("1", prompt_length=8, block_size=1) + _, seq_group_2 = create_dummy_prompt("2", prompt_length=8, block_size=1) + scheduler.add_seq_group(seq_group_1) + scheduler.add_seq_group(seq_group_2) + waiting_queue = scheduler.get_waiting_queue() + assert len(waiting_queue) == 2 + assert seq_group_1.try_schedule_times == 0 + assert seq_group_2.try_schedule_times == 0 + scheduler.schedule() + # seq_group_2 cannot be scheduled due to lack of blocks + assert seq_group_1.try_schedule_times == 0 + assert seq_group_2.try_schedule_times == 1 + scheduler.schedule() + # seq_group_1 is preempted to waiting queue + assert seq_group_1.try_schedule_times == 1 + assert seq_group_2.try_schedule_times == 2 diff --git a/tests/unit_test/backends/vllm/utils.py b/tests/unit_test/backends/vllm/utils.py index bc8d1f09..887bdd93 100644 --- a/tests/unit_test/backends/vllm/utils.py +++ b/tests/unit_test/backends/vllm/utils.py @@ -18,7 +18,7 @@ from vllm import SamplingParams from vllm.lora.request import LoRARequest -from vllm.sequence import Logprob, Sequence +from vllm.sequence import Logprob, Sequence, SequenceStatus from vllm.config import SchedulerConfig, CacheConfig from vllm.core.scheduler import SchedulingBudget @@ -45,6 +45,7 @@ def create_dummy_prompt( request_id: str, prompt_length: int, block_size: Optional[int] = None, + status: SequenceStatus = SequenceStatus.WAITING, lora_request: Optional[LoRARequest] = None, use_beam_search: bool = False, best_of: int = 1, @@ -63,6 +64,7 @@ def create_dummy_prompt( request_id, server_info, expected_steps, [prompt], SamplingParams(use_beam_search=use_beam_search, best_of=best_of), time.time(), lora_request) + seq_group.get_seqs()[0].status = status return prompt, seq_group diff --git a/tests/unit_test/llumlet/test_migration_coordinator.py b/tests/unit_test/llumlet/test_migration_coordinator.py index 8a1a4d44..3386620b 100644 --- a/tests/unit_test/llumlet/test_migration_coordinator.py +++ b/tests/unit_test/llumlet/test_migration_coordinator.py @@ -38,7 +38,7 @@ async def test_migrate_out_onestage(setup_ray_env): migrate_out_request = MagicMock() # Create an instance of MigrationCoordinator - coordinator = MigrationCoordinator(backend_engine, 1, 3) + coordinator = MigrationCoordinator(backend_engine, last_stage_max_blocks=1, max_stages=3) # Mock method return values and test data src_blocks = [1, 2, 3] @@ -49,7 +49,7 @@ async def test_migrate_out_onestage(setup_ray_env): migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) # Test normal migration scenario - status = await coordinator.migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) + status = await coordinator._migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) assert status == MigrationStatus.RUNNING # Test the last stage of migration @@ -59,20 +59,21 @@ async def test_migrate_out_onestage(setup_ray_env): migrate_out_request.should_abort_migration.return_value = False migrate_out_request.blocking_migration = False migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) - status = await coordinator.migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) + status = await coordinator._migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) assert status == MigrationStatus.FINISHED_DONE migrate_out_request = MagicMock() - # Test migration aborted scenario + # Test migration dst aborted scenario src_blocks = [1, 2, 3] dst_blocks = [] backend_engine.get_request_incremental_blocks.return_value = src_blocks migrate_out_request.should_abort_migration.return_value = False migrate_out_request.blocking_migration = False migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) - status = await coordinator.migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) - assert status == MigrationStatus.ABORTED_DST + status = await coordinator._migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) + assert status == MigrationStatus.FINISHED_DST_ABORTED + # Test migration src aborted scenario migrate_out_request = MagicMock() src_blocks = [1, 2, 3] dst_blocks = [1, 2] @@ -80,23 +81,13 @@ async def test_migrate_out_onestage(setup_ray_env): migrate_out_request.should_abort_migration.return_value = True migrate_out_request.blocking_migration = False migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) - status = await coordinator.migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) - assert status == MigrationStatus.ABORTED_SRC + status = coordinator._migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) + assert status == MigrationStatus.FINISHED_SRC_ABORTED - migrate_out_request = MagicMock() - src_blocks = [1, 2, 3] - dst_blocks = [1, 2] - backend_engine.get_request_incremental_blocks.return_value = src_blocks - migrate_out_request.should_abort_migration.return_value = False - migrate_out_request.blocking_migration = True - migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) - status = await coordinator.migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) - assert status == MigrationStatus.ABORTED_DST - -# setup_ray_env should be passed after migrate_out_onestage -@patch.object(MigrationCoordinator, 'migrate_out_onestage') +# setup_ray_env should be passed after _migrate_out_onestage +@patch.object(MigrationCoordinator, '_migrate_out_onestage') @pytest.mark.asyncio -async def test_migrate_out_multistage(_, setup_ray_env): +async def test_migrate_out_running_request(_, setup_ray_env): # Create mock objects backend_engine = MagicMock(spec=BackendInterface) migrate_in_ray_actor = MagicMock() @@ -110,16 +101,40 @@ async def test_migrate_out_multistage(_, setup_ray_env): migrate_in_ray_actor.execute_engine_method.remote = MagicMock() migrate_in_ray_actor.execute_engine_method.remote.return_value = ray_remote_call.remote([1]) migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote([1]) - coordinator.migrate_out_onestage.side_effect = [MigrationStatus.FINISHED_DONE] - status = await coordinator.migrate_out_multistage(migrate_in_ray_actor, migrate_out_request) - assert coordinator.migrate_out_onestage.call_count == 1 + coordinator._migrate_out_onestage.side_effect = [MigrationStatus.FINISHED_DONE] + status = await coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) + assert coordinator._migrate_out_onestage.call_count == 1 assert status == MigrationStatus.FINISHED_DONE max_stages = 3 - coordinator.migrate_out_onestage.side_effect = [MigrationStatus.RUNNING, - MigrationStatus.RUNNING, - MigrationStatus.RUNNING, - MigrationStatus.RUNNING] - status = await coordinator.migrate_out_multistage(migrate_in_ray_actor, migrate_out_request) - assert coordinator.migrate_out_onestage.call_count == max_stages + 1 - assert status == MigrationStatus.ABORTED_SRC + coordinator._migrate_out_onestage.side_effect = [MigrationStatus.RUNNING, + MigrationStatus.RUNNING, + MigrationStatus.RUNNING, + MigrationStatus.RUNNING] + status = await coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) + assert coordinator._migrate_out_onestage.call_count == max_stages + 1 + assert status == MigrationStatus.FINISHED_SRC_ABORTED + +def test_migrate_out_waiting_request(): + # Create mock objects + backend_engine = MagicMock(spec=BackendInterface) + migrate_in_ray_actor = MagicMock() + migrate_out_request = MagicMock() + + # Create an instance of MigrationCoordinator + coordinator = MigrationCoordinator(backend_engine, last_stage_max_blocks=1, max_stages=3) + + # Test FINISHED_DONE + migrate_out_request.prefill_num_blocks = 3 + dst_blocks = [1, 2, 3] + migrate_in_ray_actor.execute_engine_method = MagicMock() + migrate_in_ray_actor.execute_engine_method.remote = MagicMock() + migrate_in_ray_actor.execute_engine_method.remote.return_value = ray_remote_call.remote(dst_blocks) + migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) + status = coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) + assert status == MigrationStatus.FINISHED_DONE + + # Test FINISHED_ABORTED + migrate_out_request.prefill_num_blocks = 2 + status = coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) + assert status == MigrationStatus.FINISHED_DST_ABORTED From ce20d3933b43f47fbbf0422c6a2403e8f04ce55a Mon Sep 17 00:00:00 2001 From: s5u13b Date: Fri, 13 Sep 2024 08:59:16 +0000 Subject: [PATCH 03/49] Fix --- llumnix/backends/vllm/llm_engine.py | 5 +++- llumnix/backends/vllm/scheduler.py | 29 +++++++++++++++++--- llumnix/backends/vllm/sequence.py | 23 +++++++++++++--- llumnix/llumlet/llumlet.py | 8 ++++-- llumnix/llumlet/local_migration_scheduler.py | 2 +- llumnix/llumlet/migration_coordinator.py | 9 ++++-- llumnix/llumlet/request.py | 4 ++- 7 files changed, 64 insertions(+), 16 deletions(-) diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 4b4b8655..8eeaf49c 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -353,7 +353,10 @@ def commit_dst_request(self, backend_request: SequenceGroupLlumnix) -> None: pre_alloc_blocks = self.engine.scheduler.pre_alloc_cache_dict.pop(backend_request.request_id) self.engine.scheduler.block_manager.add_block_table(pre_alloc_blocks, seq.seq_id) backend_request.reset_migration_args() - self.add_running_request(backend_request) + if backend_request.status == RequestStatus.RUNNING: + self.add_running_request(backend_request) + else: # backend_request.status == RequestStatus.WAITING + self.add_waiting_request(backend_request) async def send_blocks(self, dst_ray_actor: "ray.actor.ActorHandle", src_blocks: List[int], dst_blocks: List[int]) -> None: await dst_ray_actor.execute_engine_method.remote("_run_workers", diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 46a225c3..cc00f02e 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -13,17 +13,19 @@ from asyncio.log import logger import time -from typing import Dict, List, Optional, Tuple, Deque +from typing import Dict, List, Optional, Tuple, Deque, Union from collections import deque from vllm.core.block_manager_v1 import BlockSpaceManagerV1, BlockTable from vllm.core.scheduler import (Scheduler, PreemptionMode, SequenceStatus, SequenceGroupMetadata, SchedulerOutputs) from vllm.core.policy import PolicyFactory +from vllm.sequence import SequenceGroup from llumnix.instance_info import InstanceInfo from llumnix.logger import init_logger from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType, RequestStatus -from llumnix.backends.vllm.sequence import SequenceGroupLlumnix +from llumnix.backends.vllm.sequence import SequenceGroupLlumnix, SequenceStatusLlumnix + logger = init_logger(__name__) @@ -100,12 +102,15 @@ def remove_running_request(self, request_id: str) -> None: for seq_group in self.running: if seq_group.request_id == request_id: self.running.remove(seq_group) + self._set_status(seq_group, status_to=SequenceStatusLlumnix.RUNNING_MIGRATING) break def remove_waiting_request(self, request_id: str) -> None: for seq_group in self.waiting: if seq_group.request_id == request_id: self.waiting.remove(seq_group) + self._set_status(seq_group, status_to=SequenceStatusLlumnix.WAITING_MIGRATING) + seq_group.waiting_migrating = True break def add_migrating_out_request_last_stage(self, backend_request: SequenceGroupLlumnix) -> None: @@ -139,16 +144,32 @@ def pre_alloc(self, return blocks def add_running_request(self, backend_request: LlumnixRequest) -> None: + self._set_status(backend_request, status_to=SequenceStatus.RUNNING) self.running.append(backend_request) def add_waiting_request(self, backend_request: LlumnixRequest) -> None: + self._set_status(backend_request, status_to=SequenceStatus.WAITING) self.waiting.append(backend_request) fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs") self.waiting = fcfs_policy.sort_by_priority(time.time(), self.waiting) - def is_request_running(self, backend_request: LlumnixRequest) -> bool: - return backend_request in self.running + def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: + # Change seq status to running, but request status is still waiting_migrating. + if seq_group.waiting_migrating: + # For the waiting request migrated in, blocks have already been allocated when pre alloc. + self._set_status(seq_group, status_to=SequenceStatus.RUNNING) + seq_group.waiting_migrating = False + else: + super()._allocate_and_set_running(seq_group) + + def _set_status(self, + seq_group: SequenceGroup, + status_to: Union[SequenceStatus, SequenceStatusLlumnix], + status_from: Union[SequenceStatus, SequenceStatusLlumnix] = None): + for seq in seq_group.get_seqs(status=status_from): + seq.status = status_to + @scheduler_lock def free_dst_pre_alloc_cache(self, request_id: str = None) -> None: if request_id: blocks = self.pre_alloc_cache_dict.pop(request_id, []) diff --git a/llumnix/backends/vllm/sequence.py b/llumnix/backends/vllm/sequence.py index 146cf893..06b01060 100644 --- a/llumnix/backends/vllm/sequence.py +++ b/llumnix/backends/vllm/sequence.py @@ -12,12 +12,19 @@ # limitations under the License. import math +import enum from vllm.sequence import SequenceGroup, SequenceStatus from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType, RequestStatus +class SequenceStatusLlumnix(enum.Enum): + # src running request migrating to dst + RUNNING_MIGRATING = enum.auto() + # src waiting request migrating to dst + WAITING_MIGRATING = enum.auto() + class SequenceGroupLlumnix(SequenceGroup, LlumnixRequest): def __init__(self, request_id, server_info, expected_steps: int, *args, **kwargs) -> None: SequenceGroup.__init__(self, request_id, *args, **kwargs) @@ -54,13 +61,21 @@ def arrival_time(self) -> float: return self.metrics.arrival_time @property - def request_status(self) -> RequestStatus: - if self.get_seqs()[0].status == SequenceStatus.RUNNING: + def status(self) -> RequestStatus: + status = self.get_seqs()[0].status + assert status in [SequenceStatus.RUNNING, SequenceStatus.WAITING, + SequenceStatusLlumnix.RUNNING_MIGRATING, SequenceStatusLlumnix.WAITING_MIGRATING], \ + "Only RUNNING, WAITING, RUNNING_MIGRATING, WAITING_MIGRATING are expected status for LlumnixRequest" + if status == SequenceStatus.RUNNING: + return RequestStatus.RUNNING + elif status == SequenceStatus.WAITING: + return RequestStatus.WAITING + elif status == SequenceStatusLlumnix.RUNNING_MIGRATING: return RequestStatus.RUNNING - elif self.get_seqs()[0].status == SequenceStatus.WAITING: + elif status == SequenceStatusLlumnix.WAITING_MIGRATING: return RequestStatus.WAITING @property def prefill_num_blocks(self) -> int: # Get the prefill len of the waiting request. - return math.ceil(len(self.request_len) / len(self.get_seqs()[0].block_size)) + return math.ceil(self.request_len / self.get_seqs()[0].block_size) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 3c766b14..b90dacc2 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -15,6 +15,7 @@ import traceback from typing import List, Union, Iterable import time +import traceback import ray from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy, NodeAffinitySchedulingStrategy @@ -163,6 +164,9 @@ async def migrate_out(self, dst_instance_name: str, num_requests: int) -> List[s except ray.exceptions.RayActorError: logger.info("[migrate_out] instance {} is dead".format(dst_instance_name[len("instance_"):])) raise + except Exception as e: + logger.error("unexpected exception occurs: {}".format(e)) + logger.error("exception traceback: {}".format(traceback.format_exc())) return migrated_request_list def get_instance_info(self) -> InstanceInfo: @@ -205,9 +209,9 @@ def clear_migration_states(self, is_migrate_in: bool) -> None: migrating_out_requests_last_stage = self.backend_engine.pop_migrating_out_requests_last_stage() for backend_request in migrating_out_requests_last_stage: logger.info("clear_migration_states: add request {} back to engine".format(backend_request.request_id)) - if backend_request.request_status == RequestStatus.RUNNING: + if backend_request.status == RequestStatus.RUNNING: self.backend_engine.add_running_request(backend_request) - else: # backend_request.request_status == RequestStatus.WAITING + else: # backend_request.status == RequestStatus.WAITING self.backend_engine.add_waiting_request(backend_request) def execute_migration_method(self, method, *args, **kwargs): diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index 104358a9..f4b51b18 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -78,4 +78,4 @@ def _get_shortest_running_request(self, min_request_len, max_request_len): def _get_first_waiting_or_shortest_running_request(self, min_request_len, max_request_len): waiting: Deque[LlumnixRequest] = self.backend_engine.get_waiting_queue() waiting = [seq_group for seq_group in waiting if seq_group.try_schedule_times >= 1] - return waiting[0] if waiting else self._get_shortest_running_request(min_request_len, max_request_len) + return waiting[0] if waiting and waiting[0].request_len > 0 else self._get_shortest_running_request(min_request_len, max_request_len) diff --git a/llumnix/llumlet/migration_coordinator.py b/llumnix/llumlet/migration_coordinator.py index 777548b7..345e2160 100644 --- a/llumnix/llumlet/migration_coordinator.py +++ b/llumnix/llumlet/migration_coordinator.py @@ -63,7 +63,7 @@ async def migrate_out_waiting_request(self, prefill_num_blocks = migrate_out_request.prefill_num_blocks dst_blocks = await migrate_in_ray_actor.execute_migration_method \ .remote("migrate_in_pre_alloc", migrate_out_request.request_id, - migrate_out_request.request_status, + migrate_out_request.status, migrate_out_request.arrival_time, prefill_num_blocks) if len(dst_blocks) != prefill_num_blocks: @@ -104,7 +104,7 @@ async def _migrate_out_onestage(self, stage_block_num = len(incremental_blocks) - 1 dst_blocks = await migrate_in_ray_actor.execute_migration_method \ .remote("migrate_in_pre_alloc", migrate_out_request.request_id, - migrate_out_request.request_status, + migrate_out_request.status, migrate_out_request.arrival_time, stage_block_num) else: @@ -116,7 +116,7 @@ async def _migrate_out_onestage(self, src_blocks = incremental_blocks[:] dst_blocks = await migrate_in_ray_actor.execute_migration_method \ .remote("migrate_in_pre_alloc", migrate_out_request.request_id, - migrate_out_request.request_status, + migrate_out_request.status, migrate_out_request.arrival_time, stage_block_num) @@ -148,6 +148,9 @@ def migrate_in_pre_alloc(self, request_status, request_arrival_time, block_num) + if len(pre_alloc_blocks) != block_num: + # failed to alloc, abort request + self.free_dst_pre_alloc_cache(request_id) return pre_alloc_blocks def free_dst_pre_alloc_cache(self, request_id: str = None) -> None: diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index 26a4af31..d5456309 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -37,6 +37,8 @@ def __init__(self, request_id: int, server_info: ServerInfo, expected_steps: int self.stage_timestamps = [] self.stage_num_blocks_list = [] + self.waiting_migrating = False + def reset_migration_args(self): self.last_preemption_time = None self.stage_timestamps = [] @@ -72,7 +74,7 @@ def arrival_time(self) -> float: raise NotImplementedError @property - def request_status(self) -> RequestStatus: + def status(self) -> RequestStatus: raise NotImplementedError @property From 3cd6b67fcf175b8d1d63f19b609f79f6438063a0 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Fri, 13 Sep 2024 09:47:08 +0000 Subject: [PATCH 04/49] Change request migration policy FWSR to FWJ --- docs/Arguments.md | 4 ++-- llumnix/arg_utils.py | 2 +- llumnix/llumlet/local_migration_scheduler.py | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/Arguments.md b/docs/Arguments.md index 2dbcac9c..eaf3b949 100644 --- a/docs/Arguments.md +++ b/docs/Arguments.md @@ -17,7 +17,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] [--pair-migration-frequency PAIR_MIGRATION_FREQUENCY] [--pair-migration-policy {balanced,defrag_constrained,defrag_relaxed}] [--migrate-out-threshold MIGRATE_OUT_THRESHOLD] - [--request-migration-policy {LCFS,SRF,LRF,FWSR}] + [--request-migration-policy {LCFS,SRF,LRF,FWJ}] [--enable-defrag ENABLE_DEFRAG] [--enable-scaling] [--min-instances MIN_INSTANCES] @@ -89,7 +89,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] `--request-migration-policy` - Request migration policy. -- Possible choices: LCFS, SRF, LRF, FWSR +- Possible choices: LCFS, SRF, LRF, FWJ - Default: "SRF" `--enable-defrag` diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index 158c24c2..81334a12 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -245,7 +245,7 @@ def add_cli_args( parser.add_argument('--request-migration-policy', type=str, default=None, - choices=['LCFS', 'SRF', 'LRF', 'FWSR'], + choices=['LCFS', 'SRF', 'LRF', 'FWJ'], help='request migration policy') parser.add_argument('--enable-defrag', type=bool, diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index f4b51b18..12c27c93 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -33,7 +33,7 @@ def get_migrate_out_request(self, min_request_len=0, max_request_len=np.inf) -> migrate_out_request = self._get_longest_running_request(min_request_len, max_request_len) elif self.request_migration_policy == 'SRF': migrate_out_request = self._get_shortest_running_request(min_request_len, max_request_len) - elif self.request_migration_policy == 'FWSR': + elif self.request_migration_policy == 'FWJ': migrate_out_request = self._get_first_waiting_or_shortest_running_request(min_request_len, max_request_len) return migrate_out_request @@ -57,7 +57,7 @@ def _get_last_running_request(self, min_request_len, max_request_len): return request return None - def _get_longest_running_request(self, min_request_len, max_request_len): + def _get_longest_running_request(self, min_request_len, max_request_len) -> Optional[LlumnixRequest]: running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() condition = lambda request : request.inference_type == RequestInferenceType.DECODE \ and min_request_len <= request.request_len <= max_request_len @@ -66,7 +66,7 @@ def _get_longest_running_request(self, min_request_len, max_request_len): key=lambda request: request.request_len, default=None) return longest_seq_group - def _get_shortest_running_request(self, min_request_len, max_request_len): + def _get_shortest_running_request(self, min_request_len, max_request_len) -> Optional[LlumnixRequest]: running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() condition = lambda request : request.inference_type == RequestInferenceType.DECODE \ and min_request_len <= request.request_len <= max_request_len @@ -75,7 +75,7 @@ def _get_shortest_running_request(self, min_request_len, max_request_len): key=lambda request: request.request_len, default=None) return shortest_seq_group - def _get_first_waiting_or_shortest_running_request(self, min_request_len, max_request_len): + def _get_first_waiting_request(self, min_request_len, max_request_len) -> Optional[LlumnixRequest]: waiting: Deque[LlumnixRequest] = self.backend_engine.get_waiting_queue() waiting = [seq_group for seq_group in waiting if seq_group.try_schedule_times >= 1] - return waiting[0] if waiting and waiting[0].request_len > 0 else self._get_shortest_running_request(min_request_len, max_request_len) + return waiting[0] if waiting and waiting[0].request_len > 0 else None From 5dc75f3119d3615a4fe70f12ae91115e3e8f51a0 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Sat, 14 Sep 2024 01:38:58 +0000 Subject: [PATCH 05/49] add EWF request migration policy unittest --- docs/Arguments.md | 4 ++-- llumnix/arg_utils.py | 2 +- llumnix/llumlet/local_migration_scheduler.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/Arguments.md b/docs/Arguments.md index eaf3b949..6917f0d7 100644 --- a/docs/Arguments.md +++ b/docs/Arguments.md @@ -17,7 +17,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] [--pair-migration-frequency PAIR_MIGRATION_FREQUENCY] [--pair-migration-policy {balanced,defrag_constrained,defrag_relaxed}] [--migrate-out-threshold MIGRATE_OUT_THRESHOLD] - [--request-migration-policy {LCFS,SRF,LRF,FWJ}] + [--request-migration-policy {LCFS,SRF,LRF,EWF}] [--enable-defrag ENABLE_DEFRAG] [--enable-scaling] [--min-instances MIN_INSTANCES] @@ -89,7 +89,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] `--request-migration-policy` - Request migration policy. -- Possible choices: LCFS, SRF, LRF, FWJ +- Possible choices: LCFS, SRF, LRF, EWF - Default: "SRF" `--enable-defrag` diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index 81334a12..e461819b 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -245,7 +245,7 @@ def add_cli_args( parser.add_argument('--request-migration-policy', type=str, default=None, - choices=['LCFS', 'SRF', 'LRF', 'FWJ'], + choices=['LCFS', 'SRF', 'LRF', 'EWF'], help='request migration policy') parser.add_argument('--enable-defrag', type=bool, diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index 12c27c93..1d6cbd3f 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -33,8 +33,8 @@ def get_migrate_out_request(self, min_request_len=0, max_request_len=np.inf) -> migrate_out_request = self._get_longest_running_request(min_request_len, max_request_len) elif self.request_migration_policy == 'SRF': migrate_out_request = self._get_shortest_running_request(min_request_len, max_request_len) - elif self.request_migration_policy == 'FWJ': - migrate_out_request = self._get_first_waiting_or_shortest_running_request(min_request_len, max_request_len) + elif self.request_migration_policy == 'EWF': + migrate_out_request = self._get_first_waiting_request(min_request_len, max_request_len) return migrate_out_request # The function is used to retrieve requests on the backend that have already met the expected_steps. From f06a4caa570031597c8f4bc0de5e33c51dd53044 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Sat, 14 Sep 2024 01:57:42 +0000 Subject: [PATCH 06/49] Remove SequenceStatusLlumnix --- llumnix/backends/vllm/scheduler.py | 12 +++++------- llumnix/backends/vllm/sequence.py | 15 ++------------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index cc00f02e..d778d6d6 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -24,7 +24,7 @@ from llumnix.instance_info import InstanceInfo from llumnix.logger import init_logger from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType, RequestStatus -from llumnix.backends.vllm.sequence import SequenceGroupLlumnix, SequenceStatusLlumnix +from llumnix.backends.vllm.sequence import SequenceGroupLlumnix logger = init_logger(__name__) @@ -102,14 +102,12 @@ def remove_running_request(self, request_id: str) -> None: for seq_group in self.running: if seq_group.request_id == request_id: self.running.remove(seq_group) - self._set_status(seq_group, status_to=SequenceStatusLlumnix.RUNNING_MIGRATING) break def remove_waiting_request(self, request_id: str) -> None: for seq_group in self.waiting: if seq_group.request_id == request_id: self.waiting.remove(seq_group) - self._set_status(seq_group, status_to=SequenceStatusLlumnix.WAITING_MIGRATING) seq_group.waiting_migrating = True break @@ -164,12 +162,11 @@ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: def _set_status(self, seq_group: SequenceGroup, - status_to: Union[SequenceStatus, SequenceStatusLlumnix], - status_from: Union[SequenceStatus, SequenceStatusLlumnix] = None): + status_to: SequenceStatus, + status_from: SequenceStatus = None): for seq in seq_group.get_seqs(status=status_from): seq.status = status_to - @scheduler_lock def free_dst_pre_alloc_cache(self, request_id: str = None) -> None: if request_id: blocks = self.pre_alloc_cache_dict.pop(request_id, []) @@ -186,7 +183,8 @@ def free_dst_pre_alloc_cache(self, request_id: str = None) -> None: def free_src_request(self, backend_request: SequenceGroupLlumnix) -> None: seq = backend_request.get_seqs()[0] - logger.info("free seq {}".format(seq.seq_id)) + logger.info("free request: {}".format(backend_request.request_id)) + logger.info("free seq: {}".format(seq.seq_id)) self.free_seq(seq) def _get_instance_info(self, scheduled_seq_groups: List[SequenceGroupLlumnix]) -> InstanceInfo: diff --git a/llumnix/backends/vllm/sequence.py b/llumnix/backends/vllm/sequence.py index 06b01060..ef86b471 100644 --- a/llumnix/backends/vllm/sequence.py +++ b/llumnix/backends/vllm/sequence.py @@ -19,12 +19,6 @@ from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType, RequestStatus -class SequenceStatusLlumnix(enum.Enum): - # src running request migrating to dst - RUNNING_MIGRATING = enum.auto() - # src waiting request migrating to dst - WAITING_MIGRATING = enum.auto() - class SequenceGroupLlumnix(SequenceGroup, LlumnixRequest): def __init__(self, request_id, server_info, expected_steps: int, *args, **kwargs) -> None: SequenceGroup.__init__(self, request_id, *args, **kwargs) @@ -63,17 +57,12 @@ def arrival_time(self) -> float: @property def status(self) -> RequestStatus: status = self.get_seqs()[0].status - assert status in [SequenceStatus.RUNNING, SequenceStatus.WAITING, - SequenceStatusLlumnix.RUNNING_MIGRATING, SequenceStatusLlumnix.WAITING_MIGRATING], \ - "Only RUNNING, WAITING, RUNNING_MIGRATING, WAITING_MIGRATING are expected status for LlumnixRequest" + assert status in [SequenceStatus.RUNNING, SequenceStatus.WAITING], \ + "Only RUNNING, WAITING are expected status for LlumnixRequest" if status == SequenceStatus.RUNNING: return RequestStatus.RUNNING elif status == SequenceStatus.WAITING: return RequestStatus.WAITING - elif status == SequenceStatusLlumnix.RUNNING_MIGRATING: - return RequestStatus.RUNNING - elif status == SequenceStatusLlumnix.WAITING_MIGRATING: - return RequestStatus.WAITING @property def prefill_num_blocks(self) -> int: From f479a6605e6fece8ae0b444d5ec1ac12b1e25528 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Sat, 14 Sep 2024 02:46:06 +0000 Subject: [PATCH 07/49] Fix seq keyerror --- llumnix/backends/vllm/llm_engine.py | 1 + llumnix/backends/vllm/scheduler.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 8eeaf49c..f400aa78 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -356,6 +356,7 @@ def commit_dst_request(self, backend_request: SequenceGroupLlumnix) -> None: if backend_request.status == RequestStatus.RUNNING: self.add_running_request(backend_request) else: # backend_request.status == RequestStatus.WAITING + backend_request.waiting_migrating = True self.add_waiting_request(backend_request) async def send_blocks(self, dst_ray_actor: "ray.actor.ActorHandle", src_blocks: List[int], dst_blocks: List[int]) -> None: diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index d778d6d6..08d16371 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -108,7 +108,6 @@ def remove_waiting_request(self, request_id: str) -> None: for seq_group in self.waiting: if seq_group.request_id == request_id: self.waiting.remove(seq_group) - seq_group.waiting_migrating = True break def add_migrating_out_request_last_stage(self, backend_request: SequenceGroupLlumnix) -> None: From 091c7b08018682dc0e6d6f36044d66b347911081 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Sat, 14 Sep 2024 02:46:52 +0000 Subject: [PATCH 08/49] pylint --- llumnix/backends/backend_interface.py | 6 +++--- llumnix/backends/vllm/scheduler.py | 7 ++++--- llumnix/backends/vllm/sequence.py | 8 ++++---- llumnix/llumlet/llumlet.py | 2 ++ llumnix/llumlet/local_migration_scheduler.py | 10 +++++----- llumnix/llumlet/migration_coordinator.py | 2 +- llumnix/llumlet/request.py | 2 +- tests/unit_test/llumlet/test_migration_coordinator.py | 6 +++--- 8 files changed, 23 insertions(+), 20 deletions(-) diff --git a/llumnix/backends/backend_interface.py b/llumnix/backends/backend_interface.py index 2fd42a04..b4a9dcb0 100644 --- a/llumnix/backends/backend_interface.py +++ b/llumnix/backends/backend_interface.py @@ -196,7 +196,7 @@ def pre_alloc(self, migration request identified by the given request ID. It updates the pre-allocation cache dictionary with the allocated blocks, which ensures that these blocks are not used by another process until the migration is finished. For the waiting request, it only reserves - free cache blocks when the request is the earliest arrival one among the requests of dst instance's + free cache blocks when the request is the earliest arrival one among the requests of dst instance's waiting queue. Args: @@ -216,7 +216,7 @@ def add_running_request(self, backend_request: LlumnixRequest) -> None: """ Adds a backend request to the running queue for processing. - This method enqueues a backend request into engine running queue. + This method enqueues a backend request into engine running queue. It is used when a suspend migrating request should be added back to running queue. Args: @@ -231,7 +231,7 @@ def add_waiting_request(self, backend_request: LlumnixRequest) -> None: """ Adds a backend request to the waiting queue for processing. - This method enqueues a backend request into engine waiting queue. + This method enqueues a backend request into engine waiting queue. It is used when a suspend migrating request should be added back to waiting queue. Args: diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 08d16371..5effc27e 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -124,9 +124,9 @@ def pop_migrating_out_requests_last_stage(self) -> List[SequenceGroupLlumnix]: def pre_alloc(self, request_id: str, request_status: RequestStatus, - request_arrival_time: float, + request_arrival_time: float, block_num: int) -> List[int]: - # Only migrate waiting request when the waiting request is the earliest arrival one + # Only migrate waiting request when the waiting request is the earliest arrival one # among the requests of dst instance's waiting queue. if request_status == RequestStatus.WAITING: if self.waiting and request_arrival_time > self.waiting[0].arrival_time: @@ -146,6 +146,7 @@ def add_running_request(self, backend_request: LlumnixRequest) -> None: def add_waiting_request(self, backend_request: LlumnixRequest) -> None: self._set_status(backend_request, status_to=SequenceStatus.WAITING) + # pylint: disable=E0203 self.waiting.append(backend_request) fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs") self.waiting = fcfs_policy.sort_by_priority(time.time(), self.waiting) @@ -159,7 +160,7 @@ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: else: super()._allocate_and_set_running(seq_group) - def _set_status(self, + def _set_status(self, seq_group: SequenceGroup, status_to: SequenceStatus, status_from: SequenceStatus = None): diff --git a/llumnix/backends/vllm/sequence.py b/llumnix/backends/vllm/sequence.py index ef86b471..1b226ba1 100644 --- a/llumnix/backends/vllm/sequence.py +++ b/llumnix/backends/vllm/sequence.py @@ -12,7 +12,6 @@ # limitations under the License. import math -import enum from vllm.sequence import SequenceGroup, SequenceStatus @@ -60,9 +59,10 @@ def status(self) -> RequestStatus: assert status in [SequenceStatus.RUNNING, SequenceStatus.WAITING], \ "Only RUNNING, WAITING are expected status for LlumnixRequest" if status == SequenceStatus.RUNNING: - return RequestStatus.RUNNING - elif status == SequenceStatus.WAITING: - return RequestStatus.WAITING + request_status = RequestStatus.RUNNING + else: # status == SequenceStatus.WAITING + request_status = RequestStatus.WAITING + return request_status @property def prefill_num_blocks(self) -> int: diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index b90dacc2..6f57714a 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -164,9 +164,11 @@ async def migrate_out(self, dst_instance_name: str, num_requests: int) -> List[s except ray.exceptions.RayActorError: logger.info("[migrate_out] instance {} is dead".format(dst_instance_name[len("instance_"):])) raise + # pylint: disable=W0703 except Exception as e: logger.error("unexpected exception occurs: {}".format(e)) logger.error("exception traceback: {}".format(traceback.format_exc())) + raise return migrated_request_list def get_instance_info(self) -> InstanceInfo: diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index 1d6cbd3f..4e27f998 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -53,14 +53,14 @@ def _get_last_running_request(self, min_request_len, max_request_len): running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() for request in reversed(running): if request.inference_type == RequestInferenceType.DECODE \ - and min_request_len <= request.request_len <= max_request_len: + and min_request_len < request.request_len < max_request_len: return request return None def _get_longest_running_request(self, min_request_len, max_request_len) -> Optional[LlumnixRequest]: running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() condition = lambda request : request.inference_type == RequestInferenceType.DECODE \ - and min_request_len <= request.request_len <= max_request_len + and min_request_len < request.request_len < max_request_len longest_seq_group = max((request for request in running if condition(request)), \ key=lambda request: request.request_len, default=None) @@ -69,13 +69,13 @@ def _get_longest_running_request(self, min_request_len, max_request_len) -> Opti def _get_shortest_running_request(self, min_request_len, max_request_len) -> Optional[LlumnixRequest]: running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() condition = lambda request : request.inference_type == RequestInferenceType.DECODE \ - and min_request_len <= request.request_len <= max_request_len + and min_request_len < request.request_len < max_request_len shortest_seq_group = min((request for request in running if condition(request)), \ key=lambda request: request.request_len, default=None) return shortest_seq_group - + def _get_first_waiting_request(self, min_request_len, max_request_len) -> Optional[LlumnixRequest]: waiting: Deque[LlumnixRequest] = self.backend_engine.get_waiting_queue() waiting = [seq_group for seq_group in waiting if seq_group.try_schedule_times >= 1] - return waiting[0] if waiting and waiting[0].request_len > 0 else None + return waiting[0] if waiting and min_request_len < waiting[0] < max_request_len else None diff --git a/llumnix/llumlet/migration_coordinator.py b/llumnix/llumlet/migration_coordinator.py index 345e2160..e192db5d 100644 --- a/llumnix/llumlet/migration_coordinator.py +++ b/llumnix/llumlet/migration_coordinator.py @@ -137,7 +137,7 @@ async def _migrate_out_onestage(self, return migration_status - def migrate_in_pre_alloc(self, + def migrate_in_pre_alloc(self, request_id: str, request_status: RequestStatus, request_arrival_time: float, diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index d5456309..ea462c74 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -64,7 +64,7 @@ def prompt_len(self) -> int: @property def output_len(self) -> int: raise NotImplementedError - + @property def finished(self) -> bool: raise NotImplementedError diff --git a/tests/unit_test/llumlet/test_migration_coordinator.py b/tests/unit_test/llumlet/test_migration_coordinator.py index 3386620b..535a3452 100644 --- a/tests/unit_test/llumlet/test_migration_coordinator.py +++ b/tests/unit_test/llumlet/test_migration_coordinator.py @@ -120,10 +120,10 @@ def test_migrate_out_waiting_request(): backend_engine = MagicMock(spec=BackendInterface) migrate_in_ray_actor = MagicMock() migrate_out_request = MagicMock() - + # Create an instance of MigrationCoordinator coordinator = MigrationCoordinator(backend_engine, last_stage_max_blocks=1, max_stages=3) - + # Test FINISHED_DONE migrate_out_request.prefill_num_blocks = 3 dst_blocks = [1, 2, 3] @@ -133,7 +133,7 @@ def test_migrate_out_waiting_request(): migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) status = coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) assert status == MigrationStatus.FINISHED_DONE - + # Test FINISHED_ABORTED migrate_out_request.prefill_num_blocks = 2 status = coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) From 35dd88535d5c456851735400cb19b0f323bb6873 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Sat, 14 Sep 2024 06:10:24 +0000 Subject: [PATCH 09/49] Support multiple requests migration --- llumnix/llumlet/llumlet.py | 54 ++++++++++---------- llumnix/llumlet/local_migration_scheduler.py | 40 +++++++-------- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 6f57714a..f22b1f4e 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -131,36 +131,38 @@ async def _check_state_loop(self): ray.kill(self_actor) async def migrate_out(self, dst_instance_name: str, num_requests: int) -> List[str]: + migrate_out_requests = self.migration_scheduler.get_migrate_out_requests() + if len(migrate_out_requests) == 0: + return [] + migrated_request_list = [] + for migrate_out_request in migrate_out_requests: + migrated_request_list.extend(await self._migrate_out_one_request(migrate_out_request, dst_instance_name, num_requests)) + return migrated_request_list + + async def _migrate_out_one_request(self, migrate_out_request, dst_instance_name: str): try: migrate_in_ray_actor = ray.get_actor(dst_instance_name, namespace='llumnix') dst_instance_id = dst_instance_name[len("instance_"):] + logger.info("{}->{} begin migrate out".format(self.instance_id, dst_instance_id)) migrated_request_list = [] - continue_migrate = True - while continue_migrate and len(migrated_request_list) < num_requests: - t0 = time.time() - migrate_out_request = self.migration_scheduler.get_migrate_out_request() - if migrate_out_request is None: - return migrated_request_list - assert migrate_out_request.request_status in [RequestStatus.WAITING, RequestStatus.RUNNING], "Only migrate out waiting or running request" - logger.info("{}->{} begin migrate out {}".format(self.instance_id, dst_instance_id, migrate_out_request.request_id)) - if migrate_out_request.request_status == RequestStatus.RUNNING: - status = await self.migration_coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) - else: - status = await self.migration_coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) - if status == MigrationStatus.FINISHED_DONE: - await migrate_in_ray_actor.execute_engine_method.remote("commit_dst_request", migrate_out_request) - if migrate_out_request.request_status == RequestStatus.RUNNING: - self.backend_engine.free_src_request(migrate_out_request) - migrated_request_list.append(migrate_out_request.request_id) - self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) - elif status == MigrationStatus.FINISHED_SRC_ABORTED: - migrate_out_request.reset_migration_args() - await migrate_in_ray_actor.execute_migration_method.remote("free_dst_pre_alloc_cache", migrate_out_request.request_id) - continue_migrate = False - t1 = time.time() - logger.info("{}->{} migrate done, migrate request {}, status:{}, len:{} blocks, cost:{} ms" \ - .format(self.instance_id, dst_instance_id, migrated_request_list, status, \ - sum(migrate_out_request.stage_num_blocks_list), (t1 - t0)*1000)) + assert migrate_out_request.status in [RequestStatus.WAITING, RequestStatus.RUNNING], "Only migrate out waiting/running request" + if migrate_out_request.status == RequestStatus.RUNNING: + status = self.migration_coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) + else: + status = self.migration_coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) + if status == MigrationStatus.FINISHED_DONE: + ray.get(migrate_in_ray_actor.execute_engine_method.remote("commit_dst_request", migrate_out_request)) + if migrate_out_request.status == RequestStatus.RUNNING: + self.backend_engine.free_src_request(migrate_out_request) + migrated_request_list.append(migrate_out_request.request_id) + self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) + elif status == MigrationStatus.FINISHED_SRC_ABORTED: + migrate_out_request.reset_migration_args() + ray.get(migrate_in_ray_actor.execute_migration_method.remote("free_dst_pre_alloc_cache", migrate_out_request.request_id)) + t1 = time.time() + logger.info("{}->{} migrate done, migrate request {}, migration status: {}, len: {} blocks, cost: {} ms" \ + .format(self.instance_id, dst_instance_id, migrated_request_list, status, \ + sum(migrate_out_request.stage_num_blocks_list), (t1 - t0)*1000)) except ray.exceptions.RayActorError: logger.info("[migrate_out] instance {} is dead".format(dst_instance_name[len("instance_"):])) raise diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index 4e27f998..467c7a66 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -11,7 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Deque, Optional +from typing import Deque, List import numpy as np from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType @@ -23,19 +23,19 @@ def __init__(self, request_migration_policy: str, backend_engine: BackendInterfa self.request_migration_policy = request_migration_policy self.backend_engine = backend_engine - def get_migrate_out_request(self, min_request_len=0, max_request_len=np.inf) -> Optional[LlumnixRequest]: + def get_migrate_out_requests(self, min_request_len=0, max_request_len=np.inf) -> List[LlumnixRequest]: # Requests meet the strict pre-migration always have higher prioirity than other migration policy. - migrate_out_request: LlumnixRequest = self.get_ready_migration_request(min_request_len, max_request_len) - if migrate_out_request is None: + migrate_out_requests: LlumnixRequest = self.get_ready_migration_request(min_request_len, max_request_len) + if len(migrate_out_request) == 0: if self.request_migration_policy == 'LCFS': - migrate_out_request = self._get_last_running_request(min_request_len, max_request_len) + migrate_out_requests = self._get_last_running_request(min_request_len, max_request_len) elif self.request_migration_policy == 'LRF': - migrate_out_request = self._get_longest_running_request(min_request_len, max_request_len) + migrate_out_requests = self._get_longest_running_request(min_request_len, max_request_len) elif self.request_migration_policy == 'SRF': - migrate_out_request = self._get_shortest_running_request(min_request_len, max_request_len) + migrate_out_requests = self._get_shortest_running_request(min_request_len, max_request_len) elif self.request_migration_policy == 'EWF': - migrate_out_request = self._get_first_waiting_request(min_request_len, max_request_len) - return migrate_out_request + migrate_out_requests = self._get_first_waiting_request(min_request_len, max_request_len) + return migrate_out_requests # The function is used to retrieve requests on the backend that have already met the expected_steps. # TODO(xinyi): Currently, the function is only used for Prefill-decoding disaggregation, @@ -46,36 +46,34 @@ def get_ready_migration_request(self, min_request_len, max_request_len): if request.output_len >= request.expected_steps \ and request.inference_type == RequestInferenceType.DECODE \ and min_request_len <= request.request_len <= max_request_len: - return request - return None + return [request] + return [] def _get_last_running_request(self, min_request_len, max_request_len): running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() for request in reversed(running): if request.inference_type == RequestInferenceType.DECODE \ and min_request_len < request.request_len < max_request_len: - return request - return None + return [request] + return [] - def _get_longest_running_request(self, min_request_len, max_request_len) -> Optional[LlumnixRequest]: + def _get_longest_running_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() condition = lambda request : request.inference_type == RequestInferenceType.DECODE \ and min_request_len < request.request_len < max_request_len - longest_seq_group = max((request for request in running if condition(request)), \ key=lambda request: request.request_len, default=None) - return longest_seq_group + return [longest_seq_group] if longest_seq_group != None else [] - def _get_shortest_running_request(self, min_request_len, max_request_len) -> Optional[LlumnixRequest]: + def _get_shortest_running_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() condition = lambda request : request.inference_type == RequestInferenceType.DECODE \ and min_request_len < request.request_len < max_request_len - shortest_seq_group = min((request for request in running if condition(request)), \ key=lambda request: request.request_len, default=None) - return shortest_seq_group + return [shortest_seq_group] if shortest_seq_group != None else [] - def _get_first_waiting_request(self, min_request_len, max_request_len) -> Optional[LlumnixRequest]: + def _get_first_waiting_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: waiting: Deque[LlumnixRequest] = self.backend_engine.get_waiting_queue() waiting = [seq_group for seq_group in waiting if seq_group.try_schedule_times >= 1] - return waiting[0] if waiting and min_request_len < waiting[0] < max_request_len else None + return [waiting[0]] if waiting and min_request_len < waiting[0].request_len < max_request_len else [] From 1c4c7e32acdee08641a55c55bf695bbcc6006e75 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Sat, 14 Sep 2024 06:44:46 +0000 Subject: [PATCH 10/49] Support EWSR request migration policy --- docs/Arguments.md | 4 ++-- llumnix/arg_utils.py | 2 +- llumnix/llm_engine_manager.py | 9 +++++---- llumnix/llumlet/llumlet.py | 15 ++++++++------- llumnix/llumlet/local_migration_scheduler.py | 20 ++++++++++++++------ 5 files changed, 30 insertions(+), 20 deletions(-) diff --git a/docs/Arguments.md b/docs/Arguments.md index 6917f0d7..e5a8354a 100644 --- a/docs/Arguments.md +++ b/docs/Arguments.md @@ -17,7 +17,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] [--pair-migration-frequency PAIR_MIGRATION_FREQUENCY] [--pair-migration-policy {balanced,defrag_constrained,defrag_relaxed}] [--migrate-out-threshold MIGRATE_OUT_THRESHOLD] - [--request-migration-policy {LCFS,SRF,LRF,EWF}] + [--request-migration-policy {LCFS,SRF,LRF,EWF,EWSR}] [--enable-defrag ENABLE_DEFRAG] [--enable-scaling] [--min-instances MIN_INSTANCES] @@ -89,7 +89,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] `--request-migration-policy` - Request migration policy. -- Possible choices: LCFS, SRF, LRF, EWF +- Possible choices: LCFS, SRF, LRF, EWF, EWSR - Default: "SRF" `--enable-defrag` diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index e461819b..016f3e0e 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -245,7 +245,7 @@ def add_cli_args( parser.add_argument('--request-migration-policy', type=str, default=None, - choices=['LCFS', 'SRF', 'LRF', 'EWF'], + choices=['LCFS', 'SRF', 'LRF', 'EWF', 'EWSR'], help='request migration policy') parser.add_argument('--enable-defrag', type=bool, diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index 7b47728b..c2ff2f01 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -220,12 +220,12 @@ async def _clear_request_instance_loop(self, interval: float): async def _push_migrations(self) -> None: # Push migrate when the instance_info have updated a certain number of times. if self.enable_pd_disagg: - asyncio.create_task(self._migrate(PairMigrationConstraints.PREFILL_2_DECODING, math.inf)) - asyncio.create_task(self._migrate(PairMigrationConstraints.DECODING_2_DECODING, 1)) + asyncio.create_task(self._migrate(PairMigrationConstraints.PREFILL_2_DECODING)) + asyncio.create_task(self._migrate(PairMigrationConstraints.DECODING_2_DECODING)) else: - asyncio.create_task(self._migrate(PairMigrationConstraints.NO_CONSTRAINTS, 1)) + asyncio.create_task(self._migrate(PairMigrationConstraints.NO_CONSTRAINTS)) - async def _migrate(self, pair_migration_type: PairMigrationConstraints, migrate_in_num_requests: int) -> None: + async def _migrate(self, pair_migration_type: PairMigrationConstraints) -> None: async def migrate_done_callback(ret, migrate_instance_pair: Tuple[str, str]) -> None: if migrate_instance_pair[0] in self.instance_migrating: self.instance_migrating[migrate_instance_pair[0]] = False @@ -256,6 +256,7 @@ def migrate_done_callback_wrapper(migrate_instance_pair: Tuple[str, str], fut) - ret = fut.result() loop = asyncio.get_event_loop() loop.create_task(migrate_done_callback(ret, migrate_instance_pair)) + migrate_instance_pairs = self.global_scheduler.pair_migration(pair_migration_type) try: migration_tasks = [] diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index f22b1f4e..809ff502 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -130,7 +130,7 @@ async def _check_state_loop(self): self_actor = ray.get_actor(self.actor_name) ray.kill(self_actor) - async def migrate_out(self, dst_instance_name: str, num_requests: int) -> List[str]: + async def migrate_out(self, dst_instance_name: str) -> List[str]: migrate_out_requests = self.migration_scheduler.get_migrate_out_requests() if len(migrate_out_requests) == 0: return [] @@ -141,24 +141,25 @@ async def migrate_out(self, dst_instance_name: str, num_requests: int) -> List[s async def _migrate_out_one_request(self, migrate_out_request, dst_instance_name: str): try: + t0 = time.time() migrate_in_ray_actor = ray.get_actor(dst_instance_name, namespace='llumnix') dst_instance_id = dst_instance_name[len("instance_"):] - logger.info("{}->{} begin migrate out".format(self.instance_id, dst_instance_id)) migrated_request_list = [] - assert migrate_out_request.status in [RequestStatus.WAITING, RequestStatus.RUNNING], "Only migrate out waiting/running request" + assert migrate_out_request.status in [RequestStatus.WAITING, RequestStatus.RUNNING], "Only migrate out waiting or running request" + logger.info("{}->{} begin migrate out".format(self.instance_id, dst_instance_id)) if migrate_out_request.status == RequestStatus.RUNNING: - status = self.migration_coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) + status = await self.migration_coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) else: - status = self.migration_coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) + status = await self.migration_coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) if status == MigrationStatus.FINISHED_DONE: - ray.get(migrate_in_ray_actor.execute_engine_method.remote("commit_dst_request", migrate_out_request)) + await migrate_in_ray_actor.execute_engine_method.remote("commit_dst_request", migrate_out_request) if migrate_out_request.status == RequestStatus.RUNNING: self.backend_engine.free_src_request(migrate_out_request) migrated_request_list.append(migrate_out_request.request_id) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) elif status == MigrationStatus.FINISHED_SRC_ABORTED: migrate_out_request.reset_migration_args() - ray.get(migrate_in_ray_actor.execute_migration_method.remote("free_dst_pre_alloc_cache", migrate_out_request.request_id)) + await migrate_in_ray_actor.execute_migration_method.remote("free_dst_pre_alloc_cache", migrate_out_request.request_id) t1 = time.time() logger.info("{}->{} migrate done, migrate request {}, migration status: {}, len: {} blocks, cost: {} ms" \ .format(self.instance_id, dst_instance_id, migrated_request_list, status, \ diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index 467c7a66..0574ca81 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -25,7 +25,7 @@ def __init__(self, request_migration_policy: str, backend_engine: BackendInterfa def get_migrate_out_requests(self, min_request_len=0, max_request_len=np.inf) -> List[LlumnixRequest]: # Requests meet the strict pre-migration always have higher prioirity than other migration policy. - migrate_out_requests: LlumnixRequest = self.get_ready_migration_request(min_request_len, max_request_len) + migrate_out_requests: List[LlumnixRequest] = self.get_required_migration_request(min_request_len, max_request_len) if len(migrate_out_request) == 0: if self.request_migration_policy == 'LCFS': migrate_out_requests = self._get_last_running_request(min_request_len, max_request_len) @@ -34,20 +34,23 @@ def get_migrate_out_requests(self, min_request_len=0, max_request_len=np.inf) -> elif self.request_migration_policy == 'SRF': migrate_out_requests = self._get_shortest_running_request(min_request_len, max_request_len) elif self.request_migration_policy == 'EWF': - migrate_out_requests = self._get_first_waiting_request(min_request_len, max_request_len) + migrate_out_requests = self._get_earliest_waiting_request(min_request_len, max_request_len) + elif self.request_migration_policy == 'EWSR': + migrate_out_requests = self._get_earliest_waiting_and_shortest_running_requests(min_request_len, max_request_len) return migrate_out_requests # The function is used to retrieve requests on the backend that have already met the expected_steps. # TODO(xinyi): Currently, the function is only used for Prefill-decoding disaggregation, # and only selects request that migrates from the prefill instance to the decoding instance. - def get_ready_migration_request(self, min_request_len, max_request_len): + def get_required_migration_request(self, min_request_len, max_request_len): running: List[LlumnixRequest] = self.backend_engine.get_running_queue() + required_migration_requests = [] for request in reversed(running): if request.output_len >= request.expected_steps \ and request.inference_type == RequestInferenceType.DECODE \ and min_request_len <= request.request_len <= max_request_len: - return [request] - return [] + required_migration_requests.append(request) + return required_migration_requests def _get_last_running_request(self, min_request_len, max_request_len): running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() @@ -73,7 +76,12 @@ def _get_shortest_running_request(self, min_request_len, max_request_len) -> Lis key=lambda request: request.request_len, default=None) return [shortest_seq_group] if shortest_seq_group != None else [] - def _get_first_waiting_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: + def _get_earliest_waiting_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: waiting: Deque[LlumnixRequest] = self.backend_engine.get_waiting_queue() waiting = [seq_group for seq_group in waiting if seq_group.try_schedule_times >= 1] return [waiting[0]] if waiting and min_request_len < waiting[0].request_len < max_request_len else [] + + def _get_earliest_waiting_and_shortest_running_requests(self, min_request_len, max_request_len) -> List[LlumnixRequest]: + waiting_requests = self._get_earliest_waiting_request(min_request_len, max_request_len) + running_requests = self._get_shortest_running_request(min_request_len, max_request_len) + return waiting_requests + running_requests From 4caae80aac3f39ec94acb47355028a1de89ecf17 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Sat, 14 Sep 2024 07:14:56 +0000 Subject: [PATCH 11/49] Add policy explanation in arg help --- configs/base.yml | 2 +- docs/Arguments.md | 6 +++--- llumnix/arg_utils.py | 22 ++++++++++++++++---- llumnix/config/default.py | 2 +- llumnix/llumlet/local_migration_scheduler.py | 20 +++++++++--------- 5 files changed, 33 insertions(+), 19 deletions(-) diff --git a/configs/base.yml b/configs/base.yml index 35760933..6da8ef72 100644 --- a/configs/base.yml +++ b/configs/base.yml @@ -18,7 +18,7 @@ MANAGER: ENABLE_MIGRATION: True ENABLE_DEFRAG: True - REQUEST_MIGRATION_POLICY: 'SRF' + REQUEST_MIGRATION_POLICY: 'SR' MIGRATION_BACKEND: 'gloo' MIGRATION_CACHE_BLOCKS: 512 diff --git a/docs/Arguments.md b/docs/Arguments.md index e5a8354a..47c577e6 100644 --- a/docs/Arguments.md +++ b/docs/Arguments.md @@ -17,7 +17,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] [--pair-migration-frequency PAIR_MIGRATION_FREQUENCY] [--pair-migration-policy {balanced,defrag_constrained,defrag_relaxed}] [--migrate-out-threshold MIGRATE_OUT_THRESHOLD] - [--request-migration-policy {LCFS,SRF,LRF,EWF,EWSR}] + [--request-migration-policy {LCR,SR,LR,FCW,FCWSR}] [--enable-defrag ENABLE_DEFRAG] [--enable-scaling] [--min-instances MIN_INSTANCES] @@ -89,8 +89,8 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] `--request-migration-policy` - Request migration policy. -- Possible choices: LCFS, SRF, LRF, EWF, EWSR -- Default: "SRF" +- Possible choices: LCR, SR, LR, FCW, FCWSR +- Default: "SR" `--enable-defrag` - Enable defragmentation through migration based on virtual usage. diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index 016f3e0e..0203c265 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -224,7 +224,11 @@ def add_cli_args( parser.add_argument('--dispatch-policy', type=str, choices=['balanced', 'load', 'queue', 'flood'], - help='request dispatch policy') + help='The request dispatch policy.\n\n' + '* "balanced" dispatch request to the instance with minimum requests dispatched.\n' + '* "load" dispatch request to the instance with lowest instance load.\n' + '* "queue" dispatch request to the instance with minimum waiting request queue length.\n' + '* "flood" dispatch request to the instance with maximum requests dispatched.\n') parser.add_argument('--num-available-dispatch-instances', type=int, help='number of available instances for dispatching') @@ -238,15 +242,25 @@ def add_cli_args( parser.add_argument('--pair-migration-policy', type=str, choices=['balanced', 'defrag_constrained', 'defrag_relaxed'], - help='pair migration policy') + help='The pair migration policy.\n\n' + '* "balanced" pair migration to make the instance load of instance more balanced.\n' + '* "defrag_constrained" pair migration without balanced constraint to ' + 'achieve defragmentation thoroughly (with instance constraints).\n' + '* "defrag_relaxed" pair migration to without balanced constraint ' + 'to achieve defragmentation thoroughly (without instance constraints).\n') parser.add_argument('--migrate-out-threshold', type=float, help='migrate out instance load threshold') parser.add_argument('--request-migration-policy', type=str, default=None, - choices=['LCFS', 'SRF', 'LRF', 'EWF', 'EWSR'], - help='request migration policy') + choices=['LCR', 'SR', 'LR', 'FCW', 'FCWSR'], + help='The request migration policy.\n\n' + '* "LCR" migrate the running request last come.\n' + '* "SR" migrate the running request shortest.\n' + '* "LR" migrate the running request longest.\n' + '* "FCW" migrate the waiting request first come.\n' + '* "FCWSR" migrate the waiting request first come and running request shortest.\n') parser.add_argument('--enable-defrag', type=bool, help='enable defragmentation through migration based on virtual usage') diff --git a/llumnix/config/default.py b/llumnix/config/default.py index 645ab72e..f6023bea 100644 --- a/llumnix/config/default.py +++ b/llumnix/config/default.py @@ -95,7 +95,7 @@ # Migrate out instance load threshold _C.MANAGER.MIGRATE_OUT_THRESHOLD = 3.0 # Request migration policy -_C.MANAGER.REQUEST_MIGRATION_POLICY = 'SRF' +_C.MANAGER.REQUEST_MIGRATION_POLICY = 'SR' # Enable defragmentation through migration based on virtual usage _C.MANAGER.ENABLE_DEFRAG = False # Drop migration if the number of stages > max_stages diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index 0574ca81..affa316f 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -27,16 +27,16 @@ def get_migrate_out_requests(self, min_request_len=0, max_request_len=np.inf) -> # Requests meet the strict pre-migration always have higher prioirity than other migration policy. migrate_out_requests: List[LlumnixRequest] = self.get_required_migration_request(min_request_len, max_request_len) if len(migrate_out_request) == 0: - if self.request_migration_policy == 'LCFS': + if self.request_migration_policy == 'LCR': migrate_out_requests = self._get_last_running_request(min_request_len, max_request_len) - elif self.request_migration_policy == 'LRF': + elif self.request_migration_policy == 'LR': migrate_out_requests = self._get_longest_running_request(min_request_len, max_request_len) - elif self.request_migration_policy == 'SRF': + elif self.request_migration_policy == 'SR': migrate_out_requests = self._get_shortest_running_request(min_request_len, max_request_len) - elif self.request_migration_policy == 'EWF': - migrate_out_requests = self._get_earliest_waiting_request(min_request_len, max_request_len) - elif self.request_migration_policy == 'EWSR': - migrate_out_requests = self._get_earliest_waiting_and_shortest_running_requests(min_request_len, max_request_len) + elif self.request_migration_policy == 'FCW': + migrate_out_requests = self._get_first_waiting_request(min_request_len, max_request_len) + elif self.request_migration_policy == 'FCWSR': + migrate_out_requests = self._get_first_waiting_and_shortest_running_requests(min_request_len, max_request_len) return migrate_out_requests # The function is used to retrieve requests on the backend that have already met the expected_steps. @@ -76,12 +76,12 @@ def _get_shortest_running_request(self, min_request_len, max_request_len) -> Lis key=lambda request: request.request_len, default=None) return [shortest_seq_group] if shortest_seq_group != None else [] - def _get_earliest_waiting_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: + def _get_first_waiting_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: waiting: Deque[LlumnixRequest] = self.backend_engine.get_waiting_queue() waiting = [seq_group for seq_group in waiting if seq_group.try_schedule_times >= 1] return [waiting[0]] if waiting and min_request_len < waiting[0].request_len < max_request_len else [] - def _get_earliest_waiting_and_shortest_running_requests(self, min_request_len, max_request_len) -> List[LlumnixRequest]: - waiting_requests = self._get_earliest_waiting_request(min_request_len, max_request_len) + def _get_first_waiting_and_shortest_running_requests(self, min_request_len, max_request_len) -> List[LlumnixRequest]: + waiting_requests = self._get_first_waiting_request(min_request_len, max_request_len) running_requests = self._get_shortest_running_request(min_request_len, max_request_len) return waiting_requests + running_requests From 5fa147be5dfdbb83c6eb8a85fe995ada2877e583 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Sat, 14 Sep 2024 07:19:19 +0000 Subject: [PATCH 12/49] pylint --- llumnix/llumlet/local_migration_scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index affa316f..78b5479f 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -66,7 +66,7 @@ def _get_longest_running_request(self, min_request_len, max_request_len) -> List and min_request_len < request.request_len < max_request_len longest_seq_group = max((request for request in running if condition(request)), \ key=lambda request: request.request_len, default=None) - return [longest_seq_group] if longest_seq_group != None else [] + return [longest_seq_group] if longest_seq_group is not None else [] def _get_shortest_running_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() @@ -74,7 +74,7 @@ def _get_shortest_running_request(self, min_request_len, max_request_len) -> Lis and min_request_len < request.request_len < max_request_len shortest_seq_group = min((request for request in running if condition(request)), \ key=lambda request: request.request_len, default=None) - return [shortest_seq_group] if shortest_seq_group != None else [] + return [shortest_seq_group] if shortest_seq_group is not None else [] def _get_first_waiting_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: waiting: Deque[LlumnixRequest] = self.backend_engine.get_waiting_queue() From 49545df896c5465c0a813ea4e8b6091f462f692c Mon Sep 17 00:00:00 2001 From: s5u13b Date: Fri, 20 Sep 2024 08:00:56 +0000 Subject: [PATCH 13/49] Add eom --- llumnix/llumlet/llumlet.py | 15 +++++++++------ llumnix/llumlet/request.py | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 809ff502..911c907b 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -136,7 +136,10 @@ async def migrate_out(self, dst_instance_name: str) -> List[str]: return [] migrated_request_list = [] for migrate_out_request in migrate_out_requests: - migrated_request_list.extend(await self._migrate_out_one_request(migrate_out_request, dst_instance_name, num_requests)) + migrated_request = await self._migrate_out_one_request(migrate_out_request, dst_instance_name) + migrated_request_list.extend(migrated_request) + if len(migrated_request) == 0 and migrate_out_request.eom: + break return migrated_request_list async def _migrate_out_one_request(self, migrate_out_request, dst_instance_name: str): @@ -144,9 +147,9 @@ async def _migrate_out_one_request(self, migrate_out_request, dst_instance_name: t0 = time.time() migrate_in_ray_actor = ray.get_actor(dst_instance_name, namespace='llumnix') dst_instance_id = dst_instance_name[len("instance_"):] - migrated_request_list = [] - assert migrate_out_request.status in [RequestStatus.WAITING, RequestStatus.RUNNING], "Only migrate out waiting or running request" logger.info("{}->{} begin migrate out".format(self.instance_id, dst_instance_id)) + migrated_request = [] + assert migrate_out_request.status in [RequestStatus.WAITING, RequestStatus.RUNNING], "Only migrate out waiting/running request" if migrate_out_request.status == RequestStatus.RUNNING: status = await self.migration_coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) else: @@ -155,14 +158,14 @@ async def _migrate_out_one_request(self, migrate_out_request, dst_instance_name: await migrate_in_ray_actor.execute_engine_method.remote("commit_dst_request", migrate_out_request) if migrate_out_request.status == RequestStatus.RUNNING: self.backend_engine.free_src_request(migrate_out_request) - migrated_request_list.append(migrate_out_request.request_id) + migrated_request.append(migrate_out_request.request_id) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) elif status == MigrationStatus.FINISHED_SRC_ABORTED: migrate_out_request.reset_migration_args() await migrate_in_ray_actor.execute_migration_method.remote("free_dst_pre_alloc_cache", migrate_out_request.request_id) t1 = time.time() logger.info("{}->{} migrate done, migrate request {}, migration status: {}, len: {} blocks, cost: {} ms" \ - .format(self.instance_id, dst_instance_id, migrated_request_list, status, \ + .format(self.instance_id, dst_instance_id, migrated_request, status, \ sum(migrate_out_request.stage_num_blocks_list), (t1 - t0)*1000)) except ray.exceptions.RayActorError: logger.info("[migrate_out] instance {} is dead".format(dst_instance_name[len("instance_"):])) @@ -172,7 +175,7 @@ async def _migrate_out_one_request(self, migrate_out_request, dst_instance_name: logger.error("unexpected exception occurs: {}".format(e)) logger.error("exception traceback: {}".format(traceback.format_exc())) raise - return migrated_request_list + return migrated_request def get_instance_info(self) -> InstanceInfo: return self.backend_engine.engine.instance_info diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index ea462c74..1657bff6 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -36,8 +36,9 @@ def __init__(self, request_id: int, server_info: ServerInfo, expected_steps: int self.last_preemption_time = None self.stage_timestamps = [] self.stage_num_blocks_list = [] - self.waiting_migrating = False + # end-of-migration + self.eom = False def reset_migration_args(self): self.last_preemption_time = None From f46df4fcf7b127346f245c0aef7de42164e1f084 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 17 Oct 2024 08:15:03 +0000 Subject: [PATCH 14/49] Fix unit test --- llumnix/llm_engine_manager.py | 2 +- llumnix/llumlet/llumlet.py | 4 +-- llumnix/llumlet/local_migration_scheduler.py | 2 +- llumnix/llumlet/migration_coordinator.py | 2 +- .../unit_test/backends/vllm/test_migration.py | 12 +++---- .../unit_test/backends/vllm/test_simulator.py | 2 +- .../test_llm_engine_manager.py | 2 +- .../llumlet/test_engine_step_exception.py | 2 +- .../llumlet/test_local_migration_scheduler.py | 33 +++++++++++-------- .../llumlet/test_migration_coordinator.py | 9 ++--- 10 files changed, 38 insertions(+), 32 deletions(-) diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index c2ff2f01..88d1dc72 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -268,7 +268,7 @@ def migrate_done_callback_wrapper(migrate_instance_pair: Tuple[str, str], fut) - self.instance_migrating[migrate_in_instance_id] = True migrate_in_instance_name = "instance_{}".format(migrate_in_instance_id) # Use asyncio.gather to wrap ray remote call to add done callback. - task = asyncio.gather(self.instances[migrate_out_instance_id].migrate_out.remote(migrate_in_instance_name, migrate_in_num_requests), + task = asyncio.gather(self.instances[migrate_out_instance_id].migrate_out.remote(migrate_in_instance_name), return_exceptions=True) task.add_done_callback(partial(migrate_done_callback_wrapper, migrate_instance_pair)) migration_tasks.append(task) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 911c907b..85006c00 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -28,7 +28,7 @@ from llumnix.server_info import ServerInfo from llumnix.internal_config import MigrationConfig from llumnix.queue.queue_type import QueueType -from llumnix.llumlet.request import RequestStatus +from llumnix.llumlet.request import LlumnixRequest, RequestStatus logger = init_logger(__name__) @@ -142,7 +142,7 @@ async def migrate_out(self, dst_instance_name: str) -> List[str]: break return migrated_request_list - async def _migrate_out_one_request(self, migrate_out_request, dst_instance_name: str): + async def _migrate_out_one_request(self, migrate_out_request: LlumnixRequest, dst_instance_name: str): try: t0 = time.time() migrate_in_ray_actor = ray.get_actor(dst_instance_name, namespace='llumnix') diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index 78b5479f..e61d5185 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -26,7 +26,7 @@ def __init__(self, request_migration_policy: str, backend_engine: BackendInterfa def get_migrate_out_requests(self, min_request_len=0, max_request_len=np.inf) -> List[LlumnixRequest]: # Requests meet the strict pre-migration always have higher prioirity than other migration policy. migrate_out_requests: List[LlumnixRequest] = self.get_required_migration_request(min_request_len, max_request_len) - if len(migrate_out_request) == 0: + if len(migrate_out_requests) == 0: if self.request_migration_policy == 'LCR': migrate_out_requests = self._get_last_running_request(min_request_len, max_request_len) elif self.request_migration_policy == 'LR': diff --git a/llumnix/llumlet/migration_coordinator.py b/llumnix/llumlet/migration_coordinator.py index e192db5d..ea58edea 100644 --- a/llumnix/llumlet/migration_coordinator.py +++ b/llumnix/llumlet/migration_coordinator.py @@ -78,7 +78,7 @@ async def _migrate_out_multistage(self, migrate_out_request: LlumnixRequest) -> "MigrationStatus": """Migrate out requests to a specified instance, return migrated request id. Args: - dst_instance_name: instance actor name, used to get ray actor handle + migrate_in_ray_actor: instance actor name, used to get ray actor handle """ stage_count = 0 while stage_count < self.max_stages: diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index c8157258..4d4dd3c4 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -58,7 +58,7 @@ def __init__(self): async def test_migration_correctness(setup_ray_env, migration_backend): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) id_rank_map = {"0":0, "1":1} - migration_config = MigrationConfig("LCFS", migration_backend, 16, 1, 4, 5, 20) + migration_config = MigrationConfig("SR", migration_backend, 16, 1, 4, 5, 20) output_queue_type = QueueType.RAYQUEUE que, server_info = request_output_queue_server(output_queue_type) @@ -95,7 +95,7 @@ async def test_migration_correctness(setup_ray_env, migration_backend): llumlet_1.execute_engine_method.remote("_run_workers", "rebuild_migration_backend", id_rank_map, "llumnix")]) # empty instance migrate out - res = ray.get(llumlet_0.migrate_out.remote("instance_1", num_requests=math.inf)) + res = ray.get(llumlet_0.migrate_out.remote("instance_1")) assert not res # running without migration @@ -120,7 +120,7 @@ async def test_correctness(prompt): if len(running_queue) > 0 and running_queue[0].inference_type == RequestInferenceType.DECODE: break # migrate request - res = ray.get(llumlet_0.migrate_out.remote("instance_1", num_requests=math.inf)) + res = ray.get(llumlet_0.migrate_out.remote("instance_1")) assert len(res) == 1 request_output_queue = que @@ -148,7 +148,7 @@ async def test_correctness(prompt): async def test_pd_diaggregation_correctness(setup_ray_env, migration_backend): engine_args = EngineArgs(model="facebook/opt-125m",worker_use_ray=True) id_rank_map = {"0":0,"1":1} - migration_config = MigrationConfig("LCFS", migration_backend, 16, 1, 4, 5, 20) + migration_config = MigrationConfig("SR", migration_backend, 16, 1, 4, 5, 20) output_queue_type = QueueType.RAYQUEUE que, server_info = request_output_queue_server(output_queue_type) @@ -183,7 +183,7 @@ async def test_pd_diaggregation_correctness(setup_ray_env, migration_backend): ray.get([llumlet_0.execute_engine_method.remote("_run_workers","rebuild_migration_backend", id_rank_map, "llumnix"), llumlet_1.execute_engine_method.remote("_run_workers","rebuild_migration_backend", id_rank_map, "llumnix")]) # empty instance migrate out - res = ray.get(llumlet_0.migrate_out.remote("instance_1", num_requests=math.inf)) + res = ray.get(llumlet_0.migrate_out.remote("instance_1")) assert not res # running without migration @@ -206,7 +206,7 @@ async def test_correctness(prompt): ray.get(llumlet_0.generate.remote(request_id1, server_info, request_expected_steps_id1, prompt, sampling_params)) # migrate request for decoding while True: - res = ray.get(llumlet_0.migrate_out.remote("instance_1", num_requests = math.inf)) + res = ray.get(llumlet_0.migrate_out.remote("instance_1")) if len(res) == 1: break request_output_queue = que diff --git a/tests/unit_test/backends/vllm/test_simulator.py b/tests/unit_test/backends/vllm/test_simulator.py index 7fb94baa..77857665 100644 --- a/tests/unit_test/backends/vllm/test_simulator.py +++ b/tests/unit_test/backends/vllm/test_simulator.py @@ -71,7 +71,7 @@ async def test_backend(setup_ray_env): # TODO(ZeldaHuang): add tests for BackendSimVLLM methods # (currently BackendSimVLLM is just a wrapper of BackendVLLM) engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - migration_config = MigrationConfig("LCFS", "gloo", 16, 1, 4, 5, 20) + migration_config = MigrationConfig("SR", "gloo", 16, 1, 4, 5, 20) output_queue_type = QueueType.RAYQUEUE que, server_info = request_output_queue_server(output_queue_type) diff --git a/tests/unit_test/global_scheduler/test_llm_engine_manager.py b/tests/unit_test/global_scheduler/test_llm_engine_manager.py index 5c5fc644..f4f92d55 100644 --- a/tests/unit_test/global_scheduler/test_llm_engine_manager.py +++ b/tests/unit_test/global_scheduler/test_llm_engine_manager.py @@ -76,7 +76,7 @@ def abort(self, request_id): self.num_requests = len(self.request_id_set) return self.num_requests - def migrate_out(self, src_instance_name, dst_instance_name): + def migrate_out(self, dst_instance_name): self.num_migrate_out += 1 def get_num_migrate_out(self): diff --git a/tests/unit_test/llumlet/test_engine_step_exception.py b/tests/unit_test/llumlet/test_engine_step_exception.py index 56b58322..8709c014 100644 --- a/tests/unit_test/llumlet/test_engine_step_exception.py +++ b/tests/unit_test/llumlet/test_engine_step_exception.py @@ -51,7 +51,7 @@ async def raise_error_step(): @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Need at least 1 GPU to run the test.") def test_engine_step_exception(setup_ray_env): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - migration_config = MigrationConfig("LCFS", "rpc", 16, 1, 4, 5, 20) + migration_config = MigrationConfig("SR", "rpc", 16, 1, 4, 5, 20) node_id = ray.get_runtime_context().get_node_id() scheduling_strategy = NodeAffinitySchedulingStrategy(node_id=node_id, soft=False) diff --git a/tests/unit_test/llumlet/test_local_migration_scheduler.py b/tests/unit_test/llumlet/test_local_migration_scheduler.py index 4c72d9a0..a628941a 100644 --- a/tests/unit_test/llumlet/test_local_migration_scheduler.py +++ b/tests/unit_test/llumlet/test_local_migration_scheduler.py @@ -19,14 +19,19 @@ class MockRequest(LlumnixRequest): def __init__(self, request_id, length, expected_steps) -> None: super().__init__(request_id=request_id, server_info=None, expected_steps=expected_steps) self.length = length - self.status = RequestInferenceType.DECODE + self._status = RequestInferenceType.DECODE + self._finished = False def is_finished(self) -> bool: return False + @property + def finished(self) -> bool: + return self._finished + @property def inference_type(self) -> RequestInferenceType: - return self.status + return self._status @property def request_len(self) -> int: @@ -58,20 +63,20 @@ def test_scheduler_policy(): engine.add_request(request_id="1", length=3, expected_steps=math.inf) engine.add_request(request_id="2", length=2, expected_steps=math.inf) - scheduler.request_migration_policy = "LCFS" - assert scheduler.get_migrate_out_request().request_id == "2" - scheduler.request_migration_policy = "LRF" - assert scheduler.get_migrate_out_request().request_id == "1" - scheduler.request_migration_policy = "SRF" - assert scheduler.get_migrate_out_request().request_id == "0" + scheduler.request_migration_policy = "LCR" + assert scheduler.get_migrate_out_requests()[0].request_id == "2" + scheduler.request_migration_policy = "LR" + assert scheduler.get_migrate_out_requests()[0].request_id == "1" + scheduler.request_migration_policy = "SR" + assert scheduler.get_migrate_out_requests()[0].request_id == "0" engine.add_request(request_id="3", length=2, expected_steps=1) - request = scheduler.get_migrate_out_request() + request = scheduler.get_migrate_out_requests()[0] assert request.request_id == "3" assert request.output_len >= request.expected_steps and request.inference_type == RequestInferenceType.DECODE engine.add_request(request_id="4", length=3, expected_steps=math.inf) - scheduler.request_migration_policy = "LCFS" - request = scheduler.get_migrate_out_request() + scheduler.request_migration_policy = "LCR" + request = scheduler.get_migrate_out_requests()[0] assert request.request_id == "3" assert request.output_len >= request.expected_steps and request.inference_type == RequestInferenceType.DECODE @@ -79,11 +84,11 @@ def test_scheduler_should_abort_migration(): req_0 = MockRequest(request_id="0", length=1, expected_steps=math.inf) req_0.stage_timestamps = [1] assert req_0.should_abort_migration() is False - req_0.status = RequestInferenceType.PREFILL - assert req_0.should_abort_migration() is True - req_0.status = RequestInferenceType.DECODE req_0.last_preemption_time = 2 assert req_0.should_abort_migration() is True + req_0.last_preemption_time = None + req_0._finished = True + assert req_0.should_abort_migration() is True def test_blocking_migration(): req_0 = MockRequest(request_id="0", length=1, expected_steps=math.inf) diff --git a/tests/unit_test/llumlet/test_migration_coordinator.py b/tests/unit_test/llumlet/test_migration_coordinator.py index 535a3452..f0244836 100644 --- a/tests/unit_test/llumlet/test_migration_coordinator.py +++ b/tests/unit_test/llumlet/test_migration_coordinator.py @@ -81,7 +81,7 @@ async def test_migrate_out_onestage(setup_ray_env): migrate_out_request.should_abort_migration.return_value = True migrate_out_request.blocking_migration = False migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) - status = coordinator._migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) + status = await coordinator._migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) assert status == MigrationStatus.FINISHED_SRC_ABORTED # setup_ray_env should be passed after _migrate_out_onestage @@ -115,7 +115,8 @@ async def test_migrate_out_running_request(_, setup_ray_env): assert coordinator._migrate_out_onestage.call_count == max_stages + 1 assert status == MigrationStatus.FINISHED_SRC_ABORTED -def test_migrate_out_waiting_request(): +@pytest.mark.asyncio +async def test_migrate_out_waiting_request(): # Create mock objects backend_engine = MagicMock(spec=BackendInterface) migrate_in_ray_actor = MagicMock() @@ -131,10 +132,10 @@ def test_migrate_out_waiting_request(): migrate_in_ray_actor.execute_engine_method.remote = MagicMock() migrate_in_ray_actor.execute_engine_method.remote.return_value = ray_remote_call.remote(dst_blocks) migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) - status = coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) + status = await coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) assert status == MigrationStatus.FINISHED_DONE # Test FINISHED_ABORTED migrate_out_request.prefill_num_blocks = 2 - status = coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) + status = await coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) assert status == MigrationStatus.FINISHED_DST_ABORTED From acb716f83d3c53fdb56e0e4690dcb402eab70188 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 17 Oct 2024 08:22:39 +0000 Subject: [PATCH 15/49] Fix lint --- llumnix/backends/vllm/scheduler.py | 6 +++--- llumnix/llm_engine_manager.py | 1 - llumnix/llumlet/llumlet.py | 1 - llumnix/llumlet/migration_coordinator.py | 4 ++-- llumnix/llumlet/request.py | 3 --- .../llumlet/test_local_migration_scheduler.py | 19 ++++++++++++++----- 6 files changed, 19 insertions(+), 15 deletions(-) diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 5effc27e..81fc6892 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -13,7 +13,7 @@ from asyncio.log import logger import time -from typing import Dict, List, Optional, Tuple, Deque, Union +from typing import Dict, List, Optional, Tuple, Deque from collections import deque from vllm.core.block_manager_v1 import BlockSpaceManagerV1, BlockTable @@ -121,7 +121,7 @@ def pop_migrating_out_requests_last_stage(self) -> List[SequenceGroupLlumnix]: self.migrating_out_request_last_stage.clear() return migrating_out_request_last_stage - def pre_alloc(self, + def pre_alloc(self, request_id: str, request_status: RequestStatus, request_arrival_time: float, @@ -229,7 +229,7 @@ def _get_instance_info(self, scheduled_seq_groups: List[SequenceGroupLlumnix]) - # TODO(ZeldaHuang) adapt chunked-prefill instance_info.num_batched_tokens = sum([seq_group.request_len for seq_group in scheduled_seq_groups])\ if instance_info.inference_type == RequestInferenceType.PREFILL else len(instance_info.running_seq_lens) - instance_info.finished_request_ids = [seq_group.request_id for seq_group in self.running if seq_group.is_finished()] + instance_info.finished_request_ids = [seq_group.request_id for seq_group in self.running if seq_group.finished] return instance_info def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index 88d1dc72..59bcec7b 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -15,7 +15,6 @@ import time import csv import os -import math from typing import Dict, List, Tuple, Union, Iterable from collections import defaultdict import traceback diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 85006c00..0472b3c2 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -15,7 +15,6 @@ import traceback from typing import List, Union, Iterable import time -import traceback import ray from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy, NodeAffinitySchedulingStrategy diff --git a/llumnix/llumlet/migration_coordinator.py b/llumnix/llumlet/migration_coordinator.py index ea58edea..dd320fb8 100644 --- a/llumnix/llumlet/migration_coordinator.py +++ b/llumnix/llumlet/migration_coordinator.py @@ -47,7 +47,7 @@ def __init__(self, self.last_stage_max_blocks = last_stage_max_blocks self.max_stages = max_stages self.backend_engine = backend_engine - + async def migrate_out_running_request(self, migrate_in_ray_actor: "ray.actor.ActorHandle", migrate_out_request: LlumnixRequest) -> "MigrationStatus": @@ -115,7 +115,7 @@ async def _migrate_out_onestage(self, stage_block_num = len(incremental_blocks) src_blocks = incremental_blocks[:] dst_blocks = await migrate_in_ray_actor.execute_migration_method \ - .remote("migrate_in_pre_alloc", migrate_out_request.request_id, + .remote("migrate_in_pre_alloc", migrate_out_request.request_id, migrate_out_request.status, migrate_out_request.arrival_time, stage_block_num) diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index 1657bff6..085dbd71 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -47,9 +47,6 @@ def reset_migration_args(self): # By default, there is no limit on the number of steps expected for the request. self.expected_steps = math.inf - def is_finished(self) -> bool: - raise NotImplementedError - @property def inference_type(self) -> RequestInferenceType: raise NotImplementedError diff --git a/tests/unit_test/llumlet/test_local_migration_scheduler.py b/tests/unit_test/llumlet/test_local_migration_scheduler.py index a628941a..c79b42d7 100644 --- a/tests/unit_test/llumlet/test_local_migration_scheduler.py +++ b/tests/unit_test/llumlet/test_local_migration_scheduler.py @@ -22,9 +22,6 @@ def __init__(self, request_id, length, expected_steps) -> None: self._status = RequestInferenceType.DECODE self._finished = False - def is_finished(self) -> bool: - return False - @property def finished(self) -> bool: return self._finished @@ -35,16 +32,28 @@ def inference_type(self) -> RequestInferenceType: @property def request_len(self) -> int: - return self.length + pass @property def prompt_len(self) -> int: - return self.length + pass @property def output_len(self) -> int: return self.length + @property + def arrival_time(self) -> float: + pass + + @property + def status(self) -> RequestStatus: + pass + + @property + def prefill_num_blocks(self) -> int: + pass + class MockeEngine(): def __init__(self) -> None: self.running = [] From 6306cd54ed817cb4f924026547701c17d4299909 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 17 Oct 2024 08:38:07 +0000 Subject: [PATCH 16/49] Fix unit test --- Makefile | 4 ++-- tests/unit_test/llumlet/test_local_migration_scheduler.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 6bc87a9b..b8245bf4 100644 --- a/Makefile +++ b/Makefile @@ -29,14 +29,14 @@ lint: check_pylint_installed check_pytest_installed .PHONY: test test: check_pytest_installed - @pytest -x -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings + @pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings @python examlpes/offline_inference.py @pytest -v tests/e2e_test/test_e2e.py @pytest -v -x ./tests/e2e_test/test_migration.py .PHONY: unit_test unit_test: check_pytest_installed - @pytest -x -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings + @pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings .PHONY: offline_test offline_test: diff --git a/tests/unit_test/llumlet/test_local_migration_scheduler.py b/tests/unit_test/llumlet/test_local_migration_scheduler.py index c79b42d7..04dee326 100644 --- a/tests/unit_test/llumlet/test_local_migration_scheduler.py +++ b/tests/unit_test/llumlet/test_local_migration_scheduler.py @@ -13,7 +13,7 @@ import math from llumnix.llumlet.local_migration_scheduler import LocalMigrationScheduler -from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType +from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType, RequestStatus class MockRequest(LlumnixRequest): def __init__(self, request_id, length, expected_steps) -> None: @@ -32,11 +32,11 @@ def inference_type(self) -> RequestInferenceType: @property def request_len(self) -> int: - pass + return self.length @property def prompt_len(self) -> int: - pass + return self.length @property def output_len(self) -> int: @@ -48,7 +48,7 @@ def arrival_time(self) -> float: @property def status(self) -> RequestStatus: - pass + return self._status @property def prefill_num_blocks(self) -> int: From 5d8fb86f5a3f230018820fc40578a018b5d9725c Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 17 Oct 2024 09:28:30 +0000 Subject: [PATCH 17/49] Add request migration policy tests --- llumnix/backends/vllm/utils.py | 3 +-- llumnix/llumlet/local_migration_scheduler.py | 3 ++- .../llumlet/test_local_migration_scheduler.py | 26 ++++++++++++++++--- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/llumnix/backends/vllm/utils.py b/llumnix/backends/vllm/utils.py index 8aafc9f1..7e49720a 100644 --- a/llumnix/backends/vllm/utils.py +++ b/llumnix/backends/vllm/utils.py @@ -48,8 +48,7 @@ def check_engine_args(engine_args: AsyncEngineArgs, engine_manager_args: EngineM engine_config = engine_args.create_engine_config() parallel_config = engine_config.parallel_config if parallel_config.world_size > 1 and migration_config.migration_backend == 'nccl': - # TODO(s5u13b): fix logger - print("Llumnix does not support TP or PP enabled model when the migration backend is nccl, change migration backend to gloo.") + logger.info("Llumnix does not support TP or PP enabled model when the migration backend is nccl, change migration backend to gloo.") engine_manager_args.migration_backend = 'gloo' detect_unsupported_feature(engine_args) diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index e61d5185..ba7c00ad 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -40,7 +40,7 @@ def get_migrate_out_requests(self, min_request_len=0, max_request_len=np.inf) -> return migrate_out_requests # The function is used to retrieve requests on the backend that have already met the expected_steps. - # TODO(xinyi): Currently, the function is only used for Prefill-decoding disaggregation, + # (xinyi): Currently, the function is only used for Prefill-decoding disaggregation, # and only selects request that migrates from the prefill instance to the decoding instance. def get_required_migration_request(self, min_request_len, max_request_len): running: List[LlumnixRequest] = self.backend_engine.get_running_queue() @@ -84,4 +84,5 @@ def _get_first_waiting_request(self, min_request_len, max_request_len) -> List[L def _get_first_waiting_and_shortest_running_requests(self, min_request_len, max_request_len) -> List[LlumnixRequest]: waiting_requests = self._get_first_waiting_request(min_request_len, max_request_len) running_requests = self._get_shortest_running_request(min_request_len, max_request_len) + waiting_requests[0].eom = True return waiting_requests + running_requests diff --git a/tests/unit_test/llumlet/test_local_migration_scheduler.py b/tests/unit_test/llumlet/test_local_migration_scheduler.py index 04dee326..5f194a25 100644 --- a/tests/unit_test/llumlet/test_local_migration_scheduler.py +++ b/tests/unit_test/llumlet/test_local_migration_scheduler.py @@ -21,6 +21,8 @@ def __init__(self, request_id, length, expected_steps) -> None: self.length = length self._status = RequestInferenceType.DECODE self._finished = False + self.try_schedule_times = 0 + self.eom = False @property def finished(self) -> bool: @@ -57,13 +59,22 @@ def prefill_num_blocks(self) -> int: class MockeEngine(): def __init__(self) -> None: self.running = [] + self.waiting = [] def add_request(self, request_id, length, expected_steps) -> None: self.running.append(MockRequest(request_id, length, expected_steps)) + + def add_request_waiting(self, request_id, length, expected_steps) -> None: + request = MockRequest(request_id, length, expected_steps) + request.try_schedule_times += 1 + self.waiting.append(request) def get_running_queue(self): return self.running + def get_waiting_queue(self): + return self.waiting + def test_scheduler_policy(): engine = MockeEngine() scheduler = LocalMigrationScheduler("", engine) @@ -71,6 +82,8 @@ def test_scheduler_policy(): engine.add_request(request_id="0", length=1, expected_steps=math.inf) engine.add_request(request_id="1", length=3, expected_steps=math.inf) engine.add_request(request_id="2", length=2, expected_steps=math.inf) + engine.add_request_waiting(request_id="3", length=2, expected_steps=math.inf) + engine.add_request_waiting(request_id="4", length=2, expected_steps=math.inf) scheduler.request_migration_policy = "LCR" assert scheduler.get_migrate_out_requests()[0].request_id == "2" @@ -78,15 +91,20 @@ def test_scheduler_policy(): assert scheduler.get_migrate_out_requests()[0].request_id == "1" scheduler.request_migration_policy = "SR" assert scheduler.get_migrate_out_requests()[0].request_id == "0" + scheduler.request_migration_policy = "FCW" + assert scheduler.get_migrate_out_requests()[0].request_id == "3" + scheduler.request_migration_policy = "FCWSR" + assert scheduler.get_migrate_out_requests()[0].request_id == "3" + assert scheduler.get_migrate_out_requests()[1].request_id == "0" - engine.add_request(request_id="3", length=2, expected_steps=1) + engine.add_request(request_id="5", length=2, expected_steps=1) request = scheduler.get_migrate_out_requests()[0] - assert request.request_id == "3" + assert request.request_id == "5" assert request.output_len >= request.expected_steps and request.inference_type == RequestInferenceType.DECODE - engine.add_request(request_id="4", length=3, expected_steps=math.inf) + engine.add_request(request_id="6", length=3, expected_steps=math.inf) scheduler.request_migration_policy = "LCR" request = scheduler.get_migrate_out_requests()[0] - assert request.request_id == "3" + assert request.request_id == "5" assert request.output_len >= request.expected_steps and request.inference_type == RequestInferenceType.DECODE def test_scheduler_should_abort_migration(): From 09a264fc3045c9874e722be07e5b041a9b85c387 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 17 Oct 2024 12:10:38 +0000 Subject: [PATCH 18/49] Minor --- llumnix/llumlet/llumlet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 0472b3c2..8ef8534c 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -141,14 +141,14 @@ async def migrate_out(self, dst_instance_name: str) -> List[str]: break return migrated_request_list - async def _migrate_out_one_request(self, migrate_out_request: LlumnixRequest, dst_instance_name: str): + async def _migrate_out_one_request(self, migrate_out_request: LlumnixRequest, dst_instance_name: str) -> List[LlumnixRequest]: try: t0 = time.time() migrate_in_ray_actor = ray.get_actor(dst_instance_name, namespace='llumnix') dst_instance_id = dst_instance_name[len("instance_"):] logger.info("{}->{} begin migrate out".format(self.instance_id, dst_instance_id)) migrated_request = [] - assert migrate_out_request.status in [RequestStatus.WAITING, RequestStatus.RUNNING], "Only migrate out waiting/running request" + assert migrate_out_request.status in [RequestStatus.WAITING, RequestStatus.RUNNING], "Only migrate out waiting and running request" if migrate_out_request.status == RequestStatus.RUNNING: status = await self.migration_coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) else: From d41568053a4fffe4a9e24fa7a2eefc047fbdd805 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 21 Oct 2024 02:34:26 +0000 Subject: [PATCH 19/49] Fix migration unit test --- tests/unit_test/backends/vllm/test_migration.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index 4d4dd3c4..6687003c 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -129,12 +129,8 @@ async def test_correctness(prompt): while not finished: request_outputs = await request_output_queue.get() for request_output in request_outputs: - origin_output = request_output.outputs[0] - finished = request_output.finished - if request_output.request_id != request_id1: - continue - output = request_output.outputs[0] - finished = request_output.finished + output = request_output.outputs[0] + finished = request_output.finished assert output.text == origin_output.text assert output.cumulative_logprob == origin_output.cumulative_logprob @@ -215,12 +211,8 @@ async def test_correctness(prompt): while not finished: request_outputs = await request_output_queue.get() for request_output in request_outputs: - origin_output = request_output.outputs[0] + output = request_output.outputs[0] finished = request_output.finished - if request_output.request_id != request_id1: - continue - output = request_output.outputs[0] - finished = request_output.finished assert output.text == origin_output.text assert output.cumulative_logprob == origin_output.cumulative_logprob From 8e6a09867d634563588a16f642e80eaf5a965358 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 21 Oct 2024 06:58:43 +0000 Subject: [PATCH 20/49] Fix return type of backend engine step_async --- llumnix/backends/vllm/llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index f400aa78..5913ee11 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -199,7 +199,7 @@ def _process_model_outputs( # TODO(ZeldaHuang): Use LlumnixRequestOutput to store llumnix output args. return request_outputs, server_infos - async def step_async(self) -> None: + async def step_async(self) -> Tuple[List[RequestOutput], List[ServerInfo]]: step_begin_time = time.time() request_outputs, server_infos = await super().step_async() for request_output in request_outputs: From 1ddf8369512571e7371868c2d9d74ea4a3880031 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 21 Oct 2024 07:15:01 +0000 Subject: [PATCH 21/49] Add migrate waiting unit test --- .../unit_test/backends/vllm/test_migration.py | 113 ++++++++++++++---- .../llumlet/test_local_migration_scheduler.py | 2 +- 2 files changed, 90 insertions(+), 25 deletions(-) diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index 6687003c..3dffe8a3 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -11,11 +11,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List import asyncio import math +from unittest.mock import MagicMock import pytest import ray +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy from vllm import EngineArgs, SamplingParams from vllm.utils import random_uuid @@ -25,7 +26,7 @@ from llumnix.llumlet.llumlet import Llumlet from llumnix.backends.utils import BackendType from llumnix.internal_config import MigrationConfig -from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType, RequestStatus +from llumnix.llumlet.request import RequestInferenceType, RequestStatus from llumnix.queue.queue_type import QueueType from tests.unit_test.queue.utils import request_output_queue_server @@ -52,23 +53,60 @@ def __init__(self): self.instance_id = "0" self.backend_engine = MockBackendVLLM() +@ray.remote(num_cpus=1, max_concurrency=4) +class MockLlumletDoNotSchedule(Llumlet): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # stop the schedule in engine step loop + self.backend_engine.engine.scheduler.schedule = MagicMock() + + # For some reason, if MockScheduelrOutputs is defined outside, the constructor would raise error. + class MockScheduelrOutputs: + def __init__(self): + self.scheduled_seq_groups = [] + self.ignored_seq_groups = [] + self.num_batched_tokens = 0 + + def is_empty(self) -> bool: + return not self.scheduled_seq_groups + + scheduler_outputs = MockScheduelrOutputs() + self.backend_engine.engine.scheduler.schedule.return_value = ([], scheduler_outputs) + + self.step_async = self.backend_engine.engine.step_async + + async def step_async_try_schedule(): + request_outputs, server_infos = await self.step_async() + for seq_group in self.backend_engine.engine.scheduler.waiting: + seq_group.try_schedule_times += 1 + return request_outputs, server_infos + + self.backend_engine.engine.step_async = step_async_try_schedule + # TODO(s5u13b): Test migrate waiting request. @pytest.mark.parametrize("migration_backend", ['rpc', 'gloo', 'nccl']) +@pytest.mark.parametrize("migration_request_status", ['waiting', 'running']) @pytest.mark.asyncio -async def test_migration_correctness(setup_ray_env, migration_backend): +async def test_migration_correctness(setup_ray_env, migration_backend, migration_request_status): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - id_rank_map = {"0":0, "1":1} - migration_config = MigrationConfig("SR", migration_backend, 16, 1, 4, 5, 20) + id_rank_map = {"0": 0, "1": 1, "2": 2} + if migration_request_status == 'running': + request_migration_policy = "SR" + elif migration_request_status == 'waiting': + request_migration_policy = "FCW" + migration_config = MigrationConfig(request_migration_policy, migration_backend, 16, 1, 4, 5, 20) output_queue_type = QueueType.RAYQUEUE que, server_info = request_output_queue_server(output_queue_type) asyncio.create_task(que.run_server_loop()) + node_id = ray.get_runtime_context().get_node_id() + scheduling_strategy = NodeAffinitySchedulingStrategy(node_id=node_id, soft=False) llumlet_0: Llumlet = Llumlet.from_args( output_queue_type, False, - True, - ray.get_runtime_context().get_node_id(), + False, + node_id, "0", BackendType.VLLM, 1, @@ -78,25 +116,40 @@ async def test_migration_correctness(setup_ray_env, migration_backend): llumlet_1: Llumlet = Llumlet.from_args( output_queue_type, False, - True, - ray.get_runtime_context().get_node_id(), + False, + node_id, "1", BackendType.VLLM, 1, migration_config, engine_args) + llumlet_2: Llumlet = MockLlumletDoNotSchedule.options( + name='instance_2', + namespace='llumnix', + scheduling_strategy=scheduling_strategy).remote( + instance_id="2", + output_queue_type=output_queue_type, + backend_type=BackendType.VLLM, + migration_config=migration_config, + engine_args=engine_args, + node_id=node_id + ) + while True: - res = ray.get([llumlet_0.is_ready.remote(),llumlet_1.is_ready.remote()]) + res = ray.get([llumlet_0.is_ready.remote(), llumlet_1.is_ready.remote(), llumlet_2.is_ready.remote()]) if all(res): break ray.get([llumlet_0.execute_engine_method.remote("_run_workers", "rebuild_migration_backend", id_rank_map, "llumnix"), - llumlet_1.execute_engine_method.remote("_run_workers", "rebuild_migration_backend", id_rank_map, "llumnix")]) + llumlet_1.execute_engine_method.remote("_run_workers", "rebuild_migration_backend", id_rank_map, "llumnix"), + llumlet_2.execute_engine_method.remote("_run_workers", "rebuild_migration_backend", id_rank_map, "llumnix")]) # empty instance migrate out res = ray.get(llumlet_0.migrate_out.remote("instance_1")) assert not res + res = ray.get(llumlet_2.migrate_out.remote("instance_1")) + assert not res # running without migration async def test_correctness(prompt): @@ -112,16 +165,28 @@ async def test_correctness(prompt): origin_output = request_output.outputs[0] finished = request_output.finished - request_id1 = random_uuid() - ray.get(llumlet_0.generate.remote(request_id1, server_info, math.inf, prompt, sampling_params)) - # wait prefill done - while True: - running_queue: List[LlumnixRequest] = ray.get(llumlet_0.execute_engine_method.remote("get_running_queue")) - if len(running_queue) > 0 and running_queue[0].inference_type == RequestInferenceType.DECODE: - break - # migrate request - res = ray.get(llumlet_0.migrate_out.remote("instance_1")) - assert len(res) == 1 + if migration_request_status == 'running': + request_id1 = random_uuid() + ray.get(llumlet_0.generate.remote(request_id1, server_info, math.inf, prompt, sampling_params)) + # wait prefill done + while True: + running_queue = ray.get(llumlet_0.execute_engine_method.remote("get_running_queue")) + if len(running_queue) > 0 and running_queue[0].inference_type == RequestInferenceType.DECODE: + break + # migrate request + res = ray.get(llumlet_0.migrate_out.remote("instance_1")) + assert len(res) == 1 + elif migration_request_status == 'waiting': + request_id1 = random_uuid() + ray.get(llumlet_2.generate.remote(request_id1, server_info, math.inf, prompt, sampling_params)) + # wait try schedule done + while True: + waiting_queue = ray.get(llumlet_2.execute_engine_method.remote("get_waiting_queue")) + if len(waiting_queue) > 0 and waiting_queue[0].try_schedule_times >= 1: + break + # migrate request + res = ray.get(llumlet_2.migrate_out.remote("instance_1")) + assert len(res) == 1 request_output_queue = que output = None @@ -129,8 +194,8 @@ async def test_correctness(prompt): while not finished: request_outputs = await request_output_queue.get() for request_output in request_outputs: - output = request_output.outputs[0] - finished = request_output.finished + output = request_output.outputs[0] + finished = request_output.finished assert output.text == origin_output.text assert output.cumulative_logprob == origin_output.cumulative_logprob @@ -177,7 +242,7 @@ async def test_pd_diaggregation_correctness(setup_ray_env, migration_backend): if all(res): break ray.get([llumlet_0.execute_engine_method.remote("_run_workers","rebuild_migration_backend", id_rank_map, "llumnix"), - llumlet_1.execute_engine_method.remote("_run_workers","rebuild_migration_backend", id_rank_map, "llumnix")]) + llumlet_1.execute_engine_method.remote("_run_workers","rebuild_migration_backend", id_rank_map, "llumnix")]) # empty instance migrate out res = ray.get(llumlet_0.migrate_out.remote("instance_1")) assert not res diff --git a/tests/unit_test/llumlet/test_local_migration_scheduler.py b/tests/unit_test/llumlet/test_local_migration_scheduler.py index 5f194a25..8e5d1890 100644 --- a/tests/unit_test/llumlet/test_local_migration_scheduler.py +++ b/tests/unit_test/llumlet/test_local_migration_scheduler.py @@ -63,7 +63,7 @@ def __init__(self) -> None: def add_request(self, request_id, length, expected_steps) -> None: self.running.append(MockRequest(request_id, length, expected_steps)) - + def add_request_waiting(self, request_id, length, expected_steps) -> None: request = MockRequest(request_id, length, expected_steps) request.try_schedule_times += 1 From 1da0b81418bf6b7179c6b448eaf56070f0d9e2c1 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 21 Oct 2024 07:26:48 +0000 Subject: [PATCH 22/49] Fix e2e test --- Makefile | 6 +++++- tests/e2e_test/test_e2e.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index b8245bf4..18ac8b34 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,11 @@ e2e_test: .PHONY: bench_test bench_test: - @pytest -v ./tests/e2e_test/test_bench.py + @pytest -v ./tests/e2e_test/test_bench. + +.PHONY: migration_test +bench_test: + @pytest -v ./tests/e2e_test/test_migration.py .PHONY: migration_test migration_test: diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py index 741360f1..9b04108b 100644 --- a/tests/e2e_test/test_e2e.py +++ b/tests/e2e_test/test_e2e.py @@ -61,7 +61,7 @@ def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool f"--max-model-len {max_model_len} " f"--dispatch-policy {dispatch_policy} " f"--trust-remote-code " - f"--request-migration-policy LCFS " + f"--request-migration-policy SR " f"--migration-backend {migration_backend} " f"--migration-cache-blocks 32 " f"--tensor-parallel-size 1 " From 7b76479406e58520dbf46b34ae6d588a34966cd3 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 21 Oct 2024 08:17:14 +0000 Subject: [PATCH 23/49] Fix unit test of dispatch scheduler --- .../unit_test/global_scheduler/test_dispatch_scheduler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/unit_test/global_scheduler/test_dispatch_scheduler.py b/tests/unit_test/global_scheduler/test_dispatch_scheduler.py index 8cee3a69..2398d97f 100644 --- a/tests/unit_test/global_scheduler/test_dispatch_scheduler.py +++ b/tests/unit_test/global_scheduler/test_dispatch_scheduler.py @@ -46,7 +46,10 @@ def test_add_instance_and_remove_instance(dispatch_scheduler): dispatch_scheduler.remove_instance('instance_2') assert dispatch_scheduler.num_instances == 1 - assert len(dispatch_scheduler.available_dispatch_instance_set) == 1 + if dispatch_scheduler.num_dispatch_instances >= 2: + assert len(dispatch_scheduler.available_dispatch_instance_set) == 1 + else: + assert len(dispatch_scheduler.available_dispatch_instance_set) == 0 dispatch_scheduler.remove_instance('instance_3') assert dispatch_scheduler.num_instances == 0 @@ -99,7 +102,7 @@ def test_dispatch_queue(): instance_info.instance_id = instance_id instance_info.num_waiting_requests = random.randint(1, 10) instance_info_dict[instance_id] = instance_info - if len(dispatch_scheduler.available_dispatch_instance_set) < dispatch_scheduler.num_dispatch_instances: + if len(dispatch_scheduler.available_dispatch_instance_set) < dispatch_scheduler.num_dispatch_instances: dispatch_scheduler.available_dispatch_instance_set.add(instance_id) instance_num_requests[instance_id] = 0 dispatch_scheduler.instance_num_requests = instance_num_requests From 343c671ad22842b12910c188bc8953bd0b1704da Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 21 Oct 2024 08:36:08 +0000 Subject: [PATCH 24/49] Fix makefile --- Makefile | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 18ac8b34..2b83b11b 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,8 @@ test: check_pytest_installed @pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings @python examlpes/offline_inference.py @pytest -v tests/e2e_test/test_e2e.py - @pytest -v -x ./tests/e2e_test/test_migration.py + @pytest -v ./tests/e2e_test/test_bench.py + @pytest -v ./tests/e2e_test/test_migration.py .PHONY: unit_test unit_test: check_pytest_installed @@ -48,15 +49,11 @@ e2e_test: .PHONY: bench_test bench_test: - @pytest -v ./tests/e2e_test/test_bench. - -.PHONY: migration_test -bench_test: - @pytest -v ./tests/e2e_test/test_migration.py + @pytest -v ./tests/e2e_test/test_bench.py .PHONY: migration_test migration_test: - @pytest -v -x ./tests/e2e_test/test_migration.py + @pytest -v ./tests/e2e_test/test_migration.py #################### pygloo install for gloo migration backend begin #################### From 47ddfb9ffc66e6685bde8c2a4703df85b9bd8a68 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 21 Oct 2024 08:54:38 +0000 Subject: [PATCH 25/49] Fix running request migration --- llumnix/llumlet/local_migration_scheduler.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index ba7c00ad..346d3427 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -14,7 +14,7 @@ from typing import Deque, List import numpy as np -from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType +from llumnix.llumlet.request import LlumnixRequest, RequestStatus, RequestInferenceType from llumnix.backends.backend_interface import BackendInterface @@ -55,14 +55,15 @@ def get_required_migration_request(self, min_request_len, max_request_len): def _get_last_running_request(self, min_request_len, max_request_len): running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() for request in reversed(running): - if request.inference_type == RequestInferenceType.DECODE \ + if request.status == RequestStatus.RUNNING and request.inference_type == RequestInferenceType.DECODE \ and min_request_len < request.request_len < max_request_len: return [request] return [] def _get_longest_running_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() - condition = lambda request : request.inference_type == RequestInferenceType.DECODE \ + condition = lambda request : request.status == RequestStatus.RUNNING \ + and request.inference_type == RequestInferenceType.DECODE \ and min_request_len < request.request_len < max_request_len longest_seq_group = max((request for request in running if condition(request)), \ key=lambda request: request.request_len, default=None) @@ -70,8 +71,9 @@ def _get_longest_running_request(self, min_request_len, max_request_len) -> List def _get_shortest_running_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() - condition = lambda request : request.inference_type == RequestInferenceType.DECODE \ - and min_request_len < request.request_len < max_request_len + condition = lambda request : request.status == RequestStatus.RUNNING \ + and request.inference_type == RequestInferenceType.DECODE \ + and min_request_len < request.request_len < max_request_len shortest_seq_group = min((request for request in running if condition(request)), \ key=lambda request: request.request_len, default=None) return [shortest_seq_group] if shortest_seq_group is not None else [] From b809d0656b297f1e07a8c23797e264fac808617e Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 21 Oct 2024 09:38:41 +0000 Subject: [PATCH 26/49] Fix and refine local migration scheduler --- llumnix/llumlet/local_migration_scheduler.py | 49 ++++++++++++------- .../llumlet/test_local_migration_scheduler.py | 9 ++-- 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index 346d3427..a6191606 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -25,7 +25,7 @@ def __init__(self, request_migration_policy: str, backend_engine: BackendInterfa def get_migrate_out_requests(self, min_request_len=0, max_request_len=np.inf) -> List[LlumnixRequest]: # Requests meet the strict pre-migration always have higher prioirity than other migration policy. - migrate_out_requests: List[LlumnixRequest] = self.get_required_migration_request(min_request_len, max_request_len) + migrate_out_requests: List[LlumnixRequest] = self.get_required_migration_request() if len(migrate_out_requests) == 0: if self.request_migration_policy == 'LCR': migrate_out_requests = self._get_last_running_request(min_request_len, max_request_len) @@ -42,46 +42,57 @@ def get_migrate_out_requests(self, min_request_len=0, max_request_len=np.inf) -> # The function is used to retrieve requests on the backend that have already met the expected_steps. # (xinyi): Currently, the function is only used for Prefill-decoding disaggregation, # and only selects request that migrates from the prefill instance to the decoding instance. - def get_required_migration_request(self, min_request_len, max_request_len): + def get_required_migration_request(self): running: List[LlumnixRequest] = self.backend_engine.get_running_queue() required_migration_requests = [] for request in reversed(running): - if request.output_len >= request.expected_steps \ + if request.status == RequestStatus.RUNNING \ and request.inference_type == RequestInferenceType.DECODE \ - and min_request_len <= request.request_len <= max_request_len: + and request.output_len >= request.expected_steps: required_migration_requests.append(request) return required_migration_requests + def _filter_running_queue(self, running, min_request_len, max_request_len): + filtered_running = [ + request for request in running \ + if request.status == RequestStatus.RUNNING \ + and request.inference_type == RequestInferenceType.DECODE \ + and min_request_len < request.request_len < max_request_len \ + ] + return filtered_running + + def _filter_waiting_queue(self, waiting, min_request_len, max_request_len): + filtered_waiting = [ + request for request in waiting \ + if request.status == RequestStatus.WAITING \ + and request.try_schedule_times >= 1 \ + and min_request_len < request.request_len < max_request_len \ + ] + return filtered_waiting + def _get_last_running_request(self, min_request_len, max_request_len): running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() - for request in reversed(running): - if request.status == RequestStatus.RUNNING and request.inference_type == RequestInferenceType.DECODE \ - and min_request_len < request.request_len < max_request_len: - return [request] - return [] + filtered_running = self._filter_running_queue(running, min_request_len, max_request_len) + return [filtered_running[-1]] if filtered_running else [] def _get_longest_running_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() - condition = lambda request : request.status == RequestStatus.RUNNING \ - and request.inference_type == RequestInferenceType.DECODE \ - and min_request_len < request.request_len < max_request_len - longest_seq_group = max((request for request in running if condition(request)), \ + filtered_running = self._filter_running_queue(running, min_request_len, max_request_len) + longest_seq_group = max((request for request in filtered_running), \ key=lambda request: request.request_len, default=None) return [longest_seq_group] if longest_seq_group is not None else [] def _get_shortest_running_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: running: Deque[LlumnixRequest] = self.backend_engine.get_running_queue() - condition = lambda request : request.status == RequestStatus.RUNNING \ - and request.inference_type == RequestInferenceType.DECODE \ - and min_request_len < request.request_len < max_request_len - shortest_seq_group = min((request for request in running if condition(request)), \ + filtered_running = self._filter_running_queue(running, min_request_len, max_request_len) + shortest_seq_group = min((request for request in filtered_running), \ key=lambda request: request.request_len, default=None) return [shortest_seq_group] if shortest_seq_group is not None else [] def _get_first_waiting_request(self, min_request_len, max_request_len) -> List[LlumnixRequest]: waiting: Deque[LlumnixRequest] = self.backend_engine.get_waiting_queue() - waiting = [seq_group for seq_group in waiting if seq_group.try_schedule_times >= 1] - return [waiting[0]] if waiting and min_request_len < waiting[0].request_len < max_request_len else [] + filtered_waiting = self._filter_waiting_queue(waiting, min_request_len, max_request_len) + return [waiting[0]] if filtered_waiting else [] def _get_first_waiting_and_shortest_running_requests(self, min_request_len, max_request_len) -> List[LlumnixRequest]: waiting_requests = self._get_first_waiting_request(min_request_len, max_request_len) diff --git a/tests/unit_test/llumlet/test_local_migration_scheduler.py b/tests/unit_test/llumlet/test_local_migration_scheduler.py index 8e5d1890..ecca2b71 100644 --- a/tests/unit_test/llumlet/test_local_migration_scheduler.py +++ b/tests/unit_test/llumlet/test_local_migration_scheduler.py @@ -16,10 +16,11 @@ from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType, RequestStatus class MockRequest(LlumnixRequest): - def __init__(self, request_id, length, expected_steps) -> None: + def __init__(self, request_id, length, expected_steps, status=RequestStatus.RUNNING) -> None: super().__init__(request_id=request_id, server_info=None, expected_steps=expected_steps) self.length = length - self._status = RequestInferenceType.DECODE + self._status = status + self._inference_type = RequestInferenceType.DECODE self._finished = False self.try_schedule_times = 0 self.eom = False @@ -30,7 +31,7 @@ def finished(self) -> bool: @property def inference_type(self) -> RequestInferenceType: - return self._status + return self._inference_type @property def request_len(self) -> int: @@ -65,7 +66,7 @@ def add_request(self, request_id, length, expected_steps) -> None: self.running.append(MockRequest(request_id, length, expected_steps)) def add_request_waiting(self, request_id, length, expected_steps) -> None: - request = MockRequest(request_id, length, expected_steps) + request = MockRequest(request_id, length, expected_steps, status=RequestStatus.WAITING) request.try_schedule_times += 1 self.waiting.append(request) From 6f7ef67520a4903b9e797388f723efb81159ad41 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 21 Oct 2024 11:20:40 +0000 Subject: [PATCH 27/49] Minors --- llumnix/backends/vllm/llm_engine.py | 6 +++--- llumnix/backends/vllm/sequence.py | 1 - llumnix/llumlet/request.py | 1 + tests/e2e_test/test_e2e.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 5913ee11..16af3014 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -362,9 +362,9 @@ def commit_dst_request(self, backend_request: SequenceGroupLlumnix) -> None: async def send_blocks(self, dst_ray_actor: "ray.actor.ActorHandle", src_blocks: List[int], dst_blocks: List[int]) -> None: await dst_ray_actor.execute_engine_method.remote("_run_workers", "migrate_cache", - dst_blocks=dst_blocks, - src_blocks=src_blocks, - src_worker_handle_list=self.worker_handle_list) + dst_blocks=dst_blocks, + src_blocks=src_blocks, + src_worker_handle_list=self.worker_handle_list) def _run_workers(self, *args, **kwargs): # pylint: disable=protected-access diff --git a/llumnix/backends/vllm/sequence.py b/llumnix/backends/vllm/sequence.py index 1b226ba1..ce5ca429 100644 --- a/llumnix/backends/vllm/sequence.py +++ b/llumnix/backends/vllm/sequence.py @@ -22,7 +22,6 @@ class SequenceGroupLlumnix(SequenceGroup, LlumnixRequest): def __init__(self, request_id, server_info, expected_steps: int, *args, **kwargs) -> None: SequenceGroup.__init__(self, request_id, *args, **kwargs) LlumnixRequest.__init__(self, request_id, server_info, expected_steps) - self.try_schedule_times = 0 @property def prompt_len(self) -> int: diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index 085dbd71..b47f9112 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -36,6 +36,7 @@ def __init__(self, request_id: int, server_info: ServerInfo, expected_steps: int self.last_preemption_time = None self.stage_timestamps = [] self.stage_num_blocks_list = [] + self.try_schedule_times = 0 self.waiting_migrating = False # end-of-migration self.eom = False diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py index 9b04108b..33375bd1 100644 --- a/tests/e2e_test/test_e2e.py +++ b/tests/e2e_test/test_e2e.py @@ -61,7 +61,7 @@ def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool f"--max-model-len {max_model_len} " f"--dispatch-policy {dispatch_policy} " f"--trust-remote-code " - f"--request-migration-policy SR " + f"--request-migration-policy LCR " f"--migration-backend {migration_backend} " f"--migration-cache-blocks 32 " f"--tensor-parallel-size 1 " From 683f22c55ded4a90e47b6e77c55703989a7655c8 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 21 Oct 2024 12:15:30 +0000 Subject: [PATCH 28/49] Fix migration test --- llumnix/backends/vllm/llm_engine.py | 4 ++-- llumnix/llumlet/llumlet.py | 9 +++++---- llumnix/llumlet/migration_coordinator.py | 1 + llumnix/llumlet/request.py | 13 ++++++++++--- tests/e2e_test/test_migration.py | 14 +++++++------- 5 files changed, 25 insertions(+), 16 deletions(-) diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 16af3014..5b127882 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -352,10 +352,10 @@ def commit_dst_request(self, backend_request: SequenceGroupLlumnix) -> None: logger.info("add seq {} to block table".format(seq.seq_id)) pre_alloc_blocks = self.engine.scheduler.pre_alloc_cache_dict.pop(backend_request.request_id) self.engine.scheduler.block_manager.add_block_table(pre_alloc_blocks, seq.seq_id) - backend_request.reset_migration_args() + backend_request.reset_migration_args_dst() if backend_request.status == RequestStatus.RUNNING: self.add_running_request(backend_request) - else: # backend_request.status == RequestStatus.WAITING + else: # RequestStatus.WAITING backend_request.waiting_migrating = True self.add_waiting_request(backend_request) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 8ef8534c..0c407d5d 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -159,9 +159,10 @@ async def _migrate_out_one_request(self, migrate_out_request: LlumnixRequest, ds self.backend_engine.free_src_request(migrate_out_request) migrated_request.append(migrate_out_request.request_id) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) - elif status == MigrationStatus.FINISHED_SRC_ABORTED: - migrate_out_request.reset_migration_args() - await migrate_in_ray_actor.execute_migration_method.remote("free_dst_pre_alloc_cache", migrate_out_request.request_id) + else: # FINISHED_SRC_ABORTED or FINISHED_DST_ABORTED + migrate_out_request.reset_migration_args_src() + if status == MigrationStatus.FINISHED_SRC_ABORTED: + await migrate_in_ray_actor.execute_migration_method.remote("free_dst_pre_alloc_cache", migrate_out_request.request_id) t1 = time.time() logger.info("{}->{} migrate done, migrate request {}, migration status: {}, len: {} blocks, cost: {} ms" \ .format(self.instance_id, dst_instance_id, migrated_request, status, \ @@ -218,7 +219,7 @@ def clear_migration_states(self, is_migrate_in: bool) -> None: logger.info("clear_migration_states: add request {} back to engine".format(backend_request.request_id)) if backend_request.status == RequestStatus.RUNNING: self.backend_engine.add_running_request(backend_request) - else: # backend_request.status == RequestStatus.WAITING + else: # RequestStatus.WAITING self.backend_engine.add_waiting_request(backend_request) def execute_migration_method(self, method, *args, **kwargs): diff --git a/llumnix/llumlet/migration_coordinator.py b/llumnix/llumlet/migration_coordinator.py index dd320fb8..c58a4b98 100644 --- a/llumnix/llumlet/migration_coordinator.py +++ b/llumnix/llumlet/migration_coordinator.py @@ -126,6 +126,7 @@ async def _migrate_out_onestage(self, self.backend_engine.add_running_request(migrate_out_request) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) return MigrationStatus.FINISHED_DST_ABORTED + # do stage send/recv migrate_out_request.stage_timestamps.append(time.time()) migrate_out_request.stage_num_blocks_list.append(stage_block_num) diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index b47f9112..a32e1c3a 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -41,12 +41,19 @@ def __init__(self, request_id: int, server_info: ServerInfo, expected_steps: int # end-of-migration self.eom = False - def reset_migration_args(self): + def reset_migration_args_dst(self): + # By default, there is no limit on the number of steps expected for the request. + self.expected_steps = math.inf + + self.last_preemption_time = None + self.stage_timestamps = [] + self.stage_num_blocks_list = [] + self.try_schedule_times = 0 + + def reset_migration_args_src(self): self.last_preemption_time = None self.stage_timestamps = [] self.stage_num_blocks_list = [] - # By default, there is no limit on the number of steps expected for the request. - self.expected_steps = math.inf @property def inference_type(self) -> RequestInferenceType: diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index ddf7fb51..cbd4ee39 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -42,18 +42,18 @@ def parse_instance_log_file(log_files): speed = float(speed_match.group(1)) speed_dict[total_kv_cache_size].append(speed) - averger_speed = {} + average_speed = {} for transfer_size, speeds in speed_dict.items(): if len(speeds) <= 2: continue speeds.sort() trimmed_speeds = speeds[1:-1] - averger_speed[transfer_size] = sum(trimmed_speeds) / len(trimmed_speeds) + average_speed[transfer_size] = sum(trimmed_speeds) / len(trimmed_speeds) - assert len(averger_speed) > 0, "Migration should have occurred, but it was not detected. " + assert len(average_speed) > 0, "Migration should have occurred, but it was not detected. " - return averger_speed + return average_speed def parse_manager_log_file(log_file): df = pd.read_csv(log_file) @@ -99,13 +99,13 @@ async def run_bench_command(command): parse_manager_log_file("manager_instance.csv") - averger_speed = parse_instance_log_file(instance_output_logs) + average_speed = parse_instance_log_file(instance_output_logs) - sorted_keys = sorted(averger_speed.keys(), key=lambda x: float(x.split()[0])) + sorted_keys = sorted(average_speed.keys(), key=lambda x: float(x.split()[0])) data = [ ['migration_size'] + sorted_keys, - [f'{migration_backend}_speed(GB/s)'] + [f"{averger_speed[key]:.2f}" for key in sorted_keys] + [f'{migration_backend}_speed(GB/s)'] + [f"{average_speed[key]:.2f}" for key in sorted_keys] ] with open("performance.txt", "a", encoding="utf-8") as f: From 81a350a8626979c8d9025027c3d291d608dc72e5 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 22 Oct 2024 01:32:38 +0000 Subject: [PATCH 29/49] Fix lint --- llumnix/llumlet/request.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index a32e1c3a..e474009b 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -49,7 +49,7 @@ def reset_migration_args_dst(self): self.stage_timestamps = [] self.stage_num_blocks_list = [] self.try_schedule_times = 0 - + def reset_migration_args_src(self): self.last_preemption_time = None self.stage_timestamps = [] From f9a49ee2d4e30e41452c6d29cc484eb2b3da1fc9 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 22 Oct 2024 02:36:16 +0000 Subject: [PATCH 30/49] Fix free dst pre alloc cache bug --- llumnix/backends/vllm/scheduler.py | 2 ++ llumnix/llumlet/llumlet.py | 1 + 2 files changed, 3 insertions(+) diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 81fc6892..e927830f 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -132,7 +132,9 @@ def pre_alloc(self, if self.waiting and request_arrival_time > self.waiting[0].arrival_time: return [] blocks = self.block_manager.get_free_blocks(block_num) + # Once dst instance cannot pre alloc, free the pre alloc cache proactively. if len(blocks) < block_num: + self.free_dst_pre_alloc_cache(request_id) return [] pre_blocks = self.pre_alloc_cache_dict.get(request_id, []) pre_blocks.extend(blocks) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 0c407d5d..e7f86ee4 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -161,6 +161,7 @@ async def _migrate_out_one_request(self, migrate_out_request: LlumnixRequest, ds self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) else: # FINISHED_SRC_ABORTED or FINISHED_DST_ABORTED migrate_out_request.reset_migration_args_src() + # If dst aborts itself, dst proactively frees the pre alloc cache during pre alloc. if status == MigrationStatus.FINISHED_SRC_ABORTED: await migrate_in_ray_actor.execute_migration_method.remote("free_dst_pre_alloc_cache", migrate_out_request.request_id) t1 = time.time() From d86ca8bffba15ad1a26aeb105a1c48d7b0a1780a Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 22 Oct 2024 02:37:27 +0000 Subject: [PATCH 31/49] Add waiting request migration e2e test --- tests/e2e_test/test_e2e.py | 7 ++++--- tests/e2e_test/test_migration.py | 13 ++++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py index 33375bd1..7ca116ae 100644 --- a/tests/e2e_test/test_e2e.py +++ b/tests/e2e_test/test_e2e.py @@ -41,8 +41,9 @@ def parse_launch_mode(launch_mode: str): def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool = True, HEAD_NODE_IP: str = "127.0.0.1", ip: str = "127.0.0.1", port: int = 37000, instances_num = 1, dispatch_policy: str = "load", migration_backend = "gloo", model = "facebook/opt-125m", max_model_len: int = 2048, - launch_mode: str = 'eief', log_instance_info: bool = False, enable_pd_disagg: bool = False, - num_dispatch_instances: int = math.inf): + launch_mode: str = 'eief', log_instance_info: bool = False, + enable_pd_disagg: bool = False, num_dispatch_instances: int = math.inf, + request_migration_policy: str = 'SR'): disable_init_instance_by_manager, disable_fixed_node_init_instance = parse_launch_mode(launch_mode) command = ( f"RAY_DEDUP_LOGS=0 HEAD_NODE_IP={HEAD_NODE_IP} HEAD_NODE=1 " @@ -61,7 +62,7 @@ def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool f"--max-model-len {max_model_len} " f"--dispatch-policy {dispatch_policy} " f"--trust-remote-code " - f"--request-migration-policy LCR " + f"--request-migration-policy {request_migration_policy} " f"--migration-backend {migration_backend} " f"--migration-cache-blocks 32 " f"--tensor-parallel-size 1 " diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index cbd4ee39..efc25ccf 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -68,7 +68,13 @@ def parse_manager_log_file(log_file): @pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B']) @pytest.mark.parametrize("migration_backend", ['rpc', 'gloo', 'nccl']) @pytest.mark.parametrize("enable_pd_disagg", [False, True]) +@pytest.mark.parametrize("migrated_request_status", ['running', 'waiting']) async def test_migration_benchmark(model, migration_backend, enable_pd_disagg): + if migrated_request_status == 'waiting' and migration_backend != 'rpc': + pytest.skip("When the migrated request status is waiting, only test the rpc migration backend.") + + request_migration_policy = 'SR' if migrated_request_status == 'running' else 'FCWSR' + base_port = 37037 instance_output_logs = [] @@ -79,8 +85,9 @@ async def test_migration_benchmark(model, migration_backend, enable_pd_disagg): instance_output_logs.append("instance_"+output_log) launch_command = generate_launch_command(result_filename=output_log, launch_ray_cluster=False, port=base_port+i, model=model, dispatch_policy="flood", migration_backend=migration_backend, - log_instance_info=True, enable_pd_disagg=enable_pd_disagg, - num_dispatch_instances=num_dispatch_instances) + log_instance_info=True, + enable_pd_disagg=enable_pd_disagg, num_dispatch_instances=num_dispatch_instances, + request_migration_policy=request_migration_policy) subprocess.run(launch_command, shell=True, check=True) await asyncio.sleep(60) @@ -95,7 +102,7 @@ async def run_bench_command(command): dataset_path="/mnt/dataset/sharegpt_gpt4/sharegpt_gpt4.jsonl" , qps=10) await asyncio.wait_for(run_bench_command(bench_command), timeout=60*30) - await asyncio.sleep(30) + await asyncio.sleep(20) parse_manager_log_file("manager_instance.csv") From 6e41b1cf6a7fce3e8ab7843baf9565dd77d13ca2 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 22 Oct 2024 02:37:40 +0000 Subject: [PATCH 32/49] Minors --- llumnix/backends/vllm/sequence.py | 2 +- llumnix/llumlet/migration_coordinator.py | 3 +-- llumnix/llumlet/request.py | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llumnix/backends/vllm/sequence.py b/llumnix/backends/vllm/sequence.py index ce5ca429..ad6f49cc 100644 --- a/llumnix/backends/vllm/sequence.py +++ b/llumnix/backends/vllm/sequence.py @@ -59,7 +59,7 @@ def status(self) -> RequestStatus: "Only RUNNING, WAITING are expected status for LlumnixRequest" if status == SequenceStatus.RUNNING: request_status = RequestStatus.RUNNING - else: # status == SequenceStatus.WAITING + else: request_status = RequestStatus.WAITING return request_status diff --git a/llumnix/llumlet/migration_coordinator.py b/llumnix/llumlet/migration_coordinator.py index c58a4b98..96434f6a 100644 --- a/llumnix/llumlet/migration_coordinator.py +++ b/llumnix/llumlet/migration_coordinator.py @@ -60,12 +60,11 @@ async def migrate_out_waiting_request(self, """ self.backend_engine.remove_waiting_request(migrate_out_request.request_id) self.backend_engine.add_migrating_out_request_last_stage(migrate_out_request) - prefill_num_blocks = migrate_out_request.prefill_num_blocks dst_blocks = await migrate_in_ray_actor.execute_migration_method \ .remote("migrate_in_pre_alloc", migrate_out_request.request_id, migrate_out_request.status, migrate_out_request.arrival_time, - prefill_num_blocks) + migrate_out_request.prefill_num_blocks) if len(dst_blocks) != prefill_num_blocks: self.backend_engine.add_waiting_request(migrate_out_request) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index e474009b..0b864045 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -38,7 +38,8 @@ def __init__(self, request_id: int, server_info: ServerInfo, expected_steps: int self.stage_num_blocks_list = [] self.try_schedule_times = 0 self.waiting_migrating = False - # end-of-migration + + # end-of-migration, for multiple requests migration self.eom = False def reset_migration_args_dst(self): From ce21d502c48f3e3bddcdc93cd9ab0fe8d3fbcd5b Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 22 Oct 2024 02:41:29 +0000 Subject: [PATCH 33/49] Fix test migration --- tests/e2e_test/test_migration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index efc25ccf..68251f3a 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -69,7 +69,7 @@ def parse_manager_log_file(log_file): @pytest.mark.parametrize("migration_backend", ['rpc', 'gloo', 'nccl']) @pytest.mark.parametrize("enable_pd_disagg", [False, True]) @pytest.mark.parametrize("migrated_request_status", ['running', 'waiting']) -async def test_migration_benchmark(model, migration_backend, enable_pd_disagg): +async def test_migration_benchmark(model, migration_backend, enable_pd_disagg, migrated_request_status): if migrated_request_status == 'waiting' and migration_backend != 'rpc': pytest.skip("When the migrated request status is waiting, only test the rpc migration backend.") From 4d10eb4250cc5de415e2c7a3862d33288a1b7227 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 22 Oct 2024 06:42:48 +0000 Subject: [PATCH 34/49] Fix some bugs --- llumnix/backends/backend_interface.py | 10 ++++++++-- llumnix/backends/vllm/llm_engine.py | 4 ++-- llumnix/backends/vllm/scheduler.py | 12 ++++++------ llumnix/backends/vllm/sequence.py | 2 +- llumnix/llumlet/llumlet.py | 2 +- llumnix/llumlet/migration_coordinator.py | 10 +++++++--- 6 files changed, 25 insertions(+), 15 deletions(-) diff --git a/llumnix/backends/backend_interface.py b/llumnix/backends/backend_interface.py index b4a9dcb0..409775b8 100644 --- a/llumnix/backends/backend_interface.py +++ b/llumnix/backends/backend_interface.py @@ -113,7 +113,7 @@ def get_waiting_queue(self) -> Deque[LlumnixRequest]: raise NotImplementedError @abstractmethod - def remove_running_request(self, request_id: str) -> None: + def remove_running_request(self, request_id: str) -> bool: """ Removes a request from the backend's running queue. @@ -124,11 +124,14 @@ def remove_running_request(self, request_id: str) -> None: Args: request_id: A string identifier for the request that is to be removed from the running queue. This ID uniquely identifies the request within the backend system. + + Returns: + True if the request was successfully removed from the running queue, False otherwise. """ raise NotImplementedError @abstractmethod - def remove_waiting_request(self, request_id: str) -> None: + def remove_waiting_request(self, request_id: str) -> bool: """ Removes a request from the backend's waiting queue. @@ -138,6 +141,9 @@ def remove_waiting_request(self, request_id: str) -> None: Args: request_id: A string identifier for the request that is to be removed from the waiting queue. This ID uniquely identifies the request within the backend system. + + Returns: + True if the request was successfully removed from the waiting queue, False otherwise. """ raise NotImplementedError diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 5b127882..163a65e5 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -388,10 +388,10 @@ def get_waiting_queue(self) -> Deque[SequenceGroupLlumnix]: def get_request_incremental_blocks(self, *args, **kwargs) -> List[int]: return self.engine.scheduler.get_request_incremental_blocks(*args, **kwargs) - def remove_running_request(self, *args, **kwargs) -> None: + def remove_running_request(self, *args, **kwargs) -> bool: return self.engine.scheduler.remove_running_request(*args, **kwargs) - def remove_waiting_request(self, *args, **kwargs) -> None: + def remove_waiting_request(self, *args, **kwargs) -> bool: return self.engine.scheduler.remove_waiting_request(*args, **kwargs) def add_migrating_out_request_last_stage(self, *args, **kwargs) -> None: diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index e927830f..87484bcf 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -98,17 +98,19 @@ def get_request_incremental_blocks(self, backend_request: LlumnixRequest, pre_st blocks = self.block_manager.get_block_table(seq) return blocks[pre_stage_num_blocks:] - def remove_running_request(self, request_id: str) -> None: + def remove_running_request(self, request_id: str) -> bool: for seq_group in self.running: if seq_group.request_id == request_id: self.running.remove(seq_group) - break + return True + return False - def remove_waiting_request(self, request_id: str) -> None: + def remove_waiting_request(self, request_id: str) -> bool: for seq_group in self.waiting: if seq_group.request_id == request_id: self.waiting.remove(seq_group) - break + return True + return False def add_migrating_out_request_last_stage(self, backend_request: SequenceGroupLlumnix) -> None: self.migrating_out_request_last_stage.append(backend_request) @@ -143,11 +145,9 @@ def pre_alloc(self, return blocks def add_running_request(self, backend_request: LlumnixRequest) -> None: - self._set_status(backend_request, status_to=SequenceStatus.RUNNING) self.running.append(backend_request) def add_waiting_request(self, backend_request: LlumnixRequest) -> None: - self._set_status(backend_request, status_to=SequenceStatus.WAITING) # pylint: disable=E0203 self.waiting.append(backend_request) fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs") diff --git a/llumnix/backends/vllm/sequence.py b/llumnix/backends/vllm/sequence.py index ad6f49cc..7044ce56 100644 --- a/llumnix/backends/vllm/sequence.py +++ b/llumnix/backends/vllm/sequence.py @@ -66,4 +66,4 @@ def status(self) -> RequestStatus: @property def prefill_num_blocks(self) -> int: # Get the prefill len of the waiting request. - return math.ceil(self.request_len / self.get_seqs()[0].block_size) + return len(self.get_seqs()[0].logical_token_blocks) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index e7f86ee4..36f53944 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -157,8 +157,8 @@ async def _migrate_out_one_request(self, migrate_out_request: LlumnixRequest, ds await migrate_in_ray_actor.execute_engine_method.remote("commit_dst_request", migrate_out_request) if migrate_out_request.status == RequestStatus.RUNNING: self.backend_engine.free_src_request(migrate_out_request) - migrated_request.append(migrate_out_request.request_id) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) + migrated_request.append(migrate_out_request.request_id) else: # FINISHED_SRC_ABORTED or FINISHED_DST_ABORTED migrate_out_request.reset_migration_args_src() # If dst aborts itself, dst proactively frees the pre alloc cache during pre alloc. diff --git a/llumnix/llumlet/migration_coordinator.py b/llumnix/llumlet/migration_coordinator.py index 96434f6a..9b6b5fb5 100644 --- a/llumnix/llumlet/migration_coordinator.py +++ b/llumnix/llumlet/migration_coordinator.py @@ -58,14 +58,16 @@ async def migrate_out_waiting_request(self, migrate_out_request: LlumnixRequest) -> "MigrationStatus": """one-stage migration for a waiting request """ - self.backend_engine.remove_waiting_request(migrate_out_request.request_id) + found = self.backend_engine.remove_waiting_request(migrate_out_request.request_id) + if not found: + return MigrationStatus.FINISHED_SRC_ABORTED self.backend_engine.add_migrating_out_request_last_stage(migrate_out_request) dst_blocks = await migrate_in_ray_actor.execute_migration_method \ .remote("migrate_in_pre_alloc", migrate_out_request.request_id, migrate_out_request.status, migrate_out_request.arrival_time, migrate_out_request.prefill_num_blocks) - if len(dst_blocks) != prefill_num_blocks: + if len(dst_blocks) != migrate_out_request.prefill_num_blocks: self.backend_engine.add_waiting_request(migrate_out_request) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) return MigrationStatus.FINISHED_DST_ABORTED @@ -109,7 +111,9 @@ async def _migrate_out_onestage(self, else: # last stage migration, stop inference, transfer all blocks migration_status = MigrationStatus.FINISHED_DONE - self.backend_engine.remove_running_request(migrate_out_request.request_id) + found = self.backend_engine.remove_running_request(migrate_out_request.request_id) + if not found: + return MigrationStatus.FINISHED_SRC_ABORTED self.backend_engine.add_migrating_out_request_last_stage(migrate_out_request) stage_block_num = len(incremental_blocks) src_blocks = incremental_blocks[:] From 8f75b34a5b8baae4de876a0e272ad138ace3584e Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 22 Oct 2024 07:06:40 +0000 Subject: [PATCH 35/49] Fix test migration e2e test --- llumnix/backends/vllm/migration_backend.py | 8 ++++---- tests/e2e_test/test_migration.py | 10 ++++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/llumnix/backends/vllm/migration_backend.py b/llumnix/backends/vllm/migration_backend.py index 947d3e7e..4c5525cd 100644 --- a/llumnix/backends/vllm/migration_backend.py +++ b/llumnix/backends/vllm/migration_backend.py @@ -282,13 +282,13 @@ def get_migration_backend(migration_config: MigrationConfig, cache_engine: Cache .format(migration_config.migration_cache_blocks, cache_engine.num_gpu_blocks)) migration_config.migration_cache_blocks = cache_engine.num_gpu_blocks - target_col = None + target_migration_backend = None backend = migration_config.migration_backend if backend in ['nccl', 'gloo']: - target_col = RayColMigrationBackend(migration_config, cache_engine, local_rank, scheduling_strategy, + target_migration_backend = RayColMigrationBackend(migration_config, cache_engine, local_rank, scheduling_strategy, is_driver_worker, gpu_cache) else: - target_col = RayRpcMigrationBackend(migration_config, cache_engine, worker_rank, worker_handle_list, + target_migration_backend = RayRpcMigrationBackend(migration_config, cache_engine, worker_rank, worker_handle_list, scheduling_strategy, is_driver_worker, gpu_cache) - return target_col + return target_migration_backend diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index 68251f3a..6d8b7003 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -68,12 +68,12 @@ def parse_manager_log_file(log_file): @pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B']) @pytest.mark.parametrize("migration_backend", ['rpc', 'gloo', 'nccl']) @pytest.mark.parametrize("enable_pd_disagg", [False, True]) -@pytest.mark.parametrize("migrated_request_status", ['running', 'waiting']) +@pytest.mark.parametrize("migrated_request_status", ['waiting', 'running']) async def test_migration_benchmark(model, migration_backend, enable_pd_disagg, migrated_request_status): if migrated_request_status == 'waiting' and migration_backend != 'rpc': pytest.skip("When the migrated request status is waiting, only test the rpc migration backend.") - request_migration_policy = 'SR' if migrated_request_status == 'running' else 'FCWSR' + request_migration_policy = 'SR' if migrated_request_status == 'running' else 'FCW' base_port = 37037 instance_output_logs = [] @@ -96,12 +96,14 @@ async def run_bench_command(command): await process.wait() assert process.returncode == 0 + tasks = [] for i in range(device_count//2): bench_command = generate_bench_command(ip_ports=f"127.0.0.1:{base_port+i}", model=model, num_prompts=300, dataset_type="sharegpt", dataset_path="/mnt/dataset/sharegpt_gpt4/sharegpt_gpt4.jsonl" , qps=10) - await asyncio.wait_for(run_bench_command(bench_command), timeout=60*30) + tasks.append(asyncio.create_task(run_bench_command(bench_command))) + await asyncio.gather(*tasks) await asyncio.sleep(20) parse_manager_log_file("manager_instance.csv") @@ -120,4 +122,4 @@ async def run_bench_command(command): shutdown_llumnix_service() clear_ray_state() - await asyncio.sleep(3) + await asyncio.sleep(10) From 506f9e38409bc18740cf17a3b4b5240c50c64aaa Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 22 Oct 2024 11:01:55 +0000 Subject: [PATCH 36/49] Fix exceeds prompt limit bug --- llumnix/backends/vllm/scheduler.py | 16 +++++++++++----- tests/e2e_test/test_migration.py | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 87484bcf..3b79cbb3 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -20,6 +20,7 @@ from vllm.core.scheduler import (Scheduler, PreemptionMode, SequenceStatus, SequenceGroupMetadata, SchedulerOutputs) from vllm.core.policy import PolicyFactory from vllm.sequence import SequenceGroup +from vllm.core.interfaces import AllocStatus from llumnix.instance_info import InstanceInfo from llumnix.logger import init_logger @@ -131,7 +132,8 @@ def pre_alloc(self, # Only migrate waiting request when the waiting request is the earliest arrival one # among the requests of dst instance's waiting queue. if request_status == RequestStatus.WAITING: - if self.waiting and request_arrival_time > self.waiting[0].arrival_time: + if (self.waiting and request_arrival_time > self.waiting[0].arrival_time) \ + or block_num * self.cache_config.block_size > self.prompt_limit: return [] blocks = self.block_manager.get_free_blocks(block_num) # Once dst instance cannot pre alloc, free the pre alloc cache proactively. @@ -153,6 +155,11 @@ def add_waiting_request(self, backend_request: LlumnixRequest) -> None: fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs") self.waiting = fcfs_policy.sort_by_priority(time.time(), self.waiting) + def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: + if seq_group.waiting_migrating: + return AllocStatus.OK + return super().can_allocate(seq_group) + def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: # Change seq status to running, but request status is still waiting_migrating. if seq_group.waiting_migrating: @@ -229,8 +236,8 @@ def _get_instance_info(self, scheduled_seq_groups: List[SequenceGroupLlumnix]) - if scheduled_seq_groups: instance_info.inference_type = scheduled_seq_groups[-1].inference_type # TODO(ZeldaHuang) adapt chunked-prefill - instance_info.num_batched_tokens = sum([seq_group.request_len for seq_group in scheduled_seq_groups])\ - if instance_info.inference_type == RequestInferenceType.PREFILL else len(instance_info.running_seq_lens) + instance_info.num_batched_tokens = sum([seq_group.request_len for seq_group in scheduled_seq_groups]) \ + if instance_info.inference_type == RequestInferenceType.PREFILL else len(instance_info.running_seq_lens) instance_info.finished_request_ids = [seq_group.request_id for seq_group in self.running if seq_group.finished] return instance_info @@ -238,8 +245,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: seq_group_metadata_list, scheduler_outputs = super().schedule() self.update_instance_info_callback(self._get_instance_info([scheduled_seq_group.seq_group \ for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups])) - for seq_group in self.waiting: - seq_group.try_schedule_times += 1 + return seq_group_metadata_list, scheduler_outputs def _schedule_running(self, running_queue: deque, *args, **kwargs): diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index 6d8b7003..882a8a2f 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -89,7 +89,7 @@ async def test_migration_benchmark(model, migration_backend, enable_pd_disagg, m enable_pd_disagg=enable_pd_disagg, num_dispatch_instances=num_dispatch_instances, request_migration_policy=request_migration_policy) subprocess.run(launch_command, shell=True, check=True) - await asyncio.sleep(60) + await asyncio.sleep(30) async def run_bench_command(command): process = await asyncio.create_subprocess_shell(command) From f983fc095e991fe1e4642e3aa2a53b703792b979 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 22 Oct 2024 11:04:26 +0000 Subject: [PATCH 37/49] Fix lint --- llumnix/backends/backend_interface.py | 2 +- llumnix/backends/vllm/scheduler.py | 3 ++- llumnix/backends/vllm/sequence.py | 2 -- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llumnix/backends/backend_interface.py b/llumnix/backends/backend_interface.py index 409775b8..28e1e802 100644 --- a/llumnix/backends/backend_interface.py +++ b/llumnix/backends/backend_interface.py @@ -141,7 +141,7 @@ def remove_waiting_request(self, request_id: str) -> bool: Args: request_id: A string identifier for the request that is to be removed from the waiting queue. This ID uniquely identifies the request within the backend system. - + Returns: True if the request was successfully removed from the waiting queue, False otherwise. """ diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 3b79cbb3..877f44ae 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -237,7 +237,8 @@ def _get_instance_info(self, scheduled_seq_groups: List[SequenceGroupLlumnix]) - instance_info.inference_type = scheduled_seq_groups[-1].inference_type # TODO(ZeldaHuang) adapt chunked-prefill instance_info.num_batched_tokens = sum([seq_group.request_len for seq_group in scheduled_seq_groups]) \ - if instance_info.inference_type == RequestInferenceType.PREFILL else len(instance_info.running_seq_lens) + if instance_info.inference_type == RequestInferenceType.PREFILL \ + else len(instance_info.running_seq_lens) instance_info.finished_request_ids = [seq_group.request_id for seq_group in self.running if seq_group.finished] return instance_info diff --git a/llumnix/backends/vllm/sequence.py b/llumnix/backends/vllm/sequence.py index 7044ce56..59a6ef08 100644 --- a/llumnix/backends/vllm/sequence.py +++ b/llumnix/backends/vllm/sequence.py @@ -11,8 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math - from vllm.sequence import SequenceGroup, SequenceStatus from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType, RequestStatus From b722d60440fd6fa58b0654ce2c06a64dd91c4661 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 23 Oct 2024 02:28:05 +0000 Subject: [PATCH 38/49] Fix unit test --- llumnix/backends/vllm/scheduler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 877f44ae..5024671c 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -246,7 +246,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: seq_group_metadata_list, scheduler_outputs = super().schedule() self.update_instance_info_callback(self._get_instance_info([scheduled_seq_group.seq_group \ for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups])) - + for seq_group in self.waiting: + seq_group.try_schedule_times += 1 return seq_group_metadata_list, scheduler_outputs def _schedule_running(self, running_queue: deque, *args, **kwargs): From bcec5555e2fe753baf019cb496e6042546f74b30 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 23 Oct 2024 04:31:32 +0000 Subject: [PATCH 39/49] Minors --- llumnix/backends/vllm/scheduler.py | 4 ---- llumnix/llumlet/migration_coordinator.py | 2 +- tests/e2e_test/test_migration.py | 20 +++++++++----------- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 5024671c..3b7f7ed1 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -136,10 +136,6 @@ def pre_alloc(self, or block_num * self.cache_config.block_size > self.prompt_limit: return [] blocks = self.block_manager.get_free_blocks(block_num) - # Once dst instance cannot pre alloc, free the pre alloc cache proactively. - if len(blocks) < block_num: - self.free_dst_pre_alloc_cache(request_id) - return [] pre_blocks = self.pre_alloc_cache_dict.get(request_id, []) pre_blocks.extend(blocks) self.pre_alloc_cache_dict[request_id] = pre_blocks diff --git a/llumnix/llumlet/migration_coordinator.py b/llumnix/llumlet/migration_coordinator.py index 9b6b5fb5..dd766b50 100644 --- a/llumnix/llumlet/migration_coordinator.py +++ b/llumnix/llumlet/migration_coordinator.py @@ -115,8 +115,8 @@ async def _migrate_out_onestage(self, if not found: return MigrationStatus.FINISHED_SRC_ABORTED self.backend_engine.add_migrating_out_request_last_stage(migrate_out_request) - stage_block_num = len(incremental_blocks) src_blocks = incremental_blocks[:] + stage_block_num = len(incremental_blocks) dst_blocks = await migrate_in_ray_actor.execute_migration_method \ .remote("migrate_in_pre_alloc", migrate_out_request.request_id, migrate_out_request.status, diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index 882a8a2f..3be50324 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -68,7 +68,7 @@ def parse_manager_log_file(log_file): @pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B']) @pytest.mark.parametrize("migration_backend", ['rpc', 'gloo', 'nccl']) @pytest.mark.parametrize("enable_pd_disagg", [False, True]) -@pytest.mark.parametrize("migrated_request_status", ['waiting', 'running']) +@pytest.mark.parametrize("migrated_request_status", ['running', 'waiting']) async def test_migration_benchmark(model, migration_backend, enable_pd_disagg, migrated_request_status): if migrated_request_status == 'waiting' and migration_backend != 'rpc': pytest.skip("When the migrated request status is waiting, only test the rpc migration backend.") @@ -109,16 +109,14 @@ async def run_bench_command(command): parse_manager_log_file("manager_instance.csv") average_speed = parse_instance_log_file(instance_output_logs) - - sorted_keys = sorted(average_speed.keys(), key=lambda x: float(x.split()[0])) - - data = [ - ['migration_size'] + sorted_keys, - [f'{migration_backend}_speed(GB/s)'] + [f"{average_speed[key]:.2f}" for key in sorted_keys] - ] - - with open("performance.txt", "a", encoding="utf-8") as f: - f.write(to_markdown_table(data)) + if migrated_request_status == 'running': + sorted_keys = sorted(average_speed.keys(), key=lambda x: float(x.split()[0])) + data = [ + ['migration_size'] + sorted_keys, + [f'{migration_backend}_speed(GB/s)'] + [f"{average_speed[key]:.2f}" for key in sorted_keys] + ] + with open("performance.txt", "a", encoding="utf-8") as f: + f.write(to_markdown_table(data)) shutdown_llumnix_service() clear_ray_state() From 2e66544b413e433adc9d8da724ed5564a79656ee Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 23 Oct 2024 04:45:33 +0000 Subject: [PATCH 40/49] Fix test migration --- tests/e2e_test/test_migration.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index 3be50324..8572fe57 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -89,6 +89,7 @@ async def test_migration_benchmark(model, migration_backend, enable_pd_disagg, m enable_pd_disagg=enable_pd_disagg, num_dispatch_instances=num_dispatch_instances, request_migration_policy=request_migration_policy) subprocess.run(launch_command, shell=True, check=True) + await asyncio.sleep(5) await asyncio.sleep(30) async def run_bench_command(command): From be947b09a0f3722cca2fcfc29078f6aabe09cd37 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 23 Oct 2024 09:16:14 +0000 Subject: [PATCH 41/49] Fix status bugs --- llumnix/backends/vllm/llm_engine.py | 13 ++++++++---- llumnix/backends/vllm/scheduler.py | 20 ++++++++++--------- llumnix/backends/vllm/sequence.py | 8 +++++--- llumnix/llumlet/llumlet.py | 18 ++++++++++------- llumnix/llumlet/request.py | 11 +++++++++- tests/e2e_test/test_migration.py | 2 +- .../unit_test/backends/vllm/test_migration.py | 2 ++ .../unit_test/backends/vllm/test_scheduler.py | 6 +++--- 8 files changed, 52 insertions(+), 28 deletions(-) diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 163a65e5..f8c1a620 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -353,11 +353,16 @@ def commit_dst_request(self, backend_request: SequenceGroupLlumnix) -> None: pre_alloc_blocks = self.engine.scheduler.pre_alloc_cache_dict.pop(backend_request.request_id) self.engine.scheduler.block_manager.add_block_table(pre_alloc_blocks, seq.seq_id) backend_request.reset_migration_args_dst() - if backend_request.status == RequestStatus.RUNNING: - self.add_running_request(backend_request) - else: # RequestStatus.WAITING - backend_request.waiting_migrating = True + assert backend_request.status in [RequestStatus.WAITING_MIGRATING, RequestStatus.RUNNING_MIGRATING], \ + "The status of request migrated to dst instance should be \ + RequestStatus.WAITING_MIGRATING or RequestStatus.RUNNING_MIGRATING" + if backend_request.status == RequestStatus.WAITING_MIGRATING: + self.engine.scheduler.set_status(backend_request, status_to=SequenceStatus.WAITING) self.add_waiting_request(backend_request) + elif backend_request.status == RequestStatus.RUNNING_MIGRATING: + backend_request.reset_status() + self.engine.scheduler.set_status(backend_request, status_to=SequenceStatus.RUNNING) + self.add_running_request(backend_request) async def send_blocks(self, dst_ray_actor: "ray.actor.ActorHandle", src_blocks: List[int], dst_blocks: List[int]) -> None: await dst_ray_actor.execute_engine_method.remote("_run_workers", diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 3b7f7ed1..42d83760 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -103,6 +103,7 @@ def remove_running_request(self, request_id: str) -> bool: for seq_group in self.running: if seq_group.request_id == request_id: self.running.remove(seq_group) + seq_group.set_status(RequestStatus.RUNNING_MIGRATING) return True return False @@ -110,6 +111,7 @@ def remove_waiting_request(self, request_id: str) -> bool: for seq_group in self.waiting: if seq_group.request_id == request_id: self.waiting.remove(seq_group) + seq_group.set_status(RequestStatus.WAITING_MIGRATING) return True return False @@ -131,7 +133,7 @@ def pre_alloc(self, block_num: int) -> List[int]: # Only migrate waiting request when the waiting request is the earliest arrival one # among the requests of dst instance's waiting queue. - if request_status == RequestStatus.WAITING: + if request_status == RequestStatus.WAITING_MIGRATING: if (self.waiting and request_arrival_time > self.waiting[0].arrival_time) \ or block_num * self.cache_config.block_size > self.prompt_limit: return [] @@ -152,23 +154,23 @@ def add_waiting_request(self, backend_request: LlumnixRequest) -> None: self.waiting = fcfs_policy.sort_by_priority(time.time(), self.waiting) def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: - if seq_group.waiting_migrating: + if seq_group.status == RequestStatus.WAITING_MIGRATING: return AllocStatus.OK return super().can_allocate(seq_group) def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: # Change seq status to running, but request status is still waiting_migrating. - if seq_group.waiting_migrating: + if seq_group.status == RequestStatus.WAITING_MIGRATING: # For the waiting request migrated in, blocks have already been allocated when pre alloc. - self._set_status(seq_group, status_to=SequenceStatus.RUNNING) - seq_group.waiting_migrating = False + self.set_status(seq_group, status_to=SequenceStatus.RUNNING) + seq_group.reset_status() else: super()._allocate_and_set_running(seq_group) - def _set_status(self, - seq_group: SequenceGroup, - status_to: SequenceStatus, - status_from: SequenceStatus = None): + def set_status(self, + seq_group: SequenceGroup, + status_to: SequenceStatus, + status_from: SequenceStatus = None): for seq in seq_group.get_seqs(status=status_from): seq.status = status_to diff --git a/llumnix/backends/vllm/sequence.py b/llumnix/backends/vllm/sequence.py index 59a6ef08..5964f96d 100644 --- a/llumnix/backends/vllm/sequence.py +++ b/llumnix/backends/vllm/sequence.py @@ -52,13 +52,15 @@ def arrival_time(self) -> float: @property def status(self) -> RequestStatus: + if self._status: + return self._status status = self.get_seqs()[0].status - assert status in [SequenceStatus.RUNNING, SequenceStatus.WAITING], \ - "Only RUNNING, WAITING are expected status for LlumnixRequest" if status == SequenceStatus.RUNNING: request_status = RequestStatus.RUNNING - else: + elif status == SequenceStatus.WAITING: request_status = RequestStatus.WAITING + else: + request_status = RequestStatus.FINISHED return request_status @property diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 36f53944..c19e780d 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -148,20 +148,21 @@ async def _migrate_out_one_request(self, migrate_out_request: LlumnixRequest, ds dst_instance_id = dst_instance_name[len("instance_"):] logger.info("{}->{} begin migrate out".format(self.instance_id, dst_instance_id)) migrated_request = [] - assert migrate_out_request.status in [RequestStatus.WAITING, RequestStatus.RUNNING], "Only migrate out waiting and running request" if migrate_out_request.status == RequestStatus.RUNNING: status = await self.migration_coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) - else: + elif migrate_out_request.status == RequestStatus.WAITING: status = await self.migration_coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) + else: + return migrated_request if status == MigrationStatus.FINISHED_DONE: await migrate_in_ray_actor.execute_engine_method.remote("commit_dst_request", migrate_out_request) - if migrate_out_request.status == RequestStatus.RUNNING: - self.backend_engine.free_src_request(migrate_out_request) + self.backend_engine.free_src_request(migrate_out_request) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) migrated_request.append(migrate_out_request.request_id) else: # FINISHED_SRC_ABORTED or FINISHED_DST_ABORTED migrate_out_request.reset_migration_args_src() - # If dst aborts itself, dst proactively frees the pre alloc cache during pre alloc. + migrate_out_request.reset_status() + # If dst aborts itself, dst proactively frees the pre allocated cache in migrate_in_pre_alloc. if status == MigrationStatus.FINISHED_SRC_ABORTED: await migrate_in_ray_actor.execute_migration_method.remote("free_dst_pre_alloc_cache", migrate_out_request.request_id) t1 = time.time() @@ -218,9 +219,12 @@ def clear_migration_states(self, is_migrate_in: bool) -> None: migrating_out_requests_last_stage = self.backend_engine.pop_migrating_out_requests_last_stage() for backend_request in migrating_out_requests_last_stage: logger.info("clear_migration_states: add request {} back to engine".format(backend_request.request_id)) - if backend_request.status == RequestStatus.RUNNING: + assert backend_request.status in [RequestStatus.WAITING_MIGRATING, RequestStatus.RUNNING_MIGRATING], \ + "The status of request in migrating_out_requests_last_stage should be \ + RequestStatus.WAITING_MIGRATING or RequestStatus.RUNNING_MIGRATING" + if backend_request.status == RequestStatus.RUNNING_MIGRATING: self.backend_engine.add_running_request(backend_request) - else: # RequestStatus.WAITING + elif backend_request.status == RequestStatus.WAITING_MIGRATING: self.backend_engine.add_waiting_request(backend_request) def execute_migration_method(self, method, *args, **kwargs): diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index 0b864045..d92e6564 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -23,6 +23,9 @@ class RequestInferenceType(str, Enum): class RequestStatus(str, Enum): RUNNING = "running" WAITING = "waiting" + FINISHED = "finished" + RUNNING_MIGRATING = "running_migrating" + WAITING_MIGRATING = "waiting_migrating" class LlumnixRequest: def __init__(self, request_id: int, server_info: ServerInfo, expected_steps: int) -> None: @@ -37,7 +40,7 @@ def __init__(self, request_id: int, server_info: ServerInfo, expected_steps: int self.stage_timestamps = [] self.stage_num_blocks_list = [] self.try_schedule_times = 0 - self.waiting_migrating = False + self._status = None # end-of-migration, for multiple requests migration self.eom = False @@ -56,6 +59,12 @@ def reset_migration_args_src(self): self.stage_timestamps = [] self.stage_num_blocks_list = [] + def reset_status(self): + self._status = None + + def set_status(self, status: RequestStatus): + self._status = status + @property def inference_type(self) -> RequestInferenceType: raise NotImplementedError diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index 8572fe57..7d4414c0 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -109,8 +109,8 @@ async def run_bench_command(command): parse_manager_log_file("manager_instance.csv") - average_speed = parse_instance_log_file(instance_output_logs) if migrated_request_status == 'running': + average_speed = parse_instance_log_file(instance_output_logs) sorted_keys = sorted(average_speed.keys(), key=lambda x: float(x.split()[0])) data = [ ['migration_size'] + sorted_keys, diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index 3dffe8a3..865f28a7 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -294,10 +294,12 @@ def test_clear_migration_states(): llumlet.clear_migration_states(is_migrate_in=True) assert len(llumlet.backend_engine.pre_alloc("0", RequestStatus.RUNNING, 0.0, num_gpu_blocks)) == num_gpu_blocks _, seq_group = create_dummy_prompt("0",7,block_size,SequenceStatus.RUNNING) + seq_group.set_status(RequestStatus.RUNNING_MIGRATING) llumlet.backend_engine.add_migrating_out_request_last_stage(seq_group) llumlet.clear_migration_states(is_migrate_in=False) assert len(llumlet.backend_engine.get_running_queue()) == 1 _, seq_group = create_dummy_prompt("0",7,block_size,SequenceStatus.WAITING) + seq_group.set_status(RequestStatus.WAITING_MIGRATING) llumlet.backend_engine.add_migrating_out_request_last_stage(seq_group) llumlet.clear_migration_states(is_migrate_in=False) assert len(llumlet.backend_engine.get_waiting_queue()) == 1 diff --git a/tests/unit_test/backends/vllm/test_scheduler.py b/tests/unit_test/backends/vllm/test_scheduler.py index 10874edd..c8a03981 100644 --- a/tests/unit_test/backends/vllm/test_scheduler.py +++ b/tests/unit_test/backends/vllm/test_scheduler.py @@ -203,12 +203,12 @@ def test_schedule_running(): before_arrival = time.time() _, seq_group = create_dummy_prompt("1", prompt_length=1, block_size=2, expected_steps=math.inf) after_arrival = time.time() - blocks = scheduler.pre_alloc("2", RequestStatus.WAITING, after_arrival, 2) + blocks = scheduler.pre_alloc("2", RequestStatus.WAITING_MIGRATING, after_arrival, 2) assert len(blocks) == 2 scheduler.add_waiting_request(seq_group) - blocks = scheduler.pre_alloc("3", RequestStatus.WAITING, after_arrival, 2) + blocks = scheduler.pre_alloc("3", RequestStatus.WAITING_MIGRATING, after_arrival, 2) assert len(blocks) == 0 - blocks = scheduler.pre_alloc("4", RequestStatus.WAITING, before_arrival, 2) + blocks = scheduler.pre_alloc("4", RequestStatus.WAITING_MIGRATING, before_arrival, 2) assert len(blocks) == 2 def test_try_schedule_times(): From 4acb19938408a6695a1a1b0c38aa26ddaff1ccee Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 23 Oct 2024 11:12:50 +0000 Subject: [PATCH 42/49] Fix set status --- llumnix/backends/vllm/llm_engine.py | 2 -- llumnix/backends/vllm/scheduler.py | 6 ++++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index f8c1a620..09aeaf3c 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -357,11 +357,9 @@ def commit_dst_request(self, backend_request: SequenceGroupLlumnix) -> None: "The status of request migrated to dst instance should be \ RequestStatus.WAITING_MIGRATING or RequestStatus.RUNNING_MIGRATING" if backend_request.status == RequestStatus.WAITING_MIGRATING: - self.engine.scheduler.set_status(backend_request, status_to=SequenceStatus.WAITING) self.add_waiting_request(backend_request) elif backend_request.status == RequestStatus.RUNNING_MIGRATING: backend_request.reset_status() - self.engine.scheduler.set_status(backend_request, status_to=SequenceStatus.RUNNING) self.add_running_request(backend_request) async def send_blocks(self, dst_ray_actor: "ray.actor.ActorHandle", src_blocks: List[int], dst_blocks: List[int]) -> None: diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 42d83760..2e99b891 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -145,9 +145,11 @@ def pre_alloc(self, return blocks def add_running_request(self, backend_request: LlumnixRequest) -> None: + self._set_status(backend_request, status_to=SequenceStatus.RUNNING) self.running.append(backend_request) def add_waiting_request(self, backend_request: LlumnixRequest) -> None: + self._set_status(backend_request, status_to=SequenceStatus.WAITING) # pylint: disable=E0203 self.waiting.append(backend_request) fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs") @@ -162,12 +164,12 @@ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: # Change seq status to running, but request status is still waiting_migrating. if seq_group.status == RequestStatus.WAITING_MIGRATING: # For the waiting request migrated in, blocks have already been allocated when pre alloc. - self.set_status(seq_group, status_to=SequenceStatus.RUNNING) + self._set_status(seq_group, status_to=SequenceStatus.RUNNING) seq_group.reset_status() else: super()._allocate_and_set_running(seq_group) - def set_status(self, + def _set_status(self, seq_group: SequenceGroup, status_to: SequenceStatus, status_from: SequenceStatus = None): From 1b4224ca76c78000cf42e7a720e11ded7f6217e3 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 7 Nov 2024 08:41:39 +0000 Subject: [PATCH 43/49] Fix index error when FCWSR --- llumnix/llm_engine_manager.py | 5 +++-- llumnix/llumlet/local_migration_scheduler.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index 59bcec7b..df77f7a1 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -230,14 +230,15 @@ async def migrate_done_callback(ret, migrate_instance_pair: Tuple[str, str]) -> self.instance_migrating[migrate_instance_pair[0]] = False if migrate_instance_pair[1] in self.instance_migrating: self.instance_migrating[migrate_instance_pair[1]] = False - if isinstance(ret, (ray.exceptions.RayActorError, KeyError)): + # TODO(s5u13b): Add more exception types for failover. + if isinstance(ret, (ray.exceptions.RayActorError, ray.exceptions.RayTaskError, KeyError)): has_error_pair = await self._check_instance_error(migrate_instance_pair) for i, has_error in enumerate(has_error_pair): # Instance without error should clear migration states. if not has_error: try: await self.instances[migrate_instance_pair[i]].clear_migration_states.remote(is_migrate_in=bool(i)) - except (ray.exceptions.RayActorError, KeyError): + except (ray.exceptions.RayActorError, ray.exceptions.RayTaskError, KeyError): has_error = True for i, has_error in enumerate(has_error_pair): if has_error: diff --git a/llumnix/llumlet/local_migration_scheduler.py b/llumnix/llumlet/local_migration_scheduler.py index a6191606..4f30f850 100644 --- a/llumnix/llumlet/local_migration_scheduler.py +++ b/llumnix/llumlet/local_migration_scheduler.py @@ -97,5 +97,6 @@ def _get_first_waiting_request(self, min_request_len, max_request_len) -> List[L def _get_first_waiting_and_shortest_running_requests(self, min_request_len, max_request_len) -> List[LlumnixRequest]: waiting_requests = self._get_first_waiting_request(min_request_len, max_request_len) running_requests = self._get_shortest_running_request(min_request_len, max_request_len) - waiting_requests[0].eom = True + if waiting_requests: + waiting_requests[0].eom = True return waiting_requests + running_requests From c12e2c509bce0c5c8ff81a335b6be99f8acf0255 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 7 Nov 2024 10:46:30 +0000 Subject: [PATCH 44/49] Remove pd_disagg test in e2e migration test --- tests/e2e_test/test_e2e.py | 3 --- tests/e2e_test/test_migration.py | 5 +---- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py index 7ca116ae..b66014aa 100644 --- a/tests/e2e_test/test_e2e.py +++ b/tests/e2e_test/test_e2e.py @@ -42,7 +42,6 @@ def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool ip: str = "127.0.0.1", port: int = 37000, instances_num = 1, dispatch_policy: str = "load", migration_backend = "gloo", model = "facebook/opt-125m", max_model_len: int = 2048, launch_mode: str = 'eief', log_instance_info: bool = False, - enable_pd_disagg: bool = False, num_dispatch_instances: int = math.inf, request_migration_policy: str = 'SR'): disable_init_instance_by_manager, disable_fixed_node_init_instance = parse_launch_mode(launch_mode) command = ( @@ -67,8 +66,6 @@ def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool f"--migration-cache-blocks 32 " f"--tensor-parallel-size 1 " f"--request-output-queue-port {1234+port} " - f"{'--enable-pd-disagg ' if enable_pd_disagg else ''} " - f"{f'--num-dispatch-instances {num_dispatch_instances} ' if num_dispatch_instances != math.inf else ''} " f"{'--launch-ray-cluster ' if launch_ray_cluster else ''}" f"{'> instance_'+result_filename if len(result_filename)> 0 else ''} 2>&1 &" ) diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index 7d4414c0..028a5f29 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -67,9 +67,8 @@ def parse_manager_log_file(log_file): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="at least 2 gpus required for migration bench") @pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B']) @pytest.mark.parametrize("migration_backend", ['rpc', 'gloo', 'nccl']) -@pytest.mark.parametrize("enable_pd_disagg", [False, True]) @pytest.mark.parametrize("migrated_request_status", ['running', 'waiting']) -async def test_migration_benchmark(model, migration_backend, enable_pd_disagg, migrated_request_status): +async def test_migration_benchmark(model, migration_backend, migrated_request_status): if migrated_request_status == 'waiting' and migration_backend != 'rpc': pytest.skip("When the migrated request status is waiting, only test the rpc migration backend.") @@ -79,14 +78,12 @@ async def test_migration_benchmark(model, migration_backend, enable_pd_disagg, m instance_output_logs = [] device_count = torch.cuda.device_count() - num_dispatch_instances = device_count//2 if enable_pd_disagg else math.inf for i in range(device_count): output_log = f"{base_port+i}.out" instance_output_logs.append("instance_"+output_log) launch_command = generate_launch_command(result_filename=output_log, launch_ray_cluster=False, port=base_port+i, model=model, dispatch_policy="flood", migration_backend=migration_backend, log_instance_info=True, - enable_pd_disagg=enable_pd_disagg, num_dispatch_instances=num_dispatch_instances, request_migration_policy=request_migration_policy) subprocess.run(launch_command, shell=True, check=True) await asyncio.sleep(5) From 60a74c2355c0a50ec44ccf5db6c09fa504b4ba2b Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 7 Nov 2024 12:05:56 +0000 Subject: [PATCH 45/49] Fix lint --- tests/e2e_test/test_e2e.py | 1 - tests/e2e_test/test_migration.py | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py index b66014aa..b37aa8f5 100644 --- a/tests/e2e_test/test_e2e.py +++ b/tests/e2e_test/test_e2e.py @@ -11,7 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import subprocess import asyncio import pytest diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index 028a5f29..ed574afc 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -11,7 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import asyncio from collections import defaultdict import re From 56869047f61abb6b614e814058db659d7f7c1a43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=97=E5=BF=98?= Date: Mon, 11 Nov 2024 19:13:51 +0800 Subject: [PATCH 46/49] Fix comments --- llumnix/backends/vllm/llm_engine.py | 2 +- llumnix/backends/vllm/scheduler.py | 3 +- llumnix/llumlet/llumlet.py | 8 +++--- llumnix/llumlet/migration_coordinator.py | 28 +++++++++---------- .../llumlet/test_migration_coordinator.py | 18 ++++++------ 5 files changed, 29 insertions(+), 30 deletions(-) diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 09aeaf3c..59b41fa7 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -358,7 +358,7 @@ def commit_dst_request(self, backend_request: SequenceGroupLlumnix) -> None: RequestStatus.WAITING_MIGRATING or RequestStatus.RUNNING_MIGRATING" if backend_request.status == RequestStatus.WAITING_MIGRATING: self.add_waiting_request(backend_request) - elif backend_request.status == RequestStatus.RUNNING_MIGRATING: + else: # RUNNING_MIGRATING: backend_request.reset_status() self.add_running_request(backend_request) diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 2e99b891..4c6403ae 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -192,8 +192,7 @@ def free_dst_pre_alloc_cache(self, request_id: str = None) -> None: def free_src_request(self, backend_request: SequenceGroupLlumnix) -> None: seq = backend_request.get_seqs()[0] - logger.info("free request: {}".format(backend_request.request_id)) - logger.info("free seq: {}".format(seq.seq_id)) + logger.info("free request: {}, free seq: {}".format(backend_request.request_id, seq.seq_id)) self.free_seq(seq) def _get_instance_info(self, scheduled_seq_groups: List[SequenceGroupLlumnix]) -> InstanceInfo: diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index c19e780d..02d4bd99 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -154,16 +154,16 @@ async def _migrate_out_one_request(self, migrate_out_request: LlumnixRequest, ds status = await self.migration_coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) else: return migrated_request - if status == MigrationStatus.FINISHED_DONE: + if status == MigrationStatus.FINISHED: await migrate_in_ray_actor.execute_engine_method.remote("commit_dst_request", migrate_out_request) self.backend_engine.free_src_request(migrate_out_request) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) migrated_request.append(migrate_out_request.request_id) - else: # FINISHED_SRC_ABORTED or FINISHED_DST_ABORTED + else: # ABORTED_SRC or ABORTED_DST migrate_out_request.reset_migration_args_src() migrate_out_request.reset_status() # If dst aborts itself, dst proactively frees the pre allocated cache in migrate_in_pre_alloc. - if status == MigrationStatus.FINISHED_SRC_ABORTED: + if status == MigrationStatus.ABORTED_SRC: await migrate_in_ray_actor.execute_migration_method.remote("free_dst_pre_alloc_cache", migrate_out_request.request_id) t1 = time.time() logger.info("{}->{} migrate done, migrate request {}, migration status: {}, len: {} blocks, cost: {} ms" \ @@ -224,7 +224,7 @@ def clear_migration_states(self, is_migrate_in: bool) -> None: RequestStatus.WAITING_MIGRATING or RequestStatus.RUNNING_MIGRATING" if backend_request.status == RequestStatus.RUNNING_MIGRATING: self.backend_engine.add_running_request(backend_request) - elif backend_request.status == RequestStatus.WAITING_MIGRATING: + else # WAITING_MIGRATING: self.backend_engine.add_waiting_request(backend_request) def execute_migration_method(self, method, *args, **kwargs): diff --git a/llumnix/llumlet/migration_coordinator.py b/llumnix/llumlet/migration_coordinator.py index dd766b50..224c41c3 100644 --- a/llumnix/llumlet/migration_coordinator.py +++ b/llumnix/llumlet/migration_coordinator.py @@ -27,16 +27,16 @@ class MigrationStatus(enum.Enum): """Status of Migration.""" RUNNING = enum.auto() - FINISHED_DST_ABORTED = enum.auto() - FINISHED_SRC_ABORTED = enum.auto() - FINISHED_DONE = enum.auto() + ABORTED_DST = enum.auto() + ABORTED_SRC = enum.auto() + FINISHED = enum.auto() @staticmethod def is_finished(status: "MigrationStatus") -> bool: return status in [ - MigrationStatus.FINISHED_DST_ABORTED, - MigrationStatus.FINISHED_SRC_ABORTED, - MigrationStatus.FINISHED_DONE + MigrationStatus.ABORTED_DST, + MigrationStatus.ABORTED_SRC, + MigrationStatus.FINISHED ] class MigrationCoordinator: @@ -60,7 +60,7 @@ async def migrate_out_waiting_request(self, """ found = self.backend_engine.remove_waiting_request(migrate_out_request.request_id) if not found: - return MigrationStatus.FINISHED_SRC_ABORTED + return MigrationStatus.ABORTED_SRC self.backend_engine.add_migrating_out_request_last_stage(migrate_out_request) dst_blocks = await migrate_in_ray_actor.execute_migration_method \ .remote("migrate_in_pre_alloc", migrate_out_request.request_id, @@ -70,9 +70,9 @@ async def migrate_out_waiting_request(self, if len(dst_blocks) != migrate_out_request.prefill_num_blocks: self.backend_engine.add_waiting_request(migrate_out_request) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) - return MigrationStatus.FINISHED_DST_ABORTED + return MigrationStatus.ABORTED_DST - return MigrationStatus.FINISHED_DONE + return MigrationStatus.FINISHED async def _migrate_out_multistage(self, migrate_in_ray_actor: "ray.actor.ActorHandle", @@ -88,7 +88,7 @@ async def _migrate_out_multistage(self, if MigrationStatus.is_finished(status): return status # exceed max stages - return MigrationStatus.FINISHED_SRC_ABORTED + return MigrationStatus.ABORTED_SRC async def _migrate_out_onestage(self, migrate_in_ray_actor: "ray.actor.ActorHandle", @@ -110,10 +110,10 @@ async def _migrate_out_onestage(self, stage_block_num) else: # last stage migration, stop inference, transfer all blocks - migration_status = MigrationStatus.FINISHED_DONE + migration_status = MigrationStatus.FINISHED found = self.backend_engine.remove_running_request(migrate_out_request.request_id) if not found: - return MigrationStatus.FINISHED_SRC_ABORTED + return MigrationStatus.ABORTED_SRC self.backend_engine.add_migrating_out_request_last_stage(migrate_out_request) src_blocks = incremental_blocks[:] stage_block_num = len(incremental_blocks) @@ -128,7 +128,7 @@ async def _migrate_out_onestage(self, if is_last_stage: self.backend_engine.add_running_request(migrate_out_request) self.backend_engine.remove_migrating_out_request_last_stage(migrate_out_request) - return MigrationStatus.FINISHED_DST_ABORTED + return MigrationStatus.ABORTED_DST # do stage send/recv migrate_out_request.stage_timestamps.append(time.time()) @@ -137,7 +137,7 @@ async def _migrate_out_onestage(self, await self.backend_engine.send_blocks(migrate_in_ray_actor, src_blocks, dst_blocks) if not is_last_stage and migrate_out_request.should_abort_migration(): # migrate-out request abort by scheduler during send/recv - return MigrationStatus.FINISHED_SRC_ABORTED + return MigrationStatus.ABORTED_SRC return migration_status diff --git a/tests/unit_test/llumlet/test_migration_coordinator.py b/tests/unit_test/llumlet/test_migration_coordinator.py index f0244836..fcdf0638 100644 --- a/tests/unit_test/llumlet/test_migration_coordinator.py +++ b/tests/unit_test/llumlet/test_migration_coordinator.py @@ -60,7 +60,7 @@ async def test_migrate_out_onestage(setup_ray_env): migrate_out_request.blocking_migration = False migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) status = await coordinator._migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) - assert status == MigrationStatus.FINISHED_DONE + assert status == MigrationStatus.FINISHED migrate_out_request = MagicMock() # Test migration dst aborted scenario @@ -71,7 +71,7 @@ async def test_migrate_out_onestage(setup_ray_env): migrate_out_request.blocking_migration = False migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) status = await coordinator._migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) - assert status == MigrationStatus.FINISHED_DST_ABORTED + assert status == MigrationStatus.ABORTED_DST # Test migration src aborted scenario migrate_out_request = MagicMock() @@ -82,7 +82,7 @@ async def test_migrate_out_onestage(setup_ray_env): migrate_out_request.blocking_migration = False migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) status = await coordinator._migrate_out_onestage(migrate_in_ray_actor, migrate_out_request) - assert status == MigrationStatus.FINISHED_SRC_ABORTED + assert status == MigrationStatus.ABORTED_SRC # setup_ray_env should be passed after _migrate_out_onestage @patch.object(MigrationCoordinator, '_migrate_out_onestage') @@ -101,10 +101,10 @@ async def test_migrate_out_running_request(_, setup_ray_env): migrate_in_ray_actor.execute_engine_method.remote = MagicMock() migrate_in_ray_actor.execute_engine_method.remote.return_value = ray_remote_call.remote([1]) migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote([1]) - coordinator._migrate_out_onestage.side_effect = [MigrationStatus.FINISHED_DONE] + coordinator._migrate_out_onestage.side_effect = [MigrationStatus.FINISHED] status = await coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) assert coordinator._migrate_out_onestage.call_count == 1 - assert status == MigrationStatus.FINISHED_DONE + assert status == MigrationStatus.FINISHED max_stages = 3 coordinator._migrate_out_onestage.side_effect = [MigrationStatus.RUNNING, @@ -113,7 +113,7 @@ async def test_migrate_out_running_request(_, setup_ray_env): MigrationStatus.RUNNING] status = await coordinator.migrate_out_running_request(migrate_in_ray_actor, migrate_out_request) assert coordinator._migrate_out_onestage.call_count == max_stages + 1 - assert status == MigrationStatus.FINISHED_SRC_ABORTED + assert status == MigrationStatus.ABORTED_SRC @pytest.mark.asyncio async def test_migrate_out_waiting_request(): @@ -125,7 +125,7 @@ async def test_migrate_out_waiting_request(): # Create an instance of MigrationCoordinator coordinator = MigrationCoordinator(backend_engine, last_stage_max_blocks=1, max_stages=3) - # Test FINISHED_DONE + # Test FINISHED migrate_out_request.prefill_num_blocks = 3 dst_blocks = [1, 2, 3] migrate_in_ray_actor.execute_engine_method = MagicMock() @@ -133,9 +133,9 @@ async def test_migrate_out_waiting_request(): migrate_in_ray_actor.execute_engine_method.remote.return_value = ray_remote_call.remote(dst_blocks) migrate_in_ray_actor.execute_migration_method.remote.return_value = ray_remote_call.remote(dst_blocks) status = await coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) - assert status == MigrationStatus.FINISHED_DONE + assert status == MigrationStatus.FINISHED # Test FINISHED_ABORTED migrate_out_request.prefill_num_blocks = 2 status = await coordinator.migrate_out_waiting_request(migrate_in_ray_actor, migrate_out_request) - assert status == MigrationStatus.FINISHED_DST_ABORTED + assert status == MigrationStatus.ABORTED_DST From ed171cf10b67b1acb79f8d6d8d357b9b4cffcdb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=97=E5=BF=98?= Date: Mon, 11 Nov 2024 20:40:51 +0800 Subject: [PATCH 47/49] Fix --- Makefile | 14 +++++++------- llumnix/arg_utils.py | 1 + llumnix/backends/vllm/migration_backend.py | 2 +- llumnix/backends/vllm/worker.py | 6 ++---- llumnix/global_scheduler/dispatch_scheduler.py | 2 +- llumnix/global_scheduler/migration_scheduler.py | 4 ---- llumnix/llumlet/llumlet.py | 2 +- tests/e2e_test/test_bench.py | 4 ++-- tests/e2e_test/test_e2e.py | 4 ++-- tests/e2e_test/test_migration.py | 2 +- tests/e2e_test/utils.py | 2 +- 11 files changed, 19 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index bb0db511..8f75c380 100644 --- a/Makefile +++ b/Makefile @@ -29,15 +29,15 @@ lint: check_pylint_installed check_pytest_installed .PHONY: test test: check_pytest_installed - @pytest -v -x --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings + @pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings @python examlpes/offline_inference.py - @pytest -v -x tests/e2e_test/test_e2e.py - @pytest -v -x ./tests/e2e_test/test_bench.py - @pytest -v -x ./tests/e2e_test/test_migration.py + @pytest -v ./tests/e2e_test/test_e2e.py + @pytest -v ./tests/e2e_test/test_bench.py + @pytest -v ./tests/e2e_test/test_migration.py .PHONY: unit_test unit_test: check_pytest_installed - @pytest -v -x --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings + @pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings .PHONY: offline_test offline_test: @@ -45,11 +45,11 @@ offline_test: .PHONY: e2e_test e2e_test: - @pytest -v -x tests/e2e_test/test_e2e.py + @pytest -v ./tests/e2e_test/test_e2e.py .PHONY: bench_test bench_test: - @pytest -v -x ./tests/e2e_test/test_bench.py + @pytest -v ./tests/e2e_test/test_bench.py .PHONY: migration_test migration_test: diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index 8d112a91..1c4c54b4 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -22,6 +22,7 @@ from llumnix.config import LlumnixConfig, get_llumnix_config from llumnix.config.default import _C + class LlumnixArgumentParser(argparse.ArgumentParser): def __init__(self, *args, **kwargs): self.cur_namespace = "llumnix" diff --git a/llumnix/backends/vllm/migration_backend.py b/llumnix/backends/vllm/migration_backend.py index 674af01d..950c1b31 100644 --- a/llumnix/backends/vllm/migration_backend.py +++ b/llumnix/backends/vllm/migration_backend.py @@ -288,7 +288,7 @@ def get_migration_backend(migration_config: MigrationConfig, cache_engine: Cache target_migration_backend = None backend = migration_config.migration_backend - assert backend in ['nccl', 'gloo', 'rpc'], "Unsupported backend: {} for VLLM".format(backend) + assert backend in ['nccl', 'gloo', 'rpc'], "Unsupported migration backend: {} for llumnix".format(backend) if backend in ['nccl', 'gloo']: target_migration_backend = RayColMigrationBackend(migration_config, cache_engine, local_rank, scheduling_strategy, diff --git a/llumnix/backends/vllm/worker.py b/llumnix/backends/vllm/worker.py index 2b0cab33..e38c3423 100644 --- a/llumnix/backends/vllm/worker.py +++ b/llumnix/backends/vllm/worker.py @@ -111,10 +111,8 @@ def migrate_cache(self, src_worker_handle_list, src_blocks: List[int], dst_block start_time = time.time() try: self.migration_backend.migrate_cache(src_worker_handle, src_blocks, dst_blocks) - # pylint: disable=broad-except - except Exception as e: - logger.info("[migrate_cache] self.rank: {}, src_worker_handle {}, meet error : {}" - .format(self.rank, src_worker_handle, e)) + except ray.exceptions.RayActorError: + logger.info("[migrate_cache] self.rank: {}, src_worker_handle {} is dead".format(self.rank, src_worker_handle)) end_time = time.time() total_kv_cache_size = len(src_blocks) * CacheEngine.get_cache_block_size( diff --git a/llumnix/global_scheduler/dispatch_scheduler.py b/llumnix/global_scheduler/dispatch_scheduler.py index 27458f26..51a0d36b 100644 --- a/llumnix/global_scheduler/dispatch_scheduler.py +++ b/llumnix/global_scheduler/dispatch_scheduler.py @@ -71,7 +71,7 @@ def remove_instance(self, instance_id: str) -> None: del self.instance_num_requests[instance_id] if instance_id in self.available_dispatch_instance_set: self.available_dispatch_instance_set.remove(instance_id) - + # TODO(KuilongCui): Check it when there is no decode instance. if self.num_instances >= self.num_dispatch_instances: free_instance_id = next(iter(self.instance_id_set - self.available_dispatch_instance_set)) self.available_dispatch_instance_set.add(free_instance_id) diff --git a/llumnix/global_scheduler/migration_scheduler.py b/llumnix/global_scheduler/migration_scheduler.py index 77fd9b25..3445b210 100644 --- a/llumnix/global_scheduler/migration_scheduler.py +++ b/llumnix/global_scheduler/migration_scheduler.py @@ -170,10 +170,8 @@ def pair_migration(self, migrate_instance_pairs = [] for i in range(min(len(sorted_src_instance_infos), len(sorted_dst_instance_infos))): load_diff_before_mig = sorted_src_instance_infos[i].instance_load_migrate - sorted_dst_instance_infos[i].instance_load_migrate - left_load_after_mig = self._compute_instance_load_after_migrate(sorted_src_instance_infos[i], is_migrate_in=False) right_load_after_mig = self._compute_instance_load_after_migrate(sorted_dst_instance_infos[i], is_migrate_in=True) - # Add some constrains to reduce unnecessary migrations if right_load_after_mig > self.migrate_out_load_threshold: continue @@ -186,14 +184,12 @@ def pair_migration(self, def _compute_instance_load_after_migrate(self, instance_info: InstanceInfo, is_migrate_in: bool) -> float: instance_info_after_migrate = copy.deepcopy(instance_info) num_blocks_last_running_request = instance_info_after_migrate.num_blocks_last_running_request - if is_migrate_in: instance_info_after_migrate.num_running_requests += 1 instance_info_after_migrate.num_free_gpu_blocks -= num_blocks_last_running_request else: instance_info_after_migrate.num_running_requests -= 1 instance_info_after_migrate.num_free_gpu_blocks += num_blocks_last_running_request - return self.instance_load_calculator.compute_instance_load(instance_info_after_migrate, action='migrate') class DefragConstrained(PairMigrationPolicy): diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 02d4bd99..3af73ac5 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -224,7 +224,7 @@ def clear_migration_states(self, is_migrate_in: bool) -> None: RequestStatus.WAITING_MIGRATING or RequestStatus.RUNNING_MIGRATING" if backend_request.status == RequestStatus.RUNNING_MIGRATING: self.backend_engine.add_running_request(backend_request) - else # WAITING_MIGRATING: + else: # WAITING_MIGRATING self.backend_engine.add_waiting_request(backend_request) def execute_migration_method(self, method, *args, **kwargs): diff --git a/tests/e2e_test/test_bench.py b/tests/e2e_test/test_bench.py index eb93fb89..5eba27d1 100644 --- a/tests/e2e_test/test_bench.py +++ b/tests/e2e_test/test_bench.py @@ -21,7 +21,7 @@ from .test_e2e import generate_launch_command, clear_ray_state # pylint: disable=unused-import -from .utils import to_markdown_table, clean_ray +from .utils import to_markdown_table, setup_ray_env def launch_llumnix_service(command): subprocess.run(command, shell=True, check=True) @@ -91,7 +91,7 @@ def get_markdown_data(key: str, head_name: str): @pytest.mark.asyncio @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="at least 1 gpus required for simple benchmark") @pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B']) -async def test_simple_benchmark(clean_ray, model): +async def test_simple_benchmark(setup_ray_env, model): device_count = torch.cuda.device_count() base_port = 37037 for i in range(device_count): diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py index f4b875b6..a3bf1977 100644 --- a/tests/e2e_test/test_e2e.py +++ b/tests/e2e_test/test_e2e.py @@ -20,7 +20,7 @@ from vllm import LLM, SamplingParams # pylint: disable=unused-import -from .utils import clean_ray +from .utils import setup_ray_env def parse_launch_mode(launch_mode: str): # 'eief' means that enable init instance by manager and enable fixed node init instance, and so on. @@ -140,7 +140,7 @@ def run_vllm(model, max_model_len, sampling_params): @pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B']) @pytest.mark.parametrize("migration_backend", ['rpc', 'gloo']) @pytest.mark.parametrize("launch_mode", ['eief', 'eidf', 'dief', 'didf']) -async def test_e2e(clean_ray, model, migration_backend, launch_mode): +async def test_e2e(setup_ray_env, model, migration_backend, launch_mode): if migration_backend == 'gloo' and launch_mode != 'eief': pytest.skip("When the migration backend is gloo, the launch mode of llumnix can only be eief") max_model_len = 370 diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index 47a12c43..ced1e0be 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -22,7 +22,7 @@ from .test_e2e import generate_launch_command from .test_bench import generate_bench_command, clear_ray_state, shutdown_llumnix_service # pylint: disable=unused-import -from .utils import to_markdown_table, clean_ray +from .utils import to_markdown_table, setup_ray_env size_pattern = re.compile(r'total_kv_cache_size:\s*([\d.]+)\s*(B|KB|MB|GB|KB|TB)') speed_pattern = re.compile(r'speed:\s*([\d.]+)GB/s') diff --git a/tests/e2e_test/utils.py b/tests/e2e_test/utils.py index 492eb2fd..1c38dcc8 100644 --- a/tests/e2e_test/utils.py +++ b/tests/e2e_test/utils.py @@ -33,7 +33,7 @@ def to_markdown_table(data): return table @pytest.fixture -def clean_ray(): +def setup_ray_env(): subprocess.run(["ray", "stop"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) subprocess.run(["ray", "start", "--head", "--disable-usage-stats", "--port=6379"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) From 193daef4aec0e4122fca19d38be0f0a16474e3d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=97=E5=BF=98?= Date: Mon, 11 Nov 2024 20:43:57 +0800 Subject: [PATCH 48/49] Fix unit test --- tests/unit_test/global_scheduler/test_dispatch_scheduler.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/unit_test/global_scheduler/test_dispatch_scheduler.py b/tests/unit_test/global_scheduler/test_dispatch_scheduler.py index e82a5871..fedaa154 100644 --- a/tests/unit_test/global_scheduler/test_dispatch_scheduler.py +++ b/tests/unit_test/global_scheduler/test_dispatch_scheduler.py @@ -48,10 +48,7 @@ def test_add_instance_and_remove_instance(dispatch_scheduler, num_dispatch_insta dispatch_scheduler.remove_instance('instance_2') assert dispatch_scheduler.num_instances == 1 - if dispatch_scheduler.num_dispatch_instances >= 2: - assert len(dispatch_scheduler.available_dispatch_instance_set) == 1 - else: - assert len(dispatch_scheduler.available_dispatch_instance_set) == 0 + assert len(dispatch_scheduler.available_dispatch_instance_set) == 1 dispatch_scheduler.remove_instance('instance_3') assert dispatch_scheduler.num_instances == 0 From 343f6e871671f9259380ad24477b688c8ae04a44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=97=E5=BF=98?= Date: Tue, 12 Nov 2024 10:30:11 +0800 Subject: [PATCH 49/49] Fix --- tests/unit_test/backends/vllm/test_migration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index ef6f764c..b74950c2 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -207,7 +207,7 @@ async def test_correctness(prompt): @pytest.mark.parametrize("migration_backend", ['rpc', 'gloo', 'nccl']) @pytest.mark.asyncio async def test_pd_diaggregation_correctness(setup_ray_env, migration_backend): - engine_args = EngineArgs(model="facebook/opt-125m",worker_use_ray=True) + engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) id_rank_map = {"0":0, "1":1} migration_config = MigrationConfig("SR", migration_backend, 16, 1, 4, 5, 20, 2)