From 8d8984dba9e97a5f0e7897d446dd8d29d25d5168 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Fri, 10 Jan 2025 02:20:02 +0000 Subject: [PATCH] Refine simulator mode --- docs/Arguments.md | 4 +++ llumnix/arg_utils.py | 7 +++++ llumnix/backends/backend_interface.py | 5 +++ llumnix/backends/profiling.py | 3 +- llumnix/backends/utils.py | 31 +++++++++---------- llumnix/config/default.py | 2 ++ llumnix/entrypoints/bladellm/api_server.py | 3 +- llumnix/entrypoints/vllm/api_server.py | 3 +- llumnix/entrypoints/vllm/serve.py | 1 + llumnix/llumlet/llumlet.py | 2 +- llumnix/manager.py | 3 +- .../global_scheduler/test_manager.py | 2 +- 12 files changed, 43 insertions(+), 23 deletions(-) diff --git a/docs/Arguments.md b/docs/Arguments.md index 4374f3b2..f5be9210 100644 --- a/docs/Arguments.md +++ b/docs/Arguments.md @@ -38,6 +38,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] [--disable-log-requests-manager] [--log-instance-info] [--log-filename LOG_FILENAME] + [--simulator-mode] [--profiling-result-file-path PROFILING_RESULT_FILE_PATH] [--gpu-type GPU_TYPE] [--migration-backend {gloo,nccl,rayrpc,grpc,kvtransfer}] @@ -181,6 +182,9 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] - Log filename. - Default: "server.log" +`--simulator-mode` +- Enable simulator mode. + `--profiling-result-file-path` - Profiling result file path when using simulator. - Default: None diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index c882914a..c3a7d9ff 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -136,6 +136,7 @@ class ManagerArgs: disable_log_requests_manager: bool = None log_instance_info: bool = None log_filename: str = None + simulator_mode: bool = None profiling_result_file_path: str = None migration_backend: str = None @@ -218,6 +219,9 @@ def check_args(cls, args: 'ManagerArgs', parser: argparse.ArgumentParser): ("When using kvTransfer as migration backend, " "do not set --migration-backend-transfer-type as empty.") + assert not args.simulator_mode or args.profiling_result_file_path is not None, \ + "Set profiling_result_file_path args when enable simulator mode" + @staticmethod def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.add_argument('--initial-instances', @@ -309,6 +313,9 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.add_argument('--profiling-result-file-path', type=str, help='profiling result file path when using simulator') + parser.add_argument('--simulator-mode', + action='store_true', + help='enable simulator mode') parser.add_argument('--migration-backend', type=str, choices=['gloo','nccl','rayrpc','grpc','kvtransfer'], diff --git a/llumnix/backends/backend_interface.py b/llumnix/backends/backend_interface.py index 257c2189..d8631b84 100644 --- a/llumnix/backends/backend_interface.py +++ b/llumnix/backends/backend_interface.py @@ -29,6 +29,11 @@ class EngineState(str, Enum): class BackendType(str, Enum): VLLM = "VLLM" BLADELLM = "BLADELLM" + SIM_VLLM = "SIM_VLLM" + + @staticmethod + def is_sim_backend(status: "BackendType") -> bool: + return status in [BackendType.SIM_VLLM] class BackendInterface(ABC): diff --git a/llumnix/backends/profiling.py b/llumnix/backends/profiling.py index cf21fcc4..b79afcc1 100644 --- a/llumnix/backends/profiling.py +++ b/llumnix/backends/profiling.py @@ -178,7 +178,8 @@ def model_decode(x, a, b, c): return a * bs + b * tot_seq_len + c def get_latency_mem(backend_type: BackendType, profiling_database: ProfilingDatabase, **backend_args): - if backend_type == BackendType.VLLM: + assert BackendType.is_sim_backend(backend_type) + if backend_type == BackendType.SIM_VLLM: # TODO(ZeldaHuang): support multi-lora, more device, vision language model model_config = backend_args.get("model_config") _ = backend_args.get("cache_config") diff --git a/llumnix/backends/utils.py b/llumnix/backends/utils.py index 501d8a54..8659c016 100644 --- a/llumnix/backends/utils.py +++ b/llumnix/backends/utils.py @@ -71,22 +71,12 @@ def init_backend_engine(instance_id: str, profiling_result_file_path: str = None) -> BackendInterface: if backend_type == BackendType.VLLM: # pylint: disable=import-outside-toplevel - if profiling_result_file_path is None: - from llumnix.backends.vllm.llm_engine import BackendVLLM - backend_engine = BackendVLLM(instance_id, - placement_group, - request_output_queue_type, - migration_config, - engine_args) - else: - # pylint: disable=import-outside-toplevel - from llumnix.backends.vllm.simulator import BackendSimVLLM - backend_engine = BackendSimVLLM(instance_id, - placement_group, - request_output_queue_type, - migration_config, - engine_args, - profiling_result_file_path) + from llumnix.backends.vllm.llm_engine import BackendVLLM + backend_engine = BackendVLLM(instance_id, + placement_group, + request_output_queue_type, + migration_config, + engine_args) elif backend_type == BackendType.BLADELLM: # pylint: disable=import-outside-toplevel from llumnix.backends.bladellm.llm_engine import BackendBladeLLM @@ -95,6 +85,15 @@ def init_backend_engine(instance_id: str, request_output_queue_type, migration_config, engine_args) + elif backend_type == BackendType.SIM_VLLM: + # pylint: disable=import-outside-toplevel + from llumnix.backends.vllm.simulator import BackendSimVLLM + backend_engine = BackendSimVLLM(instance_id, + placement_group, + request_output_queue_type, + migration_config, + engine_args, + profiling_result_file_path) else: raise ValueError(f'Unsupported backend: {backend_type}') return backend_engine diff --git a/llumnix/config/default.py b/llumnix/config/default.py index 078607b4..ec6d060e 100644 --- a/llumnix/config/default.py +++ b/llumnix/config/default.py @@ -69,6 +69,8 @@ _C.MANAGER.LOG_INSTANCE_INFO = False # Log filename _C.MANAGER.LOG_FILENAME = "server.log" +# Enable simulator mode +_C.MANAGER.SIMULATOR_MODE = False # Profiling result file path when using simulator _C.MANAGER.PROFILING_RESULT_FILE_PATH = None # Enable port increment when deploying multiple servers diff --git a/llumnix/entrypoints/bladellm/api_server.py b/llumnix/entrypoints/bladellm/api_server.py index 56a563b8..537798f5 100644 --- a/llumnix/entrypoints/bladellm/api_server.py +++ b/llumnix/entrypoints/bladellm/api_server.py @@ -32,7 +32,8 @@ def setup_llumnix_api_server(bladellm_args: ServingArgs, loop: asyncio.AbstractE llumnix_config = get_llumnix_config(bladellm_args.llumnix_config) entrypoints_args, manager_args, engine_args = get_args(llumnix_config, llumnix_parser, bladellm_args) - launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.VLLM) + assert not manager_args.simulator_mode, "Only support the simulator mode for vLLM." + launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.BLADELLM) setup_ray_cluster(entrypoints_args) diff --git a/llumnix/entrypoints/vllm/api_server.py b/llumnix/entrypoints/vllm/api_server.py index d297dcd9..a1e1b955 100644 --- a/llumnix/entrypoints/vllm/api_server.py +++ b/llumnix/entrypoints/vllm/api_server.py @@ -182,7 +182,8 @@ async def is_ready() -> bool: cfg = get_llumnix_config(cli_args.config_file, cli_args) entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args) - launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.VLLM) + backend_type = BackendType.VLLM if not manager_args.simulator_mode else BackendType.SIM_VLLM + launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=backend_type) # Launch or connect to the ray cluster for multi-node serving. setup_ray_cluster(entrypoints_args) diff --git a/llumnix/entrypoints/vllm/serve.py b/llumnix/entrypoints/vllm/serve.py index fc865403..a73f1ce9 100644 --- a/llumnix/entrypoints/vllm/serve.py +++ b/llumnix/entrypoints/vllm/serve.py @@ -25,6 +25,7 @@ cfg = get_llumnix_config(cli_args.config_file, cli_args) entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args) + backend_type = BackendType.VLLM if not manager_args.simulator_mode else BackendType.SIM_VLLM launch_args = LaunchArgs(launch_mode=LaunchMode.GLOBAL, backend_type=BackendType.VLLM) # Assume that there is an existing ray cluster when using centralized deployment. diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 378e717a..85c14ba1 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -81,7 +81,7 @@ def from_args(cls, engine_args, profiling_result_file_path: str = None): try: - assert backend_type in [backend_type.VLLM, backend_type.BLADELLM], \ + assert backend_type in [backend_type.VLLM, backend_type.BLADELLM, backend_type.SIM_VLLM], \ f'unimplemented backend {backend_type}' num_gpus = 0 if backend_type == backend_type.BLADELLM: diff --git a/llumnix/manager.py b/llumnix/manager.py index 69cd3cad..8b0d63f4 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -522,14 +522,13 @@ def _init_placement_group(self, backend_type: BackendType, init_server: bool = False, block: bool = True) -> PlacementGroup: - if not self.manager_args.profiling_result_file_path: + if not BackendType.is_sim_backend(backend_type): # num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor # num_gpus=world_size, for world_size Workers world_size = get_engine_world_size(engine_args, backend_type) placement_group = initialize_placement_group(placement_group_name, num_cpus=3+int(init_server), num_gpus=world_size, detached=True, block=block) else: - assert backend_type == backend_type.VLLM, "Only support the simulator backend for vLLM." # num_cpus=1, for Llumlet + AsyncPutQueueActor placement_group = initialize_placement_group(placement_group_name, num_cpus=2+int(init_server), num_gpus=0, detached=True, block=block) diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index a2dbcf89..518424a2 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -193,7 +193,7 @@ def test_init_instances_sim(ray_env, manager): import llumnix.backends.vllm.simulator llumnix.backends.vllm.simulator.BackendSimVLLM = MockBackendSim engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - _, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) + _, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.SIM_VLLM, engine_args)) num_instances = len(instances) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances