From 8d8984dba9e97a5f0e7897d446dd8d29d25d5168 Mon Sep 17 00:00:00 2001
From: s5u13b <sunbiao.sun@alibaba-inc.com>
Date: Fri, 10 Jan 2025 02:20:02 +0000
Subject: [PATCH] Refine simulator mode

---
 docs/Arguments.md                             |  4 +++
 llumnix/arg_utils.py                          |  7 +++++
 llumnix/backends/backend_interface.py         |  5 +++
 llumnix/backends/profiling.py                 |  3 +-
 llumnix/backends/utils.py                     | 31 +++++++++----------
 llumnix/config/default.py                     |  2 ++
 llumnix/entrypoints/bladellm/api_server.py    |  3 +-
 llumnix/entrypoints/vllm/api_server.py        |  3 +-
 llumnix/entrypoints/vllm/serve.py             |  1 +
 llumnix/llumlet/llumlet.py                    |  2 +-
 llumnix/manager.py                            |  3 +-
 .../global_scheduler/test_manager.py          |  2 +-
 12 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/docs/Arguments.md b/docs/Arguments.md
index 4374f3b2..f5be9210 100644
--- a/docs/Arguments.md
+++ b/docs/Arguments.md
@@ -38,6 +38,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
             [--disable-log-requests-manager]
             [--log-instance-info]
             [--log-filename LOG_FILENAME]
+            [--simulator-mode]
             [--profiling-result-file-path PROFILING_RESULT_FILE_PATH]
             [--gpu-type GPU_TYPE]
             [--migration-backend {gloo,nccl,rayrpc,grpc,kvtransfer}]
@@ -181,6 +182,9 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
 - Log filename.
 - Default: "server.log"
 
+`--simulator-mode`
+- Enable simulator mode.
+
 `--profiling-result-file-path`
 - Profiling result file path when using simulator.
 - Default: None
diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py
index c882914a..c3a7d9ff 100644
--- a/llumnix/arg_utils.py
+++ b/llumnix/arg_utils.py
@@ -136,6 +136,7 @@ class ManagerArgs:
     disable_log_requests_manager: bool = None
     log_instance_info: bool = None
     log_filename: str = None
+    simulator_mode: bool = None
     profiling_result_file_path: str = None
 
     migration_backend: str = None
@@ -218,6 +219,9 @@ def check_args(cls, args: 'ManagerArgs', parser: argparse.ArgumentParser):
             ("When using kvTransfer as migration backend, "
              "do not set --migration-backend-transfer-type as empty.")
 
+        assert not args.simulator_mode or args.profiling_result_file_path is not None, \
+            "Set profiling_result_file_path args when enable simulator mode"
+
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         parser.add_argument('--initial-instances',
@@ -309,6 +313,9 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         parser.add_argument('--profiling-result-file-path',
                             type=str,
                             help='profiling result file path when using simulator')
+        parser.add_argument('--simulator-mode',
+                            action='store_true',
+                            help='enable simulator mode')
         parser.add_argument('--migration-backend',
                             type=str,
                             choices=['gloo','nccl','rayrpc','grpc','kvtransfer'],
diff --git a/llumnix/backends/backend_interface.py b/llumnix/backends/backend_interface.py
index 257c2189..d8631b84 100644
--- a/llumnix/backends/backend_interface.py
+++ b/llumnix/backends/backend_interface.py
@@ -29,6 +29,11 @@ class EngineState(str, Enum):
 class BackendType(str, Enum):
     VLLM = "VLLM"
     BLADELLM = "BLADELLM"
+    SIM_VLLM = "SIM_VLLM"
+
+    @staticmethod
+    def is_sim_backend(status: "BackendType") -> bool:
+        return status in [BackendType.SIM_VLLM]
 
 
 class BackendInterface(ABC):
diff --git a/llumnix/backends/profiling.py b/llumnix/backends/profiling.py
index cf21fcc4..b79afcc1 100644
--- a/llumnix/backends/profiling.py
+++ b/llumnix/backends/profiling.py
@@ -178,7 +178,8 @@ def model_decode(x, a, b, c):
     return a * bs + b * tot_seq_len + c
 
 def get_latency_mem(backend_type: BackendType, profiling_database: ProfilingDatabase, **backend_args):
-    if backend_type == BackendType.VLLM:
+    assert BackendType.is_sim_backend(backend_type)
+    if backend_type == BackendType.SIM_VLLM:
         # TODO(ZeldaHuang): support multi-lora, more device, vision language model
         model_config = backend_args.get("model_config")
         _ = backend_args.get("cache_config")
diff --git a/llumnix/backends/utils.py b/llumnix/backends/utils.py
index 501d8a54..8659c016 100644
--- a/llumnix/backends/utils.py
+++ b/llumnix/backends/utils.py
@@ -71,22 +71,12 @@ def init_backend_engine(instance_id: str,
                         profiling_result_file_path: str = None) -> BackendInterface:
     if backend_type == BackendType.VLLM:
         # pylint: disable=import-outside-toplevel
-        if profiling_result_file_path is None:
-            from llumnix.backends.vllm.llm_engine import BackendVLLM
-            backend_engine = BackendVLLM(instance_id,
-                                         placement_group,
-                                         request_output_queue_type,
-                                         migration_config,
-                                         engine_args)
-        else:
-            # pylint: disable=import-outside-toplevel
-            from llumnix.backends.vllm.simulator import BackendSimVLLM
-            backend_engine = BackendSimVLLM(instance_id,
-                                            placement_group,
-                                            request_output_queue_type,
-                                            migration_config,
-                                            engine_args,
-                                            profiling_result_file_path)
+        from llumnix.backends.vllm.llm_engine import BackendVLLM
+        backend_engine = BackendVLLM(instance_id,
+                                        placement_group,
+                                        request_output_queue_type,
+                                        migration_config,
+                                        engine_args)
     elif backend_type == BackendType.BLADELLM:
         # pylint: disable=import-outside-toplevel
         from llumnix.backends.bladellm.llm_engine import BackendBladeLLM
@@ -95,6 +85,15 @@ def init_backend_engine(instance_id: str,
                                          request_output_queue_type,
                                          migration_config,
                                          engine_args)
+    elif backend_type == BackendType.SIM_VLLM:
+        # pylint: disable=import-outside-toplevel
+        from llumnix.backends.vllm.simulator import BackendSimVLLM
+        backend_engine = BackendSimVLLM(instance_id,
+                                        placement_group,
+                                        request_output_queue_type,
+                                        migration_config,
+                                        engine_args,
+                                        profiling_result_file_path)
     else:
         raise ValueError(f'Unsupported backend: {backend_type}')
     return backend_engine
diff --git a/llumnix/config/default.py b/llumnix/config/default.py
index 078607b4..ec6d060e 100644
--- a/llumnix/config/default.py
+++ b/llumnix/config/default.py
@@ -69,6 +69,8 @@
 _C.MANAGER.LOG_INSTANCE_INFO = False
 # Log filename
 _C.MANAGER.LOG_FILENAME = "server.log"
+# Enable simulator mode
+_C.MANAGER.SIMULATOR_MODE = False
 # Profiling result file path when using simulator
 _C.MANAGER.PROFILING_RESULT_FILE_PATH = None
 # Enable port increment when deploying multiple servers
diff --git a/llumnix/entrypoints/bladellm/api_server.py b/llumnix/entrypoints/bladellm/api_server.py
index 56a563b8..537798f5 100644
--- a/llumnix/entrypoints/bladellm/api_server.py
+++ b/llumnix/entrypoints/bladellm/api_server.py
@@ -32,7 +32,8 @@ def setup_llumnix_api_server(bladellm_args: ServingArgs, loop: asyncio.AbstractE
     llumnix_config = get_llumnix_config(bladellm_args.llumnix_config)
     entrypoints_args, manager_args, engine_args = get_args(llumnix_config, llumnix_parser, bladellm_args)
 
-    launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.VLLM)
+    assert not manager_args.simulator_mode, "Only support the simulator mode for vLLM."
+    launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.BLADELLM)
 
     setup_ray_cluster(entrypoints_args)
 
diff --git a/llumnix/entrypoints/vllm/api_server.py b/llumnix/entrypoints/vllm/api_server.py
index d297dcd9..a1e1b955 100644
--- a/llumnix/entrypoints/vllm/api_server.py
+++ b/llumnix/entrypoints/vllm/api_server.py
@@ -182,7 +182,8 @@ async def is_ready() -> bool:
     cfg = get_llumnix_config(cli_args.config_file, cli_args)
     entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args)
 
-    launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.VLLM)
+    backend_type = BackendType.VLLM if not manager_args.simulator_mode else BackendType.SIM_VLLM
+    launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=backend_type)
 
     # Launch or connect to the ray cluster for multi-node serving.
     setup_ray_cluster(entrypoints_args)
diff --git a/llumnix/entrypoints/vllm/serve.py b/llumnix/entrypoints/vllm/serve.py
index fc865403..a73f1ce9 100644
--- a/llumnix/entrypoints/vllm/serve.py
+++ b/llumnix/entrypoints/vllm/serve.py
@@ -25,6 +25,7 @@
     cfg = get_llumnix_config(cli_args.config_file, cli_args)
     entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args)
 
+    backend_type = BackendType.VLLM if not manager_args.simulator_mode else BackendType.SIM_VLLM
     launch_args = LaunchArgs(launch_mode=LaunchMode.GLOBAL, backend_type=BackendType.VLLM)
 
     # Assume that there is an existing ray cluster when using centralized deployment.
diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py
index 378e717a..85c14ba1 100644
--- a/llumnix/llumlet/llumlet.py
+++ b/llumnix/llumlet/llumlet.py
@@ -81,7 +81,7 @@ def from_args(cls,
                   engine_args,
                   profiling_result_file_path: str = None):
         try:
-            assert backend_type in [backend_type.VLLM, backend_type.BLADELLM], \
+            assert backend_type in [backend_type.VLLM, backend_type.BLADELLM, backend_type.SIM_VLLM], \
                 f'unimplemented backend {backend_type}'
             num_gpus = 0
             if backend_type == backend_type.BLADELLM:
diff --git a/llumnix/manager.py b/llumnix/manager.py
index 69cd3cad..8b0d63f4 100644
--- a/llumnix/manager.py
+++ b/llumnix/manager.py
@@ -522,14 +522,13 @@ def _init_placement_group(self,
                               backend_type: BackendType,
                               init_server: bool = False,
                               block: bool = True) -> PlacementGroup:
-        if not self.manager_args.profiling_result_file_path:
+        if not BackendType.is_sim_backend(backend_type):
             # num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor
             # num_gpus=world_size, for world_size Workers
             world_size = get_engine_world_size(engine_args, backend_type)
             placement_group = initialize_placement_group(placement_group_name,
                                                          num_cpus=3+int(init_server), num_gpus=world_size, detached=True, block=block)
         else:
-            assert backend_type == backend_type.VLLM, "Only support the simulator backend for vLLM."
             # num_cpus=1, for Llumlet + AsyncPutQueueActor
             placement_group = initialize_placement_group(placement_group_name,
                                                          num_cpus=2+int(init_server), num_gpus=0, detached=True, block=block)
diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py
index a2dbcf89..518424a2 100644
--- a/tests/unit_test/global_scheduler/test_manager.py
+++ b/tests/unit_test/global_scheduler/test_manager.py
@@ -193,7 +193,7 @@ def test_init_instances_sim(ray_env, manager):
     import llumnix.backends.vllm.simulator
     llumnix.backends.vllm.simulator.BackendSimVLLM = MockBackendSim
     engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True)
-    _, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args))
+    _, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.SIM_VLLM, engine_args))
     num_instances = len(instances)
     manager_args = ManagerArgs()
     assert num_instances == manager_args.initial_instances