Skip to content

Commit

Permalink
Refine simulator mode
Browse files Browse the repository at this point in the history
  • Loading branch information
s5u13b committed Jan 10, 2025
1 parent fb1b841 commit 8d8984d
Show file tree
Hide file tree
Showing 12 changed files with 43 additions and 23 deletions.
4 changes: 4 additions & 0 deletions docs/Arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
[--disable-log-requests-manager]
[--log-instance-info]
[--log-filename LOG_FILENAME]
[--simulator-mode]
[--profiling-result-file-path PROFILING_RESULT_FILE_PATH]
[--gpu-type GPU_TYPE]
[--migration-backend {gloo,nccl,rayrpc,grpc,kvtransfer}]
Expand Down Expand Up @@ -181,6 +182,9 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h]
- Log filename.
- Default: "server.log"

`--simulator-mode`
- Enable simulator mode.

`--profiling-result-file-path`
- Profiling result file path when using simulator.
- Default: None
Expand Down
7 changes: 7 additions & 0 deletions llumnix/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ class ManagerArgs:
disable_log_requests_manager: bool = None
log_instance_info: bool = None
log_filename: str = None
simulator_mode: bool = None
profiling_result_file_path: str = None

migration_backend: str = None
Expand Down Expand Up @@ -218,6 +219,9 @@ def check_args(cls, args: 'ManagerArgs', parser: argparse.ArgumentParser):
("When using kvTransfer as migration backend, "
"do not set --migration-backend-transfer-type as empty.")

assert not args.simulator_mode or args.profiling_result_file_path is not None, \
"Set profiling_result_file_path args when enable simulator mode"

@staticmethod
def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
parser.add_argument('--initial-instances',
Expand Down Expand Up @@ -309,6 +313,9 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
parser.add_argument('--profiling-result-file-path',
type=str,
help='profiling result file path when using simulator')
parser.add_argument('--simulator-mode',
action='store_true',
help='enable simulator mode')
parser.add_argument('--migration-backend',
type=str,
choices=['gloo','nccl','rayrpc','grpc','kvtransfer'],
Expand Down
5 changes: 5 additions & 0 deletions llumnix/backends/backend_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ class EngineState(str, Enum):
class BackendType(str, Enum):
VLLM = "VLLM"
BLADELLM = "BLADELLM"
SIM_VLLM = "SIM_VLLM"

@staticmethod
def is_sim_backend(status: "BackendType") -> bool:
return status in [BackendType.SIM_VLLM]


class BackendInterface(ABC):
Expand Down
3 changes: 2 additions & 1 deletion llumnix/backends/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ def model_decode(x, a, b, c):
return a * bs + b * tot_seq_len + c

def get_latency_mem(backend_type: BackendType, profiling_database: ProfilingDatabase, **backend_args):
if backend_type == BackendType.VLLM:
assert BackendType.is_sim_backend(backend_type)
if backend_type == BackendType.SIM_VLLM:
# TODO(ZeldaHuang): support multi-lora, more device, vision language model
model_config = backend_args.get("model_config")
_ = backend_args.get("cache_config")
Expand Down
31 changes: 15 additions & 16 deletions llumnix/backends/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,22 +71,12 @@ def init_backend_engine(instance_id: str,
profiling_result_file_path: str = None) -> BackendInterface:
if backend_type == BackendType.VLLM:
# pylint: disable=import-outside-toplevel
if profiling_result_file_path is None:
from llumnix.backends.vllm.llm_engine import BackendVLLM
backend_engine = BackendVLLM(instance_id,
placement_group,
request_output_queue_type,
migration_config,
engine_args)
else:
# pylint: disable=import-outside-toplevel
from llumnix.backends.vllm.simulator import BackendSimVLLM
backend_engine = BackendSimVLLM(instance_id,
placement_group,
request_output_queue_type,
migration_config,
engine_args,
profiling_result_file_path)
from llumnix.backends.vllm.llm_engine import BackendVLLM
backend_engine = BackendVLLM(instance_id,
placement_group,
request_output_queue_type,
migration_config,
engine_args)
elif backend_type == BackendType.BLADELLM:
# pylint: disable=import-outside-toplevel
from llumnix.backends.bladellm.llm_engine import BackendBladeLLM
Expand All @@ -95,6 +85,15 @@ def init_backend_engine(instance_id: str,
request_output_queue_type,
migration_config,
engine_args)
elif backend_type == BackendType.SIM_VLLM:
# pylint: disable=import-outside-toplevel
from llumnix.backends.vllm.simulator import BackendSimVLLM
backend_engine = BackendSimVLLM(instance_id,
placement_group,
request_output_queue_type,
migration_config,
engine_args,
profiling_result_file_path)
else:
raise ValueError(f'Unsupported backend: {backend_type}')
return backend_engine
Expand Down
2 changes: 2 additions & 0 deletions llumnix/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@
_C.MANAGER.LOG_INSTANCE_INFO = False
# Log filename
_C.MANAGER.LOG_FILENAME = "server.log"
# Enable simulator mode
_C.MANAGER.SIMULATOR_MODE = False
# Profiling result file path when using simulator
_C.MANAGER.PROFILING_RESULT_FILE_PATH = None
# Enable port increment when deploying multiple servers
Expand Down
3 changes: 2 additions & 1 deletion llumnix/entrypoints/bladellm/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def setup_llumnix_api_server(bladellm_args: ServingArgs, loop: asyncio.AbstractE
llumnix_config = get_llumnix_config(bladellm_args.llumnix_config)
entrypoints_args, manager_args, engine_args = get_args(llumnix_config, llumnix_parser, bladellm_args)

launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.VLLM)
assert not manager_args.simulator_mode, "Only support the simulator mode for vLLM."
launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.BLADELLM)

setup_ray_cluster(entrypoints_args)

Expand Down
3 changes: 2 additions & 1 deletion llumnix/entrypoints/vllm/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@ async def is_ready() -> bool:
cfg = get_llumnix_config(cli_args.config_file, cli_args)
entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args)

launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.VLLM)
backend_type = BackendType.VLLM if not manager_args.simulator_mode else BackendType.SIM_VLLM
launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=backend_type)

# Launch or connect to the ray cluster for multi-node serving.
setup_ray_cluster(entrypoints_args)
Expand Down
1 change: 1 addition & 0 deletions llumnix/entrypoints/vllm/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
cfg = get_llumnix_config(cli_args.config_file, cli_args)
entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args)

backend_type = BackendType.VLLM if not manager_args.simulator_mode else BackendType.SIM_VLLM
launch_args = LaunchArgs(launch_mode=LaunchMode.GLOBAL, backend_type=BackendType.VLLM)

# Assume that there is an existing ray cluster when using centralized deployment.
Expand Down
2 changes: 1 addition & 1 deletion llumnix/llumlet/llumlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def from_args(cls,
engine_args,
profiling_result_file_path: str = None):
try:
assert backend_type in [backend_type.VLLM, backend_type.BLADELLM], \
assert backend_type in [backend_type.VLLM, backend_type.BLADELLM, backend_type.SIM_VLLM], \
f'unimplemented backend {backend_type}'
num_gpus = 0
if backend_type == backend_type.BLADELLM:
Expand Down
3 changes: 1 addition & 2 deletions llumnix/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,14 +522,13 @@ def _init_placement_group(self,
backend_type: BackendType,
init_server: bool = False,
block: bool = True) -> PlacementGroup:
if not self.manager_args.profiling_result_file_path:
if not BackendType.is_sim_backend(backend_type):
# num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor
# num_gpus=world_size, for world_size Workers
world_size = get_engine_world_size(engine_args, backend_type)
placement_group = initialize_placement_group(placement_group_name,
num_cpus=3+int(init_server), num_gpus=world_size, detached=True, block=block)
else:
assert backend_type == backend_type.VLLM, "Only support the simulator backend for vLLM."
# num_cpus=1, for Llumlet + AsyncPutQueueActor
placement_group = initialize_placement_group(placement_group_name,
num_cpus=2+int(init_server), num_gpus=0, detached=True, block=block)
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_test/global_scheduler/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def test_init_instances_sim(ray_env, manager):
import llumnix.backends.vllm.simulator
llumnix.backends.vllm.simulator.BackendSimVLLM = MockBackendSim
engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True)
_, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args))
_, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.SIM_VLLM, engine_args))
num_instances = len(instances)
manager_args = ManagerArgs()
assert num_instances == manager_args.initial_instances
Expand Down

0 comments on commit 8d8984d

Please sign in to comment.