From 034d76a20378b2a2c8b22050950a69b2739e56c7 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 17 Dec 2024 03:26:47 +0000 Subject: [PATCH 01/92] demo --- serve.py | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 serve.py diff --git a/serve.py b/serve.py new file mode 100644 index 00000000..762713ce --- /dev/null +++ b/serve.py @@ -0,0 +1,91 @@ +import asyncio +import time +import uvicorn +import argparse +import requests +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse, Response, StreamingResponse +from contextlib import asynccontextmanager +import ray + + +from llumnix.queue.ray_queue_server import RayQueueServer + + +@asynccontextmanager +async def lifespan(fastapi_app: FastAPI): + asyncio.create_task(request_output_queue.run_server_loop()) + yield + request_output_queue.cleanup() + +app = FastAPI(lifespan=lifespan) +request_output_queue = RayQueueServer() + +@app.get("/is_ready") +async def is_ready() -> bool: + return True + +@app.post("/generate") +async def generate(request: Request) -> Response: + ret = {"text": ""} + return JSONResponse(ret) + +@app.get("/health") +async def health() -> Response: + """Health check.""" + return Response(status_code=200) + +@app.post("/generate_stream") +async def generate_stream(request: Request) -> StreamingResponse: + async def number_generator(): + for i in range(10): + t = time.time() + yield f"Number: {i}, Time: {t}; " + await asyncio.sleep(0.5) + return StreamingResponse(number_generator(), media_type="text/plain") + +class FastAPIServer: + def __init__(self, host: str, port: int): + self.host = host + self.port = port + + def run(self): + uvicorn.run(app, host=self.host, port=self.port) + + @classmethod + def from_args(cls, host: str, port: int): + fastapi_server_class = ray.remote(num_cpus=1, name="entrypoints")(cls) + fastapi_server = fastapi_server_class.remote(host, port) + + return fastapi_server + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default='localhost') + parser.add_argument("--port", type=int, default=8000) + args = parser.parse_args() + + fastapi_server = FastAPIServer.from_args(args.host, args.port) + fastapi_server.run.remote() + + time.sleep(10) + + ip_address = f"{args.host}:{args.port}" + api_list = [ + "is_ready", + "generate", + "generate_stream", + "health", + ] + for api in api_list: + try: + url = f"http://{ip_address}/{api}" + if api in ["is_ready", "health"]: + response = requests.get(url) + else: + response = requests.post(url) + response.raise_for_status() + print(f"api: {api}, response: {response}, response.text: {response.text}") + except requests.RequestException as e: + print(f"Request failed: {e}") From b3ded9f04ee9588ee72a01e2baad93c6bae21993 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 17 Dec 2024 09:48:34 +0000 Subject: [PATCH 02/92] Serve demo done --- demo/client.py | 28 +++++ serve.py => demo/serve_demo.py | 5 +- demo/serve_demo1.py | 135 +++++++++++++++++++++++++ demo/serve_demo2.py | 43 ++++++++ llumnix/entrypoints/vllm/api_server.py | 2 +- 5 files changed, 209 insertions(+), 4 deletions(-) create mode 100644 demo/client.py rename serve.py => demo/serve_demo.py (96%) create mode 100644 demo/serve_demo1.py create mode 100644 demo/serve_demo2.py diff --git a/demo/client.py b/demo/client.py new file mode 100644 index 00000000..8d9f3c07 --- /dev/null +++ b/demo/client.py @@ -0,0 +1,28 @@ +import argparse +import requests + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default='localhost') + parser.add_argument("--port", type=int, default=8000) + args = parser.parse_args() + + ip_address = f"{args.host}:{args.port}" + api_list = [ + "is_ready", + "generate", + "generate_stream", + "health", + ] + for api in api_list: + try: + url = f"http://{ip_address}/{api}" + if api in ["is_ready", "health"]: + response = requests.get(url) + else: + response = requests.post(url) + response.raise_for_status() + print(f"api: {api}, response: {response}, response.text: {response.text}") + except requests.RequestException as e: + print(f"Request failed: {e}") diff --git a/serve.py b/demo/serve_demo.py similarity index 96% rename from serve.py rename to demo/serve_demo.py index 762713ce..cb291f5a 100644 --- a/serve.py +++ b/demo/serve_demo.py @@ -8,7 +8,6 @@ from contextlib import asynccontextmanager import ray - from llumnix.queue.ray_queue_server import RayQueueServer @@ -54,7 +53,7 @@ def run(self): @classmethod def from_args(cls, host: str, port: int): - fastapi_server_class = ray.remote(num_cpus=1, name="entrypoints")(cls) + fastapi_server_class = ray.remote(num_cpus=1, name="entrypoints", namespace="llumnix", lifetime="detached")(cls).options() fastapi_server = fastapi_server_class.remote(host, port) return fastapi_server @@ -69,7 +68,7 @@ def from_args(cls, host: str, port: int): fastapi_server = FastAPIServer.from_args(args.host, args.port) fastapi_server.run.remote() - time.sleep(10) + time.sleep(5) ip_address = f"{args.host}:{args.port}" api_list = [ diff --git a/demo/serve_demo1.py b/demo/serve_demo1.py new file mode 100644 index 00000000..89fc5d8a --- /dev/null +++ b/demo/serve_demo1.py @@ -0,0 +1,135 @@ +import asyncio +import time +import uvicorn +from uvicorn import Config, Server +import argparse +import requests +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse, Response, StreamingResponse +from contextlib import asynccontextmanager +import ray +from ray import serve + +from llumnix.queue.zmq_server import ZmqServer +from llumnix.queue.zmq_client import ZmqClient +from llumnix.queue.utils import get_open_zmq_ipc_path +from llumnix.utils import random_uuid +from llumnix.server_info import ServerInfo + +from llumnix.queue.ray_queue_server import RayQueueServer + + +@asynccontextmanager +async def lifespan(fastapi_app: FastAPI): + # @@@ + # loop = asyncio.get_event_loop() + # loop.create_task(request_output_queue_server.run_server_loop()) + asyncio.create_task(request_output_queue_server.run_server_loop()) + yield + # @@@ + request_output_queue_server.cleanup() + +app = FastAPI(lifespan=lifespan) +# @@@ +request_output_queue = RayQueueServer() +request_output_queue_server = None + + +@app.get("/is_ready") +async def is_ready() -> bool: + return True + +@app.post("/generate") +async def generate(request: Request) -> Response: + ret = {"text": ""} + return JSONResponse(ret) + +@app.get("/health") +async def health() -> Response: + """Health check.""" + return Response(status_code=200) + +@app.post("/generate_stream") +async def generate_stream(request: Request) -> StreamingResponse: + async def number_generator(): + for i in range(10): + t = time.time() + yield f"Number: {i}, Time: {t}; " + await asyncio.sleep(0.5) + return StreamingResponse(number_generator(), media_type="text/plain") + + +class FastAPIServer: + def __init__(self, host: str, port: int): + self.host = host + self.port = port + rpc_path = get_open_zmq_ipc_path(self.host, 8002) + global request_output_queue_server + request_output_queue_server = ZmqServer(rpc_path) + # loop = asyncio.get_event_loop() + # loop.create_task(request_output_queue_server.run_server_loop()) + + def run(self): + uvicorn.run(app, host=self.host, port=self.port) + # rpc_path = get_open_zmq_ipc_path(self.host, 8002) + # request_output_queue_server = ZmqServer(rpc_path) + # loop = asyncio.get_event_loop() + # loop.create_task(request_output_queue_server.run_server_loop()) + # config = Config(app=app, loop=loop, host=self.host, port=self.port) + # server = Server(config) + # loop.run_until_complete(server.serve()) + + @classmethod + def from_args(cls, host: str, port: int): + fastapi_server_class = ray.remote(num_cpus=1, name="entrypoints")(cls) + fastapi_server = fastapi_server_class.remote(host, port) + + return fastapi_server + +async def wait_request_output_queue_server_ready(request_output_queue_client: ZmqClient, + server_info: ServerInfo): + time.sleep(5) + await request_output_queue_client.wait_for_server_rpc(server_info) + # request_output_queue_server.cleanup() + print("Request output queue server is ready.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default='172.23.75.202') + parser.add_argument("--port", type=int, default=8000) + args = parser.parse_args() + + ray.init(namespace="llumnix") + + # rpc_path = get_open_zmq_ipc_path(args.host, 8002) + # request_output_queue_server = ZmqServer(rpc_path) + request_output_queue_client = ZmqClient() + server_id = random_uuid() + server_info = ServerInfo(server_id, 'zmq', None, args.host, 8002) + + fastapi_server = FastAPIServer.from_args(args.host, args.port) + fastapi_server.run.remote() + + time.sleep(5) + + ip_address = f"{args.host}:{args.port}" + api_list = [ + "is_ready", + "generate", + "generate_stream", + "health", + ] + for api in api_list: + try: + url = f"http://{ip_address}/{api}" + if api in ["is_ready", "health"]: + response = requests.get(url) + else: + response = requests.post(url) + response.raise_for_status() + print(f"api: {api}, response: {response}, response.text: {response.text}") + except requests.RequestException as e: + print(f"Request failed: {e}") + + asyncio.run(wait_request_output_queue_server_ready(request_output_queue_client, server_info)) diff --git a/demo/serve_demo2.py b/demo/serve_demo2.py new file mode 100644 index 00000000..10e8a68d --- /dev/null +++ b/demo/serve_demo2.py @@ -0,0 +1,43 @@ +import argparse +import time +import ray +from fastapi import FastAPI +import uvicorn + +# @@@ +# from llumnix.queue.ray_queue_server import RayQueueServer +from ray.util.queue import Queue as RayQueue + +app = FastAPI() +# @@@ +# request_output_queue = RayQueueServer() +request_output_queue = RayQueue() + + +class FastAPIServer: + def __init__(self, host: str, port: int): + self.host = host + self.port = port + + def run(self): + uvicorn.run(app, host=self.host, port=self.port) + + @classmethod + def from_args(cls, host: str, port: int): + fastapi_server_class = ray.remote(num_cpus=1, name="entrypoints")(cls) + fastapi_server = fastapi_server_class.remote(host, port) + + return fastapi_server + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default='localhost') + parser.add_argument("--port", type=int, default=8000) + args = parser.parse_args() + + ray.init(namespace="llumnix") + + fastapi_server = FastAPIServer.from_args(args.host, args.port) + + time.sleep(5) diff --git a/llumnix/entrypoints/vllm/api_server.py b/llumnix/entrypoints/vllm/api_server.py index 46cbf842..680211b0 100644 --- a/llumnix/entrypoints/vllm/api_server.py +++ b/llumnix/entrypoints/vllm/api_server.py @@ -168,7 +168,7 @@ async def generate_benchmark(request: Request) -> Response: @app.get("/is_ready") -async def is_ready(): +async def is_ready() -> bool: return await llumnix_client.is_ready() From 7159ca672613397687951df64874c5a2eb36a3f4 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 18 Dec 2024 03:33:44 +0000 Subject: [PATCH 03/92] Add entrypoints actor unit test --- .../vllm/api_server_manager_service.py | 117 ++++++++++++++++++ .../entrypoints/vllm/test_api_server.py | 13 +- 2 files changed, 126 insertions(+), 4 deletions(-) create mode 100644 tests/unit_test/entrypoints/vllm/api_server_manager_service.py diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py new file mode 100644 index 00000000..6671bb80 --- /dev/null +++ b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py @@ -0,0 +1,117 @@ +# Copyright (c) 2024, Alibaba Group; +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import uvicorn +import ray +from ray.util.queue import Queue as RayQueue +from fastapi.responses import JSONResponse, Response + +from vllm.outputs import CompletionOutput, RequestOutput + +import llumnix.entrypoints.vllm.api_server +import llumnix.llm_engine_manager +from llumnix.arg_utils import EngineManagerArgs +from llumnix.server_info import ServerInfo, RequestTimestamps +from llumnix.utils import random_uuid +from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType +from llumnix.entrypoints.setup import LlumnixEntrypointsContext +from llumnix.entrypoints.vllm.client import LlumnixClientVLLM + +app = llumnix.entrypoints.vllm.api_server.app +engine_manager = None +MANAGER_ACTOR_NAME = llumnix.llm_engine_manager.MANAGER_ACTOR_NAME +ENTRYPOINTS_ACTOR_NAME = "entrypoints" +request_output_queue = RayQueue() + + +@ray.remote(num_cpus=0, lifetime="detached") +class MockLLMEngineManagerService: + def __init__(self, request_output_queue_type: QueueType, args: 'Namespace'): + self._num_generates = 0 + self._num_aborts = 0 + self.request_output_queue = init_request_output_queue_client(request_output_queue_type) + self.init_api_server(args.host, args.port, request_output_queue_type) + self.api_server.run.remote() + + def init_api_server(self, host: str, port: int, request_output_queue_type: QueueType): + self.api_server = FastAPIServer.options(name=ENTRYPOINTS_ACTOR_NAME, + namespace='llumnix').remote(args.host, args.port, request_output_queue_type) + + async def generate(self, request_id, server_info, *args, **kwargs): + self._num_generates += 1 + completion_output = CompletionOutput(0, "", [], 0.0, None) + request_output = RequestOutput(request_id, "", [], None, [completion_output], finished=True) + request_output.request_timestamps = RequestTimestamps() + await self.request_output_queue.put_nowait([request_output], server_info) + + async def abort(self, request_id): + self._num_aborts += 1 + + def testing_stats(self): + return {"num_aborted_requests": self._num_aborts} + +@ray.remote(num_cpus=1, lifetime="detached") +class FastAPIServer: + def __init__(self, host: str, port: int, request_output_queue_type: QueueType): + self.host = host + self.port = port + ip = '127.0.0.1' + port = 1234 + global engine_manager + engine_manager = ray.get_actor(MANAGER_ACTOR_NAME, namespace="llumnix") + request_output_queue = init_request_output_queue_server(ip, port, request_output_queue_type) + ray_queue_server = None + if request_output_queue_type == QueueType.RAYQUEUE: + ray_queue_server = request_output_queue + server_info = ServerInfo(random_uuid(), request_output_queue_type, ray_queue_server, ip, port) + llumnix_context = LlumnixEntrypointsContext(engine_manager, + {'0': None}, + request_output_queue, + server_info, + None, + None) + llumnix.entrypoints.vllm.api_server.llumnix_client = LlumnixClientVLLM(llumnix_context) + + def run(self): + uvicorn.run( + app, + host=self.host, + port=self.port, + log_level="debug", + timeout_keep_alive=llumnix.entrypoints.vllm.api_server.TIMEOUT_KEEP_ALIVE) + +def init_manager_service(request_output_queue_type: QueueType, args: 'Namespace'): + engine_manager = MockLLMEngineManagerService.options(name=MANAGER_ACTOR_NAME, + namespace='llumnix').remote(request_output_queue_type, args) + return engine_manager + +@app.get("/stats") +def stats() -> Response: + """Get the statistics of the engine.""" + return JSONResponse(ray.get(engine_manager.testing_stats.remote())) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--request-output-queue-type", type=str, choices=["zmq", "rayqueue"]) + parser = EngineManagerArgs.add_cli_args(parser) + args = parser.parse_args() + + request_output_queue_type = QueueType(args.request_output_queue_type) + engine_manager = init_manager_service(request_output_queue_type, args) + + import time + time.sleep(5) diff --git a/tests/unit_test/entrypoints/vllm/test_api_server.py b/tests/unit_test/entrypoints/vllm/test_api_server.py index bf6689bf..35100d42 100644 --- a/tests/unit_test/entrypoints/vllm/test_api_server.py +++ b/tests/unit_test/entrypoints/vllm/test_api_server.py @@ -45,11 +45,16 @@ def _query_server_generate(prompt: str) -> dict: def _query_server_generate_benchmark(prompt: str) -> dict: return _query_server(prompt, interface='generate_benchmark') -@pytest.fixture(params=["zmq", "rayqueue"]) +@pytest.fixture(params=[("zmq", "manager_service"), ("rayqueue", "manager_service"), ("zmq", "manager"), ("rayqueue", "manager")]) def api_server(request): - request_output_queue_type = QueueType(request.param) - script_path = Path(__file__).parent.joinpath( - "api_server_manager.py").absolute() + request_output_queue_type = QueueType(request.param[0]) + print(f"{request.param[0]}-{request.param[1]}") + if request.param[1] == "manager": + script_path = Path(__file__).parent.joinpath( + "api_server_manager.py").absolute() + else: + script_path = Path(__file__).parent.joinpath( + "api_server_manager_service.py").absolute() commands = [ sys.executable, "-u", From a2d4dd2c44fc9af77516019daad5bc1f178fa3c3 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 18 Dec 2024 05:41:26 +0000 Subject: [PATCH 04/92] Change unit test time limit --- .github/workflows/unit_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml index 7bb0052d..0e7ab0c0 100644 --- a/.github/workflows/unit_test.yml +++ b/.github/workflows/unit_test.yml @@ -20,7 +20,7 @@ jobs: unit_tests: needs: cancel_previous_workflows runs-on: [self-hosted] - timeout-minutes: 30 + timeout-minutes: 45 steps: - name: Checkout uses: actions/checkout@v4 From 2e2b9942e1fb1a3fe3d17368bfdf755ce792f233 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 18 Dec 2024 07:22:15 +0000 Subject: [PATCH 05/92] Fix pylint hang due to jobs concurrency --- Makefile | 8 ++++---- llumnix/backends/utils.py | 5 +---- .../entrypoints/vllm/api_server_manager_service.py | 11 +++++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index c3618524..887e342f 100644 --- a/Makefile +++ b/Makefile @@ -21,8 +21,8 @@ install: .PHONY: lint lint: check_pylint_installed check_pytest_installed - @pylint --rcfile=.pylintrc -s n --jobs=128 ./llumnix - + @pylint --rcfile=.pylintrc -s n --jobs=128 ./llumnix + @pylint --rcfile=.pylintrc \ --disable=protected-access,super-init-not-called,unused-argument,redefined-outer-name,invalid-name \ -s n --jobs=128 ./tests @@ -53,7 +53,7 @@ proto-clean: .PHONY: test test: check_pytest_installed - @pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings + @pytest -v --ignore=third_party --ignore=tests/e2e_test --disable-warnings @python examlpes/offline_inference.py @pytest -v -x -s --tb=long ./tests/e2e_test/test_e2e.py @pytest -v -x -s --tb=long ./tests/e2e_test/test_bench.py @@ -61,7 +61,7 @@ test: check_pytest_installed .PHONY: unit_test unit_test: check_pytest_installed - @pytest -v --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings + @pytest -v --ignore=third_party --ignore=tests/e2e_test --disable-warnings .PHONY: offline_test offline_test: diff --git a/llumnix/backends/utils.py b/llumnix/backends/utils.py index 8976128d..090ac103 100644 --- a/llumnix/backends/utils.py +++ b/llumnix/backends/utils.py @@ -88,10 +88,7 @@ def initialize_placement_group( """Initialize the distributed cluster probably with Ray. Args: - parallel_config: The configurations for parallel execution. - engine_use_ray: Whether to use Ray for async engine. - ray_address: The address of the Ray cluster. If None, uses - the default Ray cluster address. + world_size: The number of workers in Llumlet. Returns: A tuple of (`distributed_init_method`, `placement_group`). The diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py index 6671bb80..10f802a0 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py @@ -12,6 +12,7 @@ # limitations under the License. import argparse +import time import uvicorn import ray from ray.util.queue import Queue as RayQueue @@ -32,7 +33,6 @@ engine_manager = None MANAGER_ACTOR_NAME = llumnix.llm_engine_manager.MANAGER_ACTOR_NAME ENTRYPOINTS_ACTOR_NAME = "entrypoints" -request_output_queue = RayQueue() @ray.remote(num_cpus=0, lifetime="detached") @@ -110,8 +110,11 @@ def stats() -> Response: parser = EngineManagerArgs.add_cli_args(parser) args = parser.parse_args() + # magic actor, without this actor, FastAPIServer cannot initialize correctly. + # If this actor is placed globally, pylint will hangs if testing api_server_manager and api_server_service concurrently (--jobs > 1). + request_output_queue = RayQueue() + request_output_queue_type = QueueType(args.request_output_queue_type) engine_manager = init_manager_service(request_output_queue_type, args) - - import time - time.sleep(5) + + time.sleep(2) From fed2f599d977adff1d804c5cff1de13da79f285d Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 19 Dec 2024 08:17:17 +0000 Subject: [PATCH 06/92] placement group demo done --- demo/manager_service_demo.py | 196 +++++++++++++++++++++++++++++++++++ demo/placement_group_demo.py | 106 +++++++++++++++++++ 2 files changed, 302 insertions(+) create mode 100644 demo/manager_service_demo.py create mode 100644 demo/placement_group_demo.py diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py new file mode 100644 index 00000000..8472e082 --- /dev/null +++ b/demo/manager_service_demo.py @@ -0,0 +1,196 @@ +import asyncio +from queue import Queue +from typing import Dict +from functools import partial +import uvicorn +import ray +from ray.util.placement_group import PlacementGroup +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +from llumnix.utils import random_uuid + +WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS = 1.0 +AUTO_DEPLOYMENT_INTERVAL_SECONDS = 10.0 + + +def get_entrypoints_name(instance_id: str) -> str: + return f"entrypoints_{instance_id}" + +def get_instance_name(instance_id: str) -> str: + return f"instance_{instance_id}" + +def initialize_placement_group(lifetime: str = None) -> PlacementGroup: + # Any better expression? + placement_group_specs = ([{"CPU": 1}, {"CPU": 1, "GPU": 4}]) + placement_group = ray.util.placement_group( + placement_group_specs, "STRICT_PACK", lifetime=lifetime) + return placement_group + + +class FastAPIServer: + def __init__(self, instance_id: str, host: str, port: int): + self.host = host + self.port = port + self.entrypoints_name = get_entrypoints_name(instance_id) + + def run(self) -> None: + uvicorn.run(app, host=self.host, port=self.port) + + @classmethod + def from_args(cls, + instance_id: str, + host: str, + port: int, + placement_group: PlacementGroup, + lifetime: str = None): + entrypoints_name = get_entrypoints_name(instance_id) + fastapi_server_class = ray.remote(num_cpus=1, + name=entrypoints_name, + namespace="llumnix", + lifetime=lifetime)(cls).options( + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_bundle_index=0, + ) + ) + fastapi_server = fastapi_server_class.remote(instance_id, host, port) + return fastapi_server + + +class Llumlet: + def __init__(self, instance_id: str): + self.instance_name = get_instance_name(instance_id) + + @classmethod + def from_args(cls, + instance_id: str, + placement_group: PlacementGroup, + lifetime: str = None): + instance_name = get_instance_name(instance_id) + llumlet_class = ray.remote(num_cpus=1, + num_gpus=4, + name=instance_name, + namespace="llumnix", + lifetime=lifetime)(cls).options( + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_bundle_index=1, + ) + ) + llumlet = llumlet_class.remote(instance_id) + return llumlet + + def ready(self) -> bool: + return True + + +class LLMEngineManager: + def __init__(self): + self.host = "localhost" + self.port = 8000 + self.last_pending_pg = None + self.pgs: Dict[str, PlacementGroup] = {} + self.servers: Dict[str, FastAPIServer] = {} + self.instances: Dict[str, Llumlet] = {} + asyncio.create_task(self._auto_scale_up_loop()) + asyncio.create_task(self._auto_scale_down_loop()) + + async def _auto_scale_down_loop(self) -> None: + def instance_ready_callback(instance_id: str, fut): + ret = fut.result()[0] + if isinstance(ret, ray.exceptions.RayActorError): + self.scale_down(instance_id) + + for instance_id, instance in self.instances.items(): + task = asyncio.gather(instance.ready().remote(), return_exceptions=True) + task.add_done_callback(partial(instance_ready_callback, instance_id)) + tasks.append(task) + await asyncio.gather(*tasks, return_exceptions=True) + + async def _auto_scale_up_loop(self) -> None: + def instance_ready_callback(instance_id: str, fut): + ret = fut.result()[0] + if not isinstance(ret, ray.exceptions.RayActorError): + self.scale_up(instance_id, new_pgs[instance_id], new_servers[instance_id], new_instances[instance_id]) + else: + self.remove_placement_group(new_pgs[instance_id]) + self.kill_server(new_servers[instance_id]) + + while True: + # 1. Get new placement group continuously until wait placement group ready timeouts. + new_pg_queue = Queue() + while True: + new_pg = initialize_placement_group(lifetime="detached") if not self.last_pending_pg else self.last_pending_pg + try: + await asyncio.wait_for(new_pg.ready(), timeout=WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS) + new_pg_queue.put(new_pg) + except asyncio.TimeoutError: + self.last_pending_pg = new_pg + break + # 2. Deploy 1 entrypoints and 1 instance to 1 placement group (for all new placement groups). + new_pgs: Dict[str, PlacementGroup] = {} + new_servers: Dict[str, FastAPIServer] = {} + new_instances: Dict[str, Llumlet] = {} + while not new_pg_queue.empty(): + instance_id = random_uuid() + new_pg = new_pg_queue.get() + new_pgs[instance_id] = new_pg + new_servers[instance_id] = FastAPIServer.from_args(instance_id, self.host, self.port, new_pg, lifetime="detached") + new_instances[instance_id] = Llumlet.from_args(instance_id, new_pg, lifetime="detached") + # 3. Wait for all instances ready. (With the assumption that once instance ready, entrypoints is ready too.) + tasks = [] + for instance_id, instance in new_instances.items(): + task = asyncio.gather(instance.ready().remote(), return_exceptions=True) + task.add_done_callback(partial(instance_ready_callback, isntance_id)) + tasks.append(task) + await asyncio.gather(*tasks, return_exceptions=True) + + await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + + def scale_up(self, + instance_id: str, + placement_group: PlacementGroup, + server: FastAPIServer, + instance: Llumlet) -> None: + self.pgs[instance_id] = placement_group + self.servers[instance_id] = server + self.instances[instance_id] = instance + + def scale_down(self, instance_id: str) -> None: + self.kill_server(self.servers[instance_id]) + self.remove_placement_group(self.pgs[instance_id]) + if instance_id in self.pgs: + del self.pgs[instance_id] + if instance_id in self.servers: + del self.servers[instance_id] + if instance_id in self.instances: + del self.instances[instance_id] + + def remove_placement_group(self, + placement_group: PlacementGroup) -> None: + try: + ray.util.remove_placement_group(placement_group) + except Exception as e: + print(f"try to remove placement group {instance_id}") + + def kill_server(self, + server: FastAPIServer) -> None: + try: + ray.kill(server[instance_id]) + # Exception that killing a died actor. + except Exception as e: + print(f"try to kill api server {instance_id}") + + @classmethod + def from_args(cls): + engine_manager_class = ray.remote(num_cpus=1, + max_restarts=-1, + name="manager", + namespace="llumnix", + lifetime="detached")(cls).options() + engine_manager = engine_manager_class.remote() + return engine_manager + + +if __name__ == "__main__": + engine_manager = LLMEngineManager.from_args() diff --git a/demo/placement_group_demo.py b/demo/placement_group_demo.py new file mode 100644 index 00000000..9a6a41b1 --- /dev/null +++ b/demo/placement_group_demo.py @@ -0,0 +1,106 @@ +import time +import asyncio +import subprocess +import ray +from ray.util import placement_group_table +from ray.util.state import list_actors + +from manager_service_demo import (initialize_placement_group, + FastAPIServer, + Llumlet) + + +def test_actor_if_pg_died(life_time_pg, lifetime_llumlet): + print(f"### placement group lifetime: {life_time_pg}, llumlet lifetime: {lifetime_llumlet}") + print("### create placement group and llumlet") + placement_group = initialize_placement_group(lifetime=life_time_pg) + llumlet = Llumlet.from_args("0", placement_group, lifetime=lifetime_llumlet) + print(f"placement group state: {placement_group_table(placement_group)}") + print(f"llumlet state: {list_actors()}") + print("### sleep 1s") + time.sleep(5) + print(f"llumlet state: {list_actors()}") + print("### remove placement group") + ray.util.remove_placement_group(placement_group) + print(f"placement group state: {placement_group_table(placement_group)}") + print(f"llumlet state: {list_actors()}") + +def test_pg_if_actor_died(life_time_pg, lifetime_llumlet): + print(f"### placement group lifetime: {life_time_pg}, llumlet lifetime: {lifetime_llumlet}") + print("### create placement group and llumlet") + placement_group = initialize_placement_group(lifetime=life_time_pg) + llumlet = Llumlet.from_args("0", placement_group, lifetime=lifetime_llumlet) + print(f"placement group state: {placement_group_table(placement_group)}") + print(f"llumlet state: {list_actors()}") + print("### sleep 5s") + time.sleep(5) + print(f"llumlet state: {list_actors()}") + print("### kill llumlet") + ray.kill(llumlet) + print(f"placement group state: {placement_group_table(placement_group)}") + print(f"llumlet state: {list_actors()}") + print("### remove placement group") + ray.util.remove_placement_group(placement_group) + +def test_pending(life_time_pg, lifetime_llumlet): + print(f"### placement group lifetime: {life_time_pg}, llumlet lifetime: {lifetime_llumlet}") + print("### create placement group and llumlet") + placement_group1 = initialize_placement_group(lifetime=life_time_pg) + llumlet1 = Llumlet.from_args("0", placement_group1, lifetime=lifetime_llumlet) + time.sleep(5) + print(f"placement group 1 state: {placement_group_table(placement_group1)}") + print(f"llumlet 1 state: {list_actors()}") + print("### create placement group and llumlet") + placement_group2 = initialize_placement_group(lifetime=life_time_pg) + llumlet2 = Llumlet.from_args("1", placement_group2, lifetime=lifetime_llumlet) + time.sleep(5) + print(f"placement group 2 state: {placement_group_table(placement_group2)}") + print(f"llumlet 2 state: {list_actors()}") + print("### kill llumlet") + ray.kill(llumlet1) + time.sleep(5) + print(f"placement group 2 state: {placement_group_table(placement_group2)}") + print(f"llumlet 2 state: {list_actors()}") + print("### remove placement group") + ray.util.remove_placement_group(placement_group1) + time.sleep(5) + print(f"placement group 2 state: {placement_group_table(placement_group2)}") + print(f"llumlet 2 state: {list_actors()}") + ray.util.remove_placement_group(placement_group2) + ray.kill(llumlet2) + +async def test_pg_ready(): + placement_group1 = initialize_placement_group() + try: + await asyncio.wait_for(placement_group1.ready(), timeout=5.0) + print("placement group 1 ready") + except asyncio.TimeoutError: + print("wait placement group 1 timeout") + placement_group2 = initialize_placement_group() + try: + await asyncio.wait_for(placement_group2.ready(), timeout=5.0) + print("placement group 2 ready") + except asyncio.TimeoutError: + print("wait placement group 2 timeout") + ray.util.remove_placement_group(placement_group1) + try: + await asyncio.wait_for(placement_group2.ready(), timeout=5.0) + print("placement group 2 ready") + except asyncio.TimeoutError: + print("wait placement group 2 timeout") + + +if __name__ == "__main__": + # test_actor_if_pg_died(life_time_pg=None, lifetime_llumlet=None) + # test_actor_if_pg_died(life_time_pg=None, lifetime_llumlet="detached") + # test_actor_if_pg_died(life_time_pg="detached", lifetime_llumlet=None) + # test_actor_if_pg_died(life_time_pg=None, lifetime_llumlet="detached") + + # test_pg_if_actor_died(life_time_pg=None, lifetime_llumlet=None) + # test_pg_if_actor_died(life_time_pg=None, lifetime_llumlet="detached") + # test_pg_if_actor_died(life_time_pg="detached", lifetime_llumlet=None) + # test_pg_if_actor_died(life_time_pg=None, lifetime_llumlet="detached") + + # test_pending(life_time_pg=None, lifetime_llumlet=None) + + asyncio.run(test_pg_ready()) From 7aa6cda2a5a9f4a3608c0e971dcf886c6d5fb3fc Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 19 Dec 2024 10:37:13 +0000 Subject: [PATCH 07/92] Doing manager service demo --- demo/manager_service_demo.py | 81 ++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 21 deletions(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index 8472e082..06d3f5b7 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -1,26 +1,30 @@ import asyncio from queue import Queue +import time from typing import Dict from functools import partial import uvicorn +from fastapi import FastAPI import ray from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from ray.util.state import list_actors +from ray.util.queue import Queue as RayQueue from llumnix.utils import random_uuid WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS = 1.0 AUTO_DEPLOYMENT_INTERVAL_SECONDS = 10.0 +app = FastAPI() -def get_entrypoints_name(instance_id: str) -> str: - return f"entrypoints_{instance_id}" +def get_server_name(instance_id: str) -> str: + return f"server_{instance_id}" def get_instance_name(instance_id: str) -> str: return f"instance_{instance_id}" def initialize_placement_group(lifetime: str = None) -> PlacementGroup: - # Any better expression? placement_group_specs = ([{"CPU": 1}, {"CPU": 1, "GPU": 4}]) placement_group = ray.util.placement_group( placement_group_specs, "STRICT_PACK", lifetime=lifetime) @@ -31,7 +35,8 @@ class FastAPIServer: def __init__(self, instance_id: str, host: str, port: int): self.host = host self.port = port - self.entrypoints_name = get_entrypoints_name(instance_id) + self.server_name = get_server_name(instance_id) + print("FastAPIServer created") def run(self) -> None: uvicorn.run(app, host=self.host, port=self.port) @@ -43,9 +48,9 @@ def from_args(cls, port: int, placement_group: PlacementGroup, lifetime: str = None): - entrypoints_name = get_entrypoints_name(instance_id) + server_name = get_server_name(instance_id) fastapi_server_class = ray.remote(num_cpus=1, - name=entrypoints_name, + name=server_name, namespace="llumnix", lifetime=lifetime)(cls).options( scheduling_strategy=PlacementGroupSchedulingStrategy( @@ -60,6 +65,7 @@ def from_args(cls, class Llumlet: def __init__(self, instance_id: str): self.instance_name = get_instance_name(instance_id) + print("Llumlet created") @classmethod def from_args(cls, @@ -86,48 +92,60 @@ def ready(self) -> bool: class LLMEngineManager: def __init__(self): + print("create LLMEngineManager") self.host = "localhost" self.port = 8000 - self.last_pending_pg = None + self.last_pending_pg: PlacementGroup = None self.pgs: Dict[str, PlacementGroup] = {} self.servers: Dict[str, FastAPIServer] = {} self.instances: Dict[str, Llumlet] = {} asyncio.create_task(self._auto_scale_up_loop()) asyncio.create_task(self._auto_scale_down_loop()) + print("LLMEngineManager created") async def _auto_scale_down_loop(self) -> None: def instance_ready_callback(instance_id: str, fut): ret = fut.result()[0] if isinstance(ret, ray.exceptions.RayActorError): + print(f"instance {instance_id} died, scale down") self.scale_down(instance_id) for instance_id, instance in self.instances.items(): - task = asyncio.gather(instance.ready().remote(), return_exceptions=True) + task = asyncio.gather(instance.ready.remote(), return_exceptions=True) task.add_done_callback(partial(instance_ready_callback, instance_id)) tasks.append(task) await asyncio.gather(*tasks, return_exceptions=True) + await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + async def _auto_scale_up_loop(self) -> None: def instance_ready_callback(instance_id: str, fut): ret = fut.result()[0] if not isinstance(ret, ray.exceptions.RayActorError): + print(f"instance {instance_id} ready, scale up") self.scale_up(instance_id, new_pgs[instance_id], new_servers[instance_id], new_instances[instance_id]) + new_servers[instance_id].run.remote() else: - self.remove_placement_group(new_pgs[instance_id]) - self.kill_server(new_servers[instance_id]) + print(f"instance {instace_id} died, abort") + self.remove_placement_group(new_pgs[instance_id], instance_id) + self.kill_server(new_servers[instance_id], instance_id) while True: - # 1. Get new placement group continuously until wait placement group ready timeouts. + # 1. Get new placement group continuously until wait placement group ready timeout. new_pg_queue = Queue() while True: new_pg = initialize_placement_group(lifetime="detached") if not self.last_pending_pg else self.last_pending_pg try: await asyncio.wait_for(new_pg.ready(), timeout=WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS) + print("initialize new process group") new_pg_queue.put(new_pg) except asyncio.TimeoutError: + print("new placement group ready timeout") self.last_pending_pg = new_pg break - # 2. Deploy 1 entrypoints and 1 instance to 1 placement group (for all new placement groups). + print("Get new placement group done") + + # 2. Deploy 1 server and 1 instance to 1 placement group (for all new placement groups). new_pgs: Dict[str, PlacementGroup] = {} new_servers: Dict[str, FastAPIServer] = {} new_instances: Dict[str, Llumlet] = {} @@ -137,13 +155,17 @@ def instance_ready_callback(instance_id: str, fut): new_pgs[instance_id] = new_pg new_servers[instance_id] = FastAPIServer.from_args(instance_id, self.host, self.port, new_pg, lifetime="detached") new_instances[instance_id] = Llumlet.from_args(instance_id, new_pg, lifetime="detached") - # 3. Wait for all instances ready. (With the assumption that once instance ready, entrypoints is ready too.) + await new_instances[instance_id].ready.remote() + print("Deploy server and instance done") + + # 3. Wait for all instances ready. (With the assumption that once instance ready, server is ready too.) tasks = [] for instance_id, instance in new_instances.items(): - task = asyncio.gather(instance.ready().remote(), return_exceptions=True) - task.add_done_callback(partial(instance_ready_callback, isntance_id)) + task = asyncio.gather(instance.ready.remote(), return_exceptions=True) + task.add_done_callback(partial(instance_ready_callback, instance_id)) tasks.append(task) await asyncio.gather(*tasks, return_exceptions=True) + print("Wait all instances ready done") await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) @@ -152,31 +174,39 @@ def scale_up(self, placement_group: PlacementGroup, server: FastAPIServer, instance: Llumlet) -> None: + print(f"add placement group {instance_id}") self.pgs[instance_id] = placement_group + print(f"add server {instance_id}") self.servers[instance_id] = server + print(f"add instance {instance_id}") self.instances[instance_id] = instance def scale_down(self, instance_id: str) -> None: - self.kill_server(self.servers[instance_id]) - self.remove_placement_group(self.pgs[instance_id]) + self.kill_server(self.servers[instance_id], instance_id) + self.remove_placement_group(self.pgs[instance_id], instance_id) if instance_id in self.pgs: + print(f"del placement group {instance_id}") del self.pgs[instance_id] if instance_id in self.servers: + print(f"del server {instance_id}") del self.servers[instance_id] if instance_id in self.instances: + print(f"del instance {instance_id}") del self.instances[instance_id] def remove_placement_group(self, - placement_group: PlacementGroup) -> None: + placement_group: PlacementGroup, + instance_id: str) -> None: try: ray.util.remove_placement_group(placement_group) except Exception as e: print(f"try to remove placement group {instance_id}") def kill_server(self, - server: FastAPIServer) -> None: + server: FastAPIServer, + instance_id: str) -> None: try: - ray.kill(server[instance_id]) + ray.kill(server) # Exception that killing a died actor. except Exception as e: print(f"try to kill api server {instance_id}") @@ -187,10 +217,19 @@ def from_args(cls): max_restarts=-1, name="manager", namespace="llumnix", - lifetime="detached")(cls).options() + lifetime="detached")(cls) engine_manager = engine_manager_class.remote() return engine_manager if __name__ == "__main__": + ray.init() + + # magic actor + request_output_queue = RayQueue(actor_options={ + "namespace": "llumnix", + "name": "magic_queue" + }) engine_manager = LLMEngineManager.from_args() + + time.sleep(1000) From f6ac47aee4ae519b9a7fd3a7d0f035009f4af270 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 23 Dec 2024 07:57:17 +0000 Subject: [PATCH 08/92] Add pg api test --- demo/placement_group_demo.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/demo/placement_group_demo.py b/demo/placement_group_demo.py index 9a6a41b1..11fcde83 100644 --- a/demo/placement_group_demo.py +++ b/demo/placement_group_demo.py @@ -3,7 +3,8 @@ import subprocess import ray from ray.util import placement_group_table -from ray.util.state import list_actors +from ray.util.state import (list_actors, + list_placement_groups) from manager_service_demo import (initialize_placement_group, FastAPIServer, @@ -88,6 +89,22 @@ async def test_pg_ready(): print("placement group 2 ready") except asyncio.TimeoutError: print("wait placement group 2 timeout") + +def test_pg_api(): + placement_group1 = initialize_placement_group() + placement_group2 = initialize_placement_group() + time.sleep(3) + all_pgs = list_placement_groups() + print(f"all placement groups: {all_pgs}") + all_pgs_detail = list_placement_groups(detail=True) + print(f"all placement groups (detail): {all_pgs_detail}") + pending_pgs = list_placement_groups(filters=[("state", "=", "PENDING")]) + print(f"pending placement groups: {pending_pgs}") + created_pgs = list_placement_groups(filters=[("state", "=", "CREATED")]) + print(f"created placement groups: {created_pgs}") + + print(f"placement group 1 state: {placement_group_table(placement_group1)}") + print(f"placement group 2 state: {placement_group_table(placement_group2)}") if __name__ == "__main__": @@ -103,4 +120,6 @@ async def test_pg_ready(): # test_pending(life_time_pg=None, lifetime_llumlet=None) - asyncio.run(test_pg_ready()) + # asyncio.run(test_pg_ready()) + + test_pg_api() From 075394b4d18446219cd0eb87473c71b65f82707f Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 23 Dec 2024 08:21:31 +0000 Subject: [PATCH 09/92] New service codes done & Pylint --- demo/manager_service_demo.py | 179 ++++++++++++++++++++++++++--------- demo/placement_group_demo.py | 8 +- demo/serve_demo.py | 19 ++-- demo/serve_demo1.py | 22 +++-- demo/serve_demo2.py | 10 +- 5 files changed, 169 insertions(+), 69 deletions(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index 06d3f5b7..f4433045 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -8,26 +8,31 @@ import ray from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy -from ray.util.state import list_actors from ray.util.queue import Queue as RayQueue from llumnix.utils import random_uuid +PLACEMENT_GROUP_NAME_PREFIX = "pg_" +SERVER_NAME_PREFIX = "server_" +INSTANCE_NAME_PREFIX = "instance_" WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS = 1.0 -AUTO_DEPLOYMENT_INTERVAL_SECONDS = 10.0 +AUTO_DEPLOYMENT_INTERVAL_SECONDS = 1.0 app = FastAPI() +def get_placement_group_name(instance_id: str) -> str: + return f"{PLACEMENT_GROUP_NAME_PREFIX}{instance_id}" def get_server_name(instance_id: str) -> str: - return f"server_{instance_id}" + return f"{SERVER_NAME_PREFIX}{instance_id}" def get_instance_name(instance_id: str) -> str: - return f"instance_{instance_id}" + return f"{INSTANCE_NAME_PREFIX}{instance_id}" -def initialize_placement_group(lifetime: str = None) -> PlacementGroup: +def initialize_placement_group(lifetime: str = None, instance_id: str = None) -> PlacementGroup: placement_group_specs = ([{"CPU": 1}, {"CPU": 1, "GPU": 4}]) + placement_group_name = get_placement_group_name(instance_id) placement_group = ray.util.placement_group( - placement_group_specs, "STRICT_PACK", lifetime=lifetime) + placement_group_specs, "STRICT_PACK", lifetime=lifetime, name=placement_group_name) return placement_group @@ -85,7 +90,7 @@ def from_args(cls, ) llumlet = llumlet_class.remote(instance_id) return llumlet - + def ready(self) -> bool: return True @@ -101,34 +106,37 @@ def __init__(self): self.instances: Dict[str, Llumlet] = {} asyncio.create_task(self._auto_scale_up_loop()) asyncio.create_task(self._auto_scale_down_loop()) + asyncio.create_task(self._check_deployment_loop()) print("LLMEngineManager created") - + async def _auto_scale_down_loop(self) -> None: def instance_ready_callback(instance_id: str, fut): ret = fut.result()[0] if isinstance(ret, ray.exceptions.RayActorError): print(f"instance {instance_id} died, scale down") - self.scale_down(instance_id) + self._scale_down(instance_id) - for instance_id, instance in self.instances.items(): - task = asyncio.gather(instance.ready.remote(), return_exceptions=True) - task.add_done_callback(partial(instance_ready_callback, instance_id)) - tasks.append(task) - await asyncio.gather(*tasks, return_exceptions=True) + while True: + tasks = [] + for instance_id, instance in self.instances.items(): + task = asyncio.gather(instance.ready.remote(), return_exceptions=True) + task.add_done_callback(partial(instance_ready_callback, instance_id)) + tasks.append(task) + await asyncio.gather(*tasks, return_exceptions=True) - await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) - async def _auto_scale_up_loop(self) -> None: + async def _auto_scale_up_loop_removed(self) -> None: def instance_ready_callback(instance_id: str, fut): ret = fut.result()[0] if not isinstance(ret, ray.exceptions.RayActorError): print(f"instance {instance_id} ready, scale up") - self.scale_up(instance_id, new_pgs[instance_id], new_servers[instance_id], new_instances[instance_id]) + self._scale_up(instance_id, new_pgs[instance_id], new_servers[instance_id], new_instances[instance_id]) new_servers[instance_id].run.remote() else: - print(f"instance {instace_id} died, abort") - self.remove_placement_group(new_pgs[instance_id], instance_id) - self.kill_server(new_servers[instance_id], instance_id) + print(f"instance {instance_id} died, abort") + self._remove_placement_group(new_pgs[instance_id], instance_id) + self._kill_server(new_servers[instance_id], instance_id) while True: # 1. Get new placement group continuously until wait placement group ready timeout. @@ -137,13 +145,12 @@ def instance_ready_callback(instance_id: str, fut): new_pg = initialize_placement_group(lifetime="detached") if not self.last_pending_pg else self.last_pending_pg try: await asyncio.wait_for(new_pg.ready(), timeout=WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS) - print("initialize new process group") new_pg_queue.put(new_pg) except asyncio.TimeoutError: - print("new placement group ready timeout") + print("Wait new placement group ready timeout") self.last_pending_pg = new_pg break - print("Get new placement group done") + print("Get new placement group ready done") # 2. Deploy 1 server and 1 instance to 1 placement group (for all new placement groups). new_pgs: Dict[str, PlacementGroup] = {} @@ -155,7 +162,6 @@ def instance_ready_callback(instance_id: str, fut): new_pgs[instance_id] = new_pg new_servers[instance_id] = FastAPIServer.from_args(instance_id, self.host, self.port, new_pg, lifetime="detached") new_instances[instance_id] = Llumlet.from_args(instance_id, new_pg, lifetime="detached") - await new_instances[instance_id].ready.remote() print("Deploy server and instance done") # 3. Wait for all instances ready. (With the assumption that once instance ready, server is ready too.) @@ -168,22 +174,95 @@ def instance_ready_callback(instance_id: str, fut): print("Wait all instances ready done") await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) - - def scale_up(self, - instance_id: str, - placement_group: PlacementGroup, - server: FastAPIServer, - instance: Llumlet) -> None: + + async def _auto_scale_up_loop(self) -> None: + while True: + pending_pg_states = ray.util.state.list_placement_groups(filters=[("state", "=", "PENDING")]) + while len(pending_pg_states) > 1: + self._remove_placement_group(ray.util.get_placement_group(pending_pg_states.pop()["name"])) + new_pg = initialize_placement_group(lifetime="detached") if len(pending_pg_states) == 0 else pending_pg_states[0] + try: + await asyncio.wait_for(new_pg.ready(), timeout=WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS) + print("Get new placement group ready done") + instance_id = random_uuid() + self._initialize_server_and_instance(instance_id, new_pg) + print("Deploy server and instance to placement group done") + except asyncio.TimeoutError: + print("Wait new placement group ready timeout") + await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + + async def _check_deployment_loop(self) -> None: + while True: + curr_pgs: Dict[str, PlacementGroup] = {} + curr_servers: Dict[str, PlacementGroup] = {} + curr_instances: Dict[str, Llumlet] = {} + created_pg_states = ray.util.state.list_placement_groups(filters=[("state", "=", "CREATED")]) + for created_pg_state in created_pg_states: + instance_id = created_pg_state["name"].split("_")[-1] + curr_pgs[instance_id] = ray.util.get_placement_group(created_pg_state["name"]) + alive_actor_states = ray.util.state.list_actors(filters=[("state", "=", "ALIVE")]) + for alive_actor_state in alive_actor_states: + if alive_actor_state["name"].startswith(SERVER_NAME_PREFIX): + instance_id = alive_actor_state["name"].split("_")[-1] + curr_servers[instance_id] = ray.get_actor(alive_actor_state["name"]) + elif alive_actor_state["name"].startswith(INSTANCE_NAME_PREFIX): + instance_id = alive_actor_state["name"].split("_")[-1] + curr_instances[instance_id] = ray.get_actor(alive_actor_state["name"]) + + assert len(curr_pgs) > max(len(curr_servers), len(curr_instances)) + + for instance_id in curr_pgs: + if instance_id not in curr_servers or instance_id not in curr_instances: + self._scale_down(instance_id) + + await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + + def _initialize_server_and_instance(self, instance_id: str, placement_group: PlacementGroup): + def instance_ready_callback(instance_id: str, fut): + ret = fut.result()[0] + if not isinstance(ret, ray.exceptions.RayActorError): + print(f"instance {instance_id} ready, scale up") + self._scale_up(instance_id, placement_group, new_server, new_instance) + new_server.run.remote() + else: + print(f"instance {instance_id} died, abort scale up") + self._remove_placement_group(placement_group, instance_id) + self._kill_server(new_server, instance_id) + + new_server = FastAPIServer.from_args(instance_id, self.host, self.port, placement_group, lifetime="detached") + new_instance = Llumlet.from_args(instance_id, placement_group, lifetime="detached") + instance_ready_task = asyncio.gather(new_instance.ready.remote(), return_exceptions=True) + instance_ready_task.add_done_callback(partial(instance_ready_callback, instance_id)) + asyncio.create_task(instance_ready_task) + + def _scale_up(self, + instance_id: str, + placement_group: PlacementGroup, + server: FastAPIServer, + instance: Llumlet) -> None: print(f"add placement group {instance_id}") self.pgs[instance_id] = placement_group print(f"add server {instance_id}") self.servers[instance_id] = server print(f"add instance {instance_id}") self.instances[instance_id] = instance - - def scale_down(self, instance_id: str) -> None: - self.kill_server(self.servers[instance_id], instance_id) - self.remove_placement_group(self.pgs[instance_id], instance_id) + + def _scale_down(self, instance_id: str) -> None: + try: + server = ray.get_actor(get_server_name(instance_id)) + self._kill_server(server, instance_id) + except ValueError: + pass + try: + instance = ray.get_actor(get_instance_name(instance_id)) + self._kill_instance(instance, instance_id) + except ValueError: + pass + try: + placement_group = ray.util.get_placement_group(get_placement_group_name(instance_id)) + self._remove_placement_group(placement_group, instance_id) + except ValueError: + pass if instance_id in self.pgs: print(f"del placement group {instance_id}") del self.pgs[instance_id] @@ -194,23 +273,35 @@ def scale_down(self, instance_id: str) -> None: print(f"del instance {instance_id}") del self.instances[instance_id] - def remove_placement_group(self, - placement_group: PlacementGroup, - instance_id: str) -> None: + def _remove_placement_group(self, + placement_group: PlacementGroup, + instance_id: str = None) -> None: try: ray.util.remove_placement_group(placement_group) - except Exception as e: + # pylint: disable=broad-except + except Exception: print(f"try to remove placement group {instance_id}") - - def kill_server(self, - server: FastAPIServer, - instance_id: str) -> None: + + def _kill_server(self, + server: FastAPIServer, + instance_id: str = None) -> None: try: ray.kill(server) # Exception that killing a died actor. - except Exception as e: + # pylint: disable=broad-except + except Exception: print(f"try to kill api server {instance_id}") + def _kill_instance(self, + instance: Llumlet, + instance_id: str = None) -> None: + try: + ray.kill(instance) + # Exception that killing a died actor. + # pylint: disable=broad-except + except Exception: + print(f"try to kill instance {instance_id}") + @classmethod def from_args(cls): engine_manager_class = ray.remote(num_cpus=1, @@ -230,6 +321,6 @@ def from_args(cls): "namespace": "llumnix", "name": "magic_queue" }) - engine_manager = LLMEngineManager.from_args() + manager = LLMEngineManager.from_args() time.sleep(1000) diff --git a/demo/placement_group_demo.py b/demo/placement_group_demo.py index 11fcde83..9ee80d0e 100644 --- a/demo/placement_group_demo.py +++ b/demo/placement_group_demo.py @@ -1,13 +1,11 @@ import time import asyncio -import subprocess import ray from ray.util import placement_group_table from ray.util.state import (list_actors, list_placement_groups) from manager_service_demo import (initialize_placement_group, - FastAPIServer, Llumlet) @@ -15,7 +13,7 @@ def test_actor_if_pg_died(life_time_pg, lifetime_llumlet): print(f"### placement group lifetime: {life_time_pg}, llumlet lifetime: {lifetime_llumlet}") print("### create placement group and llumlet") placement_group = initialize_placement_group(lifetime=life_time_pg) - llumlet = Llumlet.from_args("0", placement_group, lifetime=lifetime_llumlet) + _ = Llumlet.from_args("0", placement_group, lifetime=lifetime_llumlet) print(f"placement group state: {placement_group_table(placement_group)}") print(f"llumlet state: {list_actors()}") print("### sleep 1s") @@ -89,7 +87,7 @@ async def test_pg_ready(): print("placement group 2 ready") except asyncio.TimeoutError: print("wait placement group 2 timeout") - + def test_pg_api(): placement_group1 = initialize_placement_group() placement_group2 = initialize_placement_group() @@ -119,7 +117,7 @@ def test_pg_api(): # test_pg_if_actor_died(life_time_pg=None, lifetime_llumlet="detached") # test_pending(life_time_pg=None, lifetime_llumlet=None) - + # asyncio.run(test_pg_ready()) test_pg_api() diff --git a/demo/serve_demo.py b/demo/serve_demo.py index cb291f5a..84ffaecd 100644 --- a/demo/serve_demo.py +++ b/demo/serve_demo.py @@ -1,16 +1,17 @@ import asyncio import time -import uvicorn import argparse +from contextlib import asynccontextmanager +import uvicorn import requests from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse -from contextlib import asynccontextmanager import ray from llumnix.queue.ray_queue_server import RayQueueServer +# pylint: disable=unused-argument @asynccontextmanager async def lifespan(fastapi_app: FastAPI): asyncio.create_task(request_output_queue.run_server_loop()) @@ -18,12 +19,13 @@ async def lifespan(fastapi_app: FastAPI): request_output_queue.cleanup() app = FastAPI(lifespan=lifespan) -request_output_queue = RayQueueServer() +request_output_queue = None @app.get("/is_ready") async def is_ready() -> bool: return True +# pylint: disable=unused-argument @app.post("/generate") async def generate(request: Request) -> Response: ret = {"text": ""} @@ -34,6 +36,7 @@ async def health() -> Response: """Health check.""" return Response(status_code=200) +# pylint: disable=unused-argument @app.post("/generate_stream") async def generate_stream(request: Request) -> StreamingResponse: async def number_generator(): @@ -47,7 +50,7 @@ class FastAPIServer: def __init__(self, host: str, port: int): self.host = host self.port = port - + def run(self): uvicorn.run(app, host=self.host, port=self.port) @@ -65,9 +68,11 @@ def from_args(cls, host: str, port: int): parser.add_argument("--port", type=int, default=8000) args = parser.parse_args() - fastapi_server = FastAPIServer.from_args(args.host, args.port) - fastapi_server.run.remote() - + request_output_queue = RayQueueServer() + + server = FastAPIServer.from_args(args.host, args.port) + server.run.remote() + time.sleep(5) ip_address = f"{args.host}:{args.port}" diff --git a/demo/serve_demo1.py b/demo/serve_demo1.py index 89fc5d8a..745d10e4 100644 --- a/demo/serve_demo1.py +++ b/demo/serve_demo1.py @@ -1,14 +1,12 @@ import asyncio import time -import uvicorn -from uvicorn import Config, Server import argparse +from contextlib import asynccontextmanager +import uvicorn import requests from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse -from contextlib import asynccontextmanager import ray -from ray import serve from llumnix.queue.zmq_server import ZmqServer from llumnix.queue.zmq_client import ZmqClient @@ -19,6 +17,7 @@ from llumnix.queue.ray_queue_server import RayQueueServer +# pylint: disable=unused-argument @asynccontextmanager async def lifespan(fastapi_app: FastAPI): # @@@ @@ -31,7 +30,7 @@ async def lifespan(fastapi_app: FastAPI): app = FastAPI(lifespan=lifespan) # @@@ -request_output_queue = RayQueueServer() +request_output_queue = None request_output_queue_server = None @@ -39,6 +38,7 @@ async def lifespan(fastapi_app: FastAPI): async def is_ready() -> bool: return True +# pylint: disable=unused-argument @app.post("/generate") async def generate(request: Request) -> Response: ret = {"text": ""} @@ -49,6 +49,7 @@ async def health() -> Response: """Health check.""" return Response(status_code=200) +# pylint: disable=unused-argument @app.post("/generate_stream") async def generate_stream(request: Request) -> StreamingResponse: async def number_generator(): @@ -82,10 +83,11 @@ def run(self): @classmethod def from_args(cls, host: str, port: int): fastapi_server_class = ray.remote(num_cpus=1, name="entrypoints")(cls) - fastapi_server = fastapi_server_class.remote(host, port) + server = fastapi_server_class.remote(host, port) - return fastapi_server + return server +# pylint: disable=redefined-outer-name async def wait_request_output_queue_server_ready(request_output_queue_client: ZmqClient, server_info: ServerInfo): time.sleep(5) @@ -99,9 +101,11 @@ async def wait_request_output_queue_server_ready(request_output_queue_client: Zm parser.add_argument("--host", type=str, default='172.23.75.202') parser.add_argument("--port", type=int, default=8000) args = parser.parse_args() - + ray.init(namespace="llumnix") + request_output_queue = RayQueueServer() + # rpc_path = get_open_zmq_ipc_path(args.host, 8002) # request_output_queue_server = ZmqServer(rpc_path) request_output_queue_client = ZmqClient() @@ -110,7 +114,7 @@ async def wait_request_output_queue_server_ready(request_output_queue_client: Zm fastapi_server = FastAPIServer.from_args(args.host, args.port) fastapi_server.run.remote() - + time.sleep(5) ip_address = f"{args.host}:{args.port}" diff --git a/demo/serve_demo2.py b/demo/serve_demo2.py index 10e8a68d..11b152e3 100644 --- a/demo/serve_demo2.py +++ b/demo/serve_demo2.py @@ -11,7 +11,7 @@ app = FastAPI() # @@@ # request_output_queue = RayQueueServer() -request_output_queue = RayQueue() +request_output_queue = None class FastAPIServer: @@ -35,9 +35,11 @@ def from_args(cls, host: str, port: int): parser.add_argument("--host", type=str, default='localhost') parser.add_argument("--port", type=int, default=8000) args = parser.parse_args() - + ray.init(namespace="llumnix") - - fastapi_server = FastAPIServer.from_args(args.host, args.port) + + request_output_queue = RayQueue() + + server = FastAPIServer.from_args(args.host, args.port) time.sleep(5) From c94b5448ea01be689717995d6b7ac73ae9662537 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 23 Dec 2024 09:45:16 +0000 Subject: [PATCH 10/92] Refine codes --- demo/manager_service_demo.py | 253 ++++++++++++++++------------------- 1 file changed, 115 insertions(+), 138 deletions(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index f4433045..a15e8783 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -1,6 +1,6 @@ import asyncio -from queue import Queue import time +import traceback from typing import Dict from functools import partial import uvicorn @@ -11,6 +11,9 @@ from ray.util.queue import Queue as RayQueue from llumnix.utils import random_uuid +from llumnix.logger import init_logger + +logger = init_logger(__name__) PLACEMENT_GROUP_NAME_PREFIX = "pg_" SERVER_NAME_PREFIX = "server_" @@ -28,13 +31,55 @@ def get_server_name(instance_id: str) -> str: def get_instance_name(instance_id: str) -> str: return f"{INSTANCE_NAME_PREFIX}{instance_id}" -def initialize_placement_group(lifetime: str = None, instance_id: str = None) -> PlacementGroup: +def initialize_placement_group(instance_id: str = None, lifetime: str = None) -> PlacementGroup: placement_group_specs = ([{"CPU": 1}, {"CPU": 1, "GPU": 4}]) placement_group_name = get_placement_group_name(instance_id) placement_group = ray.util.placement_group( placement_group_specs, "STRICT_PACK", lifetime=lifetime, name=placement_group_name) return placement_group +def remove_placement_group(instance_id: str = None) -> bool: + placement_group = ray.util.get_placement_group(get_placement_group_name(instance_id)) + if not placement_group: + return False + try: + ray.util.remove_placement_group(placement_group) + # pylint: disable=broad-except + except Exception: + return False + return True + +def kill_server(instance_id: str = None) -> bool: + try: + server = ray.get_actor(get_server_name(instance_id)) + except ValueError: + return False + try: + ray.kill(server) + # pylint: disable=broad-except + except Exception: + return False + return True + +def kill_instance(instance_id: str = None) -> bool: + try: + instance = ray.get_actor(get_instance_name(instance_id)) + except ValueError: + return False + try: + ray.kill(instance) + return True + # pylint: disable=broad-except + except Exception: + return False + +def actor_exists(actor_name: str) -> bool: + try: + ray.get_actor(actor_name) + return True + except ValueError: + return False + class FastAPIServer: def __init__(self, instance_id: str, host: str, port: int): @@ -106,7 +151,7 @@ def __init__(self): self.instances: Dict[str, Llumlet] = {} asyncio.create_task(self._auto_scale_up_loop()) asyncio.create_task(self._auto_scale_down_loop()) - asyncio.create_task(self._check_deployment_loop()) + asyncio.create_task(self._check_deployment_correctness_loop()) print("LLMEngineManager created") async def _auto_scale_down_loop(self) -> None: @@ -117,105 +162,78 @@ def instance_ready_callback(instance_id: str, fut): self._scale_down(instance_id) while True: - tasks = [] - for instance_id, instance in self.instances.items(): - task = asyncio.gather(instance.ready.remote(), return_exceptions=True) - task.add_done_callback(partial(instance_ready_callback, instance_id)) - tasks.append(task) - await asyncio.gather(*tasks, return_exceptions=True) - - await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) - - async def _auto_scale_up_loop_removed(self) -> None: - def instance_ready_callback(instance_id: str, fut): - ret = fut.result()[0] - if not isinstance(ret, ray.exceptions.RayActorError): - print(f"instance {instance_id} ready, scale up") - self._scale_up(instance_id, new_pgs[instance_id], new_servers[instance_id], new_instances[instance_id]) - new_servers[instance_id].run.remote() - else: - print(f"instance {instance_id} died, abort") - self._remove_placement_group(new_pgs[instance_id], instance_id) - self._kill_server(new_servers[instance_id], instance_id) + try: + tasks = [] + for instance_id, instance in self.instances.items(): + task = asyncio.gather(instance.ready.remote(), return_exceptions=True) + task.add_done_callback(partial(instance_ready_callback, instance_id)) + tasks.append(task) + await asyncio.gather(*tasks, return_exceptions=True) - while True: - # 1. Get new placement group continuously until wait placement group ready timeout. - new_pg_queue = Queue() - while True: - new_pg = initialize_placement_group(lifetime="detached") if not self.last_pending_pg else self.last_pending_pg - try: - await asyncio.wait_for(new_pg.ready(), timeout=WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS) - new_pg_queue.put(new_pg) - except asyncio.TimeoutError: - print("Wait new placement group ready timeout") - self.last_pending_pg = new_pg - break - print("Get new placement group ready done") - - # 2. Deploy 1 server and 1 instance to 1 placement group (for all new placement groups). - new_pgs: Dict[str, PlacementGroup] = {} - new_servers: Dict[str, FastAPIServer] = {} - new_instances: Dict[str, Llumlet] = {} - while not new_pg_queue.empty(): - instance_id = random_uuid() - new_pg = new_pg_queue.get() - new_pgs[instance_id] = new_pg - new_servers[instance_id] = FastAPIServer.from_args(instance_id, self.host, self.port, new_pg, lifetime="detached") - new_instances[instance_id] = Llumlet.from_args(instance_id, new_pg, lifetime="detached") - print("Deploy server and instance done") - - # 3. Wait for all instances ready. (With the assumption that once instance ready, server is ready too.) - tasks = [] - for instance_id, instance in new_instances.items(): - task = asyncio.gather(instance.ready.remote(), return_exceptions=True) - task.add_done_callback(partial(instance_ready_callback, instance_id)) - tasks.append(task) - await asyncio.gather(*tasks, return_exceptions=True) - print("Wait all instances ready done") - - await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + # pylint: disable=broad-except + except Exception as e: + logger.error("unexpected exception occurs: {}".format(e)) + logger.error("exception traceback: {}".format(traceback.format_exc())) async def _auto_scale_up_loop(self) -> None: while True: - pending_pg_states = ray.util.state.list_placement_groups(filters=[("state", "=", "PENDING")]) - while len(pending_pg_states) > 1: - self._remove_placement_group(ray.util.get_placement_group(pending_pg_states.pop()["name"])) - new_pg = initialize_placement_group(lifetime="detached") if len(pending_pg_states) == 0 else pending_pg_states[0] try: - await asyncio.wait_for(new_pg.ready(), timeout=WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS) - print("Get new placement group ready done") + pending_pg_states = ray.util.state.list_placement_groups(filters=[("state", "=", "PENDING")]) + for pending_pg_state in pending_pg_states: + instance_id = pending_pg_state["name"].split("_")[-1] + self._scale_down(instance_id) instance_id = random_uuid() + new_pg = initialize_placement_group(instance_id, lifetime="detached") + await new_pg.ready() + print("Get new placement group ready done") self._initialize_server_and_instance(instance_id, new_pg) - print("Deploy server and instance to placement group done") - except asyncio.TimeoutError: - print("Wait new placement group ready timeout") - await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + print("Deploy server and instance to new placement group done") + # pylint: disable=broad-except + except Exception as e: + logger.error("unexpected exception occurs: {}".format(e)) + logger.error("exception traceback: {}".format(traceback.format_exc())) - async def _check_deployment_loop(self) -> None: + async def _check_deployment_correctness_loop(self) -> None: while True: - curr_pgs: Dict[str, PlacementGroup] = {} - curr_servers: Dict[str, PlacementGroup] = {} - curr_instances: Dict[str, Llumlet] = {} - created_pg_states = ray.util.state.list_placement_groups(filters=[("state", "=", "CREATED")]) - for created_pg_state in created_pg_states: - instance_id = created_pg_state["name"].split("_")[-1] - curr_pgs[instance_id] = ray.util.get_placement_group(created_pg_state["name"]) - alive_actor_states = ray.util.state.list_actors(filters=[("state", "=", "ALIVE")]) - for alive_actor_state in alive_actor_states: - if alive_actor_state["name"].startswith(SERVER_NAME_PREFIX): - instance_id = alive_actor_state["name"].split("_")[-1] - curr_servers[instance_id] = ray.get_actor(alive_actor_state["name"]) - elif alive_actor_state["name"].startswith(INSTANCE_NAME_PREFIX): - instance_id = alive_actor_state["name"].split("_")[-1] - curr_instances[instance_id] = ray.get_actor(alive_actor_state["name"]) - - assert len(curr_pgs) > max(len(curr_servers), len(curr_instances)) - - for instance_id in curr_pgs: - if instance_id not in curr_servers or instance_id not in curr_instances: - self._scale_down(instance_id) + try: + curr_pgs: Dict[str, PlacementGroup] = {} + curr_servers: Dict[str, PlacementGroup] = {} + curr_instances: Dict[str, Llumlet] = {} + created_pg_states = ray.util.state.list_placement_groups(filters=[("state", "=", "CREATED")]) + for created_pg_state in created_pg_states: + instance_id = created_pg_state["name"].split("_")[-1] + curr_pgs[instance_id] = ray.util.get_placement_group(created_pg_state["name"]) + alive_actor_states = ray.util.state.list_actors(filters=[("state", "=", "ALIVE")]) + for alive_actor_state in alive_actor_states: + if alive_actor_state["name"].startswith(SERVER_NAME_PREFIX): + instance_id = alive_actor_state["name"].split("_")[-1] + curr_servers[instance_id] = ray.get_actor(alive_actor_state["name"]) + elif alive_actor_state["name"].startswith(INSTANCE_NAME_PREFIX): + instance_id = alive_actor_state["name"].split("_")[-1] + curr_instances[instance_id] = ray.get_actor(alive_actor_state["name"]) + + assert len(curr_pgs) > max(len(curr_servers), len(curr_instances)) + + for instance_id in curr_pgs: + if instance_id not in curr_servers or instance_id not in curr_instances: + self._scale_down(instance_id) + if instance_id in curr_pgs: + curr_pgs.pop(instance_id) + if instance_id in curr_servers: + curr_servers.pop(instance_id) + if instance_id in curr_instances: + curr_instances.pop(instance_id) + + self.pgs = curr_pgs + self.servers = curr_servers + self.instance = curr_instances - await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + # pylint: disable=broad-except + except Exception as e: + logger.error("unexpected exception occurs: {}".format(e)) + logger.error("exception traceback: {}".format(traceback.format_exc())) def _initialize_server_and_instance(self, instance_id: str, placement_group: PlacementGroup): def instance_ready_callback(instance_id: str, fut): @@ -226,8 +244,8 @@ def instance_ready_callback(instance_id: str, fut): new_server.run.remote() else: print(f"instance {instance_id} died, abort scale up") - self._remove_placement_group(placement_group, instance_id) - self._kill_server(new_server, instance_id) + remove_placement_group(instance_id) + kill_server(instance_id) new_server = FastAPIServer.from_args(instance_id, self.host, self.port, placement_group, lifetime="detached") new_instance = Llumlet.from_args(instance_id, placement_group, lifetime="detached") @@ -248,21 +266,9 @@ def _scale_up(self, self.instances[instance_id] = instance def _scale_down(self, instance_id: str) -> None: - try: - server = ray.get_actor(get_server_name(instance_id)) - self._kill_server(server, instance_id) - except ValueError: - pass - try: - instance = ray.get_actor(get_instance_name(instance_id)) - self._kill_instance(instance, instance_id) - except ValueError: - pass - try: - placement_group = ray.util.get_placement_group(get_placement_group_name(instance_id)) - self._remove_placement_group(placement_group, instance_id) - except ValueError: - pass + kill_server(instance_id) + kill_instance(instance_id) + remove_placement_group(instance_id) if instance_id in self.pgs: print(f"del placement group {instance_id}") del self.pgs[instance_id] @@ -273,35 +279,6 @@ def _scale_down(self, instance_id: str) -> None: print(f"del instance {instance_id}") del self.instances[instance_id] - def _remove_placement_group(self, - placement_group: PlacementGroup, - instance_id: str = None) -> None: - try: - ray.util.remove_placement_group(placement_group) - # pylint: disable=broad-except - except Exception: - print(f"try to remove placement group {instance_id}") - - def _kill_server(self, - server: FastAPIServer, - instance_id: str = None) -> None: - try: - ray.kill(server) - # Exception that killing a died actor. - # pylint: disable=broad-except - except Exception: - print(f"try to kill api server {instance_id}") - - def _kill_instance(self, - instance: Llumlet, - instance_id: str = None) -> None: - try: - ray.kill(instance) - # Exception that killing a died actor. - # pylint: disable=broad-except - except Exception: - print(f"try to kill instance {instance_id}") - @classmethod def from_args(cls): engine_manager_class = ray.remote(num_cpus=1, From 1c7630992c2eec6c27e828e4a33761ec6bb6be4d Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 23 Dec 2024 11:11:29 +0000 Subject: [PATCH 11/92] Test pg ready --- demo/placement_group_demo.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/demo/placement_group_demo.py b/demo/placement_group_demo.py index 9ee80d0e..7fdd28b9 100644 --- a/demo/placement_group_demo.py +++ b/demo/placement_group_demo.py @@ -87,6 +87,9 @@ async def test_pg_ready(): print("placement group 2 ready") except asyncio.TimeoutError: print("wait placement group 2 timeout") + placement_group3 = initialize_placement_group() + ray.util.remove_placement_group(placement_group3) + await placement_group3.ready() def test_pg_api(): placement_group1 = initialize_placement_group() @@ -104,7 +107,6 @@ def test_pg_api(): print(f"placement group 1 state: {placement_group_table(placement_group1)}") print(f"placement group 2 state: {placement_group_table(placement_group2)}") - if __name__ == "__main__": # test_actor_if_pg_died(life_time_pg=None, lifetime_llumlet=None) # test_actor_if_pg_died(life_time_pg=None, lifetime_llumlet="detached") @@ -118,6 +120,6 @@ def test_pg_api(): # test_pending(life_time_pg=None, lifetime_llumlet=None) - # asyncio.run(test_pg_ready()) + asyncio.run(test_pg_ready()) - test_pg_api() + # test_pg_api() From 76c10b3c96879ce03cc2376b44fe4b3819f55114 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 23 Dec 2024 11:35:04 +0000 Subject: [PATCH 12/92] Test threading uvicorn run --- demo/serve_demo.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/demo/serve_demo.py b/demo/serve_demo.py index 84ffaecd..d8eeef66 100644 --- a/demo/serve_demo.py +++ b/demo/serve_demo.py @@ -1,5 +1,6 @@ import asyncio import time +import threading import argparse from contextlib import asynccontextmanager import uvicorn @@ -50,9 +51,15 @@ class FastAPIServer: def __init__(self, host: str, port: int): self.host = host self.port = port + self.run_loop_thread = threading.Thread( + target=self._run_loop, args=(), daemon=True, name="run_loop" + ) - def run(self): + def _run_loop(self): uvicorn.run(app, host=self.host, port=self.port) + + def run(self): + self.run_loop_thread.start() @classmethod def from_args(cls, host: str, port: int): From 1bbe1e59107e52a7489a3a46ab318046492af0af Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 23 Dec 2024 11:38:22 +0000 Subject: [PATCH 13/92] Refine --- demo/manager_service_demo.py | 39 ++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index a15e8783..0da66974 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -9,6 +9,7 @@ from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from ray.util.queue import Queue as RayQueue +from ray.util import placement_group_table from llumnix.utils import random_uuid from llumnix.logger import init_logger @@ -33,6 +34,8 @@ def get_instance_name(instance_id: str) -> str: def initialize_placement_group(instance_id: str = None, lifetime: str = None) -> PlacementGroup: placement_group_specs = ([{"CPU": 1}, {"CPU": 1, "GPU": 4}]) + if instance_id is None: + instance_id = random_uuid() placement_group_name = get_placement_group_name(instance_id) placement_group = ray.util.placement_group( placement_group_specs, "STRICT_PACK", lifetime=lifetime, name=placement_group_name) @@ -43,7 +46,9 @@ def remove_placement_group(instance_id: str = None) -> bool: if not placement_group: return False try: + # asynchronous ray.util.remove_placement_group(placement_group) + print(f"remove placement group {instance_id}") # pylint: disable=broad-except except Exception: return False @@ -56,6 +61,7 @@ def kill_server(instance_id: str = None) -> bool: return False try: ray.kill(server) + print(f"kill server {instance_id}") # pylint: disable=broad-except except Exception: return False @@ -68,6 +74,7 @@ def kill_instance(instance_id: str = None) -> bool: return False try: ray.kill(instance) + print(f"kill instance {instance_id}") return True # pylint: disable=broad-except except Exception: @@ -87,9 +94,15 @@ def __init__(self, instance_id: str, host: str, port: int): self.port = port self.server_name = get_server_name(instance_id) print("FastAPIServer created") + self.run_loop_thread = threading.Thread( + target=self._run_loop, args=(), daemon=True, name="run_loop" + ) - def run(self) -> None: + def _run_loop(self): uvicorn.run(app, host=self.host, port=self.port) + + def run(self): + self.run_loop_thread.start() @classmethod def from_args(cls, @@ -111,6 +124,9 @@ def from_args(cls, fastapi_server = fastapi_server_class.remote(instance_id, host, port) return fastapi_server + def ready(self) -> bool: + return True + class Llumlet: def __init__(self, instance_id: str): @@ -158,7 +174,7 @@ async def _auto_scale_down_loop(self) -> None: def instance_ready_callback(instance_id: str, fut): ret = fut.result()[0] if isinstance(ret, ray.exceptions.RayActorError): - print(f"instance {instance_id} died, scale down") + print(f"server/instance {instance_id} died, scale down") self._scale_down(instance_id) while True: @@ -170,6 +186,13 @@ def instance_ready_callback(instance_id: str, fut): tasks.append(task) await asyncio.gather(*tasks, return_exceptions=True) + tasks = [] + for instance_id, server in self.servers.items(): + task = asyncio.gather(server.ready.remote(), return_exceptions=True) + task.add_done_callback(partial(instance_ready_callback, instance_id)) + tasks.append(task) + await asyncio.gather(*tasks, return_exceptions=True) + await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) # pylint: disable=broad-except except Exception as e: @@ -185,7 +208,12 @@ async def _auto_scale_up_loop(self) -> None: self._scale_down(instance_id) instance_id = random_uuid() new_pg = initialize_placement_group(instance_id, lifetime="detached") - await new_pg.ready() + try: + await asyncio.wait_for(new_pg.ready(), WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS) + except asyncio.TimeoutError: + print("Get new placement group ready timeout") + ayncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + continue print("Get new placement group ready done") self._initialize_server_and_instance(instance_id, new_pg) print("Deploy server and instance to new placement group done") @@ -240,12 +268,11 @@ def instance_ready_callback(instance_id: str, fut): ret = fut.result()[0] if not isinstance(ret, ray.exceptions.RayActorError): print(f"instance {instance_id} ready, scale up") - self._scale_up(instance_id, placement_group, new_server, new_instance) new_server.run.remote() + self._scale_up(instance_id, placement_group, new_server, new_instance) else: print(f"instance {instance_id} died, abort scale up") - remove_placement_group(instance_id) - kill_server(instance_id) + self._scale_down(instance_id) new_server = FastAPIServer.from_args(instance_id, self.host, self.port, placement_group, lifetime="detached") new_instance = Llumlet.from_args(instance_id, placement_group, lifetime="detached") From 39b885d029429068f1ea507257eb6fe1e9b54caf Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 23 Dec 2024 11:44:15 +0000 Subject: [PATCH 14/92] Fix --- demo/manager_service_demo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index 0da66974..b977dba7 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -212,7 +212,7 @@ async def _auto_scale_up_loop(self) -> None: await asyncio.wait_for(new_pg.ready(), WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS) except asyncio.TimeoutError: print("Get new placement group ready timeout") - ayncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + await ayncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) continue print("Get new placement group ready done") self._initialize_server_and_instance(instance_id, new_pg) @@ -253,6 +253,7 @@ async def _check_deployment_correctness_loop(self) -> None: if instance_id in curr_instances: curr_instances.pop(instance_id) + # (TODO: s5u13b): double check self.pgs = curr_pgs self.servers = curr_servers self.instance = curr_instances From 1abb28ea9355de99c79c5d0ca9b4a08329213a3d Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 23 Dec 2024 11:55:44 +0000 Subject: [PATCH 15/92] Add actor demo --- demo/actor_demo.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 demo/actor_demo.py diff --git a/demo/actor_demo.py b/demo/actor_demo.py new file mode 100644 index 00000000..e182d66a --- /dev/null +++ b/demo/actor_demo.py @@ -0,0 +1,24 @@ +import ray + +from manager_service_demo import (initialize_placement_group, + Llumlet, + get_instance_name) + +from llumnix.utils import random_uuid + + +def test_get_died_actor(): + placement_group = initialize_placement_group() + instance_id = random_uuid() + llumlet = Llumlet.from_args(instance_id, placement_group) + ray.get(llumlet.ready.remote()) + ray.get_actor(get_instance_name(instance_id), namespace="llumnix") + ray.kill(llumlet) + try: + ray.get_actor(get_instance_name(instance_id)) + print("Get died actor successfully") + except ValueError: + print("Get died actor failed") + +if __name__ == "__main__": + test_get_died_actor() From 135e4ab9d7009d94ffaba1b4ee4776b701a49721 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 23 Dec 2024 11:56:43 +0000 Subject: [PATCH 16/92] Specify get_actor namespace --- demo/manager_service_demo.py | 16 ++++++++-------- demo/serve_demo.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index b977dba7..dfae1f1f 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -9,7 +9,6 @@ from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from ray.util.queue import Queue as RayQueue -from ray.util import placement_group_table from llumnix.utils import random_uuid from llumnix.logger import init_logger @@ -56,7 +55,7 @@ def remove_placement_group(instance_id: str = None) -> bool: def kill_server(instance_id: str = None) -> bool: try: - server = ray.get_actor(get_server_name(instance_id)) + server = ray.get_actor(get_server_name(instance_id), namespace="llumnix") except ValueError: return False try: @@ -69,7 +68,7 @@ def kill_server(instance_id: str = None) -> bool: def kill_instance(instance_id: str = None) -> bool: try: - instance = ray.get_actor(get_instance_name(instance_id)) + instance = ray.get_actor(get_instance_name(instance_id), namespace="llumnix") except ValueError: return False try: @@ -82,7 +81,7 @@ def kill_instance(instance_id: str = None) -> bool: def actor_exists(actor_name: str) -> bool: try: - ray.get_actor(actor_name) + ray.get_actor(actor_name, namespace="llumnix") return True except ValueError: return False @@ -100,7 +99,7 @@ def __init__(self, instance_id: str, host: str, port: int): def _run_loop(self): uvicorn.run(app, host=self.host, port=self.port) - + def run(self): self.run_loop_thread.start() @@ -236,10 +235,10 @@ async def _check_deployment_correctness_loop(self) -> None: for alive_actor_state in alive_actor_states: if alive_actor_state["name"].startswith(SERVER_NAME_PREFIX): instance_id = alive_actor_state["name"].split("_")[-1] - curr_servers[instance_id] = ray.get_actor(alive_actor_state["name"]) + curr_servers[instance_id] = ray.get_actor(alive_actor_state["name"], namespace="llumnix") elif alive_actor_state["name"].startswith(INSTANCE_NAME_PREFIX): instance_id = alive_actor_state["name"].split("_")[-1] - curr_instances[instance_id] = ray.get_actor(alive_actor_state["name"]) + curr_instances[instance_id] = ray.get_actor(alive_actor_state["name"], namespace="llumnix") assert len(curr_pgs) > max(len(curr_servers), len(curr_instances)) @@ -328,4 +327,5 @@ def from_args(cls): }) manager = LLMEngineManager.from_args() - time.sleep(1000) + while True: + time.sleep(100) diff --git a/demo/serve_demo.py b/demo/serve_demo.py index d8eeef66..c57cd29e 100644 --- a/demo/serve_demo.py +++ b/demo/serve_demo.py @@ -57,7 +57,7 @@ def __init__(self, host: str, port: int): def _run_loop(self): uvicorn.run(app, host=self.host, port=self.port) - + def run(self): self.run_loop_thread.start() From 21c1552a32372c4258fc0658e3af6aec30d684c3 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 23 Dec 2024 12:02:32 +0000 Subject: [PATCH 17/92] Fix TODO --- demo/manager_service_demo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index dfae1f1f..303a98e6 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -252,7 +252,7 @@ async def _check_deployment_correctness_loop(self) -> None: if instance_id in curr_instances: curr_instances.pop(instance_id) - # (TODO: s5u13b): double check + # TODO(s5u13b): double check self.pgs = curr_pgs self.servers = curr_servers self.instance = curr_instances From 217c3e4dac88f6ac57a5b6dfe2a803ba1b2cd68c Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 02:33:14 +0000 Subject: [PATCH 18/92] Testing service --- demo/manager_service_demo.py | 54 +++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index 303a98e6..7de88f82 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -1,6 +1,7 @@ import asyncio import time import traceback +import threading from typing import Dict from functools import partial import uvicorn @@ -9,12 +10,13 @@ from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from ray.util.queue import Queue as RayQueue +from ray.util.state import (list_actors, + list_placement_groups) + from llumnix.utils import random_uuid from llumnix.logger import init_logger -logger = init_logger(__name__) - PLACEMENT_GROUP_NAME_PREFIX = "pg_" SERVER_NAME_PREFIX = "server_" INSTANCE_NAME_PREFIX = "instance_" @@ -166,7 +168,7 @@ def __init__(self): self.instances: Dict[str, Llumlet] = {} asyncio.create_task(self._auto_scale_up_loop()) asyncio.create_task(self._auto_scale_down_loop()) - asyncio.create_task(self._check_deployment_correctness_loop()) + # asyncio.create_task(self._check_deployment_correctness_loop()) print("LLMEngineManager created") async def _auto_scale_down_loop(self) -> None: @@ -195,13 +197,14 @@ def instance_ready_callback(instance_id: str, fut): await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) # pylint: disable=broad-except except Exception as e: - logger.error("unexpected exception occurs: {}".format(e)) - logger.error("exception traceback: {}".format(traceback.format_exc())) + print("unexpected exception occurs: {}".format(e)) + print("exception traceback: {}".format(traceback.format_exc())) async def _auto_scale_up_loop(self) -> None: while True: try: - pending_pg_states = ray.util.state.list_placement_groups(filters=[("state", "=", "PENDING")]) + pending_pg_states = list_placement_groups(filters=[("state", "=", "PENDING")]) + print(f"pending_pg_states: {pending_pg_states}") for pending_pg_state in pending_pg_states: instance_id = pending_pg_state["name"].split("_")[-1] self._scale_down(instance_id) @@ -211,15 +214,15 @@ async def _auto_scale_up_loop(self) -> None: await asyncio.wait_for(new_pg.ready(), WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS) except asyncio.TimeoutError: print("Get new placement group ready timeout") - await ayncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) + await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) continue print("Get new placement group ready done") self._initialize_server_and_instance(instance_id, new_pg) print("Deploy server and instance to new placement group done") # pylint: disable=broad-except except Exception as e: - logger.error("unexpected exception occurs: {}".format(e)) - logger.error("exception traceback: {}".format(traceback.format_exc())) + print("unexpected exception occurs: {}".format(e)) + print("exception traceback: {}".format(traceback.format_exc())) async def _check_deployment_correctness_loop(self) -> None: while True: @@ -227,11 +230,11 @@ async def _check_deployment_correctness_loop(self) -> None: curr_pgs: Dict[str, PlacementGroup] = {} curr_servers: Dict[str, PlacementGroup] = {} curr_instances: Dict[str, Llumlet] = {} - created_pg_states = ray.util.state.list_placement_groups(filters=[("state", "=", "CREATED")]) + created_pg_states = list_placement_groups(filters=[("state", "=", "CREATED")]) for created_pg_state in created_pg_states: instance_id = created_pg_state["name"].split("_")[-1] curr_pgs[instance_id] = ray.util.get_placement_group(created_pg_state["name"]) - alive_actor_states = ray.util.state.list_actors(filters=[("state", "=", "ALIVE")]) + alive_actor_states = list_actors(filters=[("state", "=", "ALIVE")]) for alive_actor_state in alive_actor_states: if alive_actor_state["name"].startswith(SERVER_NAME_PREFIX): instance_id = alive_actor_state["name"].split("_")[-1] @@ -240,7 +243,7 @@ async def _check_deployment_correctness_loop(self) -> None: instance_id = alive_actor_state["name"].split("_")[-1] curr_instances[instance_id] = ray.get_actor(alive_actor_state["name"], namespace="llumnix") - assert len(curr_pgs) > max(len(curr_servers), len(curr_instances)) + assert len(curr_pgs) >= max(len(curr_servers), len(curr_instances)) for instance_id in curr_pgs: if instance_id not in curr_servers or instance_id not in curr_instances: @@ -260,25 +263,23 @@ async def _check_deployment_correctness_loop(self) -> None: await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) # pylint: disable=broad-except except Exception as e: - logger.error("unexpected exception occurs: {}".format(e)) - logger.error("exception traceback: {}".format(traceback.format_exc())) + print("unexpected exception occurs: {}".format(e)) + print("exception traceback: {}".format(traceback.format_exc())) def _initialize_server_and_instance(self, instance_id: str, placement_group: PlacementGroup): - def instance_ready_callback(instance_id: str, fut): - ret = fut.result()[0] - if not isinstance(ret, ray.exceptions.RayActorError): + async def wait_instance_ready(instance_id: str): + try: + await new_instance.ready.remote() print(f"instance {instance_id} ready, scale up") new_server.run.remote() self._scale_up(instance_id, placement_group, new_server, new_instance) - else: + except ray.exceptions.RayActorError: print(f"instance {instance_id} died, abort scale up") self._scale_down(instance_id) new_server = FastAPIServer.from_args(instance_id, self.host, self.port, placement_group, lifetime="detached") new_instance = Llumlet.from_args(instance_id, placement_group, lifetime="detached") - instance_ready_task = asyncio.gather(new_instance.ready.remote(), return_exceptions=True) - instance_ready_task.add_done_callback(partial(instance_ready_callback, instance_id)) - asyncio.create_task(instance_ready_task) + asyncio.create_task(wait_instance_ready(instance_id)) def _scale_up(self, instance_id: str, @@ -297,14 +298,15 @@ def _scale_down(self, instance_id: str) -> None: kill_instance(instance_id) remove_placement_group(instance_id) if instance_id in self.pgs: - print(f"del placement group {instance_id}") - del self.pgs[instance_id] + print(f"pop placement group {instance_id}") + # Don't use del here. + self.pgs.pop(instance_id) if instance_id in self.servers: - print(f"del server {instance_id}") - del self.servers[instance_id] + print(f"pop server {instance_id}") + self.servers.pop(instance_id) if instance_id in self.instances: print(f"del instance {instance_id}") - del self.instances[instance_id] + self.instances.pop(instance_id) @classmethod def from_args(cls): From 6f2a80978317193a7660759827e9ec4f3c1f34ab Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 03:46:56 +0000 Subject: [PATCH 19/92] Fix _check_deployment_correctness_loop --- demo/manager_service_demo.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index 7de88f82..c8f54283 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -15,13 +15,13 @@ from llumnix.utils import random_uuid -from llumnix.logger import init_logger PLACEMENT_GROUP_NAME_PREFIX = "pg_" SERVER_NAME_PREFIX = "server_" INSTANCE_NAME_PREFIX = "instance_" WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS = 1.0 AUTO_DEPLOYMENT_INTERVAL_SECONDS = 1.0 +CHECK_DEPLOYMENT_CORRECTNESS_INTERVAL_SECONDS = 5.0 app = FastAPI() @@ -167,8 +167,8 @@ def __init__(self): self.servers: Dict[str, FastAPIServer] = {} self.instances: Dict[str, Llumlet] = {} asyncio.create_task(self._auto_scale_up_loop()) - asyncio.create_task(self._auto_scale_down_loop()) - # asyncio.create_task(self._check_deployment_correctness_loop()) + # asyncio.create_task(self._auto_scale_down_loop()) + asyncio.create_task(self._check_deployment_correctness_loop()) print("LLMEngineManager created") async def _auto_scale_down_loop(self) -> None: @@ -225,6 +225,11 @@ async def _auto_scale_up_loop(self) -> None: print("exception traceback: {}".format(traceback.format_exc())) async def _check_deployment_correctness_loop(self) -> None: + async def detect_correctness_task(instance_id: str): + await asyncio.sleep(CHECK_DEPLOYMENT_CORRECTNESS_INTERVAL_SECONDS) + if instance_id in self.pgs and (instance_id not in self.servers or instance_id not in self.instances): + self._scale_down(instance_id) + while True: try: curr_pgs: Dict[str, PlacementGroup] = {} @@ -245,20 +250,11 @@ async def _check_deployment_correctness_loop(self) -> None: assert len(curr_pgs) >= max(len(curr_servers), len(curr_instances)) + tasks = [] for instance_id in curr_pgs: if instance_id not in curr_servers or instance_id not in curr_instances: - self._scale_down(instance_id) - if instance_id in curr_pgs: - curr_pgs.pop(instance_id) - if instance_id in curr_servers: - curr_servers.pop(instance_id) - if instance_id in curr_instances: - curr_instances.pop(instance_id) - - # TODO(s5u13b): double check - self.pgs = curr_pgs - self.servers = curr_servers - self.instance = curr_instances + tasks.append(asyncio.create_task(detect_correctness_task(instance_id))) + await asyncio.gather(*tasks, return_exceptions=True) await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) # pylint: disable=broad-except @@ -305,7 +301,7 @@ def _scale_down(self, instance_id: str) -> None: print(f"pop server {instance_id}") self.servers.pop(instance_id) if instance_id in self.instances: - print(f"del instance {instance_id}") + print(f"pop instance {instance_id}") self.instances.pop(instance_id) @classmethod From 4fe40b8da1ebbf5d360cf2352d9481d008314087 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 05:48:21 +0000 Subject: [PATCH 20/92] Func _check_deployment_states_correctness_loop done --- demo/manager_service_demo.py | 49 ++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index c8f54283..546e989e 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -2,7 +2,7 @@ import time import traceback import threading -from typing import Dict +from typing import Dict, Tuple from functools import partial import uvicorn from fastapi import FastAPI @@ -157,6 +157,28 @@ def ready(self) -> bool: return True +def get_curr_deployment_states() -> Tuple[Dict[str, PlacementGroup], Dict[str, FastAPIServer], Dict[str, Llumlet]]: + curr_pgs: Dict[str, PlacementGroup] = {} + curr_servers: Dict[str, PlacementGroup] = {} + curr_instances: Dict[str, Llumlet] = {} + + created_pg_states = list_placement_groups(filters=[("state", "=", "CREATED")]) + for created_pg_state in created_pg_states: + instance_id = created_pg_state["name"].split("_")[-1] + curr_pgs[instance_id] = ray.util.get_placement_group(created_pg_state["name"]) + + alive_actor_states = list_actors(filters=[("state", "=", "ALIVE")]) + for alive_actor_state in alive_actor_states: + if alive_actor_state["name"].startswith(SERVER_NAME_PREFIX): + instance_id = alive_actor_state["name"].split("_")[-1] + curr_servers[instance_id] = ray.get_actor(alive_actor_state["name"], namespace="llumnix") + elif alive_actor_state["name"].startswith(INSTANCE_NAME_PREFIX): + instance_id = alive_actor_state["name"].split("_")[-1] + curr_instances[instance_id] = ray.get_actor(alive_actor_state["name"], namespace="llumnix") + + return curr_pgs, curr_servers, curr_instances + + class LLMEngineManager: def __init__(self): print("create LLMEngineManager") @@ -168,7 +190,7 @@ def __init__(self): self.instances: Dict[str, Llumlet] = {} asyncio.create_task(self._auto_scale_up_loop()) # asyncio.create_task(self._auto_scale_down_loop()) - asyncio.create_task(self._check_deployment_correctness_loop()) + asyncio.create_task(self._check_deployment_states_correctness_loop()) print("LLMEngineManager created") async def _auto_scale_down_loop(self) -> None: @@ -214,6 +236,7 @@ async def _auto_scale_up_loop(self) -> None: await asyncio.wait_for(new_pg.ready(), WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS) except asyncio.TimeoutError: print("Get new placement group ready timeout") + ray.util.remove_placement_group(new_pg) await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) continue print("Get new placement group ready done") @@ -224,29 +247,17 @@ async def _auto_scale_up_loop(self) -> None: print("unexpected exception occurs: {}".format(e)) print("exception traceback: {}".format(traceback.format_exc())) - async def _check_deployment_correctness_loop(self) -> None: + async def _check_deployment_states_correctness_loop(self) -> None: async def detect_correctness_task(instance_id: str): + print(f"detect instance {instance_id}") await asyncio.sleep(CHECK_DEPLOYMENT_CORRECTNESS_INTERVAL_SECONDS) - if instance_id in self.pgs and (instance_id not in self.servers or instance_id not in self.instances): + curr_pgs, curr_servers, curr_instances = get_curr_deployment_states() + if instance_id in curr_pgs and (instance_id not in curr_servers or instance_id not in curr_instances): self._scale_down(instance_id) while True: try: - curr_pgs: Dict[str, PlacementGroup] = {} - curr_servers: Dict[str, PlacementGroup] = {} - curr_instances: Dict[str, Llumlet] = {} - created_pg_states = list_placement_groups(filters=[("state", "=", "CREATED")]) - for created_pg_state in created_pg_states: - instance_id = created_pg_state["name"].split("_")[-1] - curr_pgs[instance_id] = ray.util.get_placement_group(created_pg_state["name"]) - alive_actor_states = list_actors(filters=[("state", "=", "ALIVE")]) - for alive_actor_state in alive_actor_states: - if alive_actor_state["name"].startswith(SERVER_NAME_PREFIX): - instance_id = alive_actor_state["name"].split("_")[-1] - curr_servers[instance_id] = ray.get_actor(alive_actor_state["name"], namespace="llumnix") - elif alive_actor_state["name"].startswith(INSTANCE_NAME_PREFIX): - instance_id = alive_actor_state["name"].split("_")[-1] - curr_instances[instance_id] = ray.get_actor(alive_actor_state["name"], namespace="llumnix") + curr_pgs, curr_servers, curr_instances = get_curr_deployment_states() assert len(curr_pgs) >= max(len(curr_servers), len(curr_instances)) From 2764e536416d1d06fe26dbede3dee0a291249cd3 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 06:10:47 +0000 Subject: [PATCH 21/92] Test restart done --- demo/manager_service_demo.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index 546e989e..f0420bf2 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -157,7 +157,7 @@ def ready(self) -> bool: return True -def get_curr_deployment_states() -> Tuple[Dict[str, PlacementGroup], Dict[str, FastAPIServer], Dict[str, Llumlet]]: +def get_curr_deployment() -> Tuple[Dict[str, PlacementGroup], Dict[str, FastAPIServer], Dict[str, Llumlet]]: curr_pgs: Dict[str, PlacementGroup] = {} curr_servers: Dict[str, PlacementGroup] = {} curr_instances: Dict[str, Llumlet] = {} @@ -188,9 +188,10 @@ def __init__(self): self.pgs: Dict[str, PlacementGroup] = {} self.servers: Dict[str, FastAPIServer] = {} self.instances: Dict[str, Llumlet] = {} + self._connect_to_existing_deployment() asyncio.create_task(self._auto_scale_up_loop()) - # asyncio.create_task(self._auto_scale_down_loop()) - asyncio.create_task(self._check_deployment_states_correctness_loop()) + asyncio.create_task(self._auto_scale_down_loop()) + asyncio.create_task(self._check_deployment_states_loop()) print("LLMEngineManager created") async def _auto_scale_down_loop(self) -> None: @@ -247,17 +248,17 @@ async def _auto_scale_up_loop(self) -> None: print("unexpected exception occurs: {}".format(e)) print("exception traceback: {}".format(traceback.format_exc())) - async def _check_deployment_states_correctness_loop(self) -> None: + async def _check_deployment_states_loop(self) -> None: async def detect_correctness_task(instance_id: str): print(f"detect instance {instance_id}") await asyncio.sleep(CHECK_DEPLOYMENT_CORRECTNESS_INTERVAL_SECONDS) - curr_pgs, curr_servers, curr_instances = get_curr_deployment_states() + curr_pgs, curr_servers, curr_instances = get_curr_deployment() if instance_id in curr_pgs and (instance_id not in curr_servers or instance_id not in curr_instances): self._scale_down(instance_id) while True: try: - curr_pgs, curr_servers, curr_instances = get_curr_deployment_states() + curr_pgs, curr_servers, curr_instances = get_curr_deployment() assert len(curr_pgs) >= max(len(curr_servers), len(curr_instances)) @@ -288,6 +289,13 @@ async def wait_instance_ready(instance_id: str): new_instance = Llumlet.from_args(instance_id, placement_group, lifetime="detached") asyncio.create_task(wait_instance_ready(instance_id)) + def _connect_to_existing_deployment(self): + self.pgs, self.servers, self.instances = get_curr_deployment() + correct_instance_id_set = set(self.pgs.keys()).intersection(self.servers.keys(), self.instances.keys()) + print(f"connect to instances: {correct_instance_id_set}") + for instance_id in correct_instance_id_set: + self._scale_up(instance_id, self.pgs[instance_id], self.servers[instance_id], self.instances[instance_id]) + def _scale_up(self, instance_id: str, placement_group: PlacementGroup, From 907050c2f5ada3adb32cab752406cb0fdd3a3db1 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 06:52:49 +0000 Subject: [PATCH 22/92] Simplify deployment --- examlpes/offline_inference.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py index 4b6bc5d3..c3f550f3 100644 --- a/examlpes/offline_inference.py +++ b/examlpes/offline_inference.py @@ -51,6 +51,13 @@ ray.get(manager.scale_up.remote(instance_ids, llumlets)) +# Create llumlets. +llumlet_ids: List[str] = None +llumlets: List[Llumlet] = None +llumlet_ids, llumlets = ray.get(engine_manager.init_llumlets.remote( + engine_args, QueueType("rayqueue"), BackendType.VLLM, 1, +)) + # The requests‘ outputs will be put to the request_output_queue no matter which instance it's running in. server_id = random_uuid() request_output_queue = RayQueueServer() From fb688794420f11de24e5f3db30079a3880572e59 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 07:22:19 +0000 Subject: [PATCH 23/92] Rename manager --- examlpes/offline_inference.py | 2 +- llumnix/llumlet/llumlet.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py index c3f550f3..053caf0d 100644 --- a/examlpes/offline_inference.py +++ b/examlpes/offline_inference.py @@ -54,7 +54,7 @@ # Create llumlets. llumlet_ids: List[str] = None llumlets: List[Llumlet] = None -llumlet_ids, llumlets = ray.get(engine_manager.init_llumlets.remote( +llumlet_ids, llumlets = ray.get(manager.init_llumlets.remote( engine_args, QueueType("rayqueue"), BackendType.VLLM, 1, )) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index d656ed82..76fd0e3e 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -123,7 +123,7 @@ def from_args(cls, logger.error("Failed to initialize llumlet: {}".format(e)) logger.error("exception traceback: {}".format(traceback.format_exc())) - return llumlet + return llumlet_class async def _check_engine_state_loop(self): while True: From aa9db568d1c77227363046ade105915f0405d3c2 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 08:10:52 +0000 Subject: [PATCH 24/92] Fix unit test of llumlet --- tests/unit_test/llumlet/test_engine_step_exception.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit_test/llumlet/test_engine_step_exception.py b/tests/unit_test/llumlet/test_engine_step_exception.py index b5ea1749..28a92a8d 100644 --- a/tests/unit_test/llumlet/test_engine_step_exception.py +++ b/tests/unit_test/llumlet/test_engine_step_exception.py @@ -62,6 +62,7 @@ def test_engine_step_exception(ray_env): origin_free_memory, _ = torch.cuda.mem_get_info() actor_name = "instance_0" + placement_group = initialize_placement_group(instance_id="0", world_size=1, detached=True) llumlet = MockLlumlet.options(name=actor_name, namespace='llumnix', scheduling_strategy=scheduling_strategy).remote( instance_id="0", From c3bb8d036b1840afcf371bbe0b83d7aaf605bf93 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 08:39:50 +0000 Subject: [PATCH 25/92] Fix backends unit test --- llumnix/llumlet/llumlet.py | 2 +- tests/unit_test/llumlet/test_engine_step_exception.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 76fd0e3e..d656ed82 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -123,7 +123,7 @@ def from_args(cls, logger.error("Failed to initialize llumlet: {}".format(e)) logger.error("exception traceback: {}".format(traceback.format_exc())) - return llumlet_class + return llumlet async def _check_engine_state_loop(self): while True: diff --git a/tests/unit_test/llumlet/test_engine_step_exception.py b/tests/unit_test/llumlet/test_engine_step_exception.py index 28a92a8d..b5ea1749 100644 --- a/tests/unit_test/llumlet/test_engine_step_exception.py +++ b/tests/unit_test/llumlet/test_engine_step_exception.py @@ -62,7 +62,6 @@ def test_engine_step_exception(ray_env): origin_free_memory, _ = torch.cuda.mem_get_info() actor_name = "instance_0" - placement_group = initialize_placement_group(instance_id="0", world_size=1, detached=True) llumlet = MockLlumlet.options(name=actor_name, namespace='llumnix', scheduling_strategy=scheduling_strategy).remote( instance_id="0", From c4acd925d29c943e2782452878c7dab5352cf667 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 09:04:59 +0000 Subject: [PATCH 26/92] Fix offline test --- examlpes/offline_inference.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py index 053caf0d..edca0a40 100644 --- a/examlpes/offline_inference.py +++ b/examlpes/offline_inference.py @@ -52,12 +52,14 @@ ray.get(manager.scale_up.remote(instance_ids, llumlets)) # Create llumlets. -llumlet_ids: List[str] = None +instance_ids: List[str] = None llumlets: List[Llumlet] = None -llumlet_ids, llumlets = ray.get(manager.init_llumlets.remote( +instance_ids, llumlets = ray.get(manager.init_llumlets.remote( engine_args, QueueType("rayqueue"), BackendType.VLLM, 1, )) +ray.get(manager.scale_up.remote(instance_ids, llumlets)) + # The requests‘ outputs will be put to the request_output_queue no matter which instance it's running in. server_id = random_uuid() request_output_queue = RayQueueServer() From cb9137fc7a1e0d3b59468b6a6e05b9034f81bd86 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 09:53:28 +0000 Subject: [PATCH 27/92] Fix e2e test --- tests/e2e_test/test_e2e.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py index 3fa3b3a2..d9e90ea9 100644 --- a/tests/e2e_test/test_e2e.py +++ b/tests/e2e_test/test_e2e.py @@ -83,6 +83,8 @@ async def test_e2e(ray_env, shutdown_llumnix_service, model, migration_backend): ray.shutdown() await asyncio.sleep(5) + + # TODO(s5u13b): Fix ray autoscaler failure. # generate llumnix outputs ip = "127.0.0.1" From 08b8c19a87a0adfed4b3ba5c31c0396aa8146d16 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 10:38:08 +0000 Subject: [PATCH 28/92] Remove pg --- tests/e2e_test/test_e2e.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py index d9e90ea9..3fa3b3a2 100644 --- a/tests/e2e_test/test_e2e.py +++ b/tests/e2e_test/test_e2e.py @@ -83,8 +83,6 @@ async def test_e2e(ray_env, shutdown_llumnix_service, model, migration_backend): ray.shutdown() await asyncio.sleep(5) - - # TODO(s5u13b): Fix ray autoscaler failure. # generate llumnix outputs ip = "127.0.0.1" From eda4e1cce5b15c5eb399c1779c02e2ccdd88cb76 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 11:16:53 +0000 Subject: [PATCH 29/92] Fix lint & Support pg management in manager --- llumnix/llm_engine_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index 83f89c37..94a037cf 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -510,6 +510,7 @@ def init_llumlets(self, engine_manager_args.create_migration_config(), placement_group, engine_args, + placement_group, *args, **kwargs ) From fca07ae5e228b975059ad2770e1a2f1e47c3c43e Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 11:24:16 +0000 Subject: [PATCH 30/92] Fix offline test --- llumnix/llm_engine_manager.py | 1 + llumnix/llumlet/llumlet.py | 1 + 2 files changed, 2 insertions(+) diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index 94a037cf..8268a097 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -511,6 +511,7 @@ def init_llumlets(self, placement_group, engine_args, placement_group, + engine_args, *args, **kwargs ) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index d656ed82..2913dcd5 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -87,6 +87,7 @@ def from_args(cls, num_gpus = world_size instance_name = get_instance_name(instance_id) if backend_type in [backend_type.VLLM, backend_type.BLADELLM]: + kwargs["placement_group"] = placement_group llumlet_class = ray.remote(num_cpus=1, num_gpus=num_gpus, name=instance_name, From adb7576b473e42c27c3b0d0b76aba615daf0bddb Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 11:49:16 +0000 Subject: [PATCH 31/92] Add TODO & Fix get_instance_name --- llumnix/llm_engine_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index 8268a097..309aff43 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -39,6 +39,7 @@ INSTANCE_NAME_PREFIX) from llumnix.queue.queue_type import QueueType from llumnix.backends.utils import initialize_placement_group +from llumnix.utils import get_instance_name logger = init_logger(__name__) From 31da3f52f8d932f27aedc561c8b3f997f62064ac Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 12:06:52 +0000 Subject: [PATCH 32/92] Fix backends unit test --- tests/unit_test/backends/vllm/test_migration.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index c0c808cb..b96ee93b 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -208,6 +208,9 @@ async def test_pd_diaggregation_correctness(ray_env, migration_backend): request_output_queue_type = QueueType.RAYQUEUE que, server_info = request_output_queue_server(request_output_queue_type) asyncio.create_task(que.run_server_loop()) + + placement_group_0 = initialize_placement_group(instance_id="0", world_size=1, detached=True) + placement_group_1 = initialize_placement_group(instance_id="1", world_size=1, detached=True) llumlet_0 = init_llumlet(request_output_queue_type, "0", migration_config, engine_args) llumlet_1 = init_llumlet(request_output_queue_type, "1", migration_config, engine_args) From 8a971787a3a4677e44203a3cf34e8adfc0def2e4 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 24 Dec 2024 12:08:26 +0000 Subject: [PATCH 33/92] Fix lint --- tests/unit_test/backends/vllm/test_migration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index b96ee93b..a36f56cc 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -208,7 +208,7 @@ async def test_pd_diaggregation_correctness(ray_env, migration_backend): request_output_queue_type = QueueType.RAYQUEUE que, server_info = request_output_queue_server(request_output_queue_type) asyncio.create_task(que.run_server_loop()) - + placement_group_0 = initialize_placement_group(instance_id="0", world_size=1, detached=True) placement_group_1 = initialize_placement_group(instance_id="1", world_size=1, detached=True) From e87ef82aa6a48f88173ff3ff106914f7aa460078 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 25 Dec 2024 09:51:02 +0000 Subject: [PATCH 34/92] Use pg when use simulator --- llumnix/llm_engine_manager.py | 2 +- llumnix/llumlet/llumlet.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index 309aff43..12392640 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -506,7 +506,7 @@ def init_llumlets(self, llumlet = Llumlet.from_args( request_output_queue_type, instance_id, - backend_type, + BackendType.SIM_VLLM, world_size, engine_manager_args.create_migration_config(), placement_group, diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 2913dcd5..d656ed82 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -87,7 +87,6 @@ def from_args(cls, num_gpus = world_size instance_name = get_instance_name(instance_id) if backend_type in [backend_type.VLLM, backend_type.BLADELLM]: - kwargs["placement_group"] = placement_group llumlet_class = ray.remote(num_cpus=1, num_gpus=num_gpus, name=instance_name, From 2aa1f02e6bdcfb349de1441841b7eae2b4805e8f Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 25 Dec 2024 10:43:42 +0000 Subject: [PATCH 35/92] Fix global scheduler unit test --- llumnix/llm_engine_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index 12392640..4614b835 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -39,7 +39,6 @@ INSTANCE_NAME_PREFIX) from llumnix.queue.queue_type import QueueType from llumnix.backends.utils import initialize_placement_group -from llumnix.utils import get_instance_name logger = init_logger(__name__) From 467af6039f5d8560462141df57ccfcf083a2d043 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 25 Dec 2024 10:54:52 +0000 Subject: [PATCH 36/92] Fix offline test --- llumnix/llm_engine_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index 4614b835..8268a097 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -505,7 +505,7 @@ def init_llumlets(self, llumlet = Llumlet.from_args( request_output_queue_type, instance_id, - BackendType.SIM_VLLM, + backend_type, world_size, engine_manager_args.create_migration_config(), placement_group, From 55dc598abae6c9db91d8a483e343b78f304a51e0 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 26 Dec 2024 05:56:54 +0000 Subject: [PATCH 37/92] Refine logger --- llumnix/backends/bladellm/llm_engine.py | 2 +- llumnix/backends/utils.py | 2 +- llumnix/backends/vllm/llm_engine.py | 2 +- llumnix/backends/vllm/migration_backend.py | 8 +- llumnix/backends/vllm/scheduler.py | 2 +- llumnix/backends/vllm/utils.py | 2 +- llumnix/backends/vllm/worker.py | 8 +- llumnix/entrypoints/bladellm/client.py | 6 +- llumnix/entrypoints/bladellm/utils.py | 4 +- llumnix/entrypoints/setup.py | 22 +-- llumnix/entrypoints/vllm/api_server.py | 6 +- llumnix/entrypoints/vllm/client.py | 8 +- .../global_scheduler/dispatch_scheduler.py | 2 +- llumnix/llm_engine_manager.py | 126 +++++++++--------- llumnix/llumlet/llumlet.py | 7 +- 15 files changed, 104 insertions(+), 103 deletions(-) diff --git a/llumnix/backends/bladellm/llm_engine.py b/llumnix/backends/bladellm/llm_engine.py index 7dff0caf..33988d77 100644 --- a/llumnix/backends/bladellm/llm_engine.py +++ b/llumnix/backends/bladellm/llm_engine.py @@ -150,7 +150,7 @@ async def _loop(self): await super()._loop() # pylint: disable=broad-except except Exception as e: - logger.error("Error in engine loop: {}".format(e)) + logger.error("error in engine loop: {}".format(e)) logger.error("exception traceback: {}".format(traceback.format_exc())) previous_state = self.state diff --git a/llumnix/backends/utils.py b/llumnix/backends/utils.py index 090ac103..e9584448 100644 --- a/llumnix/backends/utils.py +++ b/llumnix/backends/utils.py @@ -56,7 +56,7 @@ async def put_nowait_to_servers(self, logger.info("server {} is dead".format(server_id)) if self.request_output_queue_type == QueueType.ZMQ: logger.info("request output queue ip: {}, port: {}".format(server_info.request_output_queue_ip, - server_info.request_output_queue_port)) + server_info.request_output_queue_port)) req_outputs = list(server_request_outputs.values())[idx] request_ids = [req_output.request_id for req_output in req_outputs] self.engine_actor_handle.abort_request.remote(request_ids) diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index d580696f..921c7f5b 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -279,7 +279,7 @@ async def _start_engine_step_loop(self) -> None: await asyncio.sleep(NO_OUTPUTS_STEP_INTERVAL) # pylint: disable=broad-except except Exception as e: - logger.error("Error in engine loop: {}".format(e)) + logger.error("error in engine loop: {}".format(e)) logger.error("exception traceback: {}".format(traceback.format_exc())) self._run_workers("shutdown") diff --git a/llumnix/backends/vllm/migration_backend.py b/llumnix/backends/vllm/migration_backend.py index 39978b6e..368b8b49 100644 --- a/llumnix/backends/vllm/migration_backend.py +++ b/llumnix/backends/vllm/migration_backend.py @@ -57,7 +57,7 @@ def __init__(self, migration_config: MigrationConfig, cache_engine: CacheEngine, self.rpc_dtype = self.cache_engine.dtype else: self.rpc_dtype = torch.float32 - logger.warning("Detect numpy unsupported dtype: {}. Using torch.float32.".format(self.cache_engine.dtype)) + logger.warning("Detect numpy unsupported dtype: {}, using torch.float32.".format(self.cache_engine.dtype)) self.is_driver_worker = is_driver_worker self.gpu_cache = gpu_cache @@ -189,7 +189,7 @@ def init_group(world_size, rank, backend, group_name): try: init_group(world_size, rank, self.backend, group_name) except FunctionTimedOut: - logger.info("create migration backend fail (group_name: {}, world_size: {}, rank: {}, backbend: {})." + logger.info("create migration backend failed (group_name: {}, world_size: {}, rank: {}, backbend: {})." .format(group_name, world_size, rank, self.backend)) return False @@ -227,7 +227,7 @@ def warmup(self) -> bool: col.allreduce(self.dummy_cache[0], self.group_name) # pylint: disable=W0703 except Exception as e: - logger.info("warmup migration backend failed (group_name: {}, world_size: {}, rank: {}, backbend: {}), err: {}." + logger.error("warmup migration backend failed (group_name: {}, world_size: {}, rank: {}, backbend: {}), err: {}." .format(self.group_name, self.global_world_size, self.global_rank, self.backend, e)) return False @@ -276,7 +276,7 @@ def do_recv(self, src_handle, blocks: List[int]): self.migration_stream.synchronize() def get_migration_backend(migration_config: MigrationConfig, cache_engine: CacheEngine, worker_handle_list, scheduling_strategy, - is_driver_worker, gpu_cache, worker_rank, local_rank) -> MigrationBackendBase: + is_driver_worker, gpu_cache, worker_rank, local_rank) -> MigrationBackendBase: if cache_engine.num_gpu_blocks < migration_config.migration_buffer_blocks: logger.warning("migration_buffer_blocks({}) is larger than num_gpu_blocks({}), reducing it to num_gpu_blocks." .format(migration_config.migration_buffer_blocks, cache_engine.num_gpu_blocks)) diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index ea0991f7..874b5e1e 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -195,7 +195,7 @@ def free_dst_pre_alloc_cache(self, request_id: str = None) -> None: def free_src_request(self, backend_request: SequenceGroupLlumnix) -> None: seq = backend_request.get_seqs()[0] - logger.info("free request: {}, free seq: {}".format(backend_request.request_id, seq.seq_id)) + logger.info("free request: {}, seq: {}".format(backend_request.request_id, seq.seq_id)) self.free_seq(seq) def _get_instance_info(self, scheduled_seq_groups: List[SequenceGroupLlumnix]) -> InstanceInfo: diff --git a/llumnix/backends/vllm/utils.py b/llumnix/backends/vllm/utils.py index 7e49720a..6f113e2e 100644 --- a/llumnix/backends/vllm/utils.py +++ b/llumnix/backends/vllm/utils.py @@ -48,7 +48,7 @@ def check_engine_args(engine_args: AsyncEngineArgs, engine_manager_args: EngineM engine_config = engine_args.create_engine_config() parallel_config = engine_config.parallel_config if parallel_config.world_size > 1 and migration_config.migration_backend == 'nccl': - logger.info("Llumnix does not support TP or PP enabled model when the migration backend is nccl, change migration backend to gloo.") + logger.warning("Llumnix does not support TP or PP when the migration backend is nccl, change migration backend to gloo.") engine_manager_args.migration_backend = 'gloo' detect_unsupported_feature(engine_args) diff --git a/llumnix/backends/vllm/worker.py b/llumnix/backends/vllm/worker.py index d18c993f..8479d12f 100644 --- a/llumnix/backends/vllm/worker.py +++ b/llumnix/backends/vllm/worker.py @@ -60,16 +60,16 @@ def reserve_memory_for_migration(self, migration_config: MigrationConfig, model_ if migration_config.migration_backend == "nccl" and parallel_config.world_size == 1: device = torch.device(f"cuda:{self.local_rank}") _, total_memory = torch.cuda.mem_get_info(device) - migrate_ratio = math.ceil(dummy_cache_size / total_memory * 10000) / 10000 - cache_config.gpu_memory_utilization -= migrate_ratio + migration_memory_ratio = math.ceil(dummy_cache_size / total_memory * 10000) / 10000 + cache_config.gpu_memory_utilization -= migration_memory_ratio if cache_config.gpu_memory_utilization <= 0: raise ValueError("Nccl migration backend take {:.4f} gpu memory, which is greater than gpu_memory_utilization {:.4f}. " "try to increase gpu-memory-utilization or reduce migration-cache-blocks." - .format(migrate_ratio, cache_config.gpu_memory_utilization)) + .format(migration_memory_ratio, cache_config.gpu_memory_utilization)) logger.info("nccl migration backend take {:.4f} gpu memory, left gpu_memory_utilization {:.4f} for kv cache." - .format(migrate_ratio, cache_config.gpu_memory_utilization)) + .format(migration_memory_ratio, cache_config.gpu_memory_utilization)) return dummy_cache_size diff --git a/llumnix/entrypoints/bladellm/client.py b/llumnix/entrypoints/bladellm/client.py index 67a40af6..729b3874 100644 --- a/llumnix/entrypoints/bladellm/client.py +++ b/llumnix/entrypoints/bladellm/client.py @@ -35,6 +35,9 @@ WAIT_MANAGER_INTERVAL = 5 +# TODO(KuilongCui): Update LlumnixCient of BladeLLM. + + class LlumnixClientBladeLLM(MultiProcessingLLMClient): def __init__(self, args: ServingArgs, llumnix_context: LlumnixEntrypointsContext, loop: asyncio.AbstractEventLoop): super().__init__(args, -1) @@ -56,7 +59,6 @@ async def background_process_outputs(self): continue await self.request_streams[request_id].put(request_output) if request_output.is_finished: - logger.info("Client Recv: {}".format(request_output)) del self.entrypoint_id2llumnix_id[self.llumnix_id2entrypoint_id[request_id]] del self.llumnix_id2entrypoint_id[request_id] del self.request_streams[request_id] @@ -110,7 +112,7 @@ async def _manager_generate(self, request, request_id: str) -> LLMResponse: return await asyncio.create_task(self._manager_generate(request, request_id)) except (ray.exceptions.RayActorError, KeyError): if instance_id in self.llumnix_context.instances: - logger.info("[manager_generate] instance {} is dead".format(instance_id)) + logger.info("[_manager_generate] instance {} is dead".format(instance_id)) del self.llumnix_context.instances[instance_id] del self.llumnix_context.instance_num_requests[instance_id] return await asyncio.create_task(self._manager_generate(request, request_id)) diff --git a/llumnix/entrypoints/bladellm/utils.py b/llumnix/entrypoints/bladellm/utils.py index 3b9f8d14..2a6359f8 100644 --- a/llumnix/entrypoints/bladellm/utils.py +++ b/llumnix/entrypoints/bladellm/utils.py @@ -35,8 +35,8 @@ def check_engine_args(engine_args: ServingArgs, engine_manager_args: EngineManag migration_config = engine_manager_args.create_migration_config() if (engine_args.tensor_parallel_size > 1 or engine_args.tensor_parallel_size > 1) and \ migration_config.migration_backend == 'nccl': - logger.info("Llumnix does not support TP or PP enabled model when the migration backend is nccl, \ - change migration backend to gloo.") + logger.warning("Llumnix does not support TP or PP when the migration backend is nccl, \ + change migration backend to gloo.") engine_manager_args.migration_backend = 'gloo' detect_unsupported_feature(engine_args) diff --git a/llumnix/entrypoints/setup.py b/llumnix/entrypoints/setup.py index 378f1205..2232a019 100644 --- a/llumnix/entrypoints/setup.py +++ b/llumnix/entrypoints/setup.py @@ -66,11 +66,11 @@ def launch_ray_cluster(port: int) -> subprocess.CompletedProcess: # Stop the existing ray processes on the node first. subprocess.run(['ray', 'stop'], check=True, text=True, capture_output=True) except subprocess.CalledProcessError as e: - logger.info("'ray stop' failed with: \n{}".format(e.stderr)) + logger.error("'ray stop' failed with: \n{}".format(e.stderr)) sys.exit(1) # Need to specify the head node ip through environment variable currently. if head_node_ip is None: - logger.info("Environment variable 'HEAD_NODE_IP' should be set for ray cluster launch.") + logger.error("Environment variable 'HEAD_NODE_IP' should be set for ray cluster launch.") sys.exit(1) ray_start_command = None if 'HEAD_NODE' in os.environ: @@ -78,7 +78,7 @@ def launch_ray_cluster(port: int) -> subprocess.CompletedProcess: try: result = subprocess.run(['ray', 'start', '--head', f'--port={port}'], check=True, text=True, capture_output=True) except subprocess.CalledProcessError as e: - logger.info("'{}' failed with: \n{}".format(ray_start_command, e.stderr)) + logger.error("'{}' failed with: \n{}".format(ray_start_command, e.stderr)) sys.exit(1) else: ray_start_command = f"ray start --address={head_node_ip}:{port} --node-ip-address={node_ip_address}" @@ -89,10 +89,10 @@ def launch_ray_cluster(port: int) -> subprocess.CompletedProcess: break except subprocess.CalledProcessError as e: if attempt < MAX_RESTARTS: - print("Execute '{}' repeatedly until the head node starts...".format(ray_start_command)) + logger.warning("execute '{}' repeatedly until the head node starts".format(ray_start_command)) time.sleep(RESTART_INTERVALS) else: - logger.info("'{}' failed after {} attempts with: \n{}".format(ray_start_command, attempt, e.stderr)) + logger.error("'{}' failed after {} attempts with: \n{}".format(ray_start_command, attempt, e.stderr)) sys.exit(1) logger.info("'{}' succeeed with: \n{}".format(ray_start_command, result.stdout)) return result @@ -120,10 +120,10 @@ def retry_manager_method_sync(ray_call, method_name, *args, **kwargs): break except ray.exceptions.RayActorError: if attempt < MAX_TASK_RETRIES - 1: - logger.info("Manager is unavailable, sleep {}s, and retry {} again...".format(RETRIES_INTERVALS, method_name)) + logger.warning("manager is unavailable, sleep {}s, and retry {} again".format(RETRIES_INTERVALS, method_name)) time.sleep(RETRIES_INTERVALS) else: - logger.info("After {} times retries, manager is still unavailable".format(MAX_TASK_RETRIES)) + logger.error("manager is still unavailable after {} times retries".format(MAX_TASK_RETRIES)) raise return ret @@ -134,10 +134,10 @@ async def retry_manager_method_async(ray_call, method_name, *args, **kwargs): break except ray.exceptions.RayActorError: if attempt < MAX_TASK_RETRIES - 1: - logger.info("Manager is unavailable, sleep {}s, and retry {} again...".format(RETRIES_INTERVALS, method_name)) + logger.warning("manager is unavailable, sleep {}s, and retry {} again".format(RETRIES_INTERVALS, method_name)) await asyncio.sleep(RETRIES_INTERVALS) else: - logger.info("After {} times retries, manager is still unavailable".format(MAX_TASK_RETRIES)) + logger.error("manager is still unavailable after {} times retries".format(MAX_TASK_RETRIES)) raise return ret @@ -145,10 +145,10 @@ def init_manager(engine_manager_args: EngineManagerArgs) -> LLMEngineManager: # Only one instance create the manager actor, the other instances get the existing manager actor through ray. try: manager = LLMEngineManager.from_args(engine_manager_args, None) - logger.info("Init LLMEngineManager on current node") + logger.info("Init LLMEngineManager on current node.") except ValueError: manager = ray.get_actor(MANAGER_ACTOR_NAME, namespace='llumnix') - logger.info("Get existing LLMEngineManager") + logger.info("Get existing LLMEngineManager.") return manager def init_llumnix_components(engine_manager_args: EngineManagerArgs, diff --git a/llumnix/entrypoints/vllm/api_server.py b/llumnix/entrypoints/vllm/api_server.py index 680211b0..894b4d06 100644 --- a/llumnix/entrypoints/vllm/api_server.py +++ b/llumnix/entrypoints/vllm/api_server.py @@ -147,8 +147,8 @@ async def generate_benchmark(request: Request) -> Response: if llumnix_client.log_requests: llumnix_client.num_finished_requests += 1 - logger.info("entrypoints finished request {}.".format(request_id)) - logger.info("num_finished_requests {}.".format(llumnix_client.num_finished_requests)) + logger.info("entrypoints finished request {}".format(request_id)) + logger.info("num_finished_requests {}".format(llumnix_client.num_finished_requests)) generation = final_output.outputs[0].text num_output_tokens = len(final_output.outputs[0].token_ids) @@ -195,7 +195,7 @@ async def is_ready() -> bool: llumnix_client = LlumnixClientVLLM(llumnix_entrypoints_context) # Start the api server after all the components of llumnix are ready. - logger.info("Start Api Server on '{}:{}'".format(cfg.SERVER.HOST, cfg.SERVER.PORT)) + logger.info("Start api server on '{}:{}'.".format(cfg.SERVER.HOST, cfg.SERVER.PORT)) uvicorn.run(app, host=cfg.SERVER.HOST, port=cfg.SERVER.PORT, diff --git a/llumnix/entrypoints/vllm/client.py b/llumnix/entrypoints/vllm/client.py index d72cfc79..68ffa9d9 100644 --- a/llumnix/entrypoints/vllm/client.py +++ b/llumnix/entrypoints/vllm/client.py @@ -42,6 +42,8 @@ async def generate(self, **kwargs) -> AsyncStream: if sampling_params.n > 1 or sampling_params.use_beam_search: raise ValueError("Unsupported feature: multiple sequence decoding") + + logger.info("[generate] entrypoints received request {}".format(request_id)) results_generator = AsyncStream(request_id) self.request_streams[request_id] = results_generator @@ -85,10 +87,10 @@ async def _generate_by_instance(self, instance_id = min(self.instance_num_requests, key=self.instance_num_requests.get) self.instance_num_requests[instance_id] += 1 await self.instances[instance_id].generate.remote(request_id, server_info, prompt, sampling_params, *args, **kwargs) - logger.info("LLMEngineManager is unavailable temporarily, dispatch request {} to instance {}".format( + logger.warning("LLMEngineManager is unavailable temporarily, dispatch request {} to instance {}".format( request_id, instance_id)) else: - logger.info("LLMEngineManager is unavailable temporarily, but there is no instance behind this api server, " + logger.warning("LLMEngineManager is unavailable temporarily, but there is no instance behind this api server, " "sleep {}s, waiting for manager available".format(WAIT_MANAGER_INTERVAL)) await asyncio.sleep(WAIT_MANAGER_INTERVAL) return await asyncio.create_task(self.generate(prompt, sampling_params, request_id, *args, **kwargs)) @@ -110,7 +112,7 @@ async def abort(self, request_id: str) -> None: logger.info("abort request: {}.".format(request_id)) await self.manager.abort.remote(request_id) except ray.exceptions.RayActorError: - logger.info("manager is unavailable") + logger.warning("manager is unavailable") async def is_ready(self) -> bool: ready_status = await self.manager.is_ready.remote() diff --git a/llumnix/global_scheduler/dispatch_scheduler.py b/llumnix/global_scheduler/dispatch_scheduler.py index 059a4a1e..7adf0008 100644 --- a/llumnix/global_scheduler/dispatch_scheduler.py +++ b/llumnix/global_scheduler/dispatch_scheduler.py @@ -49,7 +49,7 @@ def dispatch(self) -> str: if self.num_requests % 100 == 0: logger.info("self.num_requests: {}".format(self.num_requests)) for instance_id, num_requests in self.instance_num_requests.items(): - logger.info("Instance {} num_dispatched_requests: {}".format(instance_id, num_requests)) + logger.info("instance {} num_dispatched_requests: {}".format(instance_id, num_requests)) return dispatch_instance_id def update_instance_infos(self, diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index 8268a097..f3fb0548 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -44,7 +44,7 @@ MANAGER_ACTOR_NAME = 'manager' CLEAR_REQUEST_INSTANCE_INTERVAL = 3600 -NO_INSTANCE_RETRY_INTERVAL = 5.0 +NO_INSTANCE_RETRY_INTERVAL = 1.0 WAIT_ALL_MIGRATIONS_DONE_INTERVAL = 1.0 # TODO(s5u13b): Fix the logger when manager failover. @@ -111,8 +111,8 @@ def __init__(self, async def generate(self, request_id: str, server_info: ServerInfo, *args, **kwargs,) -> None: while self.num_instances == 0: - logger.info("[generate] no instance available temporarily, sleep {}s, " - "and retry generate request {} again".format(NO_INSTANCE_RETRY_INTERVAL, request_id)) + logger.warning("[generate] no instance available temporarily, sleep {}s, " + "and regenerate request {}".format(NO_INSTANCE_RETRY_INTERVAL, request_id)) await asyncio.sleep(NO_INSTANCE_RETRY_INTERVAL) instance_id, request_expected_steps = self.global_scheduler.dispatch() @@ -121,7 +121,7 @@ async def generate(self, request_id: str, server_info: ServerInfo, *args, **kwar server_info.request_timestamps.manager_generate_timestamp = time.time() await self.instances[instance_id].generate.remote(request_id, server_info, request_expected_steps, *args, **kwargs) if self.log_requests: - logger.info("[generate] received request {}.".format(request_id)) + logger.info("[generate] manager received request {}".format(request_id)) logger.info("[generate] dispath request {} to instance {}".format(request_id, instance_id)) self.request_instance[request_id] = instance_id except (ray.exceptions.RayActorError, KeyError): @@ -159,30 +159,6 @@ def abort_done_callback(instance_id: str, request_ids: List[str], fut): tasks.append(task) await asyncio.gather(*tasks, return_exceptions=True) - async def _get_request_instance(self) -> None: - def get_request_instance_done_callback(instance_id: str, fut): - ret = fut.result()[0] - if not isinstance(ret, ray.exceptions.RayActorError): - instance_requests.append(ret) - instance_ids.append(instance_id) - else: - logger.info("[_get_request_instance] instance {} is dead".format(instance_id)) - self.scale_down(instance_id) - - instance_requests = [] - instance_ids = [] - tasks = [] - for instance_id, instance_actor_handle in self.instances.items(): - task = asyncio.gather(instance_actor_handle.get_instance_info.remote(), return_exceptions=True) - task.add_done_callback(partial(get_request_instance_done_callback, instance_id)) - tasks.append(task) - await asyncio.gather(*tasks, return_exceptions=True) - logger.info("[_get_request_instance] instance_ids: {}".format(instance_ids)) - logger.info("[_get_request_instance] instance_requests: {}".format(instance_requests)) - for (instance_id, requests) in zip(instance_ids, instance_requests): - for request_id in requests: - self.request_instance[request_id] = instance_id - async def _update_instance_info_loop(self, interval: float) -> None: def update_instance_info_done_callback(instance_id: str, fut): ret = fut.result()[0] @@ -191,6 +167,7 @@ def update_instance_info_done_callback(instance_id: str, fut): instance_infos.append(ret) self.global_scheduler.update_instance_infos([ret]) else: + logger.info("[_update_instance_info_loop] instance {} is dead".format(instance_id)) self.scale_down(instance_id) logger.info("[_update_instance_info_loop] dead instances: {}.".format(ret)) logger.info("[_update_instance_info_loop] dead instances: {}.".format(self.instances)) @@ -218,13 +195,6 @@ def update_instance_info_done_callback(instance_id: str, fut): logger.error("[_update_instance_info_loop] unexpected exception occurs: {}".format(e)) logger.error("[_update_instance_info_loop] exception traceback: {}".format(traceback.format_exc())) - async def _clear_request_instance_loop(self, interval: float): - await self._get_request_instance() - # Clear the request_instance at a certain interval to prevent memory leaking. - while True: - await asyncio.sleep(interval) - self.request_instance = {} - async def _push_migrations(self) -> None: if self.enable_pd_disagg: asyncio.create_task(self._migrate(PairMigrationConstraints.PREFILL_2_DECODING)) @@ -287,7 +257,7 @@ def migrate_done_callback_wrapper(migrate_instance_pair: Tuple[str, str], fut) - logger.error("[_migrate] unexpected exception occurs: {}".format(e)) logger.error("[_migrate] exception traceback: {}".format(traceback.format_exc())) - async def rebuild_migrate_backend(self) -> None: + async def _rebuild_migrate_backend(self) -> None: # Wait for all instances to finish migration while any(self.instance_migrating.values()): await asyncio.sleep(WAIT_ALL_MIGRATIONS_DONE_INTERVAL) @@ -342,7 +312,7 @@ async def run_task(alive_instances: List[str], task_name: str, *args, **kwargs): src_filter=lambda instance_info: instance_info.instance_id in alive_instances, dst_filter=lambda instance_info: instance_info.instance_id in alive_instances) - logger.info("[rebuild_migrate_backend] rebuild {} migrate backend done, group_name: {}, alive instance ({}): {}" + logger.info("[rebuild_migrate_backend] rebuild {} migration backend done, group_name: {}, alive instance ({}): {}" .format(self.engine_manager_args.migration_backend, group_name, len(alive_instances), alive_instances)) # Restore migrate config @@ -373,7 +343,7 @@ def scale_up(self, instance_id: Union[str, Iterable[str]], llumlet_actor_handles # for RPC, the Ray actor handle is used for the migration cache, so there is no need to rebuild the group. if self.enable_migration and self.engine_manager_args.migration_backend in ['gloo', 'nccl'] \ and indeed_update and no_pending_instance: - asyncio.create_task(self.rebuild_migrate_backend()) + asyncio.create_task(self._rebuild_migrate_backend()) return self.num_instances @@ -413,7 +383,7 @@ def scale_down(self, instance_id: Union[str, Iterable[str]], rebuild_migrate_bac if self.engine_manager_args.migration_backend == 'gloo': clear_gloo_backend_state() elif indeed_update and no_pending_instance and rebuild_migrate_backend: - asyncio.create_task(self.rebuild_migrate_backend()) + asyncio.create_task(self._rebuild_migrate_backend()) return self.num_instances @@ -425,8 +395,7 @@ def connect_to_instances_done_callback(instance_id: str, instance_actor_handle: scale_up_instance_actor_handles.append(instance_actor_handle) logger.info("[_connect_to_instances] connect to instance {}.".format(instance_id)) else: - logger.info("[_connect_to_instances] connect to instance {} abort, " - "which may be not ready or alive, err: {}".format(instance_id, e)) + logger.warning("[_connect_to_instances] connect to instance {} failed, exception: {}".format(instance_id, ret)) # Must set True despite set namespance to llumnix. actor_names_dict = ray.util.list_named_actors(all_namespaces=True) @@ -446,26 +415,6 @@ def connect_to_instances_done_callback(instance_id: str, instance_actor_handle: # The only function that can add instance actor handles to manager. self.scale_up(scale_up_instance_ids, scale_up_instance_actor_handles) - async def _check_instance_error(self, migrate_instance_pairs: Tuple[str, str]) -> List[bool]: - def check_instance_error_done_callback(idx: int, instance_id: str, fut): - ret = fut.result()[0] - if not isinstance(ret, (ray.exceptions.RayActorError, KeyError)): - logger.info("[_check_instance_error] instance {} is alive".format(instance_id)) - results[idx] = False - else: - logger.info("[_check_instance_error] instance {} is dead".format(instance_id)) - results[idx] = True - - results = [None, None] - tasks = [] - for idx, instance_id in enumerate(migrate_instance_pairs): - task = asyncio.gather(self.instances[instance_id].is_ready.remote(), return_exceptions=True) - task.add_done_callback(partial(check_instance_error_done_callback, idx, instance_id)) - tasks.append(task) - await asyncio.gather(*tasks, return_exceptions=True) - - return results - @classmethod def from_args(cls, engine_manager_args: EngineManagerArgs, @@ -518,7 +467,6 @@ def init_llumlets(self, else: assert backend_type == backend_type.VLLM, f'unimplemented backend SIM_{backend_type}' # num_cpus=1, for Llumlet + AsyncPutQueueActor - logger.info("[init_llumlets] use simulator backend") placement_group = initialize_placement_group(instance_id, num_cpus=2, num_gpus=0, detached=True) llumlet = Llumlet.from_args( request_output_queue_type, @@ -537,15 +485,63 @@ def init_llumlets(self, return instance_ids, llumlets - def get_actor_name(self) -> str: - return self.actor_name - async def is_ready(self) -> bool: """Called by api server, return true when all the instances have been successfully created.""" tasks = [llumlet.is_ready.remote() for llumlet in self.instances.values()] is_ready_list = await asyncio.gather(*tasks) return all(is_ready_list) + async def _check_instance_error(self, migrate_instance_pairs: Tuple[str, str]) -> List[bool]: + def check_instance_error_done_callback(idx: int, instance_id: str, fut): + ret = fut.result()[0] + if not isinstance(ret, (ray.exceptions.RayActorError, KeyError)): + logger.info("[_check_instance_error] instance {} is alive".format(instance_id)) + results[idx] = False + else: + logger.info("[_check_instance_error] instance {} is dead".format(instance_id)) + results[idx] = True + + results = [None, None] + tasks = [] + for idx, instance_id in enumerate(migrate_instance_pairs): + task = asyncio.gather(self.instances[instance_id].is_ready.remote(), return_exceptions=True) + task.add_done_callback(partial(check_instance_error_done_callback, idx, instance_id)) + tasks.append(task) + await asyncio.gather(*tasks, return_exceptions=True) + + return results + + async def _get_request_instance(self) -> None: + def get_request_instance_done_callback(instance_id: str, fut): + ret = fut.result()[0] + if not isinstance(ret, ray.exceptions.RayActorError): + instance_requests.append(ret) + instance_ids.append(instance_id) + else: + logger.info("[_get_request_instance] instance {} is dead".format(instance_id)) + self.scale_down(instance_id) + + instance_requests = [] + instance_ids = [] + tasks = [] + for instance_id, instance_actor_handle in self.instances.items(): + task = asyncio.gather(instance_actor_handle.get_instance_info.remote(), return_exceptions=True) + task.add_done_callback(partial(get_request_instance_done_callback, instance_id)) + tasks.append(task) + await asyncio.gather(*tasks, return_exceptions=True) + logger.debug("[_get_request_instance] instance_ids: {}".format(instance_ids)) + logger.debug("[_get_request_instance] instance_requests: {}".format(instance_requests)) + for (instance_id, requests) in zip(instance_ids, instance_requests): + for request_id in requests: + self.request_instance[request_id] = instance_id + + async def _clear_request_instance_loop(self, interval: float): + await self._get_request_instance() + # Clear the request_instance at a certain interval to prevent memory leaking. + while True: + await asyncio.sleep(interval) + self.request_instance = {} + def _init_instance_info_csv(self, engine_manager_args: EngineManagerArgs) -> None: # pylint: disable=consider-using-with self.instance_info_file = open(engine_manager_args.log_filename + '_instance.csv', 'w', encoding='utf-8') diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index d656ed82..3f96559a 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -47,6 +47,7 @@ def __init__(self, *args, **kwargs) -> None: try: + logger.info("Llumlet backend type: {}".format(backend_type)) self.instance_id = instance_id self.instance_name = get_instance_name(instance_id) self.backend_engine: BackendInterface = init_backend_engine(self.instance_id, @@ -66,7 +67,7 @@ def __init__(self, asyncio.create_task(self._check_engine_state_loop()) # pylint: disable=broad-except except Exception as e: - logger.error("Failed to initialize llumlet: {}".format(e)) + logger.error("failed to initialize Llumlet: {}".format(e)) logger.error("exception traceback: {}".format(traceback.format_exc())) @classmethod @@ -120,7 +121,7 @@ def from_args(cls, **kwargs) # pylint: disable=broad-except except Exception as e: - logger.error("Failed to initialize llumlet: {}".format(e)) + logger.error("failed to initialize Llumlet: {}".format(e)) logger.error("exception traceback: {}".format(traceback.format_exc())) return llumlet @@ -129,7 +130,7 @@ async def _check_engine_state_loop(self): while True: await asyncio.sleep(CHECK_ENGINE_STATE_INTERVAL) if self.backend_engine.state == EngineState.CRASHED: - logger.warning("llumlet ({}) detected backend engine crashed. Stopping...".format(self.instance_id)) + logger.error("Llumlet ({}) detected backend engine crashed. Stopping...".format(self.instance_id)) # pylint: disable=protected-access self.backend_engine._stop_event.set() await asyncio.sleep(0) From 8a7feeb6b24ffbd77eb19e726a0c63c8e0a85939 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 26 Dec 2024 05:57:32 +0000 Subject: [PATCH 38/92] Fix lint --- llumnix/entrypoints/vllm/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llumnix/entrypoints/vllm/client.py b/llumnix/entrypoints/vllm/client.py index 68ffa9d9..cbe2ffb6 100644 --- a/llumnix/entrypoints/vllm/client.py +++ b/llumnix/entrypoints/vllm/client.py @@ -42,7 +42,7 @@ async def generate(self, **kwargs) -> AsyncStream: if sampling_params.n > 1 or sampling_params.use_beam_search: raise ValueError("Unsupported feature: multiple sequence decoding") - + logger.info("[generate] entrypoints received request {}".format(request_id)) results_generator = AsyncStream(request_id) From 85275c22844ee3d29eac5ee4abf62c7b78c654a6 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 26 Dec 2024 06:33:43 +0000 Subject: [PATCH 39/92] Move MANAGER_NAME to utils --- llumnix/entrypoints/setup.py | 6 +++--- llumnix/llm_engine_manager.py | 8 ++++---- llumnix/utils.py | 1 + tests/unit_test/entrypoints/test_utils.py | 4 ++-- tests/unit_test/entrypoints/vllm/api_server_manager.py | 5 ++--- .../entrypoints/vllm/api_server_manager_service.py | 6 +++--- .../unit_test/global_scheduler/test_llm_engine_manager.py | 8 ++++---- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/llumnix/entrypoints/setup.py b/llumnix/entrypoints/setup.py index 2232a019..8e17f373 100644 --- a/llumnix/entrypoints/setup.py +++ b/llumnix/entrypoints/setup.py @@ -20,10 +20,10 @@ import socket import ray -from llumnix.llm_engine_manager import LLMEngineManager, MANAGER_ACTOR_NAME +from llumnix.llm_engine_manager import LLMEngineManager from llumnix.llumlet.llumlet import Llumlet from llumnix.logger import init_logger -from llumnix.utils import random_uuid +from llumnix.utils import random_uuid, MANAGER_NAME from llumnix.arg_utils import EngineManagerArgs from llumnix.queue.queue_type import QueueType from llumnix.server_info import ServerInfo, RequestTimestamps @@ -147,7 +147,7 @@ def init_manager(engine_manager_args: EngineManagerArgs) -> LLMEngineManager: manager = LLMEngineManager.from_args(engine_manager_args, None) logger.info("Init LLMEngineManager on current node.") except ValueError: - manager = ray.get_actor(MANAGER_ACTOR_NAME, namespace='llumnix') + manager = ray.get_actor(MANAGER_NAME, namespace='llumnix') logger.info("Get existing LLMEngineManager.") return manager diff --git a/llumnix/llm_engine_manager.py b/llumnix/llm_engine_manager.py index f3fb0548..b01c86ea 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/llm_engine_manager.py @@ -36,13 +36,13 @@ clear_gloo_backend_state, remove_placement_group, get_instance_name, - INSTANCE_NAME_PREFIX) + INSTANCE_NAME_PREFIX, + MANAGER_NAME) from llumnix.queue.queue_type import QueueType from llumnix.backends.utils import initialize_placement_group logger = init_logger(__name__) -MANAGER_ACTOR_NAME = 'manager' CLEAR_REQUEST_INSTANCE_INTERVAL = 3600 NO_INSTANCE_RETRY_INTERVAL = 1.0 WAIT_ALL_MIGRATIONS_DONE_INTERVAL = 1.0 @@ -59,7 +59,7 @@ def __init__(self, log_requests: bool = True, profiling_database: ProfilingDatabase = None) -> None: os.chdir(work_dir) - self.actor_name = MANAGER_ACTOR_NAME + self.actor_name = MANAGER_NAME self.engine_manager_args = engine_manager_args self.profiling_database = profiling_database @@ -422,7 +422,7 @@ def from_args(cls, global_scheduler_config = engine_manager_args.create_global_scheduler_configs() manager_class = ray.remote(num_cpus=0, max_restarts=-1, - name=MANAGER_ACTOR_NAME, + name=MANAGER_NAME, namespace='llumnix', lifetime="detached" )(cls) diff --git a/llumnix/utils.py b/llumnix/utils.py index 301fb565..932c49d7 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -14,6 +14,7 @@ import uuid import ray +MANAGER_NAME = "manager" PLACEMENT_GROUP_NAME_PREFIX = "pg_" SERVER_NAME_PREFIX = "server_" INSTANCE_NAME_PREFIX = "instance_" diff --git a/tests/unit_test/entrypoints/test_utils.py b/tests/unit_test/entrypoints/test_utils.py index 9705cb57..1cfe7b04 100644 --- a/tests/unit_test/entrypoints/test_utils.py +++ b/tests/unit_test/entrypoints/test_utils.py @@ -21,8 +21,8 @@ init_manager, retry_manager_method_sync, retry_manager_method_async) -from llumnix.llm_engine_manager import MANAGER_ACTOR_NAME from llumnix.queue.utils import init_request_output_queue_server +from llumnix.utils import MANAGER_NAME # pylint: disable=unused-import from tests.conftest import ray_env @@ -39,7 +39,7 @@ def test_init_manager(ray_env): engine_manager_args = EngineManagerArgs() manager = init_manager(engine_manager_args) assert manager is not None - manager_actor_handle = ray.get_actor(MANAGER_ACTOR_NAME, namespace='llumnix') + manager_actor_handle = ray.get_actor(MANAGER_NAME, namespace='llumnix') assert manager_actor_handle is not None assert manager == manager_actor_handle diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager.py b/tests/unit_test/entrypoints/vllm/api_server_manager.py index bafbd599..da149d25 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager.py @@ -22,14 +22,13 @@ import llumnix.llm_engine_manager from llumnix.arg_utils import EngineManagerArgs from llumnix.server_info import ServerInfo, RequestTimestamps -from llumnix.utils import random_uuid +from llumnix.utils import random_uuid, MANAGER_NAME from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType from llumnix.entrypoints.setup import LlumnixEntrypointsContext from llumnix.entrypoints.vllm.client import LlumnixClientVLLM app = llumnix.entrypoints.vllm.api_server.app manager = None -MANAGER_ACTOR_NAME = llumnix.llm_engine_manager.MANAGER_ACTOR_NAME @ray.remote(num_cpus=0) @@ -54,7 +53,7 @@ def testing_stats(self): def init_manager(request_output_queue_type: QueueType): - manager = MockLLMEngineManager.options(name=MANAGER_ACTOR_NAME, + manager = MockLLMEngineManager.options(name=MANAGER_NAME, namespace='llumnix').remote(request_output_queue_type) return manager diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py index 10f802a0..44eb459d 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py @@ -28,10 +28,10 @@ from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType from llumnix.entrypoints.setup import LlumnixEntrypointsContext from llumnix.entrypoints.vllm.client import LlumnixClientVLLM +from llumnix.utils import MANAGER_NAME app = llumnix.entrypoints.vllm.api_server.app engine_manager = None -MANAGER_ACTOR_NAME = llumnix.llm_engine_manager.MANAGER_ACTOR_NAME ENTRYPOINTS_ACTOR_NAME = "entrypoints" @@ -69,7 +69,7 @@ def __init__(self, host: str, port: int, request_output_queue_type: QueueType): ip = '127.0.0.1' port = 1234 global engine_manager - engine_manager = ray.get_actor(MANAGER_ACTOR_NAME, namespace="llumnix") + engine_manager = ray.get_actor(MANAGER_NAME, namespace="llumnix") request_output_queue = init_request_output_queue_server(ip, port, request_output_queue_type) ray_queue_server = None if request_output_queue_type == QueueType.RAYQUEUE: @@ -92,7 +92,7 @@ def run(self): timeout_keep_alive=llumnix.entrypoints.vllm.api_server.TIMEOUT_KEEP_ALIVE) def init_manager_service(request_output_queue_type: QueueType, args: 'Namespace'): - engine_manager = MockLLMEngineManagerService.options(name=MANAGER_ACTOR_NAME, + engine_manager = MockLLMEngineManagerService.options(name=MANAGER_NAME, namespace='llumnix').remote(request_output_queue_type, args) return engine_manager diff --git a/tests/unit_test/global_scheduler/test_llm_engine_manager.py b/tests/unit_test/global_scheduler/test_llm_engine_manager.py index 57de44ff..a1714f1e 100644 --- a/tests/unit_test/global_scheduler/test_llm_engine_manager.py +++ b/tests/unit_test/global_scheduler/test_llm_engine_manager.py @@ -19,9 +19,9 @@ from vllm import EngineArgs -from llumnix.utils import random_uuid, get_instance_name +from llumnix.utils import random_uuid, get_instance_name, MANAGER_NAME from llumnix.arg_utils import EngineManagerArgs -from llumnix.llm_engine_manager import LLMEngineManager, MANAGER_ACTOR_NAME +from llumnix.llm_engine_manager import LLMEngineManager from llumnix.instance_info import InstanceInfo from llumnix.server_info import ServerInfo from llumnix.queue.queue_type import QueueType @@ -109,7 +109,7 @@ def init_manager(): engine_manager_args.log_instance_info = False manager = LLMEngineManager.from_args(engine_manager_args, None) except ValueError: - manager = ray.get_actor(MANAGER_ACTOR_NAME, namespace='llumnix') + manager = ray.get_actor(MANAGER_NAME, namespace='llumnix') ray.get(manager.is_ready.remote()) return manager @@ -143,7 +143,7 @@ def llumlet(): def test_init_manager(ray_env, manager): assert manager is not None - manager_actor_handle = ray.get_actor(MANAGER_ACTOR_NAME, namespace='llumnix') + manager_actor_handle = ray.get_actor(MANAGER_NAME, namespace='llumnix') assert manager_actor_handle is not None assert manager == manager_actor_handle From b219120b5294c09fe69e6c1e12e20f22a1f56b55 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 2 Jan 2025 07:43:16 +0000 Subject: [PATCH 40/92] Refactor deployment and actor construction for supporting global deployment & Rename manager and entrypoints --- demo/manager_service_demo.py | 24 +-- docs/Quickstart.md | 2 +- examlpes/offline_inference.py | 13 +- llumnix/__init__.py | 8 +- llumnix/arg_utils.py | 41 ++-- llumnix/backends/backend_interface.py | 1 - llumnix/backends/utils.py | 70 +------ llumnix/backends/vllm/utils.py | 8 +- llumnix/config/default.py | 2 + llumnix/entrypoints/bladellm/api_server.py | 38 ++-- llumnix/entrypoints/bladellm/client.py | 4 +- llumnix/entrypoints/bladellm/utils.py | 26 +-- llumnix/entrypoints/setup.py | 193 +++++++----------- llumnix/entrypoints/utils.py | 94 +++++++++ llumnix/entrypoints/vllm/api_server.py | 43 ++-- llumnix/entrypoints/vllm/api_server_actor.py | 0 llumnix/entrypoints/vllm/arg_utils.py | 22 +- llumnix/entrypoints/vllm/client.py | 26 +-- llumnix/entrypoints/vllm/serve.py | 35 ++++ llumnix/global_scheduler/migration_policy.py | 2 +- llumnix/global_scheduler/scaling_scheduler.py | 2 +- llumnix/llumlet/llumlet.py | 5 +- llumnix/llumlet/request.py | 1 + llumnix/{llm_engine_manager.py => manager.py} | 164 ++++++++------- llumnix/utils.py | 61 ++++++ .../backends/vllm/test_llm_engine.py | 2 +- .../unit_test/backends/vllm/test_migration.py | 3 +- .../backends/vllm/test_migration_backend.py | 7 +- .../unit_test/backends/vllm/test_simulator.py | 2 +- tests/unit_test/backends/vllm/test_worker.py | 8 +- tests/unit_test/entrypoints/test_utils.py | 21 +- .../entrypoints/vllm/api_server_manager.py | 25 +-- .../vllm/api_server_manager_service.py | 27 +-- .../global_scheduler/test_global_scheduler.py | 2 +- ..._llm_engine_manager.py => test_manager.py} | 22 +- .../llumlet/test_engine_step_exception.py | 2 +- 36 files changed, 565 insertions(+), 441 deletions(-) create mode 100644 llumnix/entrypoints/utils.py create mode 100644 llumnix/entrypoints/vllm/api_server_actor.py create mode 100644 llumnix/entrypoints/vllm/serve.py rename llumnix/{llm_engine_manager.py => manager.py} (85%) rename tests/unit_test/global_scheduler/{test_llm_engine_manager.py => test_manager.py} (94%) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index f0420bf2..d4495d8b 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -179,9 +179,9 @@ def get_curr_deployment() -> Tuple[Dict[str, PlacementGroup], Dict[str, FastAPIS return curr_pgs, curr_servers, curr_instances -class LLMEngineManager: +class Manager: def __init__(self): - print("create LLMEngineManager") + print("create Manager") self.host = "localhost" self.port = 8000 self.last_pending_pg: PlacementGroup = None @@ -192,7 +192,7 @@ def __init__(self): asyncio.create_task(self._auto_scale_up_loop()) asyncio.create_task(self._auto_scale_down_loop()) asyncio.create_task(self._check_deployment_states_loop()) - print("LLMEngineManager created") + print("Manager created") async def _auto_scale_down_loop(self) -> None: def instance_ready_callback(instance_id: str, fut): @@ -325,24 +325,24 @@ def _scale_down(self, instance_id: str) -> None: @classmethod def from_args(cls): - engine_manager_class = ray.remote(num_cpus=1, - max_restarts=-1, - name="manager", - namespace="llumnix", - lifetime="detached")(cls) - engine_manager = engine_manager_class.remote() - return engine_manager + manager_class = ray.remote(num_cpus=1, + max_restarts=-1, + name="manager", + namespace="llumnix", + lifetime="detached")(cls) + manager = manager_class.remote() + return manager if __name__ == "__main__": ray.init() - # magic actor + # magic actor to avoid fast api server actor initialization error request_output_queue = RayQueue(actor_options={ "namespace": "llumnix", "name": "magic_queue" }) - manager = LLMEngineManager.from_args() + manager = Manager.from_args() while True: time.sleep(100) diff --git a/docs/Quickstart.md b/docs/Quickstart.md index 4fcd605f..6081c537 100644 --- a/docs/Quickstart.md +++ b/docs/Quickstart.md @@ -62,7 +62,7 @@ export HEAD_NODE=1 During the execution of serving deployment, Llumnix will: - Initiate the Ray cluster for distributed execution. -- Start Llumnix actor components, including LLMEngineManager, Llumlet, among others. +- Start Llumnix actor components, including Manager, Llumlet, among others. - Launch the vLLM engine instances. Following these steps, Llumnix acts as the request scheduling layer situated behind the multiple frontend API servers and above the multiple backend vLLM engine instances. This positioning allows Llumnix to significantly enhance serving performance through its dynamic, fine-grained, and KV-cache-aware request scheduling and rescheduling across instances. diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py index edca0a40..2d2e67e3 100644 --- a/examlpes/offline_inference.py +++ b/examlpes/offline_inference.py @@ -5,8 +5,9 @@ import ray from llumnix import launch_ray_cluster, connect_to_ray_cluster, init_manager -from llumnix import (SamplingParams, ServerInfo, EngineManagerArgs, LLMEngineManager, Llumlet, - EngineArgs, QueueType, BackendType) +from llumnix import (ManagerArgs, EngineArgs, Manager, + Llumlet, ServerInfo, QueueType, BackendType, + SamplingParams) from llumnix.utils import random_uuid from llumnix.queue.ray_queue_server import RayQueueServer @@ -33,21 +34,19 @@ connect_to_ray_cluster(port=ray_cluster_port) # Set manager args and engine args. -manager_args = EngineManagerArgs() +manager_args = ManagerArgs() engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True, trust_remote_code=True, max_model_len=370) # Create a manager. If the manager is created first, and then the llumlets are created, manager.scale_up # need to be called to add the newly created llumlets to the management of the manager. -manager: LLMEngineManager = init_manager(manager_args) +manager: Manager = init_manager(manager_args) ray.get(manager.is_ready.remote()) # Create llumlets. instance_ids: List[str] = None llumlets: List[Llumlet] = None -instance_ids, llumlets = ray.get(manager.init_llumlets.remote( - engine_args, QueueType("rayqueue"), BackendType.VLLM, 1, -)) +instance_ids, llumlets = ray.get(manager.init_llumlets.remote(engine_args, QueueType("rayqueue"), BackendType.VLLM)) ray.get(manager.scale_up.remote(instance_ids, llumlets)) diff --git a/llumnix/__init__.py b/llumnix/__init__.py index 3e6e04b4..fba69575 100644 --- a/llumnix/__init__.py +++ b/llumnix/__init__.py @@ -15,8 +15,8 @@ from llumnix.entrypoints.setup import (launch_ray_cluster, connect_to_ray_cluster, init_manager) -from llumnix.arg_utils import EngineManagerArgs -from llumnix.llm_engine_manager import LLMEngineManager +from llumnix.arg_utils import ManagerArgs +from llumnix.manager import Manager from llumnix.llumlet.llumlet import Llumlet from llumnix.queue.queue_type import QueueType from llumnix.backends.backend_interface import BackendType @@ -28,8 +28,8 @@ "launch_ray_cluster", "connect_to_ray_cluster", "init_manager", - "EngineManagerArgs", - "LLMEngineManager", + "ManagerArgs", + "Manager", "Llumlet", "QueueType", "BackendType", diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index d9138407..d4ee51b0 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -21,6 +21,8 @@ from llumnix.internal_config import GlobalSchedulerConfig, MigrationConfig from llumnix.config import LlumnixConfig, get_llumnix_config from llumnix.config.default import _C +from llumnix.backends.backend_interface import BackendType +from llumnix.entrypoints.utils import DeploymentMode class LlumnixArgumentParser(argparse.ArgumentParser): @@ -44,7 +46,12 @@ def add_argument(self, *args, **kwargs): # All the default values of llumnix arguments are set in default.py. So all the arguments here are set to None. @dataclass -class LlumnixEntrypointsArgs: +class EntrypointsArgs: + host: str = None + port: int = None + ssl_keyfile: str = None + ssl_certfile: str = None + log_level: str = None launch_ray_cluster: bool = None ray_cluster_port: int = None request_output_queue_type: str = None @@ -59,16 +66,16 @@ def __post_init__(self): setattr(self, attr.name, getattr(_C.SERVER, attr.name.upper())) @classmethod - def from_llumnix_config(cls, cfg: LlumnixConfig = get_llumnix_config()) -> 'LlumnixEntrypointsArgs': + def from_llumnix_config(cls, cfg: LlumnixConfig = get_llumnix_config()) -> 'EntrypointsArgs': # Get the list of attributes of this dataclass. attrs = [attr.name for attr in dataclasses.fields(cls)] # Set the attributes from the parsed arguments. # The defalut values of attributes are defined in default.py. - llumnix_entrypoints_args = cls(**{attr: getattr(cfg.SERVER, attr.upper()) for attr in attrs}) - return llumnix_entrypoints_args + entrypoints_args = cls(**{attr: getattr(cfg.SERVER, attr.upper()) for attr in attrs}) + return entrypoints_args @classmethod - def check_args(cls, args: 'LlumnixEntrypointsArgs', parser: argparse.ArgumentParser): + def check_args(cls, args: 'EntrypointsArgs', parser: argparse.ArgumentParser): # pylint: disable=protected-access for action in parser._optionals._actions: if hasattr(action, 'choices') and action.choices is not None and hasattr(args, action.dest): @@ -78,7 +85,7 @@ def check_args(cls, args: 'LlumnixEntrypointsArgs', parser: argparse.ArgumentPar def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.add_argument('--launch-ray-cluster', action='store_true', - help='if launch ray cluster in api server') + help='if launch ray cluster in server') parser.add_argument("--ray-cluster-port", type=int, help='ray cluster port') @@ -98,10 +105,11 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.add_argument("--config-file", type=str, help="path to config file") + return parser @dataclass -class EngineManagerArgs: +class ManagerArgs: initial_instances: int = None load_metric: str = None @@ -152,7 +160,7 @@ def __post_init__(self): if getattr(self, attr.name) is None: setattr(self, attr.name, getattr(_C.MANAGER, attr.name.upper())) - def create_global_scheduler_configs( + def create_global_scheduler_config( self, ) -> Tuple[GlobalSchedulerConfig]: @@ -168,7 +176,7 @@ def create_global_scheduler_configs( self.scale_up_threshold, self.scale_down_threshold, self.enable_pd_disagg, - self.migration_backend,) + self.migration_backend) return global_scheduler_config def create_migration_config(self) -> MigrationConfig: @@ -185,16 +193,16 @@ def create_migration_config(self) -> MigrationConfig: return migration_config @classmethod - def from_llumnix_config(cls, cfg: LlumnixConfig = get_llumnix_config()) -> 'EngineManagerArgs': + def from_llumnix_config(cls, cfg: LlumnixConfig = get_llumnix_config()) -> 'ManagerArgs': # Get the list of attributes of this dataclass. attrs = [attr.name for attr in dataclasses.fields(cls)] # Set the attributes from the parsed arguments. # The defalut values of attributes are defined in default.py. - engine_manager_args = cls(**{attr: getattr(cfg.MANAGER, attr.upper()) for attr in attrs}) - return engine_manager_args + manager_args = cls(**{attr: getattr(cfg.MANAGER, attr.upper()) for attr in attrs}) + return manager_args @classmethod - def check_args(cls, args: 'EngineManagerArgs', parser: argparse.ArgumentParser): + def check_args(cls, args: 'ManagerArgs', parser: argparse.ArgumentParser): # pylint: disable=protected-access for action in parser._optionals._actions: if hasattr(action, 'choices') and action.choices is not None and hasattr(args, action.dest): @@ -331,10 +339,17 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.add_argument('--max-stages', type=int, help='drop migration if the number of stages > max_stages') + parser.add_argument('--enable-pd-disagg', action='store_true', help='enable prefill decoding disaggregation') parser.add_argument('--num-dispatch-instances', type=int, help='number of available instances for dispatch') + return parser + +@dataclass +class DeploymentArgs: + deployment_mode: DeploymentMode = None + backend_type: BackendType = None diff --git a/llumnix/backends/backend_interface.py b/llumnix/backends/backend_interface.py index 5e34c01f..c1fc3a84 100644 --- a/llumnix/backends/backend_interface.py +++ b/llumnix/backends/backend_interface.py @@ -35,7 +35,6 @@ def is_sim_backend(status: "BackendType") -> bool: BackendType.SIM_VLLM, ] -# TODO(KuilongCui): separate backend interface into two parts: DispatchBackendInterface and MigrationBackendInterface class BackendInterface(ABC): # Methods for inference @abstractmethod diff --git a/llumnix/backends/utils.py b/llumnix/backends/utils.py index e9584448..f7d4d3bb 100644 --- a/llumnix/backends/utils.py +++ b/llumnix/backends/utils.py @@ -11,12 +11,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional, Tuple, Dict, List +from typing import Dict, List import asyncio import time import ray -from ray.util.placement_group import PlacementGroup from llumnix.backends.backend_interface import BackendInterface, BackendType from llumnix.queue.queue_type import QueueType @@ -24,7 +23,7 @@ from llumnix.queue.utils import init_request_output_queue_client from llumnix.server_info import ServerInfo from llumnix.logger import init_logger -from llumnix.utils import get_placement_group_name, get_instance_name +from llumnix.utils import get_instance_name logger = init_logger(__name__) @@ -79,61 +78,10 @@ def init_backend_engine(instance_id: str, request_output_queue_type: QueueType, raise ValueError(f'Unsupported backend: {backend_type}') return backend_engine -def initialize_placement_group( - instance_id: str, - num_cpus: int = 1, - num_gpus: int = 1, - detached: bool = False -) -> Tuple[str, Optional[PlacementGroup]]: - """Initialize the distributed cluster probably with Ray. - - Args: - world_size: The number of workers in Llumlet. - - Returns: - A tuple of (`distributed_init_method`, `placement_group`). The - `distributed_init_method` is the address for initializing the - distributed backend. `placement_group` includes the specification - of the resources for each distributed worker. - """ - if ray is None: - raise ImportError( - "Ray is not installed. Please install Ray to use distributed " - "serving.") - - lifetime = "detached" if detached else None - # Create placement group for worker processes - current_placement_group = ray.util.get_current_placement_group() - if current_placement_group: - # We are in a placement group - bundles = current_placement_group.bundle_specs - # Verify that we can use the placement group. - gpu_bundles = 0 - for bundle in bundles: - bundle_gpus = bundle.get("GPU", 0) - if bundle_gpus > 1: - raise ValueError( - "Placement group bundle cannot have more than 1 GPU.") - if bundle_gpus: - gpu_bundles += 1 - if num_gpus > gpu_bundles: - raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the placement group.") - else: - num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) - if num_gpus > num_gpus_in_cluster: - raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the cluster.") - # Create a new placement group - # bundle_0: Llumlet + AsyncPutQueueActor + ProxyActor, bundle_1: Workers - placement_group_specs = ([{"CPU": num_cpus}] + [{"GPU": 1}] * num_gpus) - current_placement_group = ray.util.placement_group( - placement_group_specs, "STRICT_PACK", name=get_placement_group_name(instance_id), lifetime=lifetime) - # Wait until PG is ready - this will block until all - # requested resources are available, and will timeout - # if they cannot be provisioned. - ray.get(current_placement_group.ready(), timeout=1800) - - return current_placement_group +def get_engine_world_size(engine_args, backend_type: BackendType): + if backend_type in [BackendType.VLLM, BackendType.SIM_VLLM]: + engine_config = engine_args.create_engine_config() + world_size = engine_config.parallel_config.world_size + else: # BLADE_LLM + world_size = engine_args.tensor_parallel_size * engine_args.pipeline_parallel_size + return world_size diff --git a/llumnix/backends/vllm/utils.py b/llumnix/backends/vllm/utils.py index 6f113e2e..80c63e6d 100644 --- a/llumnix/backends/vllm/utils.py +++ b/llumnix/backends/vllm/utils.py @@ -23,7 +23,7 @@ _modify_greedy_probs_inplace, _beam_search_sample from llumnix.logger import init_logger -from llumnix.arg_utils import EngineManagerArgs +from llumnix.arg_utils import ManagerArgs logger = init_logger(__name__) @@ -41,15 +41,15 @@ def detect_unsupported_feature(engine_args: EngineArgs) -> None: if unsupported_feature: raise ValueError(f'Unsupported feature: Llumnix does not support "{unsupported_feature}" currently.') -def check_engine_args(engine_args: AsyncEngineArgs, engine_manager_args: EngineManagerArgs) -> None: +def check_engine_args(engine_args: AsyncEngineArgs, manager_args: ManagerArgs) -> None: assert engine_args.engine_use_ray and engine_args.worker_use_ray, \ ("In Llumnix, engine and worker must be ray actor.") - migration_config = engine_manager_args.create_migration_config() + migration_config = manager_args.create_migration_config() engine_config = engine_args.create_engine_config() parallel_config = engine_config.parallel_config if parallel_config.world_size > 1 and migration_config.migration_backend == 'nccl': logger.warning("Llumnix does not support TP or PP when the migration backend is nccl, change migration backend to gloo.") - engine_manager_args.migration_backend = 'gloo' + manager_args.migration_backend = 'gloo' detect_unsupported_feature(engine_args) def _get_dtype_size(dtype: torch.dtype) -> int: diff --git a/llumnix/config/default.py b/llumnix/config/default.py index a1f48043..31d56fe0 100644 --- a/llumnix/config/default.py +++ b/llumnix/config/default.py @@ -32,6 +32,8 @@ _C.SERVER.SSL_KEYFILE = None # Path to SSL certificate file for secure connections _C.SERVER.SSL_CERTFILE = None +# Log level for the server +_C.SERVER.LOG_LEVEL = "debug" # Queue type for request output queue _C.SERVER.REQUEST_OUTPUT_QUEUE_TYPE = "rayqueue" # Port number for the request output queue diff --git a/llumnix/entrypoints/bladellm/api_server.py b/llumnix/entrypoints/bladellm/api_server.py index 836b8757..b32dd78c 100644 --- a/llumnix/entrypoints/bladellm/api_server.py +++ b/llumnix/entrypoints/bladellm/api_server.py @@ -14,35 +14,39 @@ import asyncio from blade_llm.service.args import ServingArgs -from llumnix.config import get_llumnix_config, LlumnixConfig +# TODO(s5u13b): Refine multiple import codes. +from llumnix.config import get_llumnix_config from llumnix.backends.backend_interface import BackendType -from llumnix.arg_utils import LlumnixEntrypointsArgs, EngineManagerArgs, LlumnixArgumentParser -from llumnix.entrypoints.setup import setup_ray_cluster, setup_llumnix, is_gpu_available +from llumnix.arg_utils import (EntrypointsArgs, ManagerArgs, LlumnixArgumentParser, + DeploymentArgs) +from llumnix.entrypoints.setup import setup_ray_cluster, setup_llumnix from llumnix.entrypoints.bladellm.client import LlumnixClientBladeLLM -from llumnix.entrypoints.setup import LlumnixEntrypointsContext from llumnix.entrypoints.bladellm.utils import get_args +from llumnix.entrypoints.utils import EntrypointsContext, DeploymentMode, is_gpu_available + def setup_llumnix_api_server(bladellm_args: ServingArgs, loop: asyncio.AbstractEventLoop): # generate llumnix_parser for checking parameters with choices - llumnix_parser: LlumnixArgumentParser = LlumnixArgumentParser() - llumnix_parser = LlumnixEntrypointsArgs.add_cli_args(llumnix_parser) - llumnix_parser = EngineManagerArgs.add_cli_args(llumnix_parser) - llumnix_config: LlumnixConfig = get_llumnix_config(bladellm_args.llumnix_config) - _, engine_manager_args, engine_args = get_args(llumnix_config, llumnix_parser, bladellm_args) + # TODO(s5u13b): Add add_cli_args function. + llumnix_parser = LlumnixArgumentParser() + llumnix_parser = EntrypointsArgs.add_cli_args(llumnix_parser) + llumnix_parser = ManagerArgs.add_cli_args(llumnix_parser) + llumnix_config = get_llumnix_config(bladellm_args.llumnix_config) + entrypoints_args, manager_args, engine_args = get_args(llumnix_config, llumnix_parser, bladellm_args) + + deployment_args = DeploymentArgs(deployment_mode=DeploymentMode.LOCAL, backend_type=BackendType.VLLM) - setup_ray_cluster(llumnix_config) + setup_ray_cluster(entrypoints_args) - llm_client = None + llumnix_client = None # if gpu is not available, it means that this node is head pod x any llumnix components if is_gpu_available(): - world_size = engine_args.tensor_parallel_size * engine_args.pipeline_parallel_size instance_ids = None if engine_args.enable_disagg: instance_ids = [engine_args.disagg_options.inst_id] - llumnix_context: LlumnixEntrypointsContext = \ - setup_llumnix(engine_manager_args, engine_args, llumnix_config, BackendType.BLADELLM, - world_size, instance_ids=instance_ids) - llm_client = LlumnixClientBladeLLM(bladellm_args, llumnix_context, loop) + llumnix_context: EntrypointsContext = \ + setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args, instance_ids=instance_ids) + llumnix_client = LlumnixClientBladeLLM(bladellm_args, llumnix_context, loop) - return llm_client + return llumnix_client diff --git a/llumnix/entrypoints/bladellm/client.py b/llumnix/entrypoints/bladellm/client.py index 729b3874..f95dae96 100644 --- a/llumnix/entrypoints/bladellm/client.py +++ b/llumnix/entrypoints/bladellm/client.py @@ -28,7 +28,7 @@ from blade_llm.service.communications.response import error_resp from llumnix.server_info import RequestTimestamps -from llumnix.entrypoints.setup import LlumnixEntrypointsContext +from llumnix.entrypoints.setup import EntrypointsContext from llumnix.logger import init_logger logger = init_logger(__name__) @@ -39,7 +39,7 @@ class LlumnixClientBladeLLM(MultiProcessingLLMClient): - def __init__(self, args: ServingArgs, llumnix_context: LlumnixEntrypointsContext, loop: asyncio.AbstractEventLoop): + def __init__(self, args: ServingArgs, llumnix_context: EntrypointsContext, loop: asyncio.AbstractEventLoop): super().__init__(args, -1) self.entrypoint_id2llumnix_id = {} self.llumnix_id2entrypoint_id = {} diff --git a/llumnix/entrypoints/bladellm/utils.py b/llumnix/entrypoints/bladellm/utils.py index 2a6359f8..3fa94cd6 100644 --- a/llumnix/entrypoints/bladellm/utils.py +++ b/llumnix/entrypoints/bladellm/utils.py @@ -15,7 +15,7 @@ from loguru import logger from blade_llm.service.args import ServingArgs -from llumnix.arg_utils import LlumnixEntrypointsArgs, EngineManagerArgs +from llumnix.arg_utils import EntrypointsArgs, ManagerArgs def detect_unsupported_feature(engine_args: ServingArgs) -> None: unsupported_feature = None @@ -31,24 +31,24 @@ def detect_unsupported_feature(engine_args: ServingArgs) -> None: if unsupported_feature: raise ValueError(f'Llumnix does not support "{unsupported_feature}" for bladeLLM currently.') -def check_engine_args(engine_args: ServingArgs, engine_manager_args: EngineManagerArgs) -> None: - migration_config = engine_manager_args.create_migration_config() +def check_engine_args(engine_args: ServingArgs, manager_args: ManagerArgs) -> None: + migration_config = manager_args.create_migration_config() if (engine_args.tensor_parallel_size > 1 or engine_args.tensor_parallel_size > 1) and \ migration_config.migration_backend == 'nccl': logger.warning("Llumnix does not support TP or PP when the migration backend is nccl, \ change migration backend to gloo.") - engine_manager_args.migration_backend = 'gloo' + manager_args.migration_backend = 'gloo' detect_unsupported_feature(engine_args) def get_args(llumnix_cfg, llumnix_parser, engine_args): - llumnix_entrypoints_args = LlumnixEntrypointsArgs.from_llumnix_config(llumnix_cfg) - LlumnixEntrypointsArgs.check_args(llumnix_entrypoints_args, llumnix_parser) - engine_manager_args = EngineManagerArgs.from_llumnix_config(llumnix_cfg) - EngineManagerArgs.check_args(engine_manager_args, llumnix_parser) - check_engine_args(engine_args, engine_manager_args) - - logger.info("llumnix_entrypoints_args: {}", llumnix_entrypoints_args) - logger.info("engine_manager_args: {}", engine_manager_args) + entrypoints_args = EntrypointsArgs.from_llumnix_config(llumnix_cfg) + EntrypointsArgs.check_args(entrypoints_args, llumnix_parser) + manager_args = ManagerArgs.from_llumnix_config(llumnix_cfg) + ManagerArgs.check_args(manager_args, llumnix_parser) + check_engine_args(engine_args, manager_args) + + logger.info("entrypoints_args: {}", entrypoints_args) + logger.info("manager_args: {}", manager_args) logger.info("engine_args: {}", engine_args) - return llumnix_entrypoints_args, engine_manager_args, engine_args + return entrypoints_args, manager_args, engine_args diff --git a/llumnix/entrypoints/setup.py b/llumnix/entrypoints/setup.py index 8e17f373..bdd236d0 100644 --- a/llumnix/entrypoints/setup.py +++ b/llumnix/entrypoints/setup.py @@ -15,49 +15,27 @@ import sys import os import time -from typing import Dict -import asyncio -import socket +from typing import Dict, Optional import ray -from llumnix.llm_engine_manager import LLMEngineManager +from llumnix.manager import Manager from llumnix.llumlet.llumlet import Llumlet from llumnix.logger import init_logger from llumnix.utils import random_uuid, MANAGER_NAME -from llumnix.arg_utils import EngineManagerArgs +from llumnix.arg_utils import ManagerArgs, EntrypointsArgs, DeploymentArgs from llumnix.queue.queue_type import QueueType -from llumnix.server_info import ServerInfo, RequestTimestamps +from llumnix.server_info import ServerInfo from llumnix.queue.utils import init_request_output_queue_server -from llumnix.queue.queue_server_base import QueueServerBase +from llumnix.entrypoints.utils import (EntrypointsContext, get_ip_address, + retry_manager_method_sync) +from llumnix.entrypoints.utils import DeploymentMode +from llumnix.backends.backend_interface import BackendType + +MAX_RAY_RESTARTS = 5 +RAY_RESTART_INTERVALS = 10 logger = init_logger(__name__) -MAX_RESTARTS = 30 -RESTART_INTERVALS = 1 -MAX_TASK_RETRIES = 300 -RETRIES_INTERVALS = 0.1 - - -class LlumnixEntrypointsContext: - def __init__(self, - manager: LLMEngineManager, - instances: Dict[str, Llumlet], - request_output_queue: QueueServerBase, - server_info: ServerInfo, - log_requests: bool, - log_request_timestamps: bool): - self.manager = manager - self.instances = instances - self.request_output_queue = request_output_queue - self.server_info = server_info - self.log_requests = log_requests - self.log_request_timestamps = log_request_timestamps - - -def get_ip_address(): - hostname = socket.gethostname() - ip_address = socket.gethostbyname(hostname) - return ip_address def launch_ray_cluster(port: int) -> subprocess.CompletedProcess: head_node_ip = os.getenv('HEAD_NODE_IP') @@ -82,86 +60,59 @@ def launch_ray_cluster(port: int) -> subprocess.CompletedProcess: sys.exit(1) else: ray_start_command = f"ray start --address={head_node_ip}:{port} --node-ip-address={node_ip_address}" - for attempt in range(MAX_RESTARTS): + for attempt in range(MAX_RAY_RESTARTS): try: # wait about 2 mins by default result = subprocess.run(['ray', 'start', f'--address={head_node_ip}:{port}'], check=True, text=True, capture_output=True) break except subprocess.CalledProcessError as e: - if attempt < MAX_RESTARTS: + if attempt < MAX_RAY_RESTARTS: logger.warning("execute '{}' repeatedly until the head node starts".format(ray_start_command)) - time.sleep(RESTART_INTERVALS) + time.sleep(RAY_RESTART_INTERVALS) else: logger.error("'{}' failed after {} attempts with: \n{}".format(ray_start_command, attempt, e.stderr)) sys.exit(1) logger.info("'{}' succeeed with: \n{}".format(ray_start_command, result.stdout)) return result -def connect_to_ray_cluster(port: int, namespace="llumnix") -> None: - head_node_ip = os.getenv('HEAD_NODE_IP') - ray.init(address=f"{head_node_ip}:{port}", ignore_reinit_error=True, namespace=namespace) - -def setup_ray_cluster(cfg): - if cfg.SERVER.LAUNCH_RAY_CLUSTER: - launch_ray_cluster(cfg.SERVER.RAY_CLUSTER_PORT) - connect_to_ray_cluster(port=cfg.SERVER.RAY_CLUSTER_PORT) - -def is_gpu_available() -> bool: - try: - subprocess.check_output(['nvidia-smi']) - return True - except (subprocess.CalledProcessError, FileNotFoundError): - return False - -def retry_manager_method_sync(ray_call, method_name, *args, **kwargs): - for attempt in range(MAX_TASK_RETRIES): - try: - ret = ray.get(ray_call(*args, **kwargs)) - break - except ray.exceptions.RayActorError: - if attempt < MAX_TASK_RETRIES - 1: - logger.warning("manager is unavailable, sleep {}s, and retry {} again".format(RETRIES_INTERVALS, method_name)) - time.sleep(RETRIES_INTERVALS) - else: - logger.error("manager is still unavailable after {} times retries".format(MAX_TASK_RETRIES)) - raise - return ret - -async def retry_manager_method_async(ray_call, method_name, *args, **kwargs): - for attempt in range(MAX_TASK_RETRIES): - try: - ret = await ray_call(*args, **kwargs) - break - except ray.exceptions.RayActorError: - if attempt < MAX_TASK_RETRIES - 1: - logger.warning("manager is unavailable, sleep {}s, and retry {} again".format(RETRIES_INTERVALS, method_name)) - await asyncio.sleep(RETRIES_INTERVALS) - else: - logger.error("manager is still unavailable after {} times retries".format(MAX_TASK_RETRIES)) - raise - return ret - -def init_manager(engine_manager_args: EngineManagerArgs) -> LLMEngineManager: +def connect_to_ray_cluster(head_node_ip: str = None, port: int = None, namespace="llumnix") -> None: + if head_node_ip is not None and port is not None: + ray.init(address=f"{head_node_ip}:{port}", ignore_reinit_error=True, namespace=namespace) + else: + ray.init(ignore_reinit_error=True, namespace=namespace) + +def setup_ray_cluster(entrypoints_args) -> None: + if entrypoints_args.launch_ray_cluster: + launch_ray_cluster(entrypoints_args.ray_cluster_port) + connect_to_ray_cluster(head_node_ip=os.getenv('HEAD_NODE_IP'), port=entrypoints_args.ray_cluster_port, namespace="llumnix") + +def init_manager(manager_args: ManagerArgs, + entrypoints_args: EntrypointsArgs = None, + engine_args = None, + deployment_args: DeploymentArgs = None, + ) -> Manager: # Only one instance create the manager actor, the other instances get the existing manager actor through ray. try: - manager = LLMEngineManager.from_args(engine_manager_args, None) - logger.info("Init LLMEngineManager on current node.") + manager = Manager.from_args(manager_args, entrypoints_args, engine_args, deployment_args) + logger.info("Init Manager on current node.") except ValueError: manager = ray.get_actor(MANAGER_NAME, namespace='llumnix') - logger.info("Get existing LLMEngineManager.") + logger.info("Get existing Manager.") return manager -def init_llumnix_components(engine_manager_args: EngineManagerArgs, +def init_llumnix_components(manager_args: ManagerArgs, engine_args, request_output_queue_type: QueueType, ip: str, request_output_queue_port: str, + backend_type: BackendType, *args, **kwargs ): - manager = init_manager(engine_manager_args) + manager = init_manager(manager_args) + instance_ids, llumlets = retry_manager_method_sync( - manager.init_llumlets.remote, 'init_llumlets', engine_args, request_output_queue_type, *args, **kwargs) + manager.init_llumlets.remote, 'init_llumlets', engine_args, request_output_queue_type, backend_type, *args, **kwargs) available_instance_ids = [] dead_instance_ids = [] @@ -186,52 +137,54 @@ def init_llumnix_components(engine_manager_args: EngineManagerArgs, return manager, available_instance_ids, available_llumlets, request_output_queue -def setup_llumnix(engine_manager_args, engine_args, cfg, *args, **kwargs): +def _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment_args, + *args, **kwargs) -> EntrypointsContext: ip = get_ip_address() + request_output_queue_type = entrypoints_args.request_output_queue_type + request_output_queue_port = entrypoints_args.request_output_queue_port + backend_type = deployment_args.backend_type + manager, instance_ids, llumlets, request_output_queue = \ - init_llumnix_components(engine_manager_args, + init_llumnix_components(manager_args, engine_args, - cfg.SERVER.REQUEST_OUTPUT_QUEUE_TYPE, + request_output_queue_type, ip, - cfg.SERVER.REQUEST_OUTPUT_QUEUE_PORT, + request_output_queue_port, + backend_type, *args, **kwargs) + server_id = random_uuid() server_info = ServerInfo(server_id, - cfg.SERVER.REQUEST_OUTPUT_QUEUE_TYPE, + request_output_queue_type, request_output_queue, ip, - cfg.SERVER.REQUEST_OUTPUT_QUEUE_PORT) + request_output_queue_port) + instances: Dict[str, Llumlet] = {} for idx, ins_id in enumerate(instance_ids): instances[ins_id] = llumlets[idx] - log_requests = not cfg.SERVER.DISABLE_LOG_REQUESTS_SERVER - log_request_timestamps = cfg.SERVER.LOG_REQUEST_TIMESTAMPS + log_requests = not manager_args.disable_log_requests_manager + log_request_timestamps = entrypoints_args.log_request_timestamps logger.info("log_requests: {}, log_request_timestamps: {}".format(log_requests, log_request_timestamps)) - llumnix_entrypoints_context = LlumnixEntrypointsContext(manager, - instances, - request_output_queue, - server_info, - log_requests, - log_request_timestamps) - - return llumnix_entrypoints_context - -def init_per_token_latency_breakdown_dict() -> Dict[str, int]: - per_token_latency_breakdown_dict = { - 'step_latency_engine': [], - 'process_model_outputs_latency': [], - 'step_postprocess_latency': [], - 'across_async_put_queue_thread_latency': [], - 'across_async_put_queue_actor_latency': [], - 'queue_rpc_latency': [], - 'background_process_get_queue_latency': [], - 'generate_benchmark_return_output_latency': [] - } - return per_token_latency_breakdown_dict - -def record_per_token_latency_breakdown(per_token_latency_breakdown_dict: Dict[str, int], request_timestamps: RequestTimestamps): - for key in per_token_latency_breakdown_dict.keys(): - per_token_latency_breakdown_dict[key].append(getattr(request_timestamps, key)) + entrypoints_context = EntrypointsContext(manager, + instances, + request_output_queue, + server_info, + deployment_args.deployment_mode, + log_requests, + log_request_timestamps) + + return entrypoints_context + +def _setup_llumnix_global(manager_args, entrypoints_args, engine_args, deployment_args) -> None: + _ = init_manager(manager_args, entrypoints_args, engine_args, deployment_args) + +def setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args, + *args, **kwargs) -> Optional[EntrypointsContext]: + if deployment_args.deployment_mode == DeploymentMode.LOCAL: + return _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment_args, *args, **kwargs) + + return _setup_llumnix_global(manager_args, entrypoints_args, engine_args, deployment_args) diff --git a/llumnix/entrypoints/utils.py b/llumnix/entrypoints/utils.py new file mode 100644 index 00000000..c638b2cd --- /dev/null +++ b/llumnix/entrypoints/utils.py @@ -0,0 +1,94 @@ +import socket +from enum import Enum +from typing import Dict +import subprocess +import asyncio +import time +import ray + +from llumnix.logger import init_logger + +MAX_TASK_RETRIES = 300 +RETRIES_INTERVALS = 0.1 + +logger = init_logger(__name__) + + +class DeploymentMode(str, Enum): + LOCAL = "LOCAL" + GLOBAL = "GLOBAL" + +# Use "" type hint to avoid circular import. +class EntrypointsContext: + def __init__(self, + manager: "Manager", + instances: Dict[str, "Llumlet"], + request_output_queue: "QueueServerBase", + server_info: "ServerInfo", + deployment_mode: str, + log_requests: bool, + log_request_timestamps: bool): + self.manager = manager + self.instances = instances + self.request_output_queue = request_output_queue + self.server_info = server_info + self.deployment_mode = deployment_mode + self.log_requests = log_requests + self.log_request_timestamps = log_request_timestamps + +def get_ip_address(): + hostname = socket.gethostname() + ip_address = socket.gethostbyname(hostname) + return ip_address + +def is_gpu_available() -> bool: + try: + subprocess.check_output(['nvidia-smi']) + return True + except (subprocess.CalledProcessError, FileNotFoundError): + return False + +def retry_manager_method_sync(ray_call, method_name, *args, **kwargs): + for attempt in range(MAX_TASK_RETRIES): + try: + ret = ray.get(ray_call(*args, **kwargs)) + break + except ray.exceptions.RayActorError: + if attempt < MAX_TASK_RETRIES - 1: + logger.warning("manager is unavailable, sleep {}s, and retry {} again".format(RETRIES_INTERVALS, method_name)) + time.sleep(RETRIES_INTERVALS) + else: + logger.error("manager is still unavailable after {} times retries".format(MAX_TASK_RETRIES)) + raise + return ret + +async def retry_manager_method_async(ray_call, method_name, *args, **kwargs): + for attempt in range(MAX_TASK_RETRIES): + try: + ret = await ray_call(*args, **kwargs) + break + except ray.exceptions.RayActorError: + if attempt < MAX_TASK_RETRIES - 1: + logger.warning("manager is unavailable, sleep {}s, and retry {} again".format(RETRIES_INTERVALS, method_name)) + await asyncio.sleep(RETRIES_INTERVALS) + else: + logger.error("manager is still unavailable after {} times retries".format(MAX_TASK_RETRIES)) + raise + return ret + +def init_per_token_latency_breakdown_dict() -> Dict[str, int]: + per_token_latency_breakdown_dict = { + 'step_latency_engine': [], + 'process_model_outputs_latency': [], + 'step_postprocess_latency': [], + 'across_async_put_queue_thread_latency': [], + 'across_async_put_queue_actor_latency': [], + 'queue_rpc_latency': [], + 'background_process_get_queue_latency': [], + 'generate_benchmark_return_output_latency': [] + } + return per_token_latency_breakdown_dict + +def record_per_token_latency_breakdown(per_token_latency_breakdown_dict: Dict[str, int], request_timestamps: "RequestTimestamps"): + for key in per_token_latency_breakdown_dict.keys(): + per_token_latency_breakdown_dict[key].append(getattr(request_timestamps, key)) diff --git a/llumnix/entrypoints/vllm/api_server.py b/llumnix/entrypoints/vllm/api_server.py index 894b4d06..20ceb239 100644 --- a/llumnix/entrypoints/vllm/api_server.py +++ b/llumnix/entrypoints/vllm/api_server.py @@ -22,19 +22,16 @@ from vllm.sampling_params import SamplingParams -from llumnix.arg_utils import LlumnixArgumentParser -from llumnix.entrypoints.setup import (setup_ray_cluster, - setup_llumnix, - is_gpu_available, - init_per_token_latency_breakdown_dict, - record_per_token_latency_breakdown) -from llumnix.entrypoints.vllm.arg_utils import (add_cli_args, - get_args) +from llumnix.arg_utils import LlumnixArgumentParser, DeploymentArgs +from llumnix.entrypoints.setup import setup_ray_cluster, setup_llumnix +from llumnix.entrypoints.utils import init_per_token_latency_breakdown_dict, record_per_token_latency_breakdown +from llumnix.entrypoints.vllm.arg_utils import add_cli_args, get_args from llumnix.entrypoints.vllm.client import LlumnixClientVLLM from llumnix.logger import init_logger from llumnix.utils import random_uuid -from llumnix.config import get_llumnix_config, LlumnixConfig +from llumnix.config import get_llumnix_config from llumnix.backends.backend_interface import BackendType +from llumnix.entrypoints.utils import DeploymentMode, is_gpu_available # Code file with __main__ should set the logger name to inherit the llumnix logger configuration. logger = init_logger("llumnix.entrypoints.vllm.api_server") @@ -47,6 +44,7 @@ # pylint: disable=unused-argument @asynccontextmanager async def lifespan(fastapi_app: FastAPI): + # TODO(s5u13b): Do not run request output queue in event loop of api server. asyncio.create_task(llumnix_client.request_output_queue.run_server_loop()) asyncio.create_task(llumnix_client.get_request_outputs_loop()) yield @@ -179,27 +177,28 @@ async def is_ready() -> bool: parser.add_argument("--port", type=int) parser.add_argument("--ssl-keyfile", type=str) parser.add_argument("--ssl-certfile", type=str) + parser.add_argument("--log-level", type=str) cli_args = add_cli_args(parser) - cfg: LlumnixConfig = get_llumnix_config(cli_args.config_file, cli_args) - _, engine_manager_args, engine_args = get_args(cfg, parser, cli_args) + cfg = get_llumnix_config(cli_args.config_file, cli_args) + entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args) + + deployment_args = DeploymentArgs(deployment_mode=DeploymentMode.LOCAL, backend_type=BackendType.VLLM) # Launch or connect to the ray cluster for multi-node serving. - setup_ray_cluster(cfg) + setup_ray_cluster(entrypoints_args) # if gpu is not available, it means that this node is head pod without any llumnix components if is_gpu_available(): - engine_config = engine_args.create_engine_config() - parallel_config = engine_config.parallel_config - llumnix_entrypoints_context = setup_llumnix(engine_manager_args, engine_args, cfg, BackendType.VLLM, parallel_config.world_size) - llumnix_client = LlumnixClientVLLM(llumnix_entrypoints_context) + entrypoints_context = setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args) + llumnix_client = LlumnixClientVLLM(entrypoints_context) # Start the api server after all the components of llumnix are ready. - logger.info("Start api server on '{}:{}'.".format(cfg.SERVER.HOST, cfg.SERVER.PORT)) + logger.info("Start api server on '{}:{}'.".format(entrypoints_args.host, entrypoints_args.port)) uvicorn.run(app, - host=cfg.SERVER.HOST, - port=cfg.SERVER.PORT, - log_level="debug", + host=entrypoints_args.host, + port=entrypoints_args.port, + log_level="info", timeout_keep_alive=TIMEOUT_KEEP_ALIVE, - ssl_keyfile=cfg.SERVER.SSL_KEYFILE, - ssl_certfile=cfg.SERVER.SSL_CERTFILE) + ssl_keyfile=entrypoints_args.ssl_keyfile, + ssl_certfile=entrypoints_args.ssl_certfile) diff --git a/llumnix/entrypoints/vllm/api_server_actor.py b/llumnix/entrypoints/vllm/api_server_actor.py new file mode 100644 index 00000000..e69de29b diff --git a/llumnix/entrypoints/vllm/arg_utils.py b/llumnix/entrypoints/vllm/arg_utils.py index bb7daacd..6329b227 100644 --- a/llumnix/entrypoints/vllm/arg_utils.py +++ b/llumnix/entrypoints/vllm/arg_utils.py @@ -1,7 +1,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs from llumnix.backends.vllm.utils import check_engine_args -from llumnix.arg_utils import LlumnixEntrypointsArgs, EngineManagerArgs +from llumnix.arg_utils import EntrypointsArgs, ManagerArgs from llumnix.logger import init_logger logger = init_logger(__name__) @@ -9,23 +9,23 @@ def add_cli_args(parser): parser.set_namespace("llumnix") - parser = LlumnixEntrypointsArgs.add_cli_args(parser) - parser = EngineManagerArgs.add_cli_args(parser) + parser = EntrypointsArgs.add_cli_args(parser) + parser = ManagerArgs.add_cli_args(parser) parser.set_namespace("vllm") parser = AsyncEngineArgs.add_cli_args(parser) cli_args = parser.parse_args() return cli_args def get_args(cfg, parser, cli_args): - llumnix_entrypoints_args = LlumnixEntrypointsArgs.from_llumnix_config(cfg) - LlumnixEntrypointsArgs.check_args(llumnix_entrypoints_args, parser) - engine_manager_args = EngineManagerArgs.from_llumnix_config(cfg) - EngineManagerArgs.check_args(engine_manager_args, parser) + entrypoints_args = EntrypointsArgs.from_llumnix_config(cfg) + EntrypointsArgs.check_args(entrypoints_args, parser) + manager_args = ManagerArgs.from_llumnix_config(cfg) + ManagerArgs.check_args(manager_args, parser) engine_args = AsyncEngineArgs.from_cli_args(cli_args) - check_engine_args(engine_args, engine_manager_args) + check_engine_args(engine_args, manager_args) - logger.info("llumnix_entrypoints_args: {}".format(llumnix_entrypoints_args)) - logger.info("engine_manager_args: {}".format(engine_manager_args)) + logger.info("entrypoints_args: {}".format(entrypoints_args)) + logger.info("manager_args: {}".format(manager_args)) logger.info("engine_args: {}".format(engine_args)) - return llumnix_entrypoints_args, engine_manager_args, engine_args + return entrypoints_args, manager_args, engine_args diff --git a/llumnix/entrypoints/vllm/client.py b/llumnix/entrypoints/vllm/client.py index cbe2ffb6..d13ae208 100644 --- a/llumnix/entrypoints/vllm/client.py +++ b/llumnix/entrypoints/vllm/client.py @@ -7,10 +7,11 @@ from vllm import SamplingParams from llumnix.logger import init_logger -from llumnix.entrypoints.setup import LlumnixEntrypointsContext +from llumnix.entrypoints.setup import EntrypointsContext from llumnix.server_info import RequestTimestamps from llumnix.queue.queue_server_base import QueueServerBase from llumnix.server_info import ServerInfo +from llumnix.entrypoints.utils import DeploymentMode logger = init_logger(__name__) @@ -19,20 +20,21 @@ class LlumnixClientVLLM: def __init__(self, - llumnix_entrypoints_context: LlumnixEntrypointsContext): - self.manager: LLMEngineManager = llumnix_entrypoints_context.manager - self.instances: Dict[str, Llumlet] = llumnix_entrypoints_context.instances - self.request_output_queue: QueueServerBase = llumnix_entrypoints_context.request_output_queue - self.server_info: ServerInfo = llumnix_entrypoints_context.server_info - self.log_requests: bool = llumnix_entrypoints_context.log_requests - self.log_request_timestamps: bool = llumnix_entrypoints_context.log_request_timestamps + entrypoints_context: EntrypointsContext): + self.manager: Manager = entrypoints_context.manager + self.instances: Dict[str, Llumlet] = entrypoints_context.instances + self.request_output_queue: QueueServerBase = entrypoints_context.request_output_queue + self.server_info: ServerInfo = entrypoints_context.server_info + self.log_requests = entrypoints_context.log_requests + self.log_request_timestamps = entrypoints_context.log_request_timestamps + self.deployment_mode: DeploymentMode = entrypoints_context.deployment_mode self.request_streams: Dict[str, AsyncStream] = {} self.instance_num_requests: Dict[str, int] = {} for ins_id in self.instances.keys(): self.instance_num_requests[ins_id] = 0 - self.num_finished_requests: int = 0 - self.manager_available: bool = True + self.num_finished_requests = 0 + self.manager_available = True async def generate(self, prompt: str, @@ -87,10 +89,10 @@ async def _generate_by_instance(self, instance_id = min(self.instance_num_requests, key=self.instance_num_requests.get) self.instance_num_requests[instance_id] += 1 await self.instances[instance_id].generate.remote(request_id, server_info, prompt, sampling_params, *args, **kwargs) - logger.warning("LLMEngineManager is unavailable temporarily, dispatch request {} to instance {}".format( + logger.warning("Manager is unavailable temporarily, dispatch request {} to instance {}".format( request_id, instance_id)) else: - logger.warning("LLMEngineManager is unavailable temporarily, but there is no instance behind this api server, " + logger.warning("Manager is unavailable temporarily, but there is no instance behind this api server, " "sleep {}s, waiting for manager available".format(WAIT_MANAGER_INTERVAL)) await asyncio.sleep(WAIT_MANAGER_INTERVAL) return await asyncio.create_task(self.generate(prompt, sampling_params, request_id, *args, **kwargs)) diff --git a/llumnix/entrypoints/vllm/serve.py b/llumnix/entrypoints/vllm/serve.py new file mode 100644 index 00000000..2e2d90d9 --- /dev/null +++ b/llumnix/entrypoints/vllm/serve.py @@ -0,0 +1,35 @@ + +from ray.util.queue import Queue as RayQueue + +from llumnix.entrypoints.vllm.arg_utils import add_cli_args, get_args +from llumnix.entrypoints.setup import connect_to_ray_cluster +from llumnix.config import get_llumnix_config +from llumnix.arg_utils import LlumnixArgumentParser +from llumnix.entrypoints.utils import DeploymentMode + + +if __name__ == "__main__": + parser: LlumnixArgumentParser = LlumnixArgumentParser() + + parser.add_argument("--host", type=str) + parser.add_argument("--port", type=int) + parser.add_argument("--ssl-keyfile", type=str) + parser.add_argument("--ssl-certfile", type=str) + parser.add_argument("--log-level", type=str) + + cli_args = add_cli_args(parser) + cfg = get_llumnix_config(cli_args.config_file, cli_args) + entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args) + + deployment_args = DeploymentArgs(deployment_mode=DeploymentMode.GLOBAL, backend_type=BackendType.VLLM) + + # Assume that there is an existing ray cluster when using centralized deployment. + connect_to_ray_cluster() + + # magic actor to avoid fast api server actor initialization error + request_output_queue = RayQueue(actor_options={"namespace": "llumnix", + "name": "magic_ray_queue"}) + + engine_config = engine_args.create_engine_config() + parallel_config = engine_config.parallel_config + entrypoints_context = setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args) diff --git a/llumnix/global_scheduler/migration_policy.py b/llumnix/global_scheduler/migration_policy.py index c917cce7..eafe5cf3 100644 --- a/llumnix/global_scheduler/migration_policy.py +++ b/llumnix/global_scheduler/migration_policy.py @@ -22,10 +22,10 @@ logger = init_logger(__name__) + class PairMigrationConstraints(str, Enum): """Target of Migration.""" NO_CONSTRAINTS = "NO_CONSTRAINTS" - # Enable the prefill-decoding disaggregration. DECODING_2_DECODING = "DECODING_2_DECODING" PREFILL_2_DECODING = "PREFILL_2_DECODING" diff --git a/llumnix/global_scheduler/scaling_scheduler.py b/llumnix/global_scheduler/scaling_scheduler.py index 7607d88a..3c862f8a 100644 --- a/llumnix/global_scheduler/scaling_scheduler.py +++ b/llumnix/global_scheduler/scaling_scheduler.py @@ -21,9 +21,9 @@ logger = init_logger(__name__) + class InstanceType(str, Enum): NO_CONSTRAINTS = "NO_CONSTRAINTS" - # Specific to Prefill-Decoding disaggregation. PREFILL = "PREFILL" DECODE = "DECODE" diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 3f96559a..4845d956 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -23,7 +23,7 @@ from llumnix.logger import init_logger from llumnix.instance_info import InstanceInfo from llumnix.backends.backend_interface import BackendInterface, BackendType, EngineState -from llumnix.backends.utils import init_backend_engine +from llumnix.backends.utils import init_backend_engine, get_engine_world_size from llumnix.llumlet.migration_coordinator import MigrationCoordinator, MigrationStatus from llumnix.llumlet.local_migration_scheduler import LocalMigrationScheduler from llumnix.server_info import ServerInfo @@ -75,7 +75,6 @@ def from_args(cls, request_output_queue_type: QueueType, instance_id: str, backend_type: BackendType, - world_size: int, migration_config: MigrationConfig, placement_group: PlacementGroup, *args, @@ -85,6 +84,8 @@ def from_args(cls, f'unimplemented backend {backend_type}' num_gpus = 0 if backend_type == backend_type.BLADELLM: + engine_args = kwargs["engine_args"] + world_size = get_engine_world_size(engine_args, backend_type) num_gpus = world_size instance_name = get_instance_name(instance_id) if backend_type in [backend_type.VLLM, backend_type.BLADELLM]: diff --git a/llumnix/llumlet/request.py b/llumnix/llumlet/request.py index d6c7dac5..15bb3c85 100644 --- a/llumnix/llumlet/request.py +++ b/llumnix/llumlet/request.py @@ -16,6 +16,7 @@ from llumnix.server_info import ServerInfo + class RequestInferenceType(str, Enum): PREFILL = "prefill" DECODE = "decode" diff --git a/llumnix/llm_engine_manager.py b/llumnix/manager.py similarity index 85% rename from llumnix/llm_engine_manager.py rename to llumnix/manager.py index b01c86ea..9b101eae 100644 --- a/llumnix/llm_engine_manager.py +++ b/llumnix/manager.py @@ -27,70 +27,83 @@ from llumnix.global_scheduler.migration_scheduler import PairMigrationConstraints from llumnix.global_scheduler.migration_filter import CustomFilter from llumnix.instance_info import InstanceInfo -from llumnix.internal_config import GlobalSchedulerConfig -from llumnix.arg_utils import EngineManagerArgs -from llumnix.backends.profiling import ProfilingDatabase +from llumnix.arg_utils import ManagerArgs, EntrypointsArgs, DeploymentArgs from llumnix.server_info import ServerInfo from llumnix.backends.backend_interface import BackendType -from llumnix.utils import (random_uuid, - clear_gloo_backend_state, - remove_placement_group, - get_instance_name, - INSTANCE_NAME_PREFIX, - MANAGER_NAME) +from llumnix.utils import (random_uuid, clear_gloo_backend_state, remove_placement_group, + get_instance_name, INSTANCE_NAME_PREFIX, MANAGER_NAME) +from llumnix.entrypoints.utils import DeploymentMode +from llumnix.utils import initialize_placement_group +from llumnix.backends.utils import get_engine_world_size from llumnix.queue.queue_type import QueueType -from llumnix.backends.utils import initialize_placement_group logger = init_logger(__name__) CLEAR_REQUEST_INSTANCE_INTERVAL = 3600 NO_INSTANCE_RETRY_INTERVAL = 1.0 WAIT_ALL_MIGRATIONS_DONE_INTERVAL = 1.0 +WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS = 1.0 +AUTO_DEPLOYMENT_INTERVAL = 1.0 # TODO(s5u13b): Fix the logger when manager failover. # TODO(s5u13b): Handle exception of ray operations. +# TODO(s5u13b): Update the documents of global deployment. +# TODO(s5u13b): Change add_done_callback method. -class LLMEngineManager: +class Manager: def __init__(self, - engine_manager_args: EngineManagerArgs, - global_scheduler_config: GlobalSchedulerConfig, + manager_args: ManagerArgs, work_dir: str, - log_requests: bool = True, - profiling_database: ProfilingDatabase = None) -> None: + entrypoints_args: EntrypointsArgs = None, + engine_args = None, + deployment_args: DeploymentArgs = None + ) -> None: os.chdir(work_dir) self.actor_name = MANAGER_NAME - self.engine_manager_args = engine_manager_args - self.profiling_database = profiling_database + self.manager_args = manager_args + # engine_args and entrypoints_args are used in global deployment. + self.entrypoints_args = entrypoints_args + self.engine_args = engine_args + self.deployment_args = deployment_args + + assert deployment_args is None or (deployment_args is not None and entrypoints_args is not None and engine_args is not None) + + # deployment args + if deployment_args is not None: + self.deployment_mode: DeploymentMode = deployment_args.deployment_mode + self.backend_type: BackendType = deployment_args.backend_type + self.max_instances = manager_args.max_instances + self.min_instances = manager_args.min_instances + + # scheduling args + self.enable_migration = manager_args.enable_migration + self.enable_scaling = manager_args.enable_scaling + self.enable_pd_disagg = manager_args.enable_pd_disagg + self.polling_interval = manager_args.polling_interval + self.pair_migration_frequency = manager_args.pair_migration_frequency + self.scaling_interval = manager_args.scaling_interval + + global_scheduler_config = manager_args.create_global_scheduler_config() + self.global_scheduler = GlobalScheduler(global_scheduler_config) - self.log_requests = log_requests + # log args + self.log_requests = not manager_args.disable_log_requests_manager + self.log_instance_info = manager_args.log_instance_info + if self.log_instance_info: + self._init_instance_info_csv(manager_args) + self.instance_last_logged_empty = {} + # instance states self.num_instances = 0 - self.enable_migration = engine_manager_args.enable_migration - self.enable_scaling = engine_manager_args.enable_scaling - self.max_instances = engine_manager_args.max_instances - self.min_instances = engine_manager_args.min_instances - - self.enable_pd_disagg = global_scheduler_config.enable_pd_disagg - self.instances: Dict[str, Llumlet] = {} self.instance_migrating: Dict[str, bool] = {} self.pending_rebuild_migration_instances = 0 - self.global_scheduler = GlobalScheduler(global_scheduler_config) - - self.polling_interval = engine_manager_args.polling_interval - asyncio.create_task(self._update_instance_info_loop(self.polling_interval)) - - # args - self.pair_migration_frequency = engine_manager_args.pair_migration_frequency - self.scaling_interval = engine_manager_args.scaling_interval # request states self.request_instance: Dict[str, str] = {} - self.clear_request_intance_interval = CLEAR_REQUEST_INSTANCE_INTERVAL - asyncio.create_task(self._clear_request_instance_loop(self.clear_request_intance_interval)) - # migrate states + # migration states self.num_instance_info_updates = 0 self.migrating = False @@ -99,15 +112,14 @@ def __init__(self, self.scale_down_time = -1 self.scaling_up = False self.scaling_down = False - self.last_check_scale_time = time.time() + 100 - - self.log_instance_info = engine_manager_args.log_instance_info - if self.log_instance_info: - self._init_instance_info_csv(engine_manager_args) - self.instance_last_logged_empty = {} + self.last_check_scale_time = time.time() + # tasks # When manager starts, it automatically connects to all existing instances. + # TODO(s5u13b): Check if this is a sync call. asyncio.run_coroutine_threadsafe(self._connect_to_instances(), asyncio.get_event_loop()) + asyncio.create_task(self._update_instance_info_loop(self.polling_interval)) + asyncio.create_task(self._clear_request_instance_loop(CLEAR_REQUEST_INSTANCE_INTERVAL)) async def generate(self, request_id: str, server_info: ServerInfo, *args, **kwargs,) -> None: while self.num_instances == 0: @@ -278,7 +290,7 @@ async def run_task(alive_instances: List[str], task_name: str, *args, **kwargs): dead_instances.add(instance_name) if len(dead_instances) > 0: self.scale_down(dead_instances, rebuild_migrate_backend=False) - if self.engine_manager_args.migration_backend == 'gloo': + if self.manager_args.migration_backend == 'gloo': clear_gloo_backend_state() return dead_instances @@ -286,7 +298,7 @@ async def run_task(alive_instances: List[str], task_name: str, *args, **kwargs): pending_task = self.pending_rebuild_migration_instances group_name = None - if self.engine_manager_args.migration_backend == 'gloo': + if self.manager_args.migration_backend == 'gloo': clear_gloo_backend_state() while len(alive_instances) > 0 and self.pending_rebuild_migration_instances > 0: @@ -313,7 +325,7 @@ async def run_task(alive_instances: List[str], task_name: str, *args, **kwargs): dst_filter=lambda instance_info: instance_info.instance_id in alive_instances) logger.info("[rebuild_migrate_backend] rebuild {} migration backend done, group_name: {}, alive instance ({}): {}" - .format(self.engine_manager_args.migration_backend, group_name, len(alive_instances), alive_instances)) + .format(self.manager_args.migration_backend, group_name, len(alive_instances), alive_instances)) # Restore migrate config self.enable_migration = origin_config @@ -341,7 +353,7 @@ def scale_up(self, instance_id: Union[str, Iterable[str]], llumlet_actor_handles # a coroutine is already handling the changes in the number of instances in the cluster and it will account for the changes # caused by this scale-up (see rebuild_migrate_backend for details). Therefore, we simply return in this case. Specifically, # for RPC, the Ray actor handle is used for the migration cache, so there is no need to rebuild the group. - if self.enable_migration and self.engine_manager_args.migration_backend in ['gloo', 'nccl'] \ + if self.enable_migration and self.manager_args.migration_backend in ['gloo', 'nccl'] \ and indeed_update and no_pending_instance: asyncio.create_task(self._rebuild_migrate_backend()) @@ -377,10 +389,10 @@ def scale_down(self, instance_id: Union[str, Iterable[str]], rebuild_migrate_bac self.global_scheduler.scale_down(instance_ids) self.num_instances = len(self.instances) - if self.enable_migration and self.engine_manager_args.migration_backend in ['gloo', 'nccl']: + if self.enable_migration and self.manager_args.migration_backend in ['gloo', 'nccl']: if len(self.instances) == 0: self.pending_rebuild_migration_instances = 0 - if self.engine_manager_args.migration_backend == 'gloo': + if self.manager_args.migration_backend == 'gloo': clear_gloo_backend_state() elif indeed_update and no_pending_instance and rebuild_migrate_backend: asyncio.create_task(self._rebuild_migrate_backend()) @@ -417,20 +429,21 @@ def connect_to_instances_done_callback(instance_id: str, instance_actor_handle: @classmethod def from_args(cls, - engine_manager_args: EngineManagerArgs, - profiling_database: ProfilingDatabase=None) -> "LLMEngineManager": - global_scheduler_config = engine_manager_args.create_global_scheduler_configs() + manager_args: ManagerArgs, + entrypoints_args: EntrypointsArgs = None, + engine_args = None, + deployment_args: DeploymentArgs = None, + ) -> "Manager": manager_class = ray.remote(num_cpus=0, max_restarts=-1, name=MANAGER_NAME, namespace='llumnix', - lifetime="detached" - )(cls) - manager = manager_class.remote(engine_manager_args, - global_scheduler_config, + lifetime="detached")(cls) + manager = manager_class.remote(manager_args, os.getcwd(), - log_requests=not engine_manager_args.disable_log_requests_manager, - profiling_database=profiling_database) + entrypoints_args, + engine_args, + deployment_args) return manager @@ -438,19 +451,22 @@ def init_llumlets(self, engine_args, request_output_queue_type: QueueType, backend_type: BackendType, - world_size: int, *args, **kwargs) -> Tuple[List[str], List[Llumlet]]: - engine_manager_args = self.engine_manager_args + manager_args = self.manager_args + world_size = get_engine_world_size(engine_args, backend_type) + instance_ids: List[str] = [] llumlets: List[Llumlet] = [] + # for pd disaggregation if 'instance_ids' in kwargs and kwargs['instance_ids'][0]: instance_ids = kwargs['instance_ids'] - for _ in range(engine_manager_args.initial_instances): + for _ in range(manager_args.initial_instances): instance_id = random_uuid() - if not engine_manager_args.profiling_result_file_path: + if not manager_args.profiling_result_file_path: # num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor, num_gpus=world_size, for Workers placement_group = initialize_placement_group(instance_id, num_cpus=3, num_gpus=world_size, detached=True) + # TODO(s5u13b): Refine the order of arguments. llumlet = Llumlet.from_args( request_output_queue_type, instance_id, @@ -465,21 +481,19 @@ def init_llumlets(self, **kwargs ) else: - assert backend_type == backend_type.VLLM, f'unimplemented backend SIM_{backend_type}' + assert backend_type == backend_type.VLLM, 'Only support the simulator backend for vLLM.' # num_cpus=1, for Llumlet + AsyncPutQueueActor placement_group = initialize_placement_group(instance_id, num_cpus=2, num_gpus=0, detached=True) llumlet = Llumlet.from_args( - request_output_queue_type, - instance_id, - BackendType.SIM_VLLM, - world_size, - engine_manager_args.create_migration_config(), - placement_group, - engine_args, - engine_manager_args.profiling_result_file_path, - *args, - **kwargs - ) + request_output_queue_type, + instance_id, + BackendType.SIM_VLLM, + manager_args.create_migration_config(), + placement_group, + engine_args, + manager_args.profiling_result_file_path, + *args, + **kwargs) instance_ids.append(instance_id) llumlets.append(llumlet) @@ -542,9 +556,9 @@ async def _clear_request_instance_loop(self, interval: float): await asyncio.sleep(interval) self.request_instance = {} - def _init_instance_info_csv(self, engine_manager_args: EngineManagerArgs) -> None: + def _init_instance_info_csv(self, manager_args: ManagerArgs) -> None: # pylint: disable=consider-using-with - self.instance_info_file = open(engine_manager_args.log_filename + '_instance.csv', 'w', encoding='utf-8') + self.instance_info_file = open(manager_args.log_filename + '_instance.csv', 'w', encoding='utf-8') self.instance_info_csv = csv.writer(self.instance_info_file) self.instance_info_csv.writerow([ 'timestamp', diff --git a/llumnix/utils.py b/llumnix/utils.py index 932c49d7..fa15124e 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -13,6 +13,7 @@ import uuid import ray +from ray.util.placement_group import PlacementGroup MANAGER_NAME = "manager" PLACEMENT_GROUP_NAME_PREFIX = "pg_" @@ -20,6 +21,66 @@ INSTANCE_NAME_PREFIX = "instance_" +def initialize_placement_group( + instance_id: str, + num_cpus: int = 1, + num_gpus: int = 1, + detached: bool = False +) -> PlacementGroup: + """Initialize the distributed cluster probably with Ray. + + Args: + instance_id: The instance id of the instance scheduled to the placement group. + num_cpus: The number of cpus in placement group. + num_cpus: The number of cpus in placement group. + detached: Whether the lifetime of the placement group being detached. + + Returns: + `placement_group`. `placement_group` includes the specification + of the resources for each distributed worker. + """ + if ray is None: + raise ImportError( + "Ray is not installed. Please install Ray to use distributed " + "serving.") + + lifetime = "detached" if detached else None + # Create placement group for worker processes + current_placement_group = ray.util.get_current_placement_group() + if current_placement_group: + # We are in a placement group + bundles = current_placement_group.bundle_specs + # Verify that we can use the placement group. + gpu_bundles = 0 + for bundle in bundles: + bundle_gpus = bundle.get("GPU", 0) + if bundle_gpus > 1: + raise ValueError( + "Placement group bundle cannot have more than 1 GPU.") + if bundle_gpus: + gpu_bundles += 1 + if num_gpus > gpu_bundles: + raise ValueError( + "The number of required GPUs exceeds the total number of " + "available GPUs in the placement group.") + else: + num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) + if num_gpus > num_gpus_in_cluster: + raise ValueError( + "The number of required GPUs exceeds the total number of " + "available GPUs in the cluster.") + # Create a new placement group + # bundle_0: Llumlet + AsyncPutQueueActor + ProxyActor, bundle_1: Workers + placement_group_specs = ([{"CPU": num_cpus}] + [{"GPU": 1}] * num_gpus) + current_placement_group = ray.util.placement_group( + placement_group_specs, "STRICT_PACK", name=get_placement_group_name(instance_id), lifetime=lifetime) + # Wait until PG is ready - this will block until all + # requested resources are available, and will timeout + # if they cannot be provisioned. + ray.get(current_placement_group.ready(), timeout=1800) + + return current_placement_group + def random_uuid() -> str: return str(uuid.uuid4().hex) diff --git a/tests/unit_test/backends/vllm/test_llm_engine.py b/tests/unit_test/backends/vllm/test_llm_engine.py index 9b01c8af..b4fe46b6 100644 --- a/tests/unit_test/backends/vllm/test_llm_engine.py +++ b/tests/unit_test/backends/vllm/test_llm_engine.py @@ -28,7 +28,7 @@ from llumnix.backends.vllm.sequence import LlumnixRequest from llumnix.queue.queue_type import QueueType from llumnix.server_info import ServerInfo -from llumnix.backends.utils import initialize_placement_group +from llumnix.utils import initialize_placement_group from tests.conftest import ray_env from .utils import create_dummy_prompt, initialize_scheduler diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index a36f56cc..5d6a01c0 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -28,7 +28,7 @@ from llumnix.internal_config import MigrationConfig from llumnix.llumlet.request import RequestInferenceType, RequestStatus from llumnix.queue.queue_type import QueueType -from llumnix.backends.utils import initialize_placement_group +from llumnix.utils import initialize_placement_group from tests.unit_test.queue.utils import request_output_queue_server # pylint: disable=unused-import @@ -51,7 +51,6 @@ def init_llumlet(request_output_queue_type, instance_id, migration_config, engin request_output_queue_type, instance_id, BackendType.VLLM, - 1, migration_config, placement_group, engine_args,) diff --git a/tests/unit_test/backends/vllm/test_migration_backend.py b/tests/unit_test/backends/vllm/test_migration_backend.py index 8c2cb4fa..843c2a23 100644 --- a/tests/unit_test/backends/vllm/test_migration_backend.py +++ b/tests/unit_test/backends/vllm/test_migration_backend.py @@ -19,9 +19,8 @@ from vllm.engine.arg_utils import EngineArgs from llumnix.backends.vllm.worker import MigrationWorker -from llumnix.arg_utils import EngineManagerArgs -from llumnix.utils import random_uuid -from llumnix.backends.utils import initialize_placement_group +from llumnix.arg_utils import ManagerArgs +from llumnix.utils import random_uuid, initialize_placement_group # pylint: disable=unused-import from tests.conftest import ray_env @@ -41,7 +40,7 @@ def get_gpu_cache(self): @pytest.mark.parametrize("backend", ['rayrpc', 'gloo', 'nccl']) def test_migrate_cache(ray_env, backend): engine_config = EngineArgs(model='facebook/opt-125m', max_model_len=8, enforce_eager=True).create_engine_config() - migraiton_config = EngineManagerArgs(migration_buffer_blocks=3, migration_num_layers=5).create_migration_config() + migraiton_config = ManagerArgs(migration_buffer_blocks=3, migration_num_layers=5).create_migration_config() migraiton_config.migration_backend = backend worker0 = create_worker(rank=0, local_rank=0, engine_config=engine_config, diff --git a/tests/unit_test/backends/vllm/test_simulator.py b/tests/unit_test/backends/vllm/test_simulator.py index 417be632..1167ffdf 100644 --- a/tests/unit_test/backends/vllm/test_simulator.py +++ b/tests/unit_test/backends/vllm/test_simulator.py @@ -12,7 +12,7 @@ from llumnix.backends.profiling import LatencyMemData from llumnix.internal_config import MigrationConfig from llumnix.queue.queue_type import QueueType -from llumnix.backends.utils import initialize_placement_group +from llumnix.utils import initialize_placement_group # pylint: disable=unused-import from tests.conftest import ray_env diff --git a/tests/unit_test/backends/vllm/test_worker.py b/tests/unit_test/backends/vllm/test_worker.py index fae20162..ef5f15f0 100644 --- a/tests/unit_test/backends/vllm/test_worker.py +++ b/tests/unit_test/backends/vllm/test_worker.py @@ -21,9 +21,9 @@ from vllm.config import EngineConfig from vllm.executor.ray_gpu_executor import RayWorkerWrapper -from llumnix.arg_utils import EngineManagerArgs +from llumnix.arg_utils import ManagerArgs from llumnix.utils import random_uuid -from llumnix.backends.utils import initialize_placement_group +from llumnix.utils import initialize_placement_group # pylint: disable=unused-import from tests.conftest import ray_env @@ -60,7 +60,7 @@ def create_worker(rank: int, local_rank: int, engine_config: EngineConfig, @pytest.mark.parametrize("backend", ['rayrpc', 'gloo', 'nccl']) def test_reserve_memory_for_migration(ray_env, backend): engine_config = EngineArgs(model='facebook/opt-125m', max_model_len=8, enforce_eager=True).create_engine_config() - migration_config = EngineManagerArgs(migration_buffer_blocks=1).create_migration_config() + migration_config = ManagerArgs(migration_buffer_blocks=1).create_migration_config() migration_config.migration_backend = backend worker = create_worker(rank=0, local_rank=0, engine_config=engine_config) ray.get(worker.execute_method.remote('init_device')) @@ -81,7 +81,7 @@ def test_reserve_memory_for_migration(ray_env, backend): @pytest.mark.parametrize("backend", ['rayrpc', 'gloo', 'nccl']) def test_rebuild_migration_backend(ray_env, backend): engine_config = EngineArgs(model='facebook/opt-125m', max_model_len=8, enforce_eager=True).create_engine_config() - migration_config = EngineManagerArgs(migration_buffer_blocks=1).create_migration_config() + migration_config = ManagerArgs(migration_buffer_blocks=1).create_migration_config() migration_config.migration_backend = backend worker0 = create_worker(rank=0, local_rank=0, engine_config=engine_config) diff --git a/tests/unit_test/entrypoints/test_utils.py b/tests/unit_test/entrypoints/test_utils.py index 1cfe7b04..ab694266 100644 --- a/tests/unit_test/entrypoints/test_utils.py +++ b/tests/unit_test/entrypoints/test_utils.py @@ -15,12 +15,9 @@ import pytest import ray -from llumnix.arg_utils import EngineManagerArgs -from llumnix.entrypoints.setup import (get_ip_address, - launch_ray_cluster, - init_manager, - retry_manager_method_sync, - retry_manager_method_async) +from llumnix.arg_utils import ManagerArgs +from llumnix.entrypoints.setup import launch_ray_cluster, init_manager +from llumnix.entrypoints.utils import get_ip_address, retry_manager_method_sync, retry_manager_method_async from llumnix.queue.utils import init_request_output_queue_server from llumnix.utils import MANAGER_NAME @@ -36,8 +33,8 @@ def test_launch_ray_cluster(): assert result.returncode == 0 def test_init_manager(ray_env): - engine_manager_args = EngineManagerArgs() - manager = init_manager(engine_manager_args) + manager_args = ManagerArgs() + manager = init_manager(manager_args) assert manager is not None manager_actor_handle = ray.get_actor(MANAGER_NAME, namespace='llumnix') assert manager_actor_handle is not None @@ -50,14 +47,14 @@ def test_init_zmq(ray_env): assert request_output_queue is not None def test_retry_manager_method_sync(ray_env): - engine_manager_args = EngineManagerArgs() - manager = init_manager(engine_manager_args) + manager_args = ManagerArgs() + manager = init_manager(manager_args) ret = retry_manager_method_sync(manager.is_ready.remote, 'is_ready') assert ret is True @pytest.mark.asyncio async def test_retry_manager_method_async(ray_env): - engine_manager_args = EngineManagerArgs() - manager = init_manager(engine_manager_args) + manager_args = ManagerArgs() + manager = init_manager(manager_args) ret = await retry_manager_method_async(manager.is_ready.remote, 'is_ready') assert ret is True diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager.py b/tests/unit_test/entrypoints/vllm/api_server_manager.py index da149d25..4a184049 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager.py @@ -19,12 +19,12 @@ from vllm.outputs import CompletionOutput, RequestOutput import llumnix.entrypoints.vllm.api_server -import llumnix.llm_engine_manager -from llumnix.arg_utils import EngineManagerArgs +import llumnix.manager +from llumnix.arg_utils import ManagerArgs from llumnix.server_info import ServerInfo, RequestTimestamps from llumnix.utils import random_uuid, MANAGER_NAME from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType -from llumnix.entrypoints.setup import LlumnixEntrypointsContext +from llumnix.entrypoints.setup import EntrypointsContext from llumnix.entrypoints.vllm.client import LlumnixClientVLLM app = llumnix.entrypoints.vllm.api_server.app @@ -32,7 +32,7 @@ @ray.remote(num_cpus=0) -class MockLLMEngineManager: +class MockManager: def __init__(self, request_output_queue_type: QueueType): self._num_generates = 0 self._num_aborts = 0 @@ -53,7 +53,7 @@ def testing_stats(self): def init_manager(request_output_queue_type: QueueType): - manager = MockLLMEngineManager.options(name=MANAGER_NAME, + manager = MockManager.options(name=MANAGER_NAME, namespace='llumnix').remote(request_output_queue_type) return manager @@ -68,7 +68,7 @@ def stats() -> Response: parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) parser.add_argument("--request-output-queue-type", type=str, choices=["zmq", "rayqueue"]) - parser = EngineManagerArgs.add_cli_args(parser) + parser = ManagerArgs.add_cli_args(parser) args = parser.parse_args() request_output_queue_type = QueueType(args.request_output_queue_type) @@ -80,12 +80,13 @@ def stats() -> Response: if request_output_queue_type == QueueType.RAYQUEUE: ray_queue_server = request_output_queue server_info = ServerInfo(random_uuid(), request_output_queue_type, ray_queue_server, ip, port) - llumnix_context = LlumnixEntrypointsContext(manager, - {'0': None}, - request_output_queue, - server_info, - None, - None) + llumnix_context = EntrypointsContext(manager, + {'0': None}, + request_output_queue, + server_info, + None, + None, + None) llumnix.entrypoints.vllm.api_server.llumnix_client = LlumnixClientVLLM(llumnix_context) uvicorn.run( diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py index 44eb459d..a4b1e4c3 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py @@ -21,22 +21,22 @@ from vllm.outputs import CompletionOutput, RequestOutput import llumnix.entrypoints.vllm.api_server -import llumnix.llm_engine_manager -from llumnix.arg_utils import EngineManagerArgs +import llumnix.manager +from llumnix.arg_utils import ManagerArgs from llumnix.server_info import ServerInfo, RequestTimestamps from llumnix.utils import random_uuid from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType -from llumnix.entrypoints.setup import LlumnixEntrypointsContext +from llumnix.entrypoints.setup import EntrypointsContext from llumnix.entrypoints.vllm.client import LlumnixClientVLLM from llumnix.utils import MANAGER_NAME app = llumnix.entrypoints.vllm.api_server.app -engine_manager = None +manager = None ENTRYPOINTS_ACTOR_NAME = "entrypoints" @ray.remote(num_cpus=0, lifetime="detached") -class MockLLMEngineManagerService: +class MockManagerService: def __init__(self, request_output_queue_type: QueueType, args: 'Namespace'): self._num_generates = 0 self._num_aborts = 0 @@ -68,18 +68,19 @@ def __init__(self, host: str, port: int, request_output_queue_type: QueueType): self.port = port ip = '127.0.0.1' port = 1234 - global engine_manager - engine_manager = ray.get_actor(MANAGER_NAME, namespace="llumnix") + global manager + manager = ray.get_actor(MANAGER_NAME, namespace="llumnix") request_output_queue = init_request_output_queue_server(ip, port, request_output_queue_type) ray_queue_server = None if request_output_queue_type == QueueType.RAYQUEUE: ray_queue_server = request_output_queue server_info = ServerInfo(random_uuid(), request_output_queue_type, ray_queue_server, ip, port) - llumnix_context = LlumnixEntrypointsContext(engine_manager, + llumnix_context = EntrypointsContext(manager, {'0': None}, request_output_queue, server_info, None, + None, None) llumnix.entrypoints.vllm.api_server.llumnix_client = LlumnixClientVLLM(llumnix_context) @@ -92,14 +93,14 @@ def run(self): timeout_keep_alive=llumnix.entrypoints.vllm.api_server.TIMEOUT_KEEP_ALIVE) def init_manager_service(request_output_queue_type: QueueType, args: 'Namespace'): - engine_manager = MockLLMEngineManagerService.options(name=MANAGER_NAME, + manager = MockManagerService.options(name=MANAGER_NAME, namespace='llumnix').remote(request_output_queue_type, args) - return engine_manager + return manager @app.get("/stats") def stats() -> Response: """Get the statistics of the engine.""" - return JSONResponse(ray.get(engine_manager.testing_stats.remote())) + return JSONResponse(ray.get(manager.testing_stats.remote())) if __name__ == "__main__": @@ -107,7 +108,7 @@ def stats() -> Response: parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) parser.add_argument("--request-output-queue-type", type=str, choices=["zmq", "rayqueue"]) - parser = EngineManagerArgs.add_cli_args(parser) + parser = ManagerArgs.add_cli_args(parser) args = parser.parse_args() # magic actor, without this actor, FastAPIServer cannot initialize correctly. @@ -115,6 +116,6 @@ def stats() -> Response: request_output_queue = RayQueue() request_output_queue_type = QueueType(args.request_output_queue_type) - engine_manager = init_manager_service(request_output_queue_type, args) + manager = init_manager_service(request_output_queue_type, args) time.sleep(2) diff --git a/tests/unit_test/global_scheduler/test_global_scheduler.py b/tests/unit_test/global_scheduler/test_global_scheduler.py index 7079c96f..9a30b6c9 100644 --- a/tests/unit_test/global_scheduler/test_global_scheduler.py +++ b/tests/unit_test/global_scheduler/test_global_scheduler.py @@ -19,7 +19,7 @@ from llumnix.instance_info import InstanceInfo from llumnix.utils import random_uuid -from .test_llm_engine_manager import get_instance_info_migrate_in, get_instance_info_migrate_out +from .test_manager import get_instance_info_migrate_in, get_instance_info_migrate_out def init_global_scheduler(): diff --git a/tests/unit_test/global_scheduler/test_llm_engine_manager.py b/tests/unit_test/global_scheduler/test_manager.py similarity index 94% rename from tests/unit_test/global_scheduler/test_llm_engine_manager.py rename to tests/unit_test/global_scheduler/test_manager.py index a1714f1e..5ddbe6d8 100644 --- a/tests/unit_test/global_scheduler/test_llm_engine_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -20,8 +20,8 @@ from vllm import EngineArgs from llumnix.utils import random_uuid, get_instance_name, MANAGER_NAME -from llumnix.arg_utils import EngineManagerArgs -from llumnix.llm_engine_manager import LLMEngineManager +from llumnix.arg_utils import ManagerArgs +from llumnix.manager import Manager from llumnix.instance_info import InstanceInfo from llumnix.server_info import ServerInfo from llumnix.queue.queue_type import QueueType @@ -105,9 +105,9 @@ def _get_lantecy_mem(self, *args, **kwargs): def init_manager(): try: - engine_manager_args = EngineManagerArgs(migration_backend="rayrpc", enable_migration=True) - engine_manager_args.log_instance_info = False - manager = LLMEngineManager.from_args(engine_manager_args, None) + manager_args = ManagerArgs(migration_backend="rayrpc", enable_migration=True) + manager_args.log_instance_info = False + manager = Manager.from_args(manager_args) except ValueError: manager = ray.get_actor(MANAGER_NAME, namespace='llumnix') ray.get(manager.is_ready.remote()) @@ -153,10 +153,10 @@ def test_init_llumlet(ray_env, llumlet): def test_init_llumlets(ray_env, manager): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - instance_ids, llumlets = ray.get(manager.init_llumlets.remote(engine_args, QueueType("rayqueue"), BackendType.VLLM, 1)) + instance_ids, llumlets = ray.get(manager.init_llumlets.remote(engine_args, QueueType("rayqueue"), BackendType.VLLM)) num_instances = ray.get(manager.scale_up.remote(instance_ids, llumlets)) - engine_manager_args = EngineManagerArgs() - assert num_instances == engine_manager_args.initial_instances + manager_args = ManagerArgs() + assert num_instances == manager_args.initial_instances def test_init_llumlets_sim(ray_env, manager): manager.profiling_result_file_path="//" @@ -164,10 +164,10 @@ def test_init_llumlets_sim(ray_env, manager): import llumnix.backends.vllm.simulator llumnix.backends.vllm.simulator.BackendSimVLLM = MockBackendSim engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - instance_ids, llumlets = ray.get(manager.init_llumlets.remote(engine_args, QueueType("rayqueue"), BackendType.VLLM, 1)) + instance_ids, llumlets = ray.get(manager.init_llumlets.remote(engine_args, QueueType("rayqueue"), BackendType.VLLM)) num_instances = ray.get(manager.scale_up.remote(instance_ids, llumlets)) - engine_manager_args = EngineManagerArgs() - assert num_instances == engine_manager_args.initial_instances + manager_args = ManagerArgs() + assert num_instances == manager_args.initial_instances def test_scale_up_and_down(ray_env, manager): initial_instances = 4 diff --git a/tests/unit_test/llumlet/test_engine_step_exception.py b/tests/unit_test/llumlet/test_engine_step_exception.py index b5ea1749..ccb45f1e 100644 --- a/tests/unit_test/llumlet/test_engine_step_exception.py +++ b/tests/unit_test/llumlet/test_engine_step_exception.py @@ -25,7 +25,7 @@ from llumnix.llumlet.llumlet import Llumlet from llumnix.internal_config import MigrationConfig from llumnix.queue.queue_type import QueueType -from llumnix.backends.utils import initialize_placement_group +from llumnix.utils import initialize_placement_group # pylint: disable=unused-import from tests.conftest import ray_env From d89dfb87641923761a5d37e3d379950cae51c82b Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 2 Jan 2025 09:43:15 +0000 Subject: [PATCH 41/92] Refine actor construction args --- examlpes/offline_inference.py | 2 +- llumnix/backends/backend_interface.py | 8 +- llumnix/backends/bladellm/llm_engine.py | 42 +++++----- llumnix/backends/profiling.py | 5 +- llumnix/backends/utils.py | 43 ++++++++--- llumnix/backends/vllm/llm_engine.py | 25 +++--- llumnix/backends/vllm/simulator.py | 16 ++-- llumnix/config/default.py | 2 +- llumnix/entrypoints/bladellm/api_server.py | 8 +- llumnix/entrypoints/setup.py | 33 ++++---- llumnix/llumlet/llumlet.py | 76 ++++++++----------- llumnix/manager.py | 30 ++++---- .../backends/vllm/test_llm_engine.py | 17 +++-- .../unit_test/backends/vllm/test_migration.py | 12 +-- .../global_scheduler/test_manager.py | 6 +- 15 files changed, 162 insertions(+), 163 deletions(-) diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py index 2d2e67e3..46e2c8eb 100644 --- a/examlpes/offline_inference.py +++ b/examlpes/offline_inference.py @@ -46,7 +46,7 @@ # Create llumlets. instance_ids: List[str] = None llumlets: List[Llumlet] = None -instance_ids, llumlets = ray.get(manager.init_llumlets.remote(engine_args, QueueType("rayqueue"), BackendType.VLLM)) +instance_ids, llumlets = ray.get(manager.init_llumlets.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) ray.get(manager.scale_up.remote(instance_ids, llumlets)) diff --git a/llumnix/backends/backend_interface.py b/llumnix/backends/backend_interface.py index c1fc3a84..257c2189 100644 --- a/llumnix/backends/backend_interface.py +++ b/llumnix/backends/backend_interface.py @@ -18,22 +18,18 @@ from llumnix.llumlet.request import LlumnixRequest, RequestStatus from llumnix.server_info import ServerInfo + class EngineState(str, Enum): INIT = "INIT" CRASHED = "CRASHED" RUNNING = "RUNNING" STOPPED = "STOPPED" + class BackendType(str, Enum): VLLM = "VLLM" - SIM_VLLM = "SIM_VLLM" BLADELLM = "BLADELLM" - @staticmethod - def is_sim_backend(status: "BackendType") -> bool: - return status in [ - BackendType.SIM_VLLM, - ] class BackendInterface(ABC): # Methods for inference diff --git a/llumnix/backends/bladellm/llm_engine.py b/llumnix/backends/bladellm/llm_engine.py index 33988d77..79bd9abc 100644 --- a/llumnix/backends/bladellm/llm_engine.py +++ b/llumnix/backends/bladellm/llm_engine.py @@ -40,7 +40,7 @@ from llumnix.queue.queue_type import QueueType class AsyncBackQueueWrapper(APIWrapper): - def __init__(self, placement_group, instance_id, output_queue_type) -> None: + def __init__(self, placement_group, instance_id, request_output_queue_type) -> None: super().__init__(args=None, resp_queue=None) scheduling_strategy = PlacementGroupSchedulingStrategy( placement_group=placement_group, @@ -54,7 +54,7 @@ def __init__(self, placement_group, instance_id, output_queue_type) -> None: self.async_put_queue_actor = ray.remote( num_cpus=1, scheduling_strategy=scheduling_strategy - )(AsyncPutQueueActor).remote(instance_id, output_queue_type) + )(AsyncPutQueueActor).remote(instance_id, request_output_queue_type) self.put_queue_loop_thread.start() self.request_server_map = {} @@ -113,9 +113,9 @@ class AsyncLLMEngineLlumnixMixin: # pylint: disable=unused-argument def __init__(self, instance_id: str, - output_queue_type: QueueType, + placement_group: PlacementGroup, + request_output_queue_type: QueueType, migration_config: MigrationConfig, - placement_group: Optional[PlacementGroup], ) -> None: self.instance_id = instance_id @@ -123,7 +123,7 @@ def __init__(self, logger.info("engine ({}) current state {}".format(self.instance_id, self.state)) self.placement_group = placement_group - self.output_queue_type = output_queue_type + self.request_output_queue_type = request_output_queue_type @property def instance_info(self) -> InstanceInfo: @@ -134,7 +134,7 @@ def start(self, loop: asyncio.AbstractEventLoop): self._client = self.init_client_from_engine() self.trans_wrapper: AsyncBackQueueWrapper = AsyncBackQueueWrapper(self.placement_group, self.instance_id, - self.output_queue_type) + self.request_output_queue_type) self._scheduler.llumnix_metrics.engine_init_metrics(self) async def update_callback(self, resp_list, step_requests): @@ -185,49 +185,53 @@ async def drop_request(self, req_id: int): class AsyncLLMEngineLlumnix(AsyncLLMEngineLlumnixMixin, AsyncLLMEngine): def __init__(self, instance_id: str, - output_queue_type: QueueType, + placement_group: PlacementGroup, + request_output_queue_type: QueueType, migration_config: MigrationConfig, - placement_group: Optional[PlacementGroup], *args, **kwargs, ) -> None: AsyncLLMEngine.__init__(self, *args, **kwargs) - AsyncLLMEngineLlumnixMixin.__init__(self, instance_id, output_queue_type, migration_config, placement_group) + AsyncLLMEngineLlumnixMixin.__init__(self, instance_id, placement_group, request_output_queue_type, migration_config) class PrefillAsyncLLMEngineLlumnix(AsyncLLMEngineLlumnixMixin, PrefillAsyncLLMEngine): def __init__(self, instance_id: str, - output_queue_type: QueueType, + placement_group: PlacementGroup, + request_output_queue_type: QueueType, migration_config: MigrationConfig, - placement_group: Optional[PlacementGroup], *args, **kwargs, ) -> None: PrefillAsyncLLMEngine.__init__(self, *args, **kwargs) - AsyncLLMEngineLlumnixMixin.__init__(self, instance_id, output_queue_type, migration_config, placement_group) + AsyncLLMEngineLlumnixMixin.__init__(self, instance_id, placement_group, request_output_queue_type, migration_config) class DecodeAsyncLLMEngineLlumnix(AsyncLLMEngineLlumnixMixin, DecodeAsyncLLMEngine): def __init__(self, instance_id: str, - output_queue_type: QueueType, + placement_group: PlacementGroup, + request_output_queue_type: QueueType, migration_config: MigrationConfig, - placement_group: Optional[PlacementGroup], *args, **kwargs, ) -> None: DecodeAsyncLLMEngine.__init__(self, *args, **kwargs) - AsyncLLMEngineLlumnixMixin.__init__(self, instance_id, output_queue_type, migration_config, placement_group) + AsyncLLMEngineLlumnixMixin.__init__(self, instance_id, placement_group, request_output_queue_type, migration_config) class BackendBladeLLM(BackendInterface): def __init__( self, instance_id: str, - output_queue_type: QueueType, + placement_group: PlacementGroup, + request_output_queue_type: QueueType, migration_config: MigrationConfig, - engine_args: ServingArgs, - placement_group: PlacementGroup = None, + engine_args: ServingArgs ) -> None: self.instance_id = instance_id self.engine_args = engine_args engine_cls = self._get_engine_cls() - self.engine = engine_cls(instance_id, output_queue_type, migration_config, placement_group, engine_args) + self.engine = engine_cls(instance_id, + placement_group, + request_output_queue_type, + migration_config, + engine_args) self._loop = asyncio.new_event_loop() self._engine_ready = threading.Event() diff --git a/llumnix/backends/profiling.py b/llumnix/backends/profiling.py index 452bd5cd..cf21fcc4 100644 --- a/llumnix/backends/profiling.py +++ b/llumnix/backends/profiling.py @@ -178,8 +178,7 @@ def model_decode(x, a, b, c): return a * bs + b * tot_seq_len + c def get_latency_mem(backend_type: BackendType, profiling_database: ProfilingDatabase, **backend_args): - assert BackendType.is_sim_backend(backend_type) - if backend_type == BackendType.SIM_VLLM: + if backend_type == BackendType.VLLM: # TODO(ZeldaHuang): support multi-lora, more device, vision language model model_config = backend_args.get("model_config") _ = backend_args.get("cache_config") @@ -196,7 +195,7 @@ def get_latency_mem(backend_type: BackendType, profiling_database: ProfilingData assert sim_parallel_config in profiling_result.para_dict.keys(), "sim parallel config not in database" latency_mem: LatencyMemData = profiling_result.para_dict[sim_parallel_config] return latency_mem - raise ValueError(f'Unsupported backend: {backend_type}') + raise ValueError(f'Unsupported simulator backend: {backend_type}') if __name__ == "__main__": import argparse diff --git a/llumnix/backends/utils.py b/llumnix/backends/utils.py index f7d4d3bb..501d8a54 100644 --- a/llumnix/backends/utils.py +++ b/llumnix/backends/utils.py @@ -16,6 +16,7 @@ import time import ray +from ray.util.placement_group import PlacementGroup from llumnix.backends.backend_interface import BackendInterface, BackendType from llumnix.queue.queue_type import QueueType @@ -24,12 +25,13 @@ from llumnix.server_info import ServerInfo from llumnix.logger import init_logger from llumnix.utils import get_instance_name +from llumnix.internal_config import MigrationConfig logger = init_logger(__name__) class AsyncPutQueueActor: - def __init__(self, instance_id, request_output_queue_type: QueueType): + def __init__(self, instance_id: str, request_output_queue_type: QueueType): self.instance_id = instance_id self.request_output_queue_type = request_output_queue_type self.request_output_queue_client: QueueClientBase = init_request_output_queue_client(request_output_queue_type) @@ -60,26 +62,45 @@ async def put_nowait_to_servers(self, request_ids = [req_output.request_id for req_output in req_outputs] self.engine_actor_handle.abort_request.remote(request_ids) -def init_backend_engine(instance_id: str, request_output_queue_type: QueueType, - backend_type: BackendType, *args, **kwargs) -> BackendInterface: +def init_backend_engine(instance_id: str, + placement_group: PlacementGroup, + request_output_queue_type: QueueType, + migration_config: MigrationConfig, + backend_type: BackendType, + engine_args, + profiling_result_file_path: str = None) -> BackendInterface: if backend_type == BackendType.VLLM: # pylint: disable=import-outside-toplevel - from llumnix.backends.vllm.llm_engine import BackendVLLM - backend_engine = BackendVLLM(instance_id, request_output_queue_type, *args, **kwargs) - elif backend_type == BackendType.SIM_VLLM: - # pylint: disable=import-outside-toplevel - from llumnix.backends.vllm.simulator import BackendSimVLLM - backend_engine = BackendSimVLLM(instance_id, request_output_queue_type, *args, **kwargs) + if profiling_result_file_path is None: + from llumnix.backends.vllm.llm_engine import BackendVLLM + backend_engine = BackendVLLM(instance_id, + placement_group, + request_output_queue_type, + migration_config, + engine_args) + else: + # pylint: disable=import-outside-toplevel + from llumnix.backends.vllm.simulator import BackendSimVLLM + backend_engine = BackendSimVLLM(instance_id, + placement_group, + request_output_queue_type, + migration_config, + engine_args, + profiling_result_file_path) elif backend_type == BackendType.BLADELLM: # pylint: disable=import-outside-toplevel from llumnix.backends.bladellm.llm_engine import BackendBladeLLM - backend_engine = BackendBladeLLM(instance_id, request_output_queue_type, *args, **kwargs) + backend_engine = BackendBladeLLM(instance_id, + placement_group, + request_output_queue_type, + migration_config, + engine_args) else: raise ValueError(f'Unsupported backend: {backend_type}') return backend_engine def get_engine_world_size(engine_args, backend_type: BackendType): - if backend_type in [BackendType.VLLM, BackendType.SIM_VLLM]: + if backend_type == BackendType.VLLM: engine_config = engine_args.create_engine_config() world_size = engine_config.parallel_config.world_size else: # BLADE_LLM diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 921c7f5b..bfcbbb3b 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -50,9 +50,10 @@ class LLMEngineLlumnix(_AsyncLLMEngine): def __init__(self, instance_id: str, + placement_group: PlacementGroup, request_output_queue_type: QueueType, - placement_group: Optional[PlacementGroup], - *args, **kwargs) -> None: + *args, + **kwargs) -> None: super().__init__(*args, **kwargs) self.instance_id = instance_id self.step_counter = Counter() @@ -77,12 +78,12 @@ def __init__(self, @classmethod def from_engine_args( cls, - engine_args: EngineArgs, + instance_id: str, + placement_group: PlacementGroup, request_output_queue_type: QueueType, migration_config: MigrationConfig, + engine_args: EngineArgs, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - instance_id: str = None, - placement_group: Optional[PlacementGroup] = None, latency_mem: Optional[LatencyMemData] = None ) -> "LLMEngineLlumnix": """Creates an LLM engine from the engine arguments.""" @@ -105,8 +106,8 @@ def from_engine_args( # Create the LLM engine. engine = cls( instance_id=instance_id, - request_output_queue_type=request_output_queue_type, placement_group=placement_group, + request_output_queue_type=request_output_queue_type, **engine_config.to_dict(), executor_class=executor_class, log_stats=not engine_args.disable_log_stats, @@ -237,16 +238,16 @@ class BackendVLLM(BackendInterface): def __init__( self, instance_id: str, + placement_group: PlacementGroup, request_output_queue_type: QueueType, migration_config: MigrationConfig, - placement_group: PlacementGroup, engine_args: EngineArgs, ) -> None: - self.engine: LLMEngineLlumnix = LLMEngineLlumnix.from_engine_args(engine_args=engine_args, - request_output_queue_type=request_output_queue_type, - migration_config=migration_config, - instance_id=instance_id, - placement_group=placement_group) + self.engine: LLMEngineLlumnix = LLMEngineLlumnix.from_engine_args(instance_id, + placement_group, + request_output_queue_type, + migration_config, + engine_args) self.engine.scheduler = SchedulerLlumnix(self.engine.scheduler_config, self.engine.cache_config, self.engine.lora_config) self.engine.scheduler.add_update_instance_info_callback(self.engine.update_instance_info) self.engine.output_processor.scheduler = self.engine.scheduler diff --git a/llumnix/backends/vllm/simulator.py b/llumnix/backends/vllm/simulator.py index 85613edb..94ff6850 100644 --- a/llumnix/backends/vllm/simulator.py +++ b/llumnix/backends/vllm/simulator.py @@ -32,20 +32,20 @@ class BackendSimVLLM(BackendVLLM): def __init__( self, instance_id: str, + placement_group: PlacementGroup, request_output_queue_type: QueueType, migration_config: MigrationConfig, - placement_group: PlacementGroup, engine_args: EngineArgs, - profiling_result_file_path: str, + profiling_result_file_path: str ) -> None: # multi-instance args latency_mem = self._get_lantecy_mem(profiling_result_file_path, engine_args) - self.engine: LLMEngineLlumnix = LLMEngineLlumnix.from_engine_args(engine_args=engine_args, - request_output_queue_type=request_output_queue_type, - migration_config=migration_config, - instance_id=instance_id, - placement_group=placement_group, - latency_mem=latency_mem) + self.engine: LLMEngineLlumnix = LLMEngineLlumnix.from_engine_args(instance_id, + placement_group, + request_output_queue_type, + migration_config, + engine_args, + latency_mem) self.engine.scheduler = SchedulerLlumnix(self.engine.scheduler_config, self.engine.cache_config, self.engine.lora_config) self.engine.scheduler.add_update_instance_info_callback(self.engine.update_instance_info) self.engine.output_processor.scheduler = self.engine.scheduler diff --git a/llumnix/config/default.py b/llumnix/config/default.py index 31d56fe0..ba21e42f 100644 --- a/llumnix/config/default.py +++ b/llumnix/config/default.py @@ -64,7 +64,7 @@ # Log filename _C.MANAGER.LOG_FILENAME = "server.log" # Profiling result file path -_C.MANAGER.PROFILING_RESULT_FILE_PATH = "" +_C.MANAGER.PROFILING_RESULT_FILE_PATH = None # Number of instances created at initialization _C.MANAGER.INITIAL_INSTANCES = 1 diff --git a/llumnix/entrypoints/bladellm/api_server.py b/llumnix/entrypoints/bladellm/api_server.py index b32dd78c..d56fe959 100644 --- a/llumnix/entrypoints/bladellm/api_server.py +++ b/llumnix/entrypoints/bladellm/api_server.py @@ -14,7 +14,6 @@ import asyncio from blade_llm.service.args import ServingArgs -# TODO(s5u13b): Refine multiple import codes. from llumnix.config import get_llumnix_config from llumnix.backends.backend_interface import BackendType from llumnix.arg_utils import (EntrypointsArgs, ManagerArgs, LlumnixArgumentParser, @@ -27,7 +26,6 @@ def setup_llumnix_api_server(bladellm_args: ServingArgs, loop: asyncio.AbstractEventLoop): # generate llumnix_parser for checking parameters with choices - # TODO(s5u13b): Add add_cli_args function. llumnix_parser = LlumnixArgumentParser() llumnix_parser = EntrypointsArgs.add_cli_args(llumnix_parser) llumnix_parser = ManagerArgs.add_cli_args(llumnix_parser) @@ -41,12 +39,8 @@ def setup_llumnix_api_server(bladellm_args: ServingArgs, loop: asyncio.AbstractE llumnix_client = None # if gpu is not available, it means that this node is head pod x any llumnix components if is_gpu_available(): - instance_ids = None - if engine_args.enable_disagg: - instance_ids = [engine_args.disagg_options.inst_id] - llumnix_context: EntrypointsContext = \ - setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args, instance_ids=instance_ids) + setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args) llumnix_client = LlumnixClientBladeLLM(bladellm_args, llumnix_context, loop) return llumnix_client diff --git a/llumnix/entrypoints/setup.py b/llumnix/entrypoints/setup.py index bdd236d0..9defe7db 100644 --- a/llumnix/entrypoints/setup.py +++ b/llumnix/entrypoints/setup.py @@ -15,7 +15,7 @@ import sys import os import time -from typing import Dict, Optional +from typing import Dict, Optional, List, Tuple import ray from llumnix.manager import Manager @@ -30,6 +30,7 @@ retry_manager_method_sync) from llumnix.entrypoints.utils import DeploymentMode from llumnix.backends.backend_interface import BackendType +from llumnix.queue.queue_server_base import QueueServerBase MAX_RAY_RESTARTS = 5 RAY_RESTART_INTERVALS = 10 @@ -93,7 +94,10 @@ def init_manager(manager_args: ManagerArgs, ) -> Manager: # Only one instance create the manager actor, the other instances get the existing manager actor through ray. try: - manager = Manager.from_args(manager_args, entrypoints_args, engine_args, deployment_args) + manager = Manager.from_args(manager_args=manager_args, + entrypoints_args=entrypoints_args, + engine_args=engine_args, + deployment_args=deployment_args) logger.info("Init Manager on current node.") except ValueError: manager = ray.get_actor(MANAGER_NAME, namespace='llumnix') @@ -105,18 +109,15 @@ def init_llumnix_components(manager_args: ManagerArgs, request_output_queue_type: QueueType, ip: str, request_output_queue_port: str, - backend_type: BackendType, - *args, - **kwargs - ): + backend_type: BackendType) -> Tuple[Manager, List[str], List[Llumlet], QueueServerBase]: manager = init_manager(manager_args) instance_ids, llumlets = retry_manager_method_sync( - manager.init_llumlets.remote, 'init_llumlets', engine_args, request_output_queue_type, backend_type, *args, **kwargs) + manager.init_llumlets.remote, 'init_llumlets', request_output_queue_type, backend_type, engine_args) - available_instance_ids = [] - dead_instance_ids = [] - available_llumlets = [] + available_instance_ids: List[str] = [] + dead_instance_ids: List[str] = [] + available_llumlets: List[Llumlet] = [] ready_tasks = [llumlet.is_ready.remote() for llumlet in llumlets] for idx, task in enumerate(ready_tasks): try: @@ -137,8 +138,7 @@ def init_llumnix_components(manager_args: ManagerArgs, return manager, available_instance_ids, available_llumlets, request_output_queue -def _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment_args, - *args, **kwargs) -> EntrypointsContext: +def _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment_args) -> EntrypointsContext: ip = get_ip_address() request_output_queue_type = entrypoints_args.request_output_queue_type request_output_queue_port = entrypoints_args.request_output_queue_port @@ -150,9 +150,7 @@ def _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment request_output_queue_type, ip, request_output_queue_port, - backend_type, - *args, - **kwargs) + backend_type) server_id = random_uuid() server_info = ServerInfo(server_id, @@ -182,9 +180,8 @@ def _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment def _setup_llumnix_global(manager_args, entrypoints_args, engine_args, deployment_args) -> None: _ = init_manager(manager_args, entrypoints_args, engine_args, deployment_args) -def setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args, - *args, **kwargs) -> Optional[EntrypointsContext]: +def setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args) -> Optional[EntrypointsContext]: if deployment_args.deployment_mode == DeploymentMode.LOCAL: - return _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment_args, *args, **kwargs) + return _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment_args) return _setup_llumnix_global(manager_args, entrypoints_args, engine_args, deployment_args) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 4845d956..07db2888 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -17,7 +17,7 @@ import time import ray -from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy, NodeAffinitySchedulingStrategy +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from ray.util.placement_group import PlacementGroup from llumnix.logger import init_logger @@ -40,23 +40,23 @@ class Llumlet: def __init__(self, instance_id: str, + placement_group: PlacementGroup, request_output_queue_type: QueueType, - backend_type: BackendType, migration_config: MigrationConfig, - placement_group: PlacementGroup, - *args, - **kwargs) -> None: + backend_type: BackendType, + engine_args, + profiling_result_file_path: str = None) -> None: try: logger.info("Llumlet backend type: {}".format(backend_type)) self.instance_id = instance_id self.instance_name = get_instance_name(instance_id) - self.backend_engine: BackendInterface = init_backend_engine(self.instance_id, + self.backend_engine: BackendInterface = init_backend_engine(instance_id, + placement_group, request_output_queue_type, - backend_type, migration_config, - placement_group, - *args, - **kwargs) + backend_type, + engine_args, + profiling_result_file_path) self.migration_coordinator = MigrationCoordinator(self.backend_engine, migration_config.last_stage_max_blocks, migration_config.max_stages) @@ -72,54 +72,40 @@ def __init__(self, @classmethod def from_args(cls, - request_output_queue_type: QueueType, instance_id: str, - backend_type: BackendType, - migration_config: MigrationConfig, placement_group: PlacementGroup, - *args, - **kwargs): + request_output_queue_type: QueueType, + migration_config: MigrationConfig, + backend_type: BackendType, + engine_args, + profiling_result_file_path: str = None): try: - assert backend_type in [backend_type.VLLM, backend_type.SIM_VLLM, backend_type.BLADELLM], \ + assert backend_type in [backend_type.VLLM, backend_type.BLADELLM], \ f'unimplemented backend {backend_type}' num_gpus = 0 if backend_type == backend_type.BLADELLM: - engine_args = kwargs["engine_args"] world_size = get_engine_world_size(engine_args, backend_type) num_gpus = world_size instance_name = get_instance_name(instance_id) - if backend_type in [backend_type.VLLM, backend_type.BLADELLM]: - llumlet_class = ray.remote(num_cpus=1, - num_gpus=num_gpus, - name=instance_name, - namespace='llumnix', - max_concurrency=4, - lifetime="detached")(cls).options( - scheduling_strategy=PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_bundle_index=0, - placement_group_capture_child_tasks=True, - ) - ) - else: # backend_type == backend_type.SIM_VLLM: - llumlet_class = ray.remote(num_cpus=1, - num_gpus=num_gpus, - name=instance_name, - namespace='llumnix', - max_concurrency=4, - lifetime="detached")(cls).options( - scheduling_strategy=NodeAffinitySchedulingStrategy( - node_id=ray.get_runtime_context().get_node_id(), - soft=False, - ) + llumlet_class = ray.remote(num_cpus=1, + num_gpus=num_gpus, + name=instance_name, + namespace='llumnix', + max_concurrency=4, + lifetime="detached")(cls).options( + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_bundle_index=0, + placement_group_capture_child_tasks=True, ) + ) llumlet = llumlet_class.remote(instance_id, + placement_group, request_output_queue_type, - backend_type, migration_config, - placement_group, - *args, - **kwargs) + backend_type, + engine_args, + profiling_result_file_path) # pylint: disable=broad-except except Exception as e: logger.error("failed to initialize Llumlet: {}".format(e)) diff --git a/llumnix/manager.py b/llumnix/manager.py index 9b101eae..898f06c9 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -67,7 +67,8 @@ def __init__(self, self.engine_args = engine_args self.deployment_args = deployment_args - assert deployment_args is None or (deployment_args is not None and entrypoints_args is not None and engine_args is not None) + assert deployment_args is None or \ + (deployment_args is not None and entrypoints_args is not None and engine_args is not None) # deployment args if deployment_args is not None: @@ -448,19 +449,16 @@ def from_args(cls, return manager def init_llumlets(self, - engine_args, request_output_queue_type: QueueType, backend_type: BackendType, - *args, - **kwargs) -> Tuple[List[str], List[Llumlet]]: + engine_args, + instance_ids: List[str] = None + ) -> Tuple[List[str], List[Llumlet]]: manager_args = self.manager_args world_size = get_engine_world_size(engine_args, backend_type) instance_ids: List[str] = [] llumlets: List[Llumlet] = [] - # for pd disaggregation - if 'instance_ids' in kwargs and kwargs['instance_ids'][0]: - instance_ids = kwargs['instance_ids'] for _ in range(manager_args.initial_instances): instance_id = random_uuid() if not manager_args.profiling_result_file_path: @@ -484,16 +482,14 @@ def init_llumlets(self, assert backend_type == backend_type.VLLM, 'Only support the simulator backend for vLLM.' # num_cpus=1, for Llumlet + AsyncPutQueueActor placement_group = initialize_placement_group(instance_id, num_cpus=2, num_gpus=0, detached=True) - llumlet = Llumlet.from_args( - request_output_queue_type, - instance_id, - BackendType.SIM_VLLM, - manager_args.create_migration_config(), - placement_group, - engine_args, - manager_args.profiling_result_file_path, - *args, - **kwargs) + llumlet = Llumlet.from_args( + instance_id, + placement_group, + request_output_queue_type, + manager_args.create_migration_config(), + backend_type, + engine_args, + manager_args.profiling_result_file_path) instance_ids.append(instance_id) llumlets.append(llumlet) diff --git a/tests/unit_test/backends/vllm/test_llm_engine.py b/tests/unit_test/backends/vllm/test_llm_engine.py index b4fe46b6..5da6a025 100644 --- a/tests/unit_test/backends/vllm/test_llm_engine.py +++ b/tests/unit_test/backends/vllm/test_llm_engine.py @@ -88,21 +88,26 @@ def test_llm_engine_process_model_outputs(): ret, _ = llm_engine._process_model_outputs(sampler_outputs, scheduled_seq_groups,[], metas) assert len(ret) == 1 -def test_llm_engine_from_engine_args(): +def test_llm_engine_from_engine_args(ray_env): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - llm_engine = MockEngine.from_engine_args(engine_args, request_output_queue_type=QueueType.RAYQUEUE, - instance_id="0", migration_config=None) + placement_group = initialize_placement_group(instance_id="0", num_cpus=3, num_gpus=1, detached=True) + llm_engine = MockEngine.from_engine_args(engine_args=engine_args, request_output_queue_type=QueueType.RAYQUEUE, + instance_id="0", migration_config=None, placement_group=placement_group) assert llm_engine.executor_class == LlumnixRayGPUExecutor +def test_llm_engine_from_engine_args_sim(ray_env): latency_data = LatencyMemData({},{},{}) - llm_engine = MockEngine.from_engine_args(engine_args, request_output_queue_type=QueueType.RAYQUEUE, - instance_id="0", migration_config=None, latency_mem=latency_data) + engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) + placement_group = initialize_placement_group(instance_id="0", num_cpus=2, num_gpus=1, detached=True) + llm_engine = MockEngine.from_engine_args(engine_args=engine_args, request_output_queue_type=QueueType.RAYQUEUE, + instance_id="0", migration_config=None, latency_mem=latency_data, + placement_group=placement_group) assert llm_engine.executor_class == SimGPUExecutor def test_llm_engine_add_requset(ray_env): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) placement_group = initialize_placement_group(instance_id="0", num_cpus=3, num_gpus=1, detached=True) - llm_engine = LLMEngineLlumnix.from_engine_args(engine_args, + llm_engine = LLMEngineLlumnix.from_engine_args(engine_args=engine_args, request_output_queue_type=QueueType.RAYQUEUE, instance_id="0", placement_group=placement_group, diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index 5d6a01c0..0251d7a9 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -48,12 +48,12 @@ def init_llumlet(request_output_queue_type, instance_id, migration_config, engine_args): placement_group = initialize_placement_group(instance_id=instance_id, num_cpus=3, num_gpus=1, detached=True) llumlet = Llumlet.from_args( - request_output_queue_type, - instance_id, - BackendType.VLLM, - migration_config, - placement_group, - engine_args,) + instance_id=instance_id, + placement_group=placement_group, + request_output_queue_type=request_output_queue_type, + migration_config=migration_config, + backend_type=BackendType.VLLM, + engine_args=engine_args) return llumlet class MockBackendVLLM(BackendVLLM): diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index 5ddbe6d8..d6a766b8 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -107,7 +107,7 @@ def init_manager(): try: manager_args = ManagerArgs(migration_backend="rayrpc", enable_migration=True) manager_args.log_instance_info = False - manager = Manager.from_args(manager_args) + manager = Manager.from_args(manager_args=manager_args) except ValueError: manager = ray.get_actor(MANAGER_NAME, namespace='llumnix') ray.get(manager.is_ready.remote()) @@ -153,7 +153,7 @@ def test_init_llumlet(ray_env, llumlet): def test_init_llumlets(ray_env, manager): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - instance_ids, llumlets = ray.get(manager.init_llumlets.remote(engine_args, QueueType("rayqueue"), BackendType.VLLM)) + instance_ids, llumlets = ray.get(manager.init_llumlets.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) num_instances = ray.get(manager.scale_up.remote(instance_ids, llumlets)) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances @@ -164,7 +164,7 @@ def test_init_llumlets_sim(ray_env, manager): import llumnix.backends.vllm.simulator llumnix.backends.vllm.simulator.BackendSimVLLM = MockBackendSim engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - instance_ids, llumlets = ray.get(manager.init_llumlets.remote(engine_args, QueueType("rayqueue"), BackendType.VLLM)) + instance_ids, llumlets = ray.get(manager.init_llumlets.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) num_instances = ray.get(manager.scale_up.remote(instance_ids, llumlets)) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances From 54a76809aff8a8457eba826e41afafd8f45a93ea Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 2 Jan 2025 10:40:09 +0000 Subject: [PATCH 42/92] Fix _connect_to_instances --- llumnix/manager.py | 6 +++--- llumnix/utils.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index 898f06c9..eab0e322 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -31,7 +31,8 @@ from llumnix.server_info import ServerInfo from llumnix.backends.backend_interface import BackendType from llumnix.utils import (random_uuid, clear_gloo_backend_state, remove_placement_group, - get_instance_name, INSTANCE_NAME_PREFIX, MANAGER_NAME) + get_instance_name, INSTANCE_NAME_PREFIX, MANAGER_NAME, + run_async_func_sync) from llumnix.entrypoints.utils import DeploymentMode from llumnix.utils import initialize_placement_group from llumnix.backends.utils import get_engine_world_size @@ -117,8 +118,7 @@ def __init__(self, # tasks # When manager starts, it automatically connects to all existing instances. - # TODO(s5u13b): Check if this is a sync call. - asyncio.run_coroutine_threadsafe(self._connect_to_instances(), asyncio.get_event_loop()) + run_async_func_sync(self._connect_to_instances()) asyncio.create_task(self._update_instance_info_loop(self.polling_interval)) asyncio.create_task(self._clear_request_instance_loop(CLEAR_REQUEST_INSTANCE_INTERVAL)) diff --git a/llumnix/utils.py b/llumnix/utils.py index fa15124e..3ad9331c 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -12,6 +12,8 @@ # limitations under the License. import uuid +import asyncio +import threading import ray from ray.util.placement_group import PlacementGroup @@ -123,3 +125,14 @@ def remove_placement_group(instance_id: str) -> bool: except Exception: return False return True + +def run_async_func_sync(func): + def run_task(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + future = loop.create_task(func) + loop.run_until_complete(future) + loop.close() + thread = threading.Thread(target=run_task) + thread.start() + thread.join() From 221281d85e12cdd03c9c54408057b3fdd8acfa9f Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 2 Jan 2025 10:53:06 +0000 Subject: [PATCH 43/92] Minors --- llumnix/manager.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index eab0e322..606fd4bb 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -49,7 +49,6 @@ # TODO(s5u13b): Fix the logger when manager failover. # TODO(s5u13b): Handle exception of ray operations. # TODO(s5u13b): Update the documents of global deployment. -# TODO(s5u13b): Change add_done_callback method. class Manager: @@ -191,7 +190,7 @@ def update_instance_info_done_callback(instance_id: str, fut): tasks = [] instance_infos = [] for instance_id, instance in self.instances.items(): - # Use asyncio.gather to wrap ray remote call to add done callback. + # Use asyncio.gather to wrap ray remote call to add done callback, asyncio.create_task will get error. task = asyncio.gather(instance.get_instance_info.remote(), return_exceptions=True) task.add_done_callback(partial(update_instance_info_done_callback, instance_id)) tasks.append(task) @@ -259,7 +258,6 @@ def migrate_done_callback_wrapper(migrate_instance_pair: Tuple[str, str], fut) - self.instance_migrating[migrate_out_instance_id] = True self.instance_migrating[migrate_in_instance_id] = True migrate_in_instance_name = get_instance_name(migrate_in_instance_id) - # Use asyncio.gather to wrap ray remote call to add done callback. task = asyncio.gather(self.instances[migrate_out_instance_id].migrate_out.remote(migrate_in_instance_name), return_exceptions=True) task.add_done_callback(partial(migrate_done_callback_wrapper, migrate_instance_pair)) From 5c2c27673427813dc1180dd9073887c89df7853b Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 6 Jan 2025 02:41:46 +0000 Subject: [PATCH 44/92] Pass lint, unit, e2e, offline test --- demo/manager_service_demo.py | 11 +- examlpes/offline_inference.py | 12 +- llumnix/backends/vllm/executor.py | 1 + llumnix/backends/vllm/llm_engine.py | 4 +- llumnix/backends/vllm/worker.py | 2 +- llumnix/entrypoints/setup.py | 63 +++-- llumnix/entrypoints/utils.py | 2 - llumnix/entrypoints/vllm/api_server.py | 2 +- llumnix/entrypoints/vllm/api_server_actor.py | 80 ++++++ llumnix/entrypoints/vllm/client.py | 2 - llumnix/entrypoints/vllm/serve.py | 13 +- llumnix/llumlet/llumlet.py | 17 +- llumnix/manager.py | 243 +++++++++++++++--- llumnix/utils.py | 33 +++ .../unit_test/backends/vllm/test_simulator.py | 6 +- tests/unit_test/entrypoints/test_utils.py | 4 +- .../entrypoints/vllm/api_server_manager.py | 7 +- .../vllm/api_server_manager_service.py | 19 +- .../global_scheduler/test_manager.py | 76 +++--- 19 files changed, 431 insertions(+), 166 deletions(-) diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py index d4495d8b..7213e41d 100644 --- a/demo/manager_service_demo.py +++ b/demo/manager_service_demo.py @@ -76,17 +76,10 @@ def kill_instance(instance_id: str = None) -> bool: try: ray.kill(instance) print(f"kill instance {instance_id}") - return True # pylint: disable=broad-except except Exception: return False - -def actor_exists(actor_name: str) -> bool: - try: - ray.get_actor(actor_name, namespace="llumnix") - return True - except ValueError: - return False + return True class FastAPIServer: @@ -279,7 +272,7 @@ async def wait_instance_ready(instance_id: str): try: await new_instance.ready.remote() print(f"instance {instance_id} ready, scale up") - new_server.run.remote() + await new_server.run.remote() self._scale_up(instance_id, placement_group, new_server, new_instance) except ray.exceptions.RayActorError: print(f"instance {instance_id} died, abort scale up") diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py index 46e2c8eb..c4c38047 100644 --- a/examlpes/offline_inference.py +++ b/examlpes/offline_inference.py @@ -38,17 +38,17 @@ engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True, trust_remote_code=True, max_model_len=370) -# Create a manager. If the manager is created first, and then the llumlets are created, manager.scale_up -# need to be called to add the newly created llumlets to the management of the manager. +# Create a manager. If the manager is created first, and then the instances are created, manager.scale_up +# need to be called to add the newly created instances to the management of the manager. manager: Manager = init_manager(manager_args) ray.get(manager.is_ready.remote()) -# Create llumlets. +# Create instances. instance_ids: List[str] = None -llumlets: List[Llumlet] = None -instance_ids, llumlets = ray.get(manager.init_llumlets.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) +instances: List[Llumlet] = None +instance_ids, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) -ray.get(manager.scale_up.remote(instance_ids, llumlets)) +ray.get(manager.scale_up.remote(instance_ids, instances)) # Create llumlets. instance_ids: List[str] = None diff --git a/llumnix/backends/vllm/executor.py b/llumnix/backends/vllm/executor.py index 6cb333da..7feeefcb 100644 --- a/llumnix/backends/vllm/executor.py +++ b/llumnix/backends/vllm/executor.py @@ -36,6 +36,7 @@ logger = init_logger(__name__) + class LlumnixRayGPUExecutor(RayGPUExecutorAsync): migration_config: MigrationConfig = None diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index bfcbbb3b..5c07ed19 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -83,8 +83,8 @@ def from_engine_args( request_output_queue_type: QueueType, migration_config: MigrationConfig, engine_args: EngineArgs, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - latency_mem: Optional[LatencyMemData] = None + latency_mem: Optional[LatencyMemData] = None, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT ) -> "LLMEngineLlumnix": """Creates an LLM engine from the engine arguments.""" # Create the engine configs. diff --git a/llumnix/backends/vllm/worker.py b/llumnix/backends/vllm/worker.py index 8479d12f..fd7dcca6 100644 --- a/llumnix/backends/vllm/worker.py +++ b/llumnix/backends/vllm/worker.py @@ -65,7 +65,7 @@ def reserve_memory_for_migration(self, migration_config: MigrationConfig, model_ if cache_config.gpu_memory_utilization <= 0: raise ValueError("Nccl migration backend take {:.4f} gpu memory, which is greater than gpu_memory_utilization {:.4f}. " - "try to increase gpu-memory-utilization or reduce migration-cache-blocks." + "try to increase gpu-memory-utilization or reduce migration-buffer-blocks." .format(migration_memory_ratio, cache_config.gpu_memory_utilization)) logger.info("nccl migration backend take {:.4f} gpu memory, left gpu_memory_utilization {:.4f} for kv cache." diff --git a/llumnix/entrypoints/setup.py b/llumnix/entrypoints/setup.py index 9defe7db..7da6be66 100644 --- a/llumnix/entrypoints/setup.py +++ b/llumnix/entrypoints/setup.py @@ -21,13 +21,12 @@ from llumnix.manager import Manager from llumnix.llumlet.llumlet import Llumlet from llumnix.logger import init_logger -from llumnix.utils import random_uuid, MANAGER_NAME +from llumnix.utils import random_uuid, get_manager_name from llumnix.arg_utils import ManagerArgs, EntrypointsArgs, DeploymentArgs from llumnix.queue.queue_type import QueueType from llumnix.server_info import ServerInfo from llumnix.queue.utils import init_request_output_queue_server -from llumnix.entrypoints.utils import (EntrypointsContext, get_ip_address, - retry_manager_method_sync) +from llumnix.entrypoints.utils import EntrypointsContext, get_ip_address, retry_manager_method_sync from llumnix.entrypoints.utils import DeploymentMode from llumnix.backends.backend_interface import BackendType from llumnix.queue.queue_server_base import QueueServerBase @@ -100,82 +99,78 @@ def init_manager(manager_args: ManagerArgs, deployment_args=deployment_args) logger.info("Init Manager on current node.") except ValueError: - manager = ray.get_actor(MANAGER_NAME, namespace='llumnix') + manager = ray.get_actor(get_manager_name(), namespace='llumnix') logger.info("Get existing Manager.") return manager def init_llumnix_components(manager_args: ManagerArgs, engine_args, request_output_queue_type: QueueType, - ip: str, request_output_queue_port: str, backend_type: BackendType) -> Tuple[Manager, List[str], List[Llumlet], QueueServerBase]: manager = init_manager(manager_args) - instance_ids, llumlets = retry_manager_method_sync( - manager.init_llumlets.remote, 'init_llumlets', request_output_queue_type, backend_type, engine_args) + instance_ids, instances = retry_manager_method_sync( + manager.init_instances.remote, 'init_instances', request_output_queue_type, backend_type, engine_args) available_instance_ids: List[str] = [] dead_instance_ids: List[str] = [] - available_llumlets: List[Llumlet] = [] - ready_tasks = [llumlet.is_ready.remote() for llumlet in llumlets] + available_instances: List[Llumlet] = [] + ready_tasks = [instance.is_ready.remote() for instance in instances] for idx, task in enumerate(ready_tasks): try: ray.get(task) available_instance_ids.append(instance_ids[idx]) - available_llumlets.append(llumlets[idx]) + available_instances.append(instances[idx]) except ray.exceptions.RayActorError: dead_instance_ids.append(instance_ids[idx]) if len(dead_instance_ids) > 0: retry_manager_method_sync(manager.scale_down.remote, 'scale_down', dead_instance_ids) if len(available_instance_ids) > 0: retry_manager_method_sync(manager.scale_up.remote, 'scale_up', - available_instance_ids, available_llumlets) + available_instance_ids, available_instances) logger.info("Init Llumnix components done, {} instances are ready, instance_ids: {}." .format(len(available_instance_ids), available_instance_ids)) + ip = get_ip_address() request_output_queue = init_request_output_queue_server(ip, request_output_queue_port, request_output_queue_type) - return manager, available_instance_ids, available_llumlets, request_output_queue - -def _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment_args) -> EntrypointsContext: - ip = get_ip_address() - request_output_queue_type = entrypoints_args.request_output_queue_type - request_output_queue_port = entrypoints_args.request_output_queue_port - backend_type = deployment_args.backend_type + return manager, available_instance_ids, available_instances, request_output_queue - manager, instance_ids, llumlets, request_output_queue = \ - init_llumnix_components(manager_args, - engine_args, - request_output_queue_type, - ip, - request_output_queue_port, - backend_type) +def setup_entrypoints_context(entrypoints_args, manager, instance_ids, instances, request_output_queue) -> EntrypointsContext: + instances_dict: Dict[str, Llumlet] = {} + for idx, ins_id in enumerate(instance_ids): + instances_dict[ins_id] = instances[idx] server_id = random_uuid() + ip = get_ip_address() server_info = ServerInfo(server_id, - request_output_queue_type, + entrypoints_args.request_output_queue_type, request_output_queue, ip, - request_output_queue_port) - - instances: Dict[str, Llumlet] = {} - for idx, ins_id in enumerate(instance_ids): - instances[ins_id] = llumlets[idx] + entrypoints_args.request_output_queue_port) - log_requests = not manager_args.disable_log_requests_manager + log_requests = not entrypoints_args.disable_log_requests_server log_request_timestamps = entrypoints_args.log_request_timestamps logger.info("log_requests: {}, log_request_timestamps: {}".format(log_requests, log_request_timestamps)) entrypoints_context = EntrypointsContext(manager, - instances, + instances_dict, request_output_queue, server_info, - deployment_args.deployment_mode, log_requests, log_request_timestamps) return entrypoints_context +def _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment_args) -> EntrypointsContext: + manager, instance_ids, instances, request_output_queue = \ + init_llumnix_components(manager_args, + engine_args, + entrypoints_args.request_output_queue_type, + entrypoints_args.request_output_queue_port, + deployment_args.backend_type) + + return setup_entrypoints_context(entrypoints_args, manager, instance_ids, instances, request_output_queue) def _setup_llumnix_global(manager_args, entrypoints_args, engine_args, deployment_args) -> None: _ = init_manager(manager_args, entrypoints_args, engine_args, deployment_args) diff --git a/llumnix/entrypoints/utils.py b/llumnix/entrypoints/utils.py index c638b2cd..691a050f 100644 --- a/llumnix/entrypoints/utils.py +++ b/llumnix/entrypoints/utils.py @@ -25,14 +25,12 @@ def __init__(self, instances: Dict[str, "Llumlet"], request_output_queue: "QueueServerBase", server_info: "ServerInfo", - deployment_mode: str, log_requests: bool, log_request_timestamps: bool): self.manager = manager self.instances = instances self.request_output_queue = request_output_queue self.server_info = server_info - self.deployment_mode = deployment_mode self.log_requests = log_requests self.log_request_timestamps = log_request_timestamps diff --git a/llumnix/entrypoints/vllm/api_server.py b/llumnix/entrypoints/vllm/api_server.py index 20ceb239..a0ccd26d 100644 --- a/llumnix/entrypoints/vllm/api_server.py +++ b/llumnix/entrypoints/vllm/api_server.py @@ -198,7 +198,7 @@ async def is_ready() -> bool: uvicorn.run(app, host=entrypoints_args.host, port=entrypoints_args.port, - log_level="info", + log_level=entrypoints_args.log_level, timeout_keep_alive=TIMEOUT_KEEP_ALIVE, ssl_keyfile=entrypoints_args.ssl_keyfile, ssl_certfile=entrypoints_args.ssl_certfile) diff --git a/llumnix/entrypoints/vllm/api_server_actor.py b/llumnix/entrypoints/vllm/api_server_actor.py index e69de29b..d92a5856 100644 --- a/llumnix/entrypoints/vllm/api_server_actor.py +++ b/llumnix/entrypoints/vllm/api_server_actor.py @@ -0,0 +1,80 @@ +import threading + +from ray.util.placement_group import PlacementGroup + +from llumnix.arg_utils import EntrypointsArgs +from llumnix.entrypoints.utils import EntrypointsContext, get_ip_address +from llumnix.llumlet.llumlet import Llumlet +from llumnix.utils import get_server_name + + +class FastAPIServer: + def __init__(self, entrypoints_args: EntrypointsArgs): + self.entrypoints_args = entrypoints_args + self.request_output_queue_port = self.entrypoints_args.request_output_queue_port + self.request_output_queue_type = self.entrypoints_args.request_output_queue_type + ip = get_ip_address() + self.request_output_queue = init_request_output_queue_server( + ip, self.request_output_queue_port, self.request_output_queue_type) + + def setup_entrypoints_context(self, + manager: "ray.actor.ActorHandle", + instance_id: str, + instance: Llumlet): + # avoid circular import + # pylint: disable=import-outside-toplevel + from llumnix.entrypoints.setup import setup_entrypoints_context + self.entrypoints_context = setup_entrypoints_context( + self.entrypoints_args,manager, [instance_id], [instance], self.request_output_queue) + + def _run_uvicorn_server(self, + entrypoints_args: EntrypointsArgs, + entrypoints_context: EntrypointsContext): + # pylint: disable=import-outside-toplevel + import llumnix.entrypoints.vllm.api_server + from llumnix.entrypoints.vllm.client import LlumnixClientVLLM + llumnix.entrypoints.vllm.api_server.llumnix_client = LlumnixClientVLLM(entrypoints_context) + app = llumnix.entrypoints.vllm.api_server.app + + logger.info("Start api server on '{}:{}'.".format(entrypoints_args.host, entrypoints_args.port)) + uvicorn.run(app, + host=entrypoints_args.host, + port=entrypoints_args.port, + log_level=entrypoints_args.log_level, + timeout_keep_alive=llumnix.entrypoints.vllm.api_server.TIMEOUT_KEEP_ALIVE, + ssl_keyfile=entrypoints_args.ssl_keyfile, + ssl_certfile=entrypoints_args.ssl_certfile) + + def run(self): + self.run_uvicorn_server_thread = threading.Thread( + target=self._run_uvicorn_server, args=(self.entrypoints_args, self.entrypoints_context), + daemon=True, name="run_uvicorn_server" + ) + self.run_uvicorn_server_thread.start() + + @classmethod + def from_args(cls, + instance_id: str, + placement_group: PlacementGroup, + entrypoints_args: EntrypointsArgs): + try: + fastapi_server_class = ray.remote(num_cpus=1, + name=get_server_name(instance_id), + namespace="llumnix", + lifetime="detached")(cls).options( + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_bundle_index=0, + placement_group_capture_child_tasks=True + ) + ) + fastapi_server = fastapi_server_class.remote(entrypoints_args) + # pylint: disable=broad-except + except Exception as e: + logger.error("failed to initialize FastAPIServer: {}".format(e)) + logger.error("exception traceback: {}".format(traceback.format_exc())) + + return fastapi_server + + def is_ready(self) -> bool: + return True diff --git a/llumnix/entrypoints/vllm/client.py b/llumnix/entrypoints/vllm/client.py index d13ae208..faff7973 100644 --- a/llumnix/entrypoints/vllm/client.py +++ b/llumnix/entrypoints/vllm/client.py @@ -11,7 +11,6 @@ from llumnix.server_info import RequestTimestamps from llumnix.queue.queue_server_base import QueueServerBase from llumnix.server_info import ServerInfo -from llumnix.entrypoints.utils import DeploymentMode logger = init_logger(__name__) @@ -27,7 +26,6 @@ def __init__(self, self.server_info: ServerInfo = entrypoints_context.server_info self.log_requests = entrypoints_context.log_requests self.log_request_timestamps = entrypoints_context.log_request_timestamps - self.deployment_mode: DeploymentMode = entrypoints_context.deployment_mode self.request_streams: Dict[str, AsyncStream] = {} self.instance_num_requests: Dict[str, int] = {} diff --git a/llumnix/entrypoints/vllm/serve.py b/llumnix/entrypoints/vllm/serve.py index 2e2d90d9..f0d7e7ef 100644 --- a/llumnix/entrypoints/vllm/serve.py +++ b/llumnix/entrypoints/vllm/serve.py @@ -1,11 +1,14 @@ +import time from ray.util.queue import Queue as RayQueue from llumnix.entrypoints.vllm.arg_utils import add_cli_args, get_args from llumnix.entrypoints.setup import connect_to_ray_cluster from llumnix.config import get_llumnix_config -from llumnix.arg_utils import LlumnixArgumentParser +from llumnix.arg_utils import LlumnixArgumentParser, DeploymentArgs from llumnix.entrypoints.utils import DeploymentMode +from llumnix.backends.backend_interface import BackendType +from llumnix.entrypoints.setup import setup_llumnix if __name__ == "__main__": @@ -30,6 +33,8 @@ request_output_queue = RayQueue(actor_options={"namespace": "llumnix", "name": "magic_ray_queue"}) - engine_config = engine_args.create_engine_config() - parallel_config = engine_config.parallel_config - entrypoints_context = setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args) + setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args) + + # keep the process alive to get the terminal output. + while True: + time.sleep(100.0) diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 07db2888..1706794e 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -49,7 +49,7 @@ def __init__(self, try: logger.info("Llumlet backend type: {}".format(backend_type)) self.instance_id = instance_id - self.instance_name = get_instance_name(instance_id) + self.actor_name = get_instance_name(instance_id) self.backend_engine: BackendInterface = init_backend_engine(instance_id, placement_group, request_output_queue_type, @@ -87,16 +87,17 @@ def from_args(cls, world_size = get_engine_world_size(engine_args, backend_type) num_gpus = world_size instance_name = get_instance_name(instance_id) + # TODO(s5u13b): Check the max_concurrency. llumlet_class = ray.remote(num_cpus=1, - num_gpus=num_gpus, - name=instance_name, - namespace='llumnix', - max_concurrency=4, - lifetime="detached")(cls).options( + num_gpus=num_gpus, + name=instance_name, + namespace='llumnix', + max_concurrency=4, + lifetime="detached")(cls).options( scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=placement_group, placement_group_bundle_index=0, - placement_group_capture_child_tasks=True, + placement_group_capture_child_tasks=True ) ) llumlet = llumlet_class.remote(instance_id, @@ -121,7 +122,7 @@ async def _check_engine_state_loop(self): # pylint: disable=protected-access self.backend_engine._stop_event.set() await asyncio.sleep(0) - self_actor = ray.get_actor(self.instance_name) + self_actor = ray.get_actor(self.actor_name) ray.kill(self_actor) async def migrate_out(self, dst_instance_name: str) -> List[str]: diff --git a/llumnix/manager.py b/llumnix/manager.py index 606fd4bb..2c130007 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -20,6 +20,8 @@ import traceback from functools import partial import ray +from ray.util.state import list_placement_groups, list_actors +from ray.util.placement_group import PlacementGroup from llumnix.llumlet.llumlet import Llumlet from llumnix.logger import init_logger @@ -31,20 +33,24 @@ from llumnix.server_info import ServerInfo from llumnix.backends.backend_interface import BackendType from llumnix.utils import (random_uuid, clear_gloo_backend_state, remove_placement_group, - get_instance_name, INSTANCE_NAME_PREFIX, MANAGER_NAME, - run_async_func_sync) + get_instance_name, get_manager_name, INSTANCE_NAME_PREFIX, + SERVER_NAME_PREFIX, get_placement_group_name, run_async_func_sync, + kill_server, kill_instance) from llumnix.entrypoints.utils import DeploymentMode from llumnix.utils import initialize_placement_group from llumnix.backends.utils import get_engine_world_size from llumnix.queue.queue_type import QueueType +from llumnix.entrypoints.vllm.api_server_actor import FastAPIServer logger = init_logger(__name__) -CLEAR_REQUEST_INSTANCE_INTERVAL = 3600 -NO_INSTANCE_RETRY_INTERVAL = 1.0 -WAIT_ALL_MIGRATIONS_DONE_INTERVAL = 1.0 -WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS = 1.0 -AUTO_DEPLOYMENT_INTERVAL = 1.0 +CLEAR_REQUEST_INSTANCE_INTERVAL = 600.0 +NO_INSTANCE_RETRY_INTERVAL = 0.1 +WAIT_ALL_MIGRATIONS_DONE_INTERVAL = 0.1 +AUTO_SCALE_UP_INTERVAL = 1.0 +WAIT_PLACEMENT_GROUP_TIMEOUT = 1.0 +CHECK_DEPLOYMENT_STATES_INTERVAL = 60.0 +WATCH_DEPLOYMENT_INTERVAL = 10.0 # TODO(s5u13b): Fix the logger when manager failover. # TODO(s5u13b): Handle exception of ray operations. @@ -60,16 +66,13 @@ def __init__(self, deployment_args: DeploymentArgs = None ) -> None: os.chdir(work_dir) - self.actor_name = MANAGER_NAME + self.actor_name = get_manager_name() self.manager_args = manager_args # engine_args and entrypoints_args are used in global deployment. self.entrypoints_args = entrypoints_args self.engine_args = engine_args self.deployment_args = deployment_args - assert deployment_args is None or \ - (deployment_args is not None and entrypoints_args is not None and engine_args is not None) - # deployment args if deployment_args is not None: self.deployment_mode: DeploymentMode = deployment_args.deployment_mode @@ -121,6 +124,12 @@ def __init__(self, asyncio.create_task(self._update_instance_info_loop(self.polling_interval)) asyncio.create_task(self._clear_request_instance_loop(CLEAR_REQUEST_INSTANCE_INTERVAL)) + if hasattr(self, "deployment_mode") and self.deployment_mode == DeploymentMode.GLOBAL: + assert self.entrypoints_args is not None and self.engine_args is not None + self.last_timeout_instance_id = None + asyncio.create_task(self._auto_scale_up_loop(AUTO_SCALE_UP_INTERVAL)) + asyncio.create_task(self._check_deployment_states_loop(CHECK_DEPLOYMENT_STATES_INTERVAL)) + async def generate(self, request_id: str, server_info: ServerInfo, *args, **kwargs,) -> None: while self.num_instances == 0: logger.warning("[generate] no instance available temporarily, sleep {}s, " @@ -268,12 +277,50 @@ def migrate_done_callback_wrapper(migrate_instance_pair: Tuple[str, str], fut) - logger.error("[_migrate] unexpected exception occurs: {}".format(e)) logger.error("[_migrate] exception traceback: {}".format(traceback.format_exc())) - async def _rebuild_migrate_backend(self) -> None: + async def _auto_scale_up_loop(self, interval: float) -> None: + while True: + try: + new_pg = None + if self.last_timeout_instance_id is not None: + last_timeout_pg_name = get_placement_group_name(last_timeout_instance_id) + last_timeout_pg_states = list_placement_groups(filters=[("name", "=", last_timeout_pg_name)]) + if len(last_timeout_pg_states) > 0: + new_instance_id = self.last_timeout_instance_id + # pending or created(without server and instance) + new_pg = ray.util.get_placement_group(last_timeout_pg_name) + # reset + self.last_timeout_instance_id = None + pending_pg_states = list_placement_groups(filters=[("state", "=", "PENDING")]) + for pending_pg_state in pending_pg_states: + instance_id = pending_pg_state["name"].split("_")[-1] + if new_pg is not None and instance_id == new_instance_id: + continue + self.scale_down(instance_id) + if new_pg is None: + new_instance_id = random_uuid() + new_pg = self._init_placement_group(new_instance_id, self.engine_args, self.backend_type, contain_server=True) + try: + await asyncio.wait_for(new_pg.ready(), WAIT_PLACEMENT_GROUP_TIMEOUT) + except asyncio.TimeoutError: + logger.info("[_auto_scale_up_loop] waiting for new placement group ready timeout") + # After timeout, the new instance might be pending, created(without server and instance) or killed. + self.last_timeout_instance_id = new_instance_id + await asyncio.sleep(interval) + continue + self._init_server_and_instance(new_instance_id, new_pg) + logger.info("[_auto_scale_up_loop] deploy server and instance to new placement group done") + # pylint: disable=broad-except + except Exception as e: + logger.error("[_auto_scale_up_loop] unexpected exception occurs: {}".format(e)) + logger.error("[_auto_scale_up_loop] exception traceback: {}".format(traceback.format_exc())) + + # TODO(KuilongCui): Add comments for this function. + async def _rebuild_migration_backend(self) -> None: # Wait for all instances to finish migration while any(self.instance_migrating.values()): await asyncio.sleep(WAIT_ALL_MIGRATIONS_DONE_INTERVAL) - # During rebuilding migration backend, disable migrate + # During rebuilding migration backend, disable migration. origin_config = self.enable_migration self.enable_migration = False @@ -288,7 +335,7 @@ async def run_task(alive_instances: List[str], task_name: str, *args, **kwargs): if isinstance(ret, ray.exceptions.RayActorError): dead_instances.add(instance_name) if len(dead_instances) > 0: - self.scale_down(dead_instances, rebuild_migrate_backend=False) + self.scale_down(dead_instances, rebuild_migration_backend=False) if self.manager_args.migration_backend == 'gloo': clear_gloo_backend_state() return dead_instances @@ -323,16 +370,20 @@ async def run_task(alive_instances: List[str], task_name: str, *args, **kwargs): src_filter=lambda instance_info: instance_info.instance_id in alive_instances, dst_filter=lambda instance_info: instance_info.instance_id in alive_instances) - logger.info("[rebuild_migrate_backend] rebuild {} migration backend done, group_name: {}, alive instance ({}): {}" + logger.info("[rebuild_migration_backend] rebuild {} migration backend done, group_name: {}, alive instance ({}): {}" .format(self.manager_args.migration_backend, group_name, len(alive_instances), alive_instances)) # Restore migrate config self.enable_migration = origin_config - def scale_up(self, instance_id: Union[str, Iterable[str]], llumlet_actor_handles: List["ray.actor.ActorHandle"]) -> None: + def scale_up(self, + instance_id: Union[str, Iterable[str]], + instance_actor_handle: Union["ray.actor.ActorHandle", List["ray.actor.ActorHandle"]]) -> None: if isinstance(instance_id, str): instance_id = [instance_id,] + instance_actor_handle = [instance_actor_handle,] instance_ids = list(instance_id) + instance_actor_handles = list(instance_actor_handle) indeed_update = False no_pending_instance = (self.pending_rebuild_migration_instances == 0) @@ -340,7 +391,7 @@ def scale_up(self, instance_id: Union[str, Iterable[str]], llumlet_actor_handles for idx, ins_id in enumerate(instance_ids): if ins_id not in self.instances: indeed_update = True - self.instances[ins_id] = llumlet_actor_handles[idx] + self.instances[ins_id] = instance_actor_handles[idx] self.instance_migrating[ins_id] = False if self.log_instance_info: self.instance_last_logged_empty[ins_id] = False @@ -350,15 +401,15 @@ def scale_up(self, instance_id: Union[str, Iterable[str]], llumlet_actor_handles # When scaling up, we need to rebuild the migration backend. But if initially self.pending_rebuild_migration_instances != 0, # a coroutine is already handling the changes in the number of instances in the cluster and it will account for the changes - # caused by this scale-up (see rebuild_migrate_backend for details). Therefore, we simply return in this case. Specifically, - # for RPC, the Ray actor handle is used for the migration cache, so there is no need to rebuild the group. + # caused by this scale-up (see rebuild_migration_backend for details). Therefore, we simply return in this case. + # Specifically, for RayRPC migration backend, the Ray actor handle is used for the migration cache, so there is no need to rebuild the group. if self.enable_migration and self.manager_args.migration_backend in ['gloo', 'nccl'] \ and indeed_update and no_pending_instance: - asyncio.create_task(self._rebuild_migrate_backend()) + asyncio.create_task(self._rebuild_migration_backend()) return self.num_instances - def scale_down(self, instance_id: Union[str, Iterable[str]], rebuild_migrate_backend: bool = True) -> None: + def scale_down(self, instance_id: Union[str, Iterable[str]], rebuild_migration_backend: bool = True) -> None: if isinstance(instance_id, str): instance_id = [instance_id,] instance_ids = list(instance_id) @@ -367,6 +418,7 @@ def scale_down(self, instance_id: Union[str, Iterable[str]], rebuild_migrate_bac no_pending_instance = self.pending_rebuild_migration_instances == 0 for ins_id in instance_ids: + self._clear_instance_ray_resources(ins_id) if ins_id in self.instances: indeed_update = True if ins_id in self.instances: @@ -377,8 +429,6 @@ def scale_down(self, instance_id: Union[str, Iterable[str]], rebuild_migrate_bac del self.instance_migrating[ins_id] else: logger.warning("[scale_down] instance {} is not in self.instance_migrating".format(ins_id)) - if not remove_placement_group(ins_id): - logger.warning("[scale_down] failed to remove placement group of instance {}".format(ins_id)) if self.log_instance_info: if ins_id in self.instance_last_logged_empty: del self.instance_last_logged_empty[ins_id] @@ -393,11 +443,19 @@ def scale_down(self, instance_id: Union[str, Iterable[str]], rebuild_migrate_bac self.pending_rebuild_migration_instances = 0 if self.manager_args.migration_backend == 'gloo': clear_gloo_backend_state() - elif indeed_update and no_pending_instance and rebuild_migrate_backend: - asyncio.create_task(self._rebuild_migrate_backend()) + elif indeed_update and no_pending_instance and rebuild_migration_backend: + asyncio.create_task(self._rebuild_migration_backend()) return self.num_instances + def _clear_instance_ray_resources(self, instance_id: str): + if not remove_placement_group(instance_id): + logger.warning("[clear_instance_ray_resources] failed to remove placement group {}".format(instance_id)) + if not kill_server(instance_id): + logger.warning("[clear_instance_ray_resources] failed to kill server {}".format(instance_id)) + if not kill_instance(instance_id): + logger.warning("[clear_instance_ray_resources] failed to kill instance {}".format(instance_id)) + async def _connect_to_instances(self): def connect_to_instances_done_callback(instance_id: str, instance_actor_handle: "ray.actor.ActorHandle", fut): ret = fut.result()[0] @@ -435,8 +493,8 @@ def from_args(cls, ) -> "Manager": manager_class = ray.remote(num_cpus=0, max_restarts=-1, - name=MANAGER_NAME, - namespace='llumnix', + name=get_manager_name(), + namespace="llumnix", lifetime="detached")(cls) manager = manager_class.remote(manager_args, os.getcwd(), @@ -446,18 +504,56 @@ def from_args(cls, return manager - def init_llumlets(self, - request_output_queue_type: QueueType, - backend_type: BackendType, - engine_args, - instance_ids: List[str] = None + def _init_placement_group(self, + instance_id: str, + engine_args, + backend_type: BackendType, + contain_server: bool = False) -> PlacementGroup: + if not self.manager_args.profiling_result_file_path: + # num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor + # num_gpus=world_size, for world_size Workers + world_size = get_engine_world_size(engine_args, backend_type) + placement_group = initialize_placement_group(instance_id, num_cpus=3+int(contain_server), num_gpus=world_size, detached=True) + else: + assert backend_type == backend_type.VLLM, "Only support the simulator backend for vLLM." + # num_cpus=1, for Llumlet + AsyncPutQueueActor + placement_group = initialize_placement_group(instance_id, num_cpus=2+int(contain_server), num_gpus=0, detached=True) + + return placement_group + + def _init_server(self, + instance_id: str, + placement_group: PlacementGroup, + entrypoints_args: EntrypointsArgs) -> FastAPIServer: + fastapi_server = FastAPIServer.from_args(instance_id, placement_group, entrypoints_args) + return fastapi_server + + def _init_instance(self, + instance_id: str, + placement_group: PlacementGroup, + request_output_queue_type: QueueType, + backend_type: BackendType, + engine_args + ) -> Tuple[str, Llumlet]: + instance = Llumlet.from_args( + instance_id, + placement_group, + request_output_queue_type, + self.manager_args.create_migration_config(), + backend_type, + engine_args, + self.manager_args.profiling_result_file_path) + + return instance + + def init_instances(self, + request_output_queue_type: QueueType, + backend_type: BackendType, + engine_args ) -> Tuple[List[str], List[Llumlet]]: - manager_args = self.manager_args - world_size = get_engine_world_size(engine_args, backend_type) - instance_ids: List[str] = [] - llumlets: List[Llumlet] = [] - for _ in range(manager_args.initial_instances): + instances: List[Llumlet] = [] + for _ in range(self.manager_args.initial_instances): instance_id = random_uuid() if not manager_args.profiling_result_file_path: # num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor, num_gpus=world_size, for Workers @@ -489,13 +585,59 @@ def init_llumlets(self, engine_args, manager_args.profiling_result_file_path) instance_ids.append(instance_id) - llumlets.append(llumlet) + instances.append(instance) - return instance_ids, llumlets + return instance_ids, instances + + def _init_server_and_instance(self, + instance_id: str, + placement_group: PlacementGroup): + async def done_scale_up(): + try: + manager = ray.get_actor(get_manager_name(), namespace="llumnix") + await server.is_ready() + await server.setup_entrypoints_context(manager, instance_id, instance) + await instance.is_ready() + await server.run() + self.scale_up(instance_id, instance) + # pylint: disable=broad-except + except Exception as e: + logger.error("[_init_server_and_instance] unexpected exception occurs: {}".format(e)) + logger.error("[_init_server_and_instance] exception traceback: {}".format(traceback.format_exc())) + self._clear_instance_ray_resources(instance_id) + + request_output_queue_type = self.entrypoints_args.request_output_queue_type + instance = self._init_instance(instance_id, placement_group, request_output_queue_type, self.backend_type, self.engine_args) + server = self._init_server(instance_id, placement_group, self.entrypoints_args) + asyncio.create_task(done_scale_up()) + + async def _check_deployment_states_loop(self, interval: float) -> None: + async def watch_deployment(instance_id: str): + logger.warning("[_check_deployment_states_loop] watch instance {} deployment".format(instance_id)) + await asyncio.sleep(WATCH_DEPLOYMENT_INTERVAL) + curr_pgs, curr_servers, curr_instances = self.get_curr_deployment() + if instance_id in curr_pgs and (instance_id not in curr_servers or instance_id not in curr_instances): + logger.warning("[_check_deployment_states_loop] instance {} deployment states incorrect".format(instance_id)) + self._scale_down(instance_id) + + while True: + try: + curr_pgs, curr_servers, curr_instances = self.get_curr_deployment() + assert len(curr_pgs) >= max(len(curr_servers), len(curr_instances)) + tasks = [] + for instance_id in curr_pgs: + if instance_id not in curr_servers or instance_id not in curr_instances: + tasks.append(asyncio.create_task(watch_deployment(instance_id))) + await asyncio.gather(*tasks, return_exceptions=True) + await asyncio.sleep(interval) + # pylint: disable=broad-except + except Exception as e: + logger.error("[_check_deployment_states_loop] unexpected exception occurs: {}".format(e)) + logger.error("[_check_deployment_states_loop] exception traceback: {}".format(traceback.format_exc())) async def is_ready(self) -> bool: """Called by api server, return true when all the instances have been successfully created.""" - tasks = [llumlet.is_ready.remote() for llumlet in self.instances.values()] + tasks = [instance.is_ready.remote() for instance in self.instances.values()] is_ready_list = await asyncio.gather(*tasks) return all(is_ready_list) @@ -519,6 +661,27 @@ def check_instance_error_done_callback(idx: int, instance_id: str, fut): return results + def get_curr_deployment(self) -> Tuple[Dict[str, PlacementGroup], Dict[str, FastAPIServer], Dict[str, Llumlet]]: + curr_pgs: Dict[str, PlacementGroup] = {} + curr_servers: Dict[str, PlacementGroup] = {} + curr_instances: Dict[str, Llumlet] = {} + + created_pg_states = list_placement_groups(filters=[("state", "=", "CREATED")]) + for created_pg_state in created_pg_states: + instance_id = created_pg_state["name"].split("_")[-1] + curr_pgs[instance_id] = ray.util.get_placement_group(created_pg_state["name"]) + + alive_actor_states = list_actors(filters=[("state", "=", "ALIVE")]) + for alive_actor_state in alive_actor_states: + if alive_actor_state["name"].startswith(SERVER_NAME_PREFIX): + instance_id = alive_actor_state["name"].split("_")[-1] + curr_servers[instance_id] = ray.get_actor(alive_actor_state["name"], namespace="llumnix") + elif alive_actor_state["name"].startswith(INSTANCE_NAME_PREFIX): + instance_id = alive_actor_state["name"].split("_")[-1] + curr_instances[instance_id] = ray.get_actor(alive_actor_state["name"], namespace="llumnix") + + return curr_pgs, curr_servers, curr_instances + async def _get_request_instance(self) -> None: def get_request_instance_done_callback(instance_id: str, fut): ret = fut.result()[0] diff --git a/llumnix/utils.py b/llumnix/utils.py index 3ad9331c..f0390283 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -108,9 +108,15 @@ def clear_gloo_backend_state(): # gloo_queue may not have been created yet; just ignore this error. pass +def get_manager_name() -> str: + return MANAGER_NAME + def get_placement_group_name(instance_id: str) -> str: return f"{PLACEMENT_GROUP_NAME_PREFIX}{instance_id}" +def get_server_name(instance_id: str) -> str: + return f"{SERVER_NAME_PREFIX}{instance_id}" + def get_instance_name(instance_id: str) -> str: return f"{INSTANCE_NAME_PREFIX}{instance_id}" @@ -121,6 +127,33 @@ def remove_placement_group(instance_id: str) -> bool: return False # asynchronous api ray.util.remove_placement_group(placement_group) + logger.info("remove placement group {}".format(instance_id)) + # pylint: disable=broad-except + except Exception: + return False + return True + +def kill_server(instance_id: str) -> bool: + try: + server = ray.get_actor(get_server_name(instance_id), namespace="llumnix") + except ValueError: + return False + try: + ray.kill(server) + logger.info("kill server {}".format(instance_id)) + # pylint: disable=broad-except + except Exception: + return False + return True + +def kill_instance(instance_id: str) -> bool: + try: + instance = ray.get_actor(get_instance_name(instance_id), namespace="llumnix") + except ValueError: + return False + try: + ray.kill(instance) + print("kill instance {}".format(instance_id)) # pylint: disable=broad-except except Exception: return False diff --git a/tests/unit_test/backends/vllm/test_simulator.py b/tests/unit_test/backends/vllm/test_simulator.py index 1167ffdf..4d11f1a4 100644 --- a/tests/unit_test/backends/vllm/test_simulator.py +++ b/tests/unit_test/backends/vllm/test_simulator.py @@ -82,9 +82,9 @@ class DummyActor: def __init__(self): pass dummy_actor = ray.remote(num_cpus=1, - name="instance_0", - namespace='llumnix', - max_concurrency=4)(DummyActor) + name="instance_0", + namespace='llumnix', + max_concurrency=4)(DummyActor) dummy_actor = dummy_actor.remote() placement_group = initialize_placement_group("0", num_cpus=2, num_gpus=0, detached=True) sim_backend = MockBackendSim(instance_id="0", diff --git a/tests/unit_test/entrypoints/test_utils.py b/tests/unit_test/entrypoints/test_utils.py index ab694266..413906ee 100644 --- a/tests/unit_test/entrypoints/test_utils.py +++ b/tests/unit_test/entrypoints/test_utils.py @@ -19,7 +19,7 @@ from llumnix.entrypoints.setup import launch_ray_cluster, init_manager from llumnix.entrypoints.utils import get_ip_address, retry_manager_method_sync, retry_manager_method_async from llumnix.queue.utils import init_request_output_queue_server -from llumnix.utils import MANAGER_NAME +from llumnix.utils import get_manager_name # pylint: disable=unused-import from tests.conftest import ray_env @@ -36,7 +36,7 @@ def test_init_manager(ray_env): manager_args = ManagerArgs() manager = init_manager(manager_args) assert manager is not None - manager_actor_handle = ray.get_actor(MANAGER_NAME, namespace='llumnix') + manager_actor_handle = ray.get_actor(get_manager_name(), namespace='llumnix') assert manager_actor_handle is not None assert manager == manager_actor_handle diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager.py b/tests/unit_test/entrypoints/vllm/api_server_manager.py index 4a184049..daff55e2 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager.py @@ -22,7 +22,7 @@ import llumnix.manager from llumnix.arg_utils import ManagerArgs from llumnix.server_info import ServerInfo, RequestTimestamps -from llumnix.utils import random_uuid, MANAGER_NAME +from llumnix.utils import random_uuid, get_manager_name from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType from llumnix.entrypoints.setup import EntrypointsContext from llumnix.entrypoints.vllm.client import LlumnixClientVLLM @@ -53,8 +53,8 @@ def testing_stats(self): def init_manager(request_output_queue_type: QueueType): - manager = MockManager.options(name=MANAGER_NAME, - namespace='llumnix').remote(request_output_queue_type) + manager = MockManager.options(name=get_manager_name(), + namespace='llumnix').remote(request_output_queue_type) return manager @app.get("/stats") @@ -85,7 +85,6 @@ def stats() -> Response: request_output_queue, server_info, None, - None, None) llumnix.entrypoints.vllm.api_server.llumnix_client = LlumnixClientVLLM(llumnix_context) diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py index a4b1e4c3..7d8a99ec 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py @@ -28,7 +28,7 @@ from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType from llumnix.entrypoints.setup import EntrypointsContext from llumnix.entrypoints.vllm.client import LlumnixClientVLLM -from llumnix.utils import MANAGER_NAME +from llumnix.utils import get_manager_name app = llumnix.entrypoints.vllm.api_server.app manager = None @@ -69,19 +69,18 @@ def __init__(self, host: str, port: int, request_output_queue_type: QueueType): ip = '127.0.0.1' port = 1234 global manager - manager = ray.get_actor(MANAGER_NAME, namespace="llumnix") + manager = ray.get_actor(get_manager_name(), namespace="llumnix") request_output_queue = init_request_output_queue_server(ip, port, request_output_queue_type) ray_queue_server = None if request_output_queue_type == QueueType.RAYQUEUE: ray_queue_server = request_output_queue server_info = ServerInfo(random_uuid(), request_output_queue_type, ray_queue_server, ip, port) llumnix_context = EntrypointsContext(manager, - {'0': None}, - request_output_queue, - server_info, - None, - None, - None) + {'0': None}, + request_output_queue, + server_info, + None, + None) llumnix.entrypoints.vllm.api_server.llumnix_client = LlumnixClientVLLM(llumnix_context) def run(self): @@ -93,8 +92,8 @@ def run(self): timeout_keep_alive=llumnix.entrypoints.vllm.api_server.TIMEOUT_KEEP_ALIVE) def init_manager_service(request_output_queue_type: QueueType, args: 'Namespace'): - manager = MockManagerService.options(name=MANAGER_NAME, - namespace='llumnix').remote(request_output_queue_type, args) + manager = MockManagerService.options(name=get_manager_name(), + namespace='llumnix').remote(request_output_queue_type, args) return manager @app.get("/stats") diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index d6a766b8..64a0208e 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -19,7 +19,7 @@ from vllm import EngineArgs -from llumnix.utils import random_uuid, get_instance_name, MANAGER_NAME +from llumnix.utils import random_uuid, get_instance_name, get_manager_name from llumnix.arg_utils import ManagerArgs from llumnix.manager import Manager from llumnix.instance_info import InstanceInfo @@ -109,22 +109,22 @@ def init_manager(): manager_args.log_instance_info = False manager = Manager.from_args(manager_args=manager_args) except ValueError: - manager = ray.get_actor(MANAGER_NAME, namespace='llumnix') + manager = ray.get_actor(get_manager_name(), namespace='llumnix') ray.get(manager.is_ready.remote()) return manager -def init_llumlets(initial_instances): +def init_instances(initial_instances): instance_ids = [] - llumlets = [] + instances = [] for _ in range(initial_instances): instance_id = random_uuid() instance_name = get_instance_name(instance_id) llumlet = MockLlumlet.options(name=instance_name, namespace='llumnix').remote(instance_id) instance_ids.append(instance_id) - llumlets.append(llumlet) - ray.get([llumlet.is_ready.remote() for llumlet in llumlets]) - return instance_ids, llumlets + instances.append(llumlet) + ray.get([instance.is_ready.remote() for instance in instances]) + return instance_ids, instances @pytest.fixture def manager(): @@ -143,7 +143,7 @@ def llumlet(): def test_init_manager(ray_env, manager): assert manager is not None - manager_actor_handle = ray.get_actor(MANAGER_NAME, namespace='llumnix') + manager_actor_handle = ray.get_actor(get_manager_name(), namespace='llumnix') assert manager_actor_handle is not None assert manager == manager_actor_handle @@ -151,33 +151,33 @@ def test_init_llumlet(ray_env, llumlet): assert llumlet is not None ray.get(llumlet.is_ready.remote()) -def test_init_llumlets(ray_env, manager): +def test_init_instances(ray_env, manager): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - instance_ids, llumlets = ray.get(manager.init_llumlets.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) - num_instances = ray.get(manager.scale_up.remote(instance_ids, llumlets)) + instance_ids, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) + num_instances = ray.get(manager.scale_up.remote(instance_ids, instances)) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances -def test_init_llumlets_sim(ray_env, manager): +def test_init_instances_sim(ray_env, manager): manager.profiling_result_file_path="//" # pylint: disable=import-outside-toplevel import llumnix.backends.vllm.simulator llumnix.backends.vllm.simulator.BackendSimVLLM = MockBackendSim engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - instance_ids, llumlets = ray.get(manager.init_llumlets.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) - num_instances = ray.get(manager.scale_up.remote(instance_ids, llumlets)) + instance_ids, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) + num_instances = ray.get(manager.scale_up.remote(instance_ids, instances)) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances def test_scale_up_and_down(ray_env, manager): initial_instances = 4 - instance_ids, llumlets = init_llumlets(initial_instances) - num_instances = ray.get(manager.scale_up.remote(instance_ids, llumlets)) + instance_ids, instances = init_instances(initial_instances) + num_instances = ray.get(manager.scale_up.remote(instance_ids, instances)) assert num_instances == initial_instances - instance_ids_1, llumlets_1 = init_llumlets(initial_instances) + instance_ids_1, instances_1 = init_instances(initial_instances) num_instances = ray.get(manager.scale_down.remote(instance_ids_1)) assert num_instances == initial_instances - num_instances = ray.get(manager.scale_up.remote(instance_ids_1, llumlets_1)) + num_instances = ray.get(manager.scale_up.remote(instance_ids_1, instances_1)) assert num_instances == initial_instances * 2 num_instances = ray.get(manager.scale_down.remote(instance_ids)) assert num_instances == initial_instances @@ -186,18 +186,18 @@ def test_scale_up_and_down(ray_env, manager): def test_connect_to_instances(ray_env): initial_instances = 4 - instance_ids, llumlets = init_llumlets(initial_instances) - ray.get([llumlet.is_ready.remote() for llumlet in llumlets]) + instance_ids, instances = init_instances(initial_instances) + ray.get([instance.is_ready.remote() for instance in instances]) manager = init_manager() - instance_ids_1, llumlets_1 = init_llumlets(initial_instances) - num_instances = ray.get(manager.scale_up.remote(instance_ids_1, llumlets_1)) + instance_ids_1, instances_1 = init_instances(initial_instances) + num_instances = ray.get(manager.scale_up.remote(instance_ids_1, instances_1)) assert num_instances == initial_instances * 2 num_instances = ray.get(manager.scale_down.remote(instance_ids)) assert num_instances == initial_instances def test_generate_and_abort(ray_env, manager, llumlet): instance_id = ray.get(llumlet.get_instance_id.remote()) - ray.get(manager.scale_up.remote(instance_id, [llumlet])) + ray.get(manager.scale_up.remote(instance_id, llumlet)) request_id = random_uuid() num_requests = ray.get(llumlet.get_num_requests.remote()) assert num_requests == 0 @@ -216,8 +216,8 @@ def test_generate_and_abort(ray_env, manager, llumlet): assert num_requests == 0 def test_get_request_instance(ray_env): - _, llumlets = init_llumlets(2) - llumlet, llumlet_1 = llumlets[0], llumlets[1] + _, instances = init_instances(2) + llumlet, llumlet_1 = instances[0], instances[1] manager = init_manager() request_id = random_uuid() request_id_1 = random_uuid() @@ -252,37 +252,37 @@ def get_instance_info_migrate_out(instance_id): return instance_info def test_update_instance_info_loop_and_migrate(ray_env, manager): - num_llumlets = 5 - instance_ids, llumlets = init_llumlets(num_llumlets) + num_instances = 5 + instance_ids, instances = init_instances(num_instances) - for i in range(num_llumlets): + for i in range(num_instances): for _ in range(2*(i+1)): - ray.get(llumlets[i].generate.remote(random_uuid(), None, math.inf, None, None)) + ray.get(instances[i].generate.remote(random_uuid(), None, math.inf, None, None)) instance_info = InstanceInfo() instance_info.instance_type = InstanceType.NO_CONSTRAINTS - for i in range(num_llumlets): + for i in range(num_instances): instance_info.instance_id = instance_ids[i] instance_info.num_available_gpu_blocks = 40 - i * 10 instance_info.num_running_requests = i instance_info.num_blocks_first_waiting_request = i - ray.get(llumlets[i].set_instance_info.remote(instance_info)) + ray.get(instances[i].set_instance_info.remote(instance_info)) - for i in range(num_llumlets): - num_migrate_out = ray.get(llumlets[i].get_num_migrate_out.remote()) + for i in range(num_instances): + num_migrate_out = ray.get(instances[i].get_num_migrate_out.remote()) assert num_migrate_out == 0 - ray.get(manager.scale_up.remote(instance_ids, llumlets)) + ray.get(manager.scale_up.remote(instance_ids, instances)) time.sleep(2) - for i in range(num_llumlets): - num_migrate_out = ray.get(llumlets[i].get_num_migrate_out.remote()) - num_migrate_in = ray.get(llumlets[i].get_num_migrate_in.remote()) + for i in range(num_instances): + num_migrate_out = ray.get(instances[i].get_num_migrate_out.remote()) + num_migrate_in = ray.get(instances[i].get_num_migrate_in.remote()) if i == 0: assert num_migrate_in > 1 and num_migrate_out == 0 - elif i == num_llumlets - 1: + elif i == num_instances - 1: assert num_migrate_in == 0 and num_migrate_out > 1 else: assert num_migrate_in == 0 and num_migrate_out == 0 From a4358e6b01c8ddec9e94865a0a65a953bbab1e86 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 6 Jan 2025 03:41:42 +0000 Subject: [PATCH 45/92] Refine api server unit test --- llumnix/manager.py | 6 +++--- .../unit_test/entrypoints/vllm/api_server_manager.py | 5 +---- .../entrypoints/vllm/api_server_manager_service.py | 11 ++++++----- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index 2c130007..48167ca1 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -508,16 +508,16 @@ def _init_placement_group(self, instance_id: str, engine_args, backend_type: BackendType, - contain_server: bool = False) -> PlacementGroup: + init_server: bool = False) -> PlacementGroup: if not self.manager_args.profiling_result_file_path: # num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor # num_gpus=world_size, for world_size Workers world_size = get_engine_world_size(engine_args, backend_type) - placement_group = initialize_placement_group(instance_id, num_cpus=3+int(contain_server), num_gpus=world_size, detached=True) + placement_group = initialize_placement_group(instance_id, num_cpus=3+int(init_server), num_gpus=world_size, detached=True) else: assert backend_type == backend_type.VLLM, "Only support the simulator backend for vLLM." # num_cpus=1, for Llumlet + AsyncPutQueueActor - placement_group = initialize_placement_group(instance_id, num_cpus=2+int(contain_server), num_gpus=0, detached=True) + placement_group = initialize_placement_group(instance_id, num_cpus=2+int(init_server), num_gpus=0, detached=True) return placement_group diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager.py b/tests/unit_test/entrypoints/vllm/api_server_manager.py index daff55e2..22eda14a 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager.py @@ -76,10 +76,7 @@ def stats() -> Response: ip = '127.0.0.1' port = 1234 request_output_queue = init_request_output_queue_server(ip, port, request_output_queue_type) - ray_queue_server = None - if request_output_queue_type == QueueType.RAYQUEUE: - ray_queue_server = request_output_queue - server_info = ServerInfo(random_uuid(), request_output_queue_type, ray_queue_server, ip, port) + server_info = ServerInfo(random_uuid(), request_output_queue_type, request_output_queue, ip, port) llumnix_context = EntrypointsContext(manager, {'0': None}, request_output_queue, diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py index 7d8a99ec..22703406 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py @@ -13,6 +13,7 @@ import argparse import time +import threading import uvicorn import ray from ray.util.queue import Queue as RayQueue @@ -28,8 +29,9 @@ from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType from llumnix.entrypoints.setup import EntrypointsContext from llumnix.entrypoints.vllm.client import LlumnixClientVLLM -from llumnix.utils import get_manager_name +from llumnix.utils import get_manager_name, get_server_name +# for stats api app = llumnix.entrypoints.vllm.api_server.app manager = None ENTRYPOINTS_ACTOR_NAME = "entrypoints" @@ -68,13 +70,11 @@ def __init__(self, host: str, port: int, request_output_queue_type: QueueType): self.port = port ip = '127.0.0.1' port = 1234 + # for app manager global manager manager = ray.get_actor(get_manager_name(), namespace="llumnix") request_output_queue = init_request_output_queue_server(ip, port, request_output_queue_type) - ray_queue_server = None - if request_output_queue_type == QueueType.RAYQUEUE: - ray_queue_server = request_output_queue - server_info = ServerInfo(random_uuid(), request_output_queue_type, ray_queue_server, ip, port) + server_info = ServerInfo(random_uuid(), request_output_queue_type, request_output_queue, ip, port) llumnix_context = EntrypointsContext(manager, {'0': None}, request_output_queue, @@ -117,4 +117,5 @@ def stats() -> Response: request_output_queue_type = QueueType(args.request_output_queue_type) manager = init_manager_service(request_output_queue_type, args) + # wait initialization done time.sleep(2) From e39fc900f0456dd3dc07f9b1773958434fc6860f Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 6 Jan 2025 07:18:23 +0000 Subject: [PATCH 46/92] Refine api server unit test --- llumnix/entrypoints/setup.py | 4 +- llumnix/entrypoints/vllm/api_server_actor.py | 3 +- llumnix/manager.py | 6 +- .../unit_test/entrypoints/vllm/api_server.py | 14 ++ .../entrypoints/vllm/api_server_manager.py | 74 +++++----- .../vllm/api_server_manager_service.py | 127 +++++++----------- .../entrypoints/vllm/test_api_server.py | 2 +- 7 files changed, 108 insertions(+), 122 deletions(-) create mode 100644 tests/unit_test/entrypoints/vllm/api_server.py diff --git a/llumnix/entrypoints/setup.py b/llumnix/entrypoints/setup.py index 7da6be66..135e20c6 100644 --- a/llumnix/entrypoints/setup.py +++ b/llumnix/entrypoints/setup.py @@ -145,7 +145,7 @@ def setup_entrypoints_context(entrypoints_args, manager, instance_ids, instances server_id = random_uuid() ip = get_ip_address() server_info = ServerInfo(server_id, - entrypoints_args.request_output_queue_type, + QueueType(entrypoints_args.request_output_queue_type), request_output_queue, ip, entrypoints_args.request_output_queue_port) @@ -166,7 +166,7 @@ def _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment manager, instance_ids, instances, request_output_queue = \ init_llumnix_components(manager_args, engine_args, - entrypoints_args.request_output_queue_type, + QueueType(entrypoints_args.request_output_queue_type), entrypoints_args.request_output_queue_port, deployment_args.backend_type) diff --git a/llumnix/entrypoints/vllm/api_server_actor.py b/llumnix/entrypoints/vllm/api_server_actor.py index d92a5856..4f158182 100644 --- a/llumnix/entrypoints/vllm/api_server_actor.py +++ b/llumnix/entrypoints/vllm/api_server_actor.py @@ -6,13 +6,14 @@ from llumnix.entrypoints.utils import EntrypointsContext, get_ip_address from llumnix.llumlet.llumlet import Llumlet from llumnix.utils import get_server_name +from llumnix.queue.utils import init_request_output_queue_server, QueueType class FastAPIServer: def __init__(self, entrypoints_args: EntrypointsArgs): self.entrypoints_args = entrypoints_args self.request_output_queue_port = self.entrypoints_args.request_output_queue_port - self.request_output_queue_type = self.entrypoints_args.request_output_queue_type + self.request_output_queue_type = QueueType(self.entrypoints_args.request_output_queue_type) ip = get_ip_address() self.request_output_queue = init_request_output_queue_server( ip, self.request_output_queue_port, self.request_output_queue_type) diff --git a/llumnix/manager.py b/llumnix/manager.py index 48167ca1..dc4bf3eb 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -298,7 +298,7 @@ async def _auto_scale_up_loop(self, interval: float) -> None: self.scale_down(instance_id) if new_pg is None: new_instance_id = random_uuid() - new_pg = self._init_placement_group(new_instance_id, self.engine_args, self.backend_type, contain_server=True) + new_pg = self._init_placement_group(new_instance_id, self.engine_args, self.backend_type, init_server=True) try: await asyncio.wait_for(new_pg.ready(), WAIT_PLACEMENT_GROUP_TIMEOUT) except asyncio.TimeoutError: @@ -491,7 +491,7 @@ def from_args(cls, engine_args = None, deployment_args: DeploymentArgs = None, ) -> "Manager": - manager_class = ray.remote(num_cpus=0, + manager_class = ray.remote(num_cpus=1, max_restarts=-1, name=get_manager_name(), namespace="llumnix", @@ -606,7 +606,7 @@ async def done_scale_up(): logger.error("[_init_server_and_instance] exception traceback: {}".format(traceback.format_exc())) self._clear_instance_ray_resources(instance_id) - request_output_queue_type = self.entrypoints_args.request_output_queue_type + request_output_queue_type = QueueType(self.entrypoints_args.request_output_queue_type) instance = self._init_instance(instance_id, placement_group, request_output_queue_type, self.backend_type, self.engine_args) server = self._init_server(instance_id, placement_group, self.entrypoints_args) asyncio.create_task(done_scale_up()) diff --git a/tests/unit_test/entrypoints/vllm/api_server.py b/tests/unit_test/entrypoints/vllm/api_server.py new file mode 100644 index 00000000..a72f4fd6 --- /dev/null +++ b/tests/unit_test/entrypoints/vllm/api_server.py @@ -0,0 +1,14 @@ +from fastapi.responses import JSONResponse, Response +import ray + +import llumnix.entrypoints.vllm.api_server + +manager = None +llumnix_client = llumnix.entrypoints.vllm.api_server.llumnix_client +app = llumnix.entrypoints.vllm.api_server.app + + +@app.get("/stats") +def stats() -> Response: + """Get the statistics of the engine.""" + return JSONResponse(ray.get(manager.testing_stats.remote())) diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager.py b/tests/unit_test/entrypoints/vllm/api_server_manager.py index 22eda14a..a2d039d4 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager.py @@ -14,24 +14,20 @@ import argparse import uvicorn import ray -from fastapi.responses import JSONResponse, Response from vllm.outputs import CompletionOutput, RequestOutput import llumnix.entrypoints.vllm.api_server import llumnix.manager -from llumnix.arg_utils import ManagerArgs from llumnix.server_info import ServerInfo, RequestTimestamps from llumnix.utils import random_uuid, get_manager_name from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType from llumnix.entrypoints.setup import EntrypointsContext from llumnix.entrypoints.vllm.client import LlumnixClientVLLM -app = llumnix.entrypoints.vllm.api_server.app -manager = None +import tests.unit_test.entrypoints.vllm.api_server -@ray.remote(num_cpus=0) class MockManager: def __init__(self, request_output_queue_type: QueueType): self._num_generates = 0 @@ -51,16 +47,40 @@ async def abort(self, request_id): def testing_stats(self): return {"num_aborted_requests": self._num_aborts} + @classmethod + def from_args(cls, request_output_queue_type: QueueType): + manager_class = ray.remote(num_cpus=1, + name=get_manager_name(), + namespace='llumnix', + lifetime='detached')(cls) + manager = manager_class.remote(request_output_queue_type) + return manager + +def setup_entrypoints_context(request_output_queue_type: QueueType): + manager = ray.get_actor(get_manager_name(), namespace="llumnix") + tests.unit_test.entrypoints.vllm.api_server.manager = manager + ip = '127.0.0.1' + port = 1234 + request_output_queue = init_request_output_queue_server(ip, port, request_output_queue_type) + server_info = ServerInfo(random_uuid(), request_output_queue_type, request_output_queue, ip, port) + entrypoints_context = EntrypointsContext(manager, + {'0': None}, + request_output_queue, + server_info, + None, + None) + return entrypoints_context + +def run_uvicorn_server(host: str, port: int, entrypoints_context: EntrypointsContext): + llumnix.entrypoints.vllm.api_server.llumnix_client = LlumnixClientVLLM(entrypoints_context) + app = tests.unit_test.entrypoints.vllm.api_server.app -def init_manager(request_output_queue_type: QueueType): - manager = MockManager.options(name=get_manager_name(), - namespace='llumnix').remote(request_output_queue_type) - return manager - -@app.get("/stats") -def stats() -> Response: - """Get the statistics of the engine.""" - return JSONResponse(ray.get(manager.testing_stats.remote())) + uvicorn.run( + app, + host=host, + port=port, + log_level="debug", + timeout_keep_alive=llumnix.entrypoints.vllm.api_server.TIMEOUT_KEEP_ALIVE) if __name__ == "__main__": @@ -68,26 +88,10 @@ def stats() -> Response: parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) parser.add_argument("--request-output-queue-type", type=str, choices=["zmq", "rayqueue"]) - parser = ManagerArgs.add_cli_args(parser) - args = parser.parse_args() + entrypoints_args = parser.parse_args() - request_output_queue_type = QueueType(args.request_output_queue_type) - manager = init_manager(request_output_queue_type) - ip = '127.0.0.1' - port = 1234 - request_output_queue = init_request_output_queue_server(ip, port, request_output_queue_type) - server_info = ServerInfo(random_uuid(), request_output_queue_type, request_output_queue, ip, port) - llumnix_context = EntrypointsContext(manager, - {'0': None}, - request_output_queue, - server_info, - None, - None) - llumnix.entrypoints.vllm.api_server.llumnix_client = LlumnixClientVLLM(llumnix_context) + request_output_queue_type = QueueType(entrypoints_args.request_output_queue_type) + manager = MockManager.from_args(request_output_queue_type) + entrypoints_context = setup_entrypoints_context(request_output_queue_type) - uvicorn.run( - app, - host=args.host, - port=args.port, - log_level="debug", - timeout_keep_alive=llumnix.entrypoints.vllm.api_server.TIMEOUT_KEEP_ALIVE) + run_uvicorn_server(entrypoints_args.host, entrypoints_args.port, entrypoints_context) diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py index 22703406..db8ac807 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py @@ -14,92 +14,63 @@ import argparse import time import threading -import uvicorn import ray from ray.util.queue import Queue as RayQueue -from fastapi.responses import JSONResponse, Response - -from vllm.outputs import CompletionOutput, RequestOutput - -import llumnix.entrypoints.vllm.api_server -import llumnix.manager -from llumnix.arg_utils import ManagerArgs -from llumnix.server_info import ServerInfo, RequestTimestamps -from llumnix.utils import random_uuid -from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType -from llumnix.entrypoints.setup import EntrypointsContext -from llumnix.entrypoints.vllm.client import LlumnixClientVLLM -from llumnix.utils import get_manager_name, get_server_name - -# for stats api -app = llumnix.entrypoints.vllm.api_server.app -manager = None -ENTRYPOINTS_ACTOR_NAME = "entrypoints" +from llumnix.queue.utils import init_request_output_queue_client, QueueType +from llumnix.utils import get_manager_name -@ray.remote(num_cpus=0, lifetime="detached") -class MockManagerService: - def __init__(self, request_output_queue_type: QueueType, args: 'Namespace'): - self._num_generates = 0 - self._num_aborts = 0 - self.request_output_queue = init_request_output_queue_client(request_output_queue_type) - self.init_api_server(args.host, args.port, request_output_queue_type) - self.api_server.run.remote() +from tests.unit_test.entrypoints.vllm.api_server_manager import (MockManager, setup_entrypoints_context, + run_uvicorn_server) - def init_api_server(self, host: str, port: int, request_output_queue_type: QueueType): - self.api_server = FastAPIServer.options(name=ENTRYPOINTS_ACTOR_NAME, - namespace='llumnix').remote(args.host, args.port, request_output_queue_type) +ENTRYPOINTS_ACTOR_NAME = "entrypoints" - async def generate(self, request_id, server_info, *args, **kwargs): - self._num_generates += 1 - completion_output = CompletionOutput(0, "", [], 0.0, None) - request_output = RequestOutput(request_id, "", [], None, [completion_output], finished=True) - request_output.request_timestamps = RequestTimestamps() - await self.request_output_queue.put_nowait([request_output], server_info) - async def abort(self, request_id): - self._num_aborts += 1 +class MockManagerService(MockManager): + def __init__(self, entrypoints_args): + self._num_generates = 0 + self._num_aborts = 0 + self.request_output_queue = init_request_output_queue_client( + QueueType(entrypoints_args.request_output_queue_type)) + self.server = self.init_api_server(entrypoints_args) + ray.get(self.server.setup_entrypoints_context.remote()) + ray.get(self.server.run.remote()) + + def init_server(self, entrypoints_args): + server = FastAPIServer.options(name=ENTRYPOINTS_ACTOR_NAME, + namespace='llumnix').remote(entrypoints_args) + return server + + # pylint: disable=arguments-renamed + @classmethod + def from_args(cls, entrypoints_args): + manager_class = ray.remote(num_cpus=1, + name=get_manager_name(), + namespace='llumnix', + lifetime='detached')(cls) + manager = manager_class.remote(entrypoints_args) + return manager - def testing_stats(self): - return {"num_aborted_requests": self._num_aborts} @ray.remote(num_cpus=1, lifetime="detached") class FastAPIServer: - def __init__(self, host: str, port: int, request_output_queue_type: QueueType): - self.host = host - self.port = port - ip = '127.0.0.1' - port = 1234 - # for app manager - global manager - manager = ray.get_actor(get_manager_name(), namespace="llumnix") - request_output_queue = init_request_output_queue_server(ip, port, request_output_queue_type) - server_info = ServerInfo(random_uuid(), request_output_queue_type, request_output_queue, ip, port) - llumnix_context = EntrypointsContext(manager, - {'0': None}, - request_output_queue, - server_info, - None, - None) - llumnix.entrypoints.vllm.api_server.llumnix_client = LlumnixClientVLLM(llumnix_context) + def __init__(self, entrypoints_args): + self.host = entrypoints_args.host + self.port = entrypoints_args.port + self.request_output_queue_type = QueueType(entrypoints_args.request_output_queue_type) - def run(self): - uvicorn.run( - app, - host=self.host, - port=self.port, - log_level="debug", - timeout_keep_alive=llumnix.entrypoints.vllm.api_server.TIMEOUT_KEEP_ALIVE) + def setup_entrypoints_context(self): + self.entrypoints_context = setup_entrypoints_context(self.request_output_queue_type) -def init_manager_service(request_output_queue_type: QueueType, args: 'Namespace'): - manager = MockManagerService.options(name=get_manager_name(), - namespace='llumnix').remote(request_output_queue_type, args) - return manager + def _run_uvicorn_server(self): + run_uvicorn_server(self.host, self.port, self.entrypoints_context) -@app.get("/stats") -def stats() -> Response: - """Get the statistics of the engine.""" - return JSONResponse(ray.get(manager.testing_stats.remote())) + def run(self): + self.run_uvicorn_server_thread = threading.Thread( + target=self._run_uvicorn_server, args=(), + daemon=True, name="run_uvicorn_server" + ) + self.run_uvicorn_server_thread.start() if __name__ == "__main__": @@ -107,15 +78,11 @@ def stats() -> Response: parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) parser.add_argument("--request-output-queue-type", type=str, choices=["zmq", "rayqueue"]) - parser = ManagerArgs.add_cli_args(parser) - args = parser.parse_args() + entrypoints_args = parser.parse_args() # magic actor, without this actor, FastAPIServer cannot initialize correctly. - # If this actor is placed globally, pylint will hangs if testing api_server_manager and api_server_service concurrently (--jobs > 1). + # If this actor is placed globally, + # pylint will hangs if testing api_server_manager and api_server_service concurrently (--jobs > 1). request_output_queue = RayQueue() - request_output_queue_type = QueueType(args.request_output_queue_type) - manager = init_manager_service(request_output_queue_type, args) - - # wait initialization done - time.sleep(2) + manager = MockManagerService.from_args(entrypoints_args) diff --git a/tests/unit_test/entrypoints/vllm/test_api_server.py b/tests/unit_test/entrypoints/vllm/test_api_server.py index 35100d42..3a8f9367 100644 --- a/tests/unit_test/entrypoints/vllm/test_api_server.py +++ b/tests/unit_test/entrypoints/vllm/test_api_server.py @@ -45,7 +45,7 @@ def _query_server_generate(prompt: str) -> dict: def _query_server_generate_benchmark(prompt: str) -> dict: return _query_server(prompt, interface='generate_benchmark') -@pytest.fixture(params=[("zmq", "manager_service"), ("rayqueue", "manager_service"), ("zmq", "manager"), ("rayqueue", "manager")]) +@pytest.fixture(params=[("zmq", "manager"), ("rayqueue", "manager"), ("zmq", "manager_service"), ("rayqueue", "manager_service")]) def api_server(request): request_output_queue_type = QueueType(request.param[0]) print(f"{request.param[0]}-{request.param[1]}") From 027325a8319ab08badf5cb7cd9fcc186d30ae9db Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 6 Jan 2025 07:59:46 +0000 Subject: [PATCH 47/92] Fix api server unit test --- .../unit_test/entrypoints/vllm/api_server_manager_service.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py index db8ac807..52834c1e 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py +++ b/tests/unit_test/entrypoints/vllm/api_server_manager_service.py @@ -32,7 +32,7 @@ def __init__(self, entrypoints_args): self._num_aborts = 0 self.request_output_queue = init_request_output_queue_client( QueueType(entrypoints_args.request_output_queue_type)) - self.server = self.init_api_server(entrypoints_args) + self.server = self.init_server(entrypoints_args) ray.get(self.server.setup_entrypoints_context.remote()) ray.get(self.server.run.remote()) @@ -86,3 +86,6 @@ def run(self): request_output_queue = RayQueue() manager = MockManagerService.from_args(entrypoints_args) + + while True: + time.sleep(100.0) From af1e148761695f0d179c26b4ce7cb77d1e2facfb Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 6 Jan 2025 08:15:21 +0000 Subject: [PATCH 48/92] Done api server unit test --- tests/unit_test/entrypoints/vllm/api.py | 14 +++ .../unit_test/entrypoints/vllm/api_server.py | 99 +++++++++++++++++-- ...manager_service.py => api_server_actor.py} | 8 +- .../entrypoints/vllm/api_server_manager.py | 97 ------------------ .../entrypoints/vllm/test_api_server.py | 8 +- 5 files changed, 113 insertions(+), 113 deletions(-) create mode 100644 tests/unit_test/entrypoints/vllm/api.py rename tests/unit_test/entrypoints/vllm/{api_server_manager_service.py => api_server_actor.py} (91%) delete mode 100644 tests/unit_test/entrypoints/vllm/api_server_manager.py diff --git a/tests/unit_test/entrypoints/vllm/api.py b/tests/unit_test/entrypoints/vllm/api.py new file mode 100644 index 00000000..a72f4fd6 --- /dev/null +++ b/tests/unit_test/entrypoints/vllm/api.py @@ -0,0 +1,14 @@ +from fastapi.responses import JSONResponse, Response +import ray + +import llumnix.entrypoints.vllm.api_server + +manager = None +llumnix_client = llumnix.entrypoints.vllm.api_server.llumnix_client +app = llumnix.entrypoints.vllm.api_server.app + + +@app.get("/stats") +def stats() -> Response: + """Get the statistics of the engine.""" + return JSONResponse(ray.get(manager.testing_stats.remote())) diff --git a/tests/unit_test/entrypoints/vllm/api_server.py b/tests/unit_test/entrypoints/vllm/api_server.py index a72f4fd6..78e6294a 100644 --- a/tests/unit_test/entrypoints/vllm/api_server.py +++ b/tests/unit_test/entrypoints/vllm/api_server.py @@ -1,14 +1,97 @@ -from fastapi.responses import JSONResponse, Response +# Copyright (c) 2024, Alibaba Group; +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import uvicorn import ray +from vllm.outputs import CompletionOutput, RequestOutput + import llumnix.entrypoints.vllm.api_server +import llumnix.manager +from llumnix.server_info import ServerInfo, RequestTimestamps +from llumnix.utils import random_uuid, get_manager_name +from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType +from llumnix.entrypoints.setup import EntrypointsContext +from llumnix.entrypoints.vllm.client import LlumnixClientVLLM + +import tests.unit_test.entrypoints.vllm.api + + +class MockManager: + def __init__(self, request_output_queue_type: QueueType): + self._num_generates = 0 + self._num_aborts = 0 + self.request_output_queue = init_request_output_queue_client(request_output_queue_type) + + async def generate(self, request_id, server_info, *args, **kwargs): + self._num_generates += 1 + completion_output = CompletionOutput(0, "", [], 0.0, None) + request_output = RequestOutput(request_id, "", [], None, [completion_output], finished=True) + request_output.request_timestamps = RequestTimestamps() + await self.request_output_queue.put_nowait([request_output], server_info) + + async def abort(self, request_id): + self._num_aborts += 1 + + def testing_stats(self): + return {"num_aborted_requests": self._num_aborts} + + @classmethod + def from_args(cls, request_output_queue_type: QueueType): + manager_class = ray.remote(num_cpus=1, + name=get_manager_name(), + namespace='llumnix', + lifetime='detached')(cls) + manager = manager_class.remote(request_output_queue_type) + return manager + +def setup_entrypoints_context(request_output_queue_type: QueueType): + manager = ray.get_actor(get_manager_name(), namespace="llumnix") + tests.unit_test.entrypoints.vllm.api.manager = manager + ip = '127.0.0.1' + port = 1234 + request_output_queue = init_request_output_queue_server(ip, port, request_output_queue_type) + server_info = ServerInfo(random_uuid(), request_output_queue_type, request_output_queue, ip, port) + entrypoints_context = EntrypointsContext(manager, + {'0': None}, + request_output_queue, + server_info, + None, + None) + return entrypoints_context + +def run_uvicorn_server(host: str, port: int, entrypoints_context: EntrypointsContext): + llumnix.entrypoints.vllm.api_server.llumnix_client = LlumnixClientVLLM(entrypoints_context) + app = tests.unit_test.entrypoints.vllm.api.app + + uvicorn.run( + app, + host=host, + port=port, + log_level="debug", + timeout_keep_alive=llumnix.entrypoints.vllm.api_server.TIMEOUT_KEEP_ALIVE) + -manager = None -llumnix_client = llumnix.entrypoints.vllm.api_server.llumnix_client -app = llumnix.entrypoints.vllm.api_server.app +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--request-output-queue-type", type=str, choices=["zmq", "rayqueue"]) + entrypoints_args = parser.parse_args() + request_output_queue_type = QueueType(entrypoints_args.request_output_queue_type) + manager = MockManager.from_args(request_output_queue_type) + entrypoints_context = setup_entrypoints_context(request_output_queue_type) -@app.get("/stats") -def stats() -> Response: - """Get the statistics of the engine.""" - return JSONResponse(ray.get(manager.testing_stats.remote())) + run_uvicorn_server(entrypoints_args.host, entrypoints_args.port, entrypoints_context) diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py b/tests/unit_test/entrypoints/vllm/api_server_actor.py similarity index 91% rename from tests/unit_test/entrypoints/vllm/api_server_manager_service.py rename to tests/unit_test/entrypoints/vllm/api_server_actor.py index 52834c1e..c857a747 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_manager_service.py +++ b/tests/unit_test/entrypoints/vllm/api_server_actor.py @@ -20,13 +20,13 @@ from llumnix.queue.utils import init_request_output_queue_client, QueueType from llumnix.utils import get_manager_name -from tests.unit_test.entrypoints.vllm.api_server_manager import (MockManager, setup_entrypoints_context, - run_uvicorn_server) +from tests.unit_test.entrypoints.vllm.api_server import (MockManager, setup_entrypoints_context, + run_uvicorn_server) ENTRYPOINTS_ACTOR_NAME = "entrypoints" -class MockManagerService(MockManager): +class MockManagerServer(MockManager): def __init__(self, entrypoints_args): self._num_generates = 0 self._num_aborts = 0 @@ -85,7 +85,7 @@ def run(self): # pylint will hangs if testing api_server_manager and api_server_service concurrently (--jobs > 1). request_output_queue = RayQueue() - manager = MockManagerService.from_args(entrypoints_args) + manager = MockManagerServer.from_args(entrypoints_args) while True: time.sleep(100.0) diff --git a/tests/unit_test/entrypoints/vllm/api_server_manager.py b/tests/unit_test/entrypoints/vllm/api_server_manager.py deleted file mode 100644 index a2d039d4..00000000 --- a/tests/unit_test/entrypoints/vllm/api_server_manager.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright (c) 2024, Alibaba Group; -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import uvicorn -import ray - -from vllm.outputs import CompletionOutput, RequestOutput - -import llumnix.entrypoints.vllm.api_server -import llumnix.manager -from llumnix.server_info import ServerInfo, RequestTimestamps -from llumnix.utils import random_uuid, get_manager_name -from llumnix.queue.utils import init_request_output_queue_server, init_request_output_queue_client, QueueType -from llumnix.entrypoints.setup import EntrypointsContext -from llumnix.entrypoints.vllm.client import LlumnixClientVLLM - -import tests.unit_test.entrypoints.vllm.api_server - - -class MockManager: - def __init__(self, request_output_queue_type: QueueType): - self._num_generates = 0 - self._num_aborts = 0 - self.request_output_queue = init_request_output_queue_client(request_output_queue_type) - - async def generate(self, request_id, server_info, *args, **kwargs): - self._num_generates += 1 - completion_output = CompletionOutput(0, "", [], 0.0, None) - request_output = RequestOutput(request_id, "", [], None, [completion_output], finished=True) - request_output.request_timestamps = RequestTimestamps() - await self.request_output_queue.put_nowait([request_output], server_info) - - async def abort(self, request_id): - self._num_aborts += 1 - - def testing_stats(self): - return {"num_aborted_requests": self._num_aborts} - - @classmethod - def from_args(cls, request_output_queue_type: QueueType): - manager_class = ray.remote(num_cpus=1, - name=get_manager_name(), - namespace='llumnix', - lifetime='detached')(cls) - manager = manager_class.remote(request_output_queue_type) - return manager - -def setup_entrypoints_context(request_output_queue_type: QueueType): - manager = ray.get_actor(get_manager_name(), namespace="llumnix") - tests.unit_test.entrypoints.vllm.api_server.manager = manager - ip = '127.0.0.1' - port = 1234 - request_output_queue = init_request_output_queue_server(ip, port, request_output_queue_type) - server_info = ServerInfo(random_uuid(), request_output_queue_type, request_output_queue, ip, port) - entrypoints_context = EntrypointsContext(manager, - {'0': None}, - request_output_queue, - server_info, - None, - None) - return entrypoints_context - -def run_uvicorn_server(host: str, port: int, entrypoints_context: EntrypointsContext): - llumnix.entrypoints.vllm.api_server.llumnix_client = LlumnixClientVLLM(entrypoints_context) - app = tests.unit_test.entrypoints.vllm.api_server.app - - uvicorn.run( - app, - host=host, - port=port, - log_level="debug", - timeout_keep_alive=llumnix.entrypoints.vllm.api_server.TIMEOUT_KEEP_ALIVE) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--request-output-queue-type", type=str, choices=["zmq", "rayqueue"]) - entrypoints_args = parser.parse_args() - - request_output_queue_type = QueueType(entrypoints_args.request_output_queue_type) - manager = MockManager.from_args(request_output_queue_type) - entrypoints_context = setup_entrypoints_context(request_output_queue_type) - - run_uvicorn_server(entrypoints_args.host, entrypoints_args.port, entrypoints_context) diff --git a/tests/unit_test/entrypoints/vllm/test_api_server.py b/tests/unit_test/entrypoints/vllm/test_api_server.py index 3a8f9367..103ea2ca 100644 --- a/tests/unit_test/entrypoints/vllm/test_api_server.py +++ b/tests/unit_test/entrypoints/vllm/test_api_server.py @@ -45,16 +45,16 @@ def _query_server_generate(prompt: str) -> dict: def _query_server_generate_benchmark(prompt: str) -> dict: return _query_server(prompt, interface='generate_benchmark') -@pytest.fixture(params=[("zmq", "manager"), ("rayqueue", "manager"), ("zmq", "manager_service"), ("rayqueue", "manager_service")]) +@pytest.fixture(params=[("zmq", "api_server"), ("rayqueue", "api_server"), ("zmq", "api_server_actor"), ("rayqueue", "api_server_actor")]) def api_server(request): request_output_queue_type = QueueType(request.param[0]) print(f"{request.param[0]}-{request.param[1]}") - if request.param[1] == "manager": + if request.param[1] == "api_server": script_path = Path(__file__).parent.joinpath( - "api_server_manager.py").absolute() + "api_server.py").absolute() else: script_path = Path(__file__).parent.joinpath( - "api_server_manager_service.py").absolute() + "api_server_actor.py").absolute() commands = [ sys.executable, "-u", From 641cfedb0c6194244e6cafd7e2d2acf725fa26c0 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 6 Jan 2025 08:17:29 +0000 Subject: [PATCH 49/92] Remove demo dir --- demo/actor_demo.py | 24 --- demo/client.py | 28 --- demo/manager_service_demo.py | 341 ----------------------------------- demo/placement_group_demo.py | 125 ------------- demo/serve_demo.py | 102 ----------- demo/serve_demo1.py | 139 -------------- demo/serve_demo2.py | 45 ----- 7 files changed, 804 deletions(-) delete mode 100644 demo/actor_demo.py delete mode 100644 demo/client.py delete mode 100644 demo/manager_service_demo.py delete mode 100644 demo/placement_group_demo.py delete mode 100644 demo/serve_demo.py delete mode 100644 demo/serve_demo1.py delete mode 100644 demo/serve_demo2.py diff --git a/demo/actor_demo.py b/demo/actor_demo.py deleted file mode 100644 index e182d66a..00000000 --- a/demo/actor_demo.py +++ /dev/null @@ -1,24 +0,0 @@ -import ray - -from manager_service_demo import (initialize_placement_group, - Llumlet, - get_instance_name) - -from llumnix.utils import random_uuid - - -def test_get_died_actor(): - placement_group = initialize_placement_group() - instance_id = random_uuid() - llumlet = Llumlet.from_args(instance_id, placement_group) - ray.get(llumlet.ready.remote()) - ray.get_actor(get_instance_name(instance_id), namespace="llumnix") - ray.kill(llumlet) - try: - ray.get_actor(get_instance_name(instance_id)) - print("Get died actor successfully") - except ValueError: - print("Get died actor failed") - -if __name__ == "__main__": - test_get_died_actor() diff --git a/demo/client.py b/demo/client.py deleted file mode 100644 index 8d9f3c07..00000000 --- a/demo/client.py +++ /dev/null @@ -1,28 +0,0 @@ -import argparse -import requests - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default='localhost') - parser.add_argument("--port", type=int, default=8000) - args = parser.parse_args() - - ip_address = f"{args.host}:{args.port}" - api_list = [ - "is_ready", - "generate", - "generate_stream", - "health", - ] - for api in api_list: - try: - url = f"http://{ip_address}/{api}" - if api in ["is_ready", "health"]: - response = requests.get(url) - else: - response = requests.post(url) - response.raise_for_status() - print(f"api: {api}, response: {response}, response.text: {response.text}") - except requests.RequestException as e: - print(f"Request failed: {e}") diff --git a/demo/manager_service_demo.py b/demo/manager_service_demo.py deleted file mode 100644 index 7213e41d..00000000 --- a/demo/manager_service_demo.py +++ /dev/null @@ -1,341 +0,0 @@ -import asyncio -import time -import traceback -import threading -from typing import Dict, Tuple -from functools import partial -import uvicorn -from fastapi import FastAPI -import ray -from ray.util.placement_group import PlacementGroup -from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy -from ray.util.queue import Queue as RayQueue -from ray.util.state import (list_actors, - list_placement_groups) - - -from llumnix.utils import random_uuid - -PLACEMENT_GROUP_NAME_PREFIX = "pg_" -SERVER_NAME_PREFIX = "server_" -INSTANCE_NAME_PREFIX = "instance_" -WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS = 1.0 -AUTO_DEPLOYMENT_INTERVAL_SECONDS = 1.0 -CHECK_DEPLOYMENT_CORRECTNESS_INTERVAL_SECONDS = 5.0 - -app = FastAPI() - -def get_placement_group_name(instance_id: str) -> str: - return f"{PLACEMENT_GROUP_NAME_PREFIX}{instance_id}" -def get_server_name(instance_id: str) -> str: - return f"{SERVER_NAME_PREFIX}{instance_id}" - -def get_instance_name(instance_id: str) -> str: - return f"{INSTANCE_NAME_PREFIX}{instance_id}" - -def initialize_placement_group(instance_id: str = None, lifetime: str = None) -> PlacementGroup: - placement_group_specs = ([{"CPU": 1}, {"CPU": 1, "GPU": 4}]) - if instance_id is None: - instance_id = random_uuid() - placement_group_name = get_placement_group_name(instance_id) - placement_group = ray.util.placement_group( - placement_group_specs, "STRICT_PACK", lifetime=lifetime, name=placement_group_name) - return placement_group - -def remove_placement_group(instance_id: str = None) -> bool: - placement_group = ray.util.get_placement_group(get_placement_group_name(instance_id)) - if not placement_group: - return False - try: - # asynchronous - ray.util.remove_placement_group(placement_group) - print(f"remove placement group {instance_id}") - # pylint: disable=broad-except - except Exception: - return False - return True - -def kill_server(instance_id: str = None) -> bool: - try: - server = ray.get_actor(get_server_name(instance_id), namespace="llumnix") - except ValueError: - return False - try: - ray.kill(server) - print(f"kill server {instance_id}") - # pylint: disable=broad-except - except Exception: - return False - return True - -def kill_instance(instance_id: str = None) -> bool: - try: - instance = ray.get_actor(get_instance_name(instance_id), namespace="llumnix") - except ValueError: - return False - try: - ray.kill(instance) - print(f"kill instance {instance_id}") - # pylint: disable=broad-except - except Exception: - return False - return True - - -class FastAPIServer: - def __init__(self, instance_id: str, host: str, port: int): - self.host = host - self.port = port - self.server_name = get_server_name(instance_id) - print("FastAPIServer created") - self.run_loop_thread = threading.Thread( - target=self._run_loop, args=(), daemon=True, name="run_loop" - ) - - def _run_loop(self): - uvicorn.run(app, host=self.host, port=self.port) - - def run(self): - self.run_loop_thread.start() - - @classmethod - def from_args(cls, - instance_id: str, - host: str, - port: int, - placement_group: PlacementGroup, - lifetime: str = None): - server_name = get_server_name(instance_id) - fastapi_server_class = ray.remote(num_cpus=1, - name=server_name, - namespace="llumnix", - lifetime=lifetime)(cls).options( - scheduling_strategy=PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_bundle_index=0, - ) - ) - fastapi_server = fastapi_server_class.remote(instance_id, host, port) - return fastapi_server - - def ready(self) -> bool: - return True - - -class Llumlet: - def __init__(self, instance_id: str): - self.instance_name = get_instance_name(instance_id) - print("Llumlet created") - - @classmethod - def from_args(cls, - instance_id: str, - placement_group: PlacementGroup, - lifetime: str = None): - instance_name = get_instance_name(instance_id) - llumlet_class = ray.remote(num_cpus=1, - num_gpus=4, - name=instance_name, - namespace="llumnix", - lifetime=lifetime)(cls).options( - scheduling_strategy=PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_bundle_index=1, - ) - ) - llumlet = llumlet_class.remote(instance_id) - return llumlet - - def ready(self) -> bool: - return True - - -def get_curr_deployment() -> Tuple[Dict[str, PlacementGroup], Dict[str, FastAPIServer], Dict[str, Llumlet]]: - curr_pgs: Dict[str, PlacementGroup] = {} - curr_servers: Dict[str, PlacementGroup] = {} - curr_instances: Dict[str, Llumlet] = {} - - created_pg_states = list_placement_groups(filters=[("state", "=", "CREATED")]) - for created_pg_state in created_pg_states: - instance_id = created_pg_state["name"].split("_")[-1] - curr_pgs[instance_id] = ray.util.get_placement_group(created_pg_state["name"]) - - alive_actor_states = list_actors(filters=[("state", "=", "ALIVE")]) - for alive_actor_state in alive_actor_states: - if alive_actor_state["name"].startswith(SERVER_NAME_PREFIX): - instance_id = alive_actor_state["name"].split("_")[-1] - curr_servers[instance_id] = ray.get_actor(alive_actor_state["name"], namespace="llumnix") - elif alive_actor_state["name"].startswith(INSTANCE_NAME_PREFIX): - instance_id = alive_actor_state["name"].split("_")[-1] - curr_instances[instance_id] = ray.get_actor(alive_actor_state["name"], namespace="llumnix") - - return curr_pgs, curr_servers, curr_instances - - -class Manager: - def __init__(self): - print("create Manager") - self.host = "localhost" - self.port = 8000 - self.last_pending_pg: PlacementGroup = None - self.pgs: Dict[str, PlacementGroup] = {} - self.servers: Dict[str, FastAPIServer] = {} - self.instances: Dict[str, Llumlet] = {} - self._connect_to_existing_deployment() - asyncio.create_task(self._auto_scale_up_loop()) - asyncio.create_task(self._auto_scale_down_loop()) - asyncio.create_task(self._check_deployment_states_loop()) - print("Manager created") - - async def _auto_scale_down_loop(self) -> None: - def instance_ready_callback(instance_id: str, fut): - ret = fut.result()[0] - if isinstance(ret, ray.exceptions.RayActorError): - print(f"server/instance {instance_id} died, scale down") - self._scale_down(instance_id) - - while True: - try: - tasks = [] - for instance_id, instance in self.instances.items(): - task = asyncio.gather(instance.ready.remote(), return_exceptions=True) - task.add_done_callback(partial(instance_ready_callback, instance_id)) - tasks.append(task) - await asyncio.gather(*tasks, return_exceptions=True) - - tasks = [] - for instance_id, server in self.servers.items(): - task = asyncio.gather(server.ready.remote(), return_exceptions=True) - task.add_done_callback(partial(instance_ready_callback, instance_id)) - tasks.append(task) - await asyncio.gather(*tasks, return_exceptions=True) - - await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) - # pylint: disable=broad-except - except Exception as e: - print("unexpected exception occurs: {}".format(e)) - print("exception traceback: {}".format(traceback.format_exc())) - - async def _auto_scale_up_loop(self) -> None: - while True: - try: - pending_pg_states = list_placement_groups(filters=[("state", "=", "PENDING")]) - print(f"pending_pg_states: {pending_pg_states}") - for pending_pg_state in pending_pg_states: - instance_id = pending_pg_state["name"].split("_")[-1] - self._scale_down(instance_id) - instance_id = random_uuid() - new_pg = initialize_placement_group(instance_id, lifetime="detached") - try: - await asyncio.wait_for(new_pg.ready(), WAIT_PLACEMENT_GROUP_TIMEOUT_SECONDS) - except asyncio.TimeoutError: - print("Get new placement group ready timeout") - ray.util.remove_placement_group(new_pg) - await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) - continue - print("Get new placement group ready done") - self._initialize_server_and_instance(instance_id, new_pg) - print("Deploy server and instance to new placement group done") - # pylint: disable=broad-except - except Exception as e: - print("unexpected exception occurs: {}".format(e)) - print("exception traceback: {}".format(traceback.format_exc())) - - async def _check_deployment_states_loop(self) -> None: - async def detect_correctness_task(instance_id: str): - print(f"detect instance {instance_id}") - await asyncio.sleep(CHECK_DEPLOYMENT_CORRECTNESS_INTERVAL_SECONDS) - curr_pgs, curr_servers, curr_instances = get_curr_deployment() - if instance_id in curr_pgs and (instance_id not in curr_servers or instance_id not in curr_instances): - self._scale_down(instance_id) - - while True: - try: - curr_pgs, curr_servers, curr_instances = get_curr_deployment() - - assert len(curr_pgs) >= max(len(curr_servers), len(curr_instances)) - - tasks = [] - for instance_id in curr_pgs: - if instance_id not in curr_servers or instance_id not in curr_instances: - tasks.append(asyncio.create_task(detect_correctness_task(instance_id))) - await asyncio.gather(*tasks, return_exceptions=True) - - await asyncio.sleep(AUTO_DEPLOYMENT_INTERVAL_SECONDS) - # pylint: disable=broad-except - except Exception as e: - print("unexpected exception occurs: {}".format(e)) - print("exception traceback: {}".format(traceback.format_exc())) - - def _initialize_server_and_instance(self, instance_id: str, placement_group: PlacementGroup): - async def wait_instance_ready(instance_id: str): - try: - await new_instance.ready.remote() - print(f"instance {instance_id} ready, scale up") - await new_server.run.remote() - self._scale_up(instance_id, placement_group, new_server, new_instance) - except ray.exceptions.RayActorError: - print(f"instance {instance_id} died, abort scale up") - self._scale_down(instance_id) - - new_server = FastAPIServer.from_args(instance_id, self.host, self.port, placement_group, lifetime="detached") - new_instance = Llumlet.from_args(instance_id, placement_group, lifetime="detached") - asyncio.create_task(wait_instance_ready(instance_id)) - - def _connect_to_existing_deployment(self): - self.pgs, self.servers, self.instances = get_curr_deployment() - correct_instance_id_set = set(self.pgs.keys()).intersection(self.servers.keys(), self.instances.keys()) - print(f"connect to instances: {correct_instance_id_set}") - for instance_id in correct_instance_id_set: - self._scale_up(instance_id, self.pgs[instance_id], self.servers[instance_id], self.instances[instance_id]) - - def _scale_up(self, - instance_id: str, - placement_group: PlacementGroup, - server: FastAPIServer, - instance: Llumlet) -> None: - print(f"add placement group {instance_id}") - self.pgs[instance_id] = placement_group - print(f"add server {instance_id}") - self.servers[instance_id] = server - print(f"add instance {instance_id}") - self.instances[instance_id] = instance - - def _scale_down(self, instance_id: str) -> None: - kill_server(instance_id) - kill_instance(instance_id) - remove_placement_group(instance_id) - if instance_id in self.pgs: - print(f"pop placement group {instance_id}") - # Don't use del here. - self.pgs.pop(instance_id) - if instance_id in self.servers: - print(f"pop server {instance_id}") - self.servers.pop(instance_id) - if instance_id in self.instances: - print(f"pop instance {instance_id}") - self.instances.pop(instance_id) - - @classmethod - def from_args(cls): - manager_class = ray.remote(num_cpus=1, - max_restarts=-1, - name="manager", - namespace="llumnix", - lifetime="detached")(cls) - manager = manager_class.remote() - return manager - - -if __name__ == "__main__": - ray.init() - - # magic actor to avoid fast api server actor initialization error - request_output_queue = RayQueue(actor_options={ - "namespace": "llumnix", - "name": "magic_queue" - }) - manager = Manager.from_args() - - while True: - time.sleep(100) diff --git a/demo/placement_group_demo.py b/demo/placement_group_demo.py deleted file mode 100644 index 7fdd28b9..00000000 --- a/demo/placement_group_demo.py +++ /dev/null @@ -1,125 +0,0 @@ -import time -import asyncio -import ray -from ray.util import placement_group_table -from ray.util.state import (list_actors, - list_placement_groups) - -from manager_service_demo import (initialize_placement_group, - Llumlet) - - -def test_actor_if_pg_died(life_time_pg, lifetime_llumlet): - print(f"### placement group lifetime: {life_time_pg}, llumlet lifetime: {lifetime_llumlet}") - print("### create placement group and llumlet") - placement_group = initialize_placement_group(lifetime=life_time_pg) - _ = Llumlet.from_args("0", placement_group, lifetime=lifetime_llumlet) - print(f"placement group state: {placement_group_table(placement_group)}") - print(f"llumlet state: {list_actors()}") - print("### sleep 1s") - time.sleep(5) - print(f"llumlet state: {list_actors()}") - print("### remove placement group") - ray.util.remove_placement_group(placement_group) - print(f"placement group state: {placement_group_table(placement_group)}") - print(f"llumlet state: {list_actors()}") - -def test_pg_if_actor_died(life_time_pg, lifetime_llumlet): - print(f"### placement group lifetime: {life_time_pg}, llumlet lifetime: {lifetime_llumlet}") - print("### create placement group and llumlet") - placement_group = initialize_placement_group(lifetime=life_time_pg) - llumlet = Llumlet.from_args("0", placement_group, lifetime=lifetime_llumlet) - print(f"placement group state: {placement_group_table(placement_group)}") - print(f"llumlet state: {list_actors()}") - print("### sleep 5s") - time.sleep(5) - print(f"llumlet state: {list_actors()}") - print("### kill llumlet") - ray.kill(llumlet) - print(f"placement group state: {placement_group_table(placement_group)}") - print(f"llumlet state: {list_actors()}") - print("### remove placement group") - ray.util.remove_placement_group(placement_group) - -def test_pending(life_time_pg, lifetime_llumlet): - print(f"### placement group lifetime: {life_time_pg}, llumlet lifetime: {lifetime_llumlet}") - print("### create placement group and llumlet") - placement_group1 = initialize_placement_group(lifetime=life_time_pg) - llumlet1 = Llumlet.from_args("0", placement_group1, lifetime=lifetime_llumlet) - time.sleep(5) - print(f"placement group 1 state: {placement_group_table(placement_group1)}") - print(f"llumlet 1 state: {list_actors()}") - print("### create placement group and llumlet") - placement_group2 = initialize_placement_group(lifetime=life_time_pg) - llumlet2 = Llumlet.from_args("1", placement_group2, lifetime=lifetime_llumlet) - time.sleep(5) - print(f"placement group 2 state: {placement_group_table(placement_group2)}") - print(f"llumlet 2 state: {list_actors()}") - print("### kill llumlet") - ray.kill(llumlet1) - time.sleep(5) - print(f"placement group 2 state: {placement_group_table(placement_group2)}") - print(f"llumlet 2 state: {list_actors()}") - print("### remove placement group") - ray.util.remove_placement_group(placement_group1) - time.sleep(5) - print(f"placement group 2 state: {placement_group_table(placement_group2)}") - print(f"llumlet 2 state: {list_actors()}") - ray.util.remove_placement_group(placement_group2) - ray.kill(llumlet2) - -async def test_pg_ready(): - placement_group1 = initialize_placement_group() - try: - await asyncio.wait_for(placement_group1.ready(), timeout=5.0) - print("placement group 1 ready") - except asyncio.TimeoutError: - print("wait placement group 1 timeout") - placement_group2 = initialize_placement_group() - try: - await asyncio.wait_for(placement_group2.ready(), timeout=5.0) - print("placement group 2 ready") - except asyncio.TimeoutError: - print("wait placement group 2 timeout") - ray.util.remove_placement_group(placement_group1) - try: - await asyncio.wait_for(placement_group2.ready(), timeout=5.0) - print("placement group 2 ready") - except asyncio.TimeoutError: - print("wait placement group 2 timeout") - placement_group3 = initialize_placement_group() - ray.util.remove_placement_group(placement_group3) - await placement_group3.ready() - -def test_pg_api(): - placement_group1 = initialize_placement_group() - placement_group2 = initialize_placement_group() - time.sleep(3) - all_pgs = list_placement_groups() - print(f"all placement groups: {all_pgs}") - all_pgs_detail = list_placement_groups(detail=True) - print(f"all placement groups (detail): {all_pgs_detail}") - pending_pgs = list_placement_groups(filters=[("state", "=", "PENDING")]) - print(f"pending placement groups: {pending_pgs}") - created_pgs = list_placement_groups(filters=[("state", "=", "CREATED")]) - print(f"created placement groups: {created_pgs}") - - print(f"placement group 1 state: {placement_group_table(placement_group1)}") - print(f"placement group 2 state: {placement_group_table(placement_group2)}") - -if __name__ == "__main__": - # test_actor_if_pg_died(life_time_pg=None, lifetime_llumlet=None) - # test_actor_if_pg_died(life_time_pg=None, lifetime_llumlet="detached") - # test_actor_if_pg_died(life_time_pg="detached", lifetime_llumlet=None) - # test_actor_if_pg_died(life_time_pg=None, lifetime_llumlet="detached") - - # test_pg_if_actor_died(life_time_pg=None, lifetime_llumlet=None) - # test_pg_if_actor_died(life_time_pg=None, lifetime_llumlet="detached") - # test_pg_if_actor_died(life_time_pg="detached", lifetime_llumlet=None) - # test_pg_if_actor_died(life_time_pg=None, lifetime_llumlet="detached") - - # test_pending(life_time_pg=None, lifetime_llumlet=None) - - asyncio.run(test_pg_ready()) - - # test_pg_api() diff --git a/demo/serve_demo.py b/demo/serve_demo.py deleted file mode 100644 index c57cd29e..00000000 --- a/demo/serve_demo.py +++ /dev/null @@ -1,102 +0,0 @@ -import asyncio -import time -import threading -import argparse -from contextlib import asynccontextmanager -import uvicorn -import requests -from fastapi import FastAPI, Request -from fastapi.responses import JSONResponse, Response, StreamingResponse -import ray - -from llumnix.queue.ray_queue_server import RayQueueServer - - -# pylint: disable=unused-argument -@asynccontextmanager -async def lifespan(fastapi_app: FastAPI): - asyncio.create_task(request_output_queue.run_server_loop()) - yield - request_output_queue.cleanup() - -app = FastAPI(lifespan=lifespan) -request_output_queue = None - -@app.get("/is_ready") -async def is_ready() -> bool: - return True - -# pylint: disable=unused-argument -@app.post("/generate") -async def generate(request: Request) -> Response: - ret = {"text": ""} - return JSONResponse(ret) - -@app.get("/health") -async def health() -> Response: - """Health check.""" - return Response(status_code=200) - -# pylint: disable=unused-argument -@app.post("/generate_stream") -async def generate_stream(request: Request) -> StreamingResponse: - async def number_generator(): - for i in range(10): - t = time.time() - yield f"Number: {i}, Time: {t}; " - await asyncio.sleep(0.5) - return StreamingResponse(number_generator(), media_type="text/plain") - -class FastAPIServer: - def __init__(self, host: str, port: int): - self.host = host - self.port = port - self.run_loop_thread = threading.Thread( - target=self._run_loop, args=(), daemon=True, name="run_loop" - ) - - def _run_loop(self): - uvicorn.run(app, host=self.host, port=self.port) - - def run(self): - self.run_loop_thread.start() - - @classmethod - def from_args(cls, host: str, port: int): - fastapi_server_class = ray.remote(num_cpus=1, name="entrypoints", namespace="llumnix", lifetime="detached")(cls).options() - fastapi_server = fastapi_server_class.remote(host, port) - - return fastapi_server - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default='localhost') - parser.add_argument("--port", type=int, default=8000) - args = parser.parse_args() - - request_output_queue = RayQueueServer() - - server = FastAPIServer.from_args(args.host, args.port) - server.run.remote() - - time.sleep(5) - - ip_address = f"{args.host}:{args.port}" - api_list = [ - "is_ready", - "generate", - "generate_stream", - "health", - ] - for api in api_list: - try: - url = f"http://{ip_address}/{api}" - if api in ["is_ready", "health"]: - response = requests.get(url) - else: - response = requests.post(url) - response.raise_for_status() - print(f"api: {api}, response: {response}, response.text: {response.text}") - except requests.RequestException as e: - print(f"Request failed: {e}") diff --git a/demo/serve_demo1.py b/demo/serve_demo1.py deleted file mode 100644 index 745d10e4..00000000 --- a/demo/serve_demo1.py +++ /dev/null @@ -1,139 +0,0 @@ -import asyncio -import time -import argparse -from contextlib import asynccontextmanager -import uvicorn -import requests -from fastapi import FastAPI, Request -from fastapi.responses import JSONResponse, Response, StreamingResponse -import ray - -from llumnix.queue.zmq_server import ZmqServer -from llumnix.queue.zmq_client import ZmqClient -from llumnix.queue.utils import get_open_zmq_ipc_path -from llumnix.utils import random_uuid -from llumnix.server_info import ServerInfo - -from llumnix.queue.ray_queue_server import RayQueueServer - - -# pylint: disable=unused-argument -@asynccontextmanager -async def lifespan(fastapi_app: FastAPI): - # @@@ - # loop = asyncio.get_event_loop() - # loop.create_task(request_output_queue_server.run_server_loop()) - asyncio.create_task(request_output_queue_server.run_server_loop()) - yield - # @@@ - request_output_queue_server.cleanup() - -app = FastAPI(lifespan=lifespan) -# @@@ -request_output_queue = None -request_output_queue_server = None - - -@app.get("/is_ready") -async def is_ready() -> bool: - return True - -# pylint: disable=unused-argument -@app.post("/generate") -async def generate(request: Request) -> Response: - ret = {"text": ""} - return JSONResponse(ret) - -@app.get("/health") -async def health() -> Response: - """Health check.""" - return Response(status_code=200) - -# pylint: disable=unused-argument -@app.post("/generate_stream") -async def generate_stream(request: Request) -> StreamingResponse: - async def number_generator(): - for i in range(10): - t = time.time() - yield f"Number: {i}, Time: {t}; " - await asyncio.sleep(0.5) - return StreamingResponse(number_generator(), media_type="text/plain") - - -class FastAPIServer: - def __init__(self, host: str, port: int): - self.host = host - self.port = port - rpc_path = get_open_zmq_ipc_path(self.host, 8002) - global request_output_queue_server - request_output_queue_server = ZmqServer(rpc_path) - # loop = asyncio.get_event_loop() - # loop.create_task(request_output_queue_server.run_server_loop()) - - def run(self): - uvicorn.run(app, host=self.host, port=self.port) - # rpc_path = get_open_zmq_ipc_path(self.host, 8002) - # request_output_queue_server = ZmqServer(rpc_path) - # loop = asyncio.get_event_loop() - # loop.create_task(request_output_queue_server.run_server_loop()) - # config = Config(app=app, loop=loop, host=self.host, port=self.port) - # server = Server(config) - # loop.run_until_complete(server.serve()) - - @classmethod - def from_args(cls, host: str, port: int): - fastapi_server_class = ray.remote(num_cpus=1, name="entrypoints")(cls) - server = fastapi_server_class.remote(host, port) - - return server - -# pylint: disable=redefined-outer-name -async def wait_request_output_queue_server_ready(request_output_queue_client: ZmqClient, - server_info: ServerInfo): - time.sleep(5) - await request_output_queue_client.wait_for_server_rpc(server_info) - # request_output_queue_server.cleanup() - print("Request output queue server is ready.") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default='172.23.75.202') - parser.add_argument("--port", type=int, default=8000) - args = parser.parse_args() - - ray.init(namespace="llumnix") - - request_output_queue = RayQueueServer() - - # rpc_path = get_open_zmq_ipc_path(args.host, 8002) - # request_output_queue_server = ZmqServer(rpc_path) - request_output_queue_client = ZmqClient() - server_id = random_uuid() - server_info = ServerInfo(server_id, 'zmq', None, args.host, 8002) - - fastapi_server = FastAPIServer.from_args(args.host, args.port) - fastapi_server.run.remote() - - time.sleep(5) - - ip_address = f"{args.host}:{args.port}" - api_list = [ - "is_ready", - "generate", - "generate_stream", - "health", - ] - for api in api_list: - try: - url = f"http://{ip_address}/{api}" - if api in ["is_ready", "health"]: - response = requests.get(url) - else: - response = requests.post(url) - response.raise_for_status() - print(f"api: {api}, response: {response}, response.text: {response.text}") - except requests.RequestException as e: - print(f"Request failed: {e}") - - asyncio.run(wait_request_output_queue_server_ready(request_output_queue_client, server_info)) diff --git a/demo/serve_demo2.py b/demo/serve_demo2.py deleted file mode 100644 index 11b152e3..00000000 --- a/demo/serve_demo2.py +++ /dev/null @@ -1,45 +0,0 @@ -import argparse -import time -import ray -from fastapi import FastAPI -import uvicorn - -# @@@ -# from llumnix.queue.ray_queue_server import RayQueueServer -from ray.util.queue import Queue as RayQueue - -app = FastAPI() -# @@@ -# request_output_queue = RayQueueServer() -request_output_queue = None - - -class FastAPIServer: - def __init__(self, host: str, port: int): - self.host = host - self.port = port - - def run(self): - uvicorn.run(app, host=self.host, port=self.port) - - @classmethod - def from_args(cls, host: str, port: int): - fastapi_server_class = ray.remote(num_cpus=1, name="entrypoints")(cls) - fastapi_server = fastapi_server_class.remote(host, port) - - return fastapi_server - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default='localhost') - parser.add_argument("--port", type=int, default=8000) - args = parser.parse_args() - - ray.init(namespace="llumnix") - - request_output_queue = RayQueue() - - server = FastAPIServer.from_args(args.host, args.port) - - time.sleep(5) From 77a2c8980f77b2de9b8fe8d358dfbbdb45f4c1db Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 6 Jan 2025 09:10:54 +0000 Subject: [PATCH 50/92] Pass test_init_server_and_instance --- llumnix/entrypoints/vllm/api_server_actor.py | 6 +++ llumnix/manager.py | 11 +++-- .../global_scheduler/test_manager.py | 45 ++++++++++++++++++- 3 files changed, 55 insertions(+), 7 deletions(-) diff --git a/llumnix/entrypoints/vllm/api_server_actor.py b/llumnix/entrypoints/vllm/api_server_actor.py index 4f158182..07afe89c 100644 --- a/llumnix/entrypoints/vllm/api_server_actor.py +++ b/llumnix/entrypoints/vllm/api_server_actor.py @@ -1,12 +1,18 @@ import threading +import traceback +import ray from ray.util.placement_group import PlacementGroup +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from llumnix.arg_utils import EntrypointsArgs from llumnix.entrypoints.utils import EntrypointsContext, get_ip_address from llumnix.llumlet.llumlet import Llumlet from llumnix.utils import get_server_name from llumnix.queue.utils import init_request_output_queue_server, QueueType +from llumnix.logger import init_logger + +logger = init_logger(__name__) class FastAPIServer: diff --git a/llumnix/manager.py b/llumnix/manager.py index dc4bf3eb..c5cd419a 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -35,9 +35,8 @@ from llumnix.utils import (random_uuid, clear_gloo_backend_state, remove_placement_group, get_instance_name, get_manager_name, INSTANCE_NAME_PREFIX, SERVER_NAME_PREFIX, get_placement_group_name, run_async_func_sync, - kill_server, kill_instance) + kill_server, kill_instance, initialize_placement_group) from llumnix.entrypoints.utils import DeploymentMode -from llumnix.utils import initialize_placement_group from llumnix.backends.utils import get_engine_world_size from llumnix.queue.queue_type import QueueType from llumnix.entrypoints.vllm.api_server_actor import FastAPIServer @@ -595,10 +594,10 @@ def _init_server_and_instance(self, async def done_scale_up(): try: manager = ray.get_actor(get_manager_name(), namespace="llumnix") - await server.is_ready() - await server.setup_entrypoints_context(manager, instance_id, instance) - await instance.is_ready() - await server.run() + await server.is_ready.remote() + await server.setup_entrypoints_context.remote(manager, instance_id, instance) + await instance.is_ready.remote() + await server.run.remote() self.scale_up(instance_id, instance) # pylint: disable=broad-except except Exception as e: diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index 64a0208e..e1b32a42 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -20,7 +20,7 @@ from vllm import EngineArgs from llumnix.utils import random_uuid, get_instance_name, get_manager_name -from llumnix.arg_utils import ManagerArgs +from llumnix.arg_utils import ManagerArgs, EntrypointsArgs, DeploymentArgs from llumnix.manager import Manager from llumnix.instance_info import InstanceInfo from llumnix.server_info import ServerInfo @@ -29,6 +29,8 @@ from llumnix.backends.vllm.simulator import BackendSimVLLM from llumnix.backends.backend_interface import BackendType from llumnix.backends.profiling import LatencyMemData +from llumnix.entrypoints.utils import DeploymentMode +from llumnix.utils import get_server_name, get_instance_name # pylint: disable=unused-import from tests.conftest import ray_env @@ -113,6 +115,18 @@ def init_manager(): ray.get(manager.is_ready.remote()) return manager +def init_manager_with_deployment_mode(deployment_mode): + manager_args = ManagerArgs(migration_backend="rayrpc") + entrypoints_args = EntrypointsArgs(host="127.0.0.1", port=8000, request_output_queue_type="rayqueue") + engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) + deployment_args = DeploymentArgs(deployment_mode=deployment_mode, backend_type=BackendType.VLLM) + manager = Manager.from_args(manager_args=manager_args, + entrypoints_args=entrypoints_args, + engine_args=engine_args, + deployment_args=deployment_args) + ray.get(manager.is_ready.remote()) + return manager, manager_args, entrypoints_args, engine_args, deployment_args + def init_instances(initial_instances): instance_ids = [] instances = [] @@ -286,3 +300,32 @@ def test_update_instance_info_loop_and_migrate(ray_env, manager): assert num_migrate_in == 0 and num_migrate_out > 1 else: assert num_migrate_in == 0 and num_migrate_out == 0 + +def test_auto_scale_up_loop(ray_env): + pass + +def test_init_server_and_instance(ray_env): + manager, _, _, engine_args, _ = init_manager_with_deployment_mode(DeploymentMode.LOCAL) + instance_id = random_uuid() + placement_group = ray.get(manager._init_placement_group.remote(instance_id, engine_args, BackendType.VLLM, init_server=True)) + assert placement_group is not None + ray.get(manager._init_server_and_instance.remote(instance_id, placement_group)) + # wait for scale up + time.sleep(5.0) + server = ray.get_actor(get_server_name(instance_id), namespace="llumnix") + ray.get(server.is_ready.remote()) + assert server is not None + instance = ray.get_actor(get_instance_name(instance_id), namespace="llumnix") + ray.get(instance.is_ready.remote()) + assert instance is not None + num_instances = ray.get(manager.scale_up.remote(instance_id, instance)) + assert num_instances == 1 + +def test_clear_instance_ray_resources(ray_env): + pass + +def test_check_deployment_states_loop(ray_env): + pass + +def test_get_curr_deployment(ray_env): + pass From ba4a1cae23dda5029b676b2ebc3127edfbd10027 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Mon, 6 Jan 2025 09:36:37 +0000 Subject: [PATCH 51/92] Done test_clear_instance_ray_resources --- .../global_scheduler/test_manager.py | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index e1b32a42..d09e5690 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -30,7 +30,7 @@ from llumnix.backends.backend_interface import BackendType from llumnix.backends.profiling import LatencyMemData from llumnix.entrypoints.utils import DeploymentMode -from llumnix.utils import get_server_name, get_instance_name +from llumnix.utils import get_placement_group_name, get_server_name, get_instance_name # pylint: disable=unused-import from tests.conftest import ray_env @@ -155,6 +155,20 @@ def llumlet(): ray.get(llumlet.is_ready.remote()) return llumlet +def is_actor_exists(actor_name): + try: + ray.get_actor(actor_name, namespace='llumnix') + return True + except ValueError: + return False + +def is_placement_group_exists(pg_name): + try: + ray.util.get_placement_group(pg_name) + return True + except ValueError: + return False + def test_init_manager(ray_env, manager): assert manager is not None manager_actor_handle = ray.get_actor(get_manager_name(), namespace='llumnix') @@ -304,25 +318,32 @@ def test_update_instance_info_loop_and_migrate(ray_env, manager): def test_auto_scale_up_loop(ray_env): pass -def test_init_server_and_instance(ray_env): +def test_init_server_and_instance_and_clear_instance_ray_resources(ray_env): manager, _, _, engine_args, _ = init_manager_with_deployment_mode(DeploymentMode.LOCAL) instance_id = random_uuid() - placement_group = ray.get(manager._init_placement_group.remote(instance_id, engine_args, BackendType.VLLM, init_server=True)) - assert placement_group is not None - ray.get(manager._init_server_and_instance.remote(instance_id, placement_group)) + pg = ray.get(manager._init_placement_group.remote(instance_id, engine_args, BackendType.VLLM, init_server=True)) + pg = ray.util.get_placement_group(get_placement_group_name(instance_id)) + ray.get(pg.ready()) + ray.get(manager._init_server_and_instance.remote(instance_id, pg)) # wait for scale up time.sleep(5.0) server = ray.get_actor(get_server_name(instance_id), namespace="llumnix") ray.get(server.is_ready.remote()) - assert server is not None instance = ray.get_actor(get_instance_name(instance_id), namespace="llumnix") ray.get(instance.is_ready.remote()) - assert instance is not None num_instances = ray.get(manager.scale_up.remote(instance_id, instance)) assert num_instances == 1 -def test_clear_instance_ray_resources(ray_env): - pass + # test clear_instance_ray_resources + ray.get(manager._clear_instance_ray_resources.remote(instance_id)) + # wait for remove and kill + time.sleep(1.0) + pg_exists = is_placement_group_exists(get_placement_group_name(instance_id)) + assert not pg_exists + server_exists = is_actor_exists(get_server_name(instance_id)) + assert not server_exists + instance_exists = is_actor_exists(get_instance_name(instance_id)) + assert not instance_exists def test_check_deployment_states_loop(ray_env): pass From 43886a8633751743050cf0d94d88020ddcdf4965 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 03:39:11 +0000 Subject: [PATCH 52/92] Done test_auto_scale_up_loop_and_get_curr_deployment --- llumnix/arg_utils.py | 6 ++++ llumnix/config/default.py | 2 ++ llumnix/entrypoints/vllm/api_server_actor.py | 1 + llumnix/llumlet/llumlet.py | 3 +- llumnix/manager.py | 34 ++++++++++++------- llumnix/utils.py | 21 ++++++++---- .../global_scheduler/test_manager.py | 30 ++++++++++++---- 7 files changed, 68 insertions(+), 29 deletions(-) diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index d4ee51b0..6f20fa51 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -150,6 +150,8 @@ class ManagerArgs: enable_pd_disagg: bool = None + enbale_port_increment: bool = None + def __post_init__(self): # Check if all fields default to None for field_info in dataclasses.fields(self): @@ -347,6 +349,10 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: type=int, help='number of available instances for dispatch') + parser.add_argument('--enable-port-increment', + action='store_true', + help='enable port increment when desploying multiple servers') + return parser @dataclass diff --git a/llumnix/config/default.py b/llumnix/config/default.py index ba21e42f..23cc1fb1 100644 --- a/llumnix/config/default.py +++ b/llumnix/config/default.py @@ -65,6 +65,8 @@ _C.MANAGER.LOG_FILENAME = "server.log" # Profiling result file path _C.MANAGER.PROFILING_RESULT_FILE_PATH = None +# Enable port increment when deploying multiple servers +_C.MANAGER.ENABLE_PORT_INCREMENT = False # Number of instances created at initialization _C.MANAGER.INITIAL_INSTANCES = 1 diff --git a/llumnix/entrypoints/vllm/api_server_actor.py b/llumnix/entrypoints/vllm/api_server_actor.py index 07afe89c..508092b7 100644 --- a/llumnix/entrypoints/vllm/api_server_actor.py +++ b/llumnix/entrypoints/vllm/api_server_actor.py @@ -1,5 +1,6 @@ import threading import traceback +import uvicorn import ray from ray.util.placement_group import PlacementGroup diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 1706794e..b4d58d05 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -86,11 +86,10 @@ def from_args(cls, if backend_type == backend_type.BLADELLM: world_size = get_engine_world_size(engine_args, backend_type) num_gpus = world_size - instance_name = get_instance_name(instance_id) # TODO(s5u13b): Check the max_concurrency. llumlet_class = ray.remote(num_cpus=1, num_gpus=num_gpus, - name=instance_name, + name=get_instance_name(instance_id), namespace='llumnix', max_concurrency=4, lifetime="detached")(cls).options( diff --git a/llumnix/manager.py b/llumnix/manager.py index c5cd419a..ee22b169 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -14,6 +14,7 @@ import asyncio import time import csv +import copy import os from typing import Dict, List, Tuple, Union, Iterable from collections import defaultdict @@ -47,7 +48,7 @@ NO_INSTANCE_RETRY_INTERVAL = 0.1 WAIT_ALL_MIGRATIONS_DONE_INTERVAL = 0.1 AUTO_SCALE_UP_INTERVAL = 1.0 -WAIT_PLACEMENT_GROUP_TIMEOUT = 1.0 +WAIT_PLACEMENT_GROUP_TIMEOUT = 5.0 CHECK_DEPLOYMENT_STATES_INTERVAL = 60.0 WATCH_DEPLOYMENT_INTERVAL = 10.0 @@ -123,6 +124,7 @@ def __init__(self, asyncio.create_task(self._update_instance_info_loop(self.polling_interval)) asyncio.create_task(self._clear_request_instance_loop(CLEAR_REQUEST_INSTANCE_INTERVAL)) + self.port_count = 0 if hasattr(self, "deployment_mode") and self.deployment_mode == DeploymentMode.GLOBAL: assert self.entrypoints_args is not None and self.engine_args is not None self.last_timeout_instance_id = None @@ -281,7 +283,7 @@ async def _auto_scale_up_loop(self, interval: float) -> None: try: new_pg = None if self.last_timeout_instance_id is not None: - last_timeout_pg_name = get_placement_group_name(last_timeout_instance_id) + last_timeout_pg_name = get_placement_group_name(self.last_timeout_instance_id) last_timeout_pg_states = list_placement_groups(filters=[("name", "=", last_timeout_pg_name)]) if len(last_timeout_pg_states) > 0: new_instance_id = self.last_timeout_instance_id @@ -297,7 +299,7 @@ async def _auto_scale_up_loop(self, interval: float) -> None: self.scale_down(instance_id) if new_pg is None: new_instance_id = random_uuid() - new_pg = self._init_placement_group(new_instance_id, self.engine_args, self.backend_type, init_server=True) + new_pg = self._init_placement_group(new_instance_id, self.engine_args, self.backend_type, init_server=True, block=False) try: await asyncio.wait_for(new_pg.ready(), WAIT_PLACEMENT_GROUP_TIMEOUT) except asyncio.TimeoutError: @@ -307,7 +309,8 @@ async def _auto_scale_up_loop(self, interval: float) -> None: await asyncio.sleep(interval) continue self._init_server_and_instance(new_instance_id, new_pg) - logger.info("[_auto_scale_up_loop] deploy server and instance to new placement group done") + logger.info("[_auto_scale_up_loop] deploy server and instance to new placement group done, " + "instance_id: {}".format(new_instance_id)) # pylint: disable=broad-except except Exception as e: logger.error("[_auto_scale_up_loop] unexpected exception occurs: {}".format(e)) @@ -423,16 +426,16 @@ def scale_down(self, instance_id: Union[str, Iterable[str]], rebuild_migration_b if ins_id in self.instances: del self.instances[ins_id] else: - logger.warning("[scale_down] instance {} is not in self.instances".format(ins_id)) + logger.debug("[scale_down] instance {} is not in self.instances".format(ins_id)) if ins_id in self.instance_migrating: del self.instance_migrating[ins_id] else: - logger.warning("[scale_down] instance {} is not in self.instance_migrating".format(ins_id)) + logger.debug("[scale_down] instance {} is not in self.instance_migrating".format(ins_id)) if self.log_instance_info: if ins_id in self.instance_last_logged_empty: del self.instance_last_logged_empty[ins_id] else: - logger.warning("[scale_down] instance {} is not in self.instance_last_logged_empty".format(ins_id)) + logger.debug("[scale_down] instance {} is not in self.instance_last_logged_empty".format(ins_id)) self.pending_rebuild_migration_instances += 1 self.global_scheduler.scale_down(instance_ids) self.num_instances = len(self.instances) @@ -449,11 +452,11 @@ def scale_down(self, instance_id: Union[str, Iterable[str]], rebuild_migration_b def _clear_instance_ray_resources(self, instance_id: str): if not remove_placement_group(instance_id): - logger.warning("[clear_instance_ray_resources] failed to remove placement group {}".format(instance_id)) + logger.debug("[clear_instance_ray_resources] failed to remove placement group {}".format(instance_id)) if not kill_server(instance_id): - logger.warning("[clear_instance_ray_resources] failed to kill server {}".format(instance_id)) + logger.debug("[clear_instance_ray_resources] failed to kill server {}".format(instance_id)) if not kill_instance(instance_id): - logger.warning("[clear_instance_ray_resources] failed to kill instance {}".format(instance_id)) + logger.debug("[clear_instance_ray_resources] failed to kill instance {}".format(instance_id)) async def _connect_to_instances(self): def connect_to_instances_done_callback(instance_id: str, instance_actor_handle: "ray.actor.ActorHandle", fut): @@ -507,16 +510,17 @@ def _init_placement_group(self, instance_id: str, engine_args, backend_type: BackendType, - init_server: bool = False) -> PlacementGroup: + init_server: bool = False, + block: bool = True) -> PlacementGroup: if not self.manager_args.profiling_result_file_path: # num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor # num_gpus=world_size, for world_size Workers world_size = get_engine_world_size(engine_args, backend_type) - placement_group = initialize_placement_group(instance_id, num_cpus=3+int(init_server), num_gpus=world_size, detached=True) + placement_group = initialize_placement_group(instance_id, num_cpus=3+int(init_server), num_gpus=world_size, detached=True, block=block) else: assert backend_type == backend_type.VLLM, "Only support the simulator backend for vLLM." # num_cpus=1, for Llumlet + AsyncPutQueueActor - placement_group = initialize_placement_group(instance_id, num_cpus=2+int(init_server), num_gpus=0, detached=True) + placement_group = initialize_placement_group(instance_id, num_cpus=2+int(init_server), num_gpus=0, detached=True, block=block) return placement_group @@ -524,6 +528,10 @@ def _init_server(self, instance_id: str, placement_group: PlacementGroup, entrypoints_args: EntrypointsArgs) -> FastAPIServer: + entrypoints_args = copy.deepcopy(entrypoints_args) + entrypoints_args.port += self.port_count + entrypoints_args.request_output_queue_port += self.port_count + self.port_count += 1 fastapi_server = FastAPIServer.from_args(instance_id, placement_group, entrypoints_args) return fastapi_server diff --git a/llumnix/utils.py b/llumnix/utils.py index f0390283..e5686dfc 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -17,6 +17,10 @@ import ray from ray.util.placement_group import PlacementGroup +from llumnix.logger import init_logger + +logger = init_logger(__name__) + MANAGER_NAME = "manager" PLACEMENT_GROUP_NAME_PREFIX = "pg_" SERVER_NAME_PREFIX = "server_" @@ -25,9 +29,10 @@ def initialize_placement_group( instance_id: str, - num_cpus: int = 1, - num_gpus: int = 1, - detached: bool = False + num_cpus: int, + num_gpus: int, + detached: bool = False, + block: bool = True ) -> PlacementGroup: """Initialize the distributed cluster probably with Ray. @@ -79,7 +84,8 @@ def initialize_placement_group( # Wait until PG is ready - this will block until all # requested resources are available, and will timeout # if they cannot be provisioned. - ray.get(current_placement_group.ready(), timeout=1800) + if block: + ray.get(current_placement_group.ready(), timeout=1800) return current_placement_group @@ -123,8 +129,9 @@ def get_instance_name(instance_id: str) -> str: def remove_placement_group(instance_id: str) -> bool: try: placement_group = ray.util.get_placement_group(get_placement_group_name(instance_id)) - if not placement_group: - return False + except ValueError: + return False + try: # asynchronous api ray.util.remove_placement_group(placement_group) logger.info("remove placement group {}".format(instance_id)) @@ -153,7 +160,7 @@ def kill_instance(instance_id: str) -> bool: return False try: ray.kill(instance) - print("kill instance {}".format(instance_id)) + logger.info("kill instance {}".format(instance_id)) # pylint: disable=broad-except except Exception: return False diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index d09e5690..75668de3 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -30,7 +30,8 @@ from llumnix.backends.backend_interface import BackendType from llumnix.backends.profiling import LatencyMemData from llumnix.entrypoints.utils import DeploymentMode -from llumnix.utils import get_placement_group_name, get_server_name, get_instance_name +from llumnix.utils import (get_placement_group_name, get_server_name, get_instance_name, + initialize_placement_group, remove_placement_group, INSTANCE_NAME_PREFIX) # pylint: disable=unused-import from tests.conftest import ray_env @@ -116,7 +117,7 @@ def init_manager(): return manager def init_manager_with_deployment_mode(deployment_mode): - manager_args = ManagerArgs(migration_backend="rayrpc") + manager_args = ManagerArgs(migration_backend="rayrpc", enbale_port_increment=True) entrypoints_args = EntrypointsArgs(host="127.0.0.1", port=8000, request_output_queue_type="rayqueue") engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) deployment_args = DeploymentArgs(deployment_mode=deployment_mode, backend_type=BackendType.VLLM) @@ -315,8 +316,26 @@ def test_update_instance_info_loop_and_migrate(ray_env, manager): else: assert num_migrate_in == 0 and num_migrate_out == 0 -def test_auto_scale_up_loop(ray_env): - pass +def test_auto_scale_up_loop_and_get_curr_deployment(ray_env): + manager, _, _, _, _ = init_manager_with_deployment_mode(DeploymentMode.GLOBAL) + time.sleep(30.0) + num_instances = ray.get(manager.scale_up.remote([], [])) + assert num_instances == 4 + curr_pgs, curr_servers, curr_instances = ray.get(manager.get_curr_deployment.remote()) + assert len(curr_pgs) == 4 and len(curr_servers) == 4 and len(curr_instances) == 4 + + actor_names_dict = ray.util.list_named_actors(all_namespaces=True) + instance_actor_names = [actor_name_dict['name'] for actor_name_dict in actor_names_dict + if actor_name_dict['name'].startswith(INSTANCE_NAME_PREFIX)] + instance_ids = [actor_name.split("_")[-1] for actor_name in instance_actor_names] + ray.get(manager._clear_instance_ray_resources.remote(instance_ids[0])) + ray.get(manager._clear_instance_ray_resources.remote(instance_ids[1])) + time.sleep(20.0) + num_instances = ray.get(manager.scale_up.remote([], [])) + assert num_instances == 4 + curr_pgs, curr_servers, curr_instances = ray.get(manager.get_curr_deployment.remote()) + assert len(curr_pgs) == 4 and len(curr_servers) == 4 and len(curr_instances) == 4 + def test_init_server_and_instance_and_clear_instance_ray_resources(ray_env): manager, _, _, engine_args, _ = init_manager_with_deployment_mode(DeploymentMode.LOCAL) @@ -347,6 +366,3 @@ def test_init_server_and_instance_and_clear_instance_ray_resources(ray_env): def test_check_deployment_states_loop(ray_env): pass - -def test_get_curr_deployment(ray_env): - pass From 30d13b8a68acd21d62a33b7b2f7bcabd355b43a9 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 05:49:57 +0000 Subject: [PATCH 53/92] Refine global deployment test --- llumnix/manager.py | 1 + llumnix/queue/ray_queue_server.py | 5 ++++- tests/unit_test/global_scheduler/test_manager.py | 14 ++++++++------ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index ee22b169..86af2021 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -55,6 +55,7 @@ # TODO(s5u13b): Fix the logger when manager failover. # TODO(s5u13b): Handle exception of ray operations. # TODO(s5u13b): Update the documents of global deployment. +# TODO(s5u13b): Add exeception handling wrapper. class Manager: diff --git a/llumnix/queue/ray_queue_server.py b/llumnix/queue/ray_queue_server.py index 6cff2607..bc6542a2 100644 --- a/llumnix/queue/ray_queue_server.py +++ b/llumnix/queue/ray_queue_server.py @@ -52,4 +52,7 @@ async def run_server_loop(self): pass def cleanup(self): - pass + try: + ray.kill(self.queue) + except Exception as e: + pass diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index 75668de3..1b876e58 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -116,9 +116,9 @@ def init_manager(): ray.get(manager.is_ready.remote()) return manager -def init_manager_with_deployment_mode(deployment_mode): +def init_manager_with_deployment_mode(deployment_mode, request_output_queue_type="rayqueue"): manager_args = ManagerArgs(migration_backend="rayrpc", enbale_port_increment=True) - entrypoints_args = EntrypointsArgs(host="127.0.0.1", port=8000, request_output_queue_type="rayqueue") + entrypoints_args = EntrypointsArgs(host="127.0.0.1", port=8000, request_output_queue_type=request_output_queue_type) engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) deployment_args = DeploymentArgs(deployment_mode=deployment_mode, backend_type=BackendType.VLLM) manager = Manager.from_args(manager_args=manager_args, @@ -316,8 +316,9 @@ def test_update_instance_info_loop_and_migrate(ray_env, manager): else: assert num_migrate_in == 0 and num_migrate_out == 0 -def test_auto_scale_up_loop_and_get_curr_deployment(ray_env): - manager, _, _, _, _ = init_manager_with_deployment_mode(DeploymentMode.GLOBAL) +@pytest.mark.parametrize("request_output_queue_type", ['rayqueue', 'zmq']) +def test_auto_scale_up_loop_and_get_curr_deployment(ray_env, request_output_queue_type): + manager, _, _, _, _ = init_manager_with_deployment_mode(DeploymentMode.GLOBAL, request_output_queue_type) time.sleep(30.0) num_instances = ray.get(manager.scale_up.remote([], [])) assert num_instances == 4 @@ -329,14 +330,15 @@ def test_auto_scale_up_loop_and_get_curr_deployment(ray_env): if actor_name_dict['name'].startswith(INSTANCE_NAME_PREFIX)] instance_ids = [actor_name.split("_")[-1] for actor_name in instance_actor_names] ray.get(manager._clear_instance_ray_resources.remote(instance_ids[0])) + time.sleep(5.0) ray.get(manager._clear_instance_ray_resources.remote(instance_ids[1])) - time.sleep(20.0) + # TODO(s5u13b): Get ray queue rpc error or some instances died sometimes. + time.sleep(30.0) num_instances = ray.get(manager.scale_up.remote([], [])) assert num_instances == 4 curr_pgs, curr_servers, curr_instances = ray.get(manager.get_curr_deployment.remote()) assert len(curr_pgs) == 4 and len(curr_servers) == 4 and len(curr_instances) == 4 - def test_init_server_and_instance_and_clear_instance_ray_resources(ray_env): manager, _, _, engine_args, _ = init_manager_with_deployment_mode(DeploymentMode.LOCAL) instance_id = random_uuid() From c6fe8fe9c33ca5afe3827e02f5474f369aad180b Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 06:27:18 +0000 Subject: [PATCH 54/92] Done test_check_deployment_states_loop_and_auto_scale_up_loop --- llumnix/manager.py | 6 +- llumnix/queue/ray_queue_server.py | 1 + .../global_scheduler/test_manager.py | 75 ++++++++++++------- 3 files changed, 52 insertions(+), 30 deletions(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index 86af2021..b392a99e 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -49,8 +49,8 @@ WAIT_ALL_MIGRATIONS_DONE_INTERVAL = 0.1 AUTO_SCALE_UP_INTERVAL = 1.0 WAIT_PLACEMENT_GROUP_TIMEOUT = 5.0 -CHECK_DEPLOYMENT_STATES_INTERVAL = 60.0 -WATCH_DEPLOYMENT_INTERVAL = 10.0 +CHECK_DEPLOYMENT_STATES_INTERVAL = 30.0 +WATCH_DEPLOYMENT_INTERVAL = 30.0 # TODO(s5u13b): Fix the logger when manager failover. # TODO(s5u13b): Handle exception of ray operations. @@ -626,7 +626,7 @@ async def watch_deployment(instance_id: str): curr_pgs, curr_servers, curr_instances = self.get_curr_deployment() if instance_id in curr_pgs and (instance_id not in curr_servers or instance_id not in curr_instances): logger.warning("[_check_deployment_states_loop] instance {} deployment states incorrect".format(instance_id)) - self._scale_down(instance_id) + self.scale_down(instance_id) while True: try: diff --git a/llumnix/queue/ray_queue_server.py b/llumnix/queue/ray_queue_server.py index bc6542a2..b8648157 100644 --- a/llumnix/queue/ray_queue_server.py +++ b/llumnix/queue/ray_queue_server.py @@ -54,5 +54,6 @@ async def run_server_loop(self): def cleanup(self): try: ray.kill(self.queue) + # pylint: disable=broad-except, unused-variable except Exception as e: pass diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index 1b876e58..70127722 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -31,7 +31,9 @@ from llumnix.backends.profiling import LatencyMemData from llumnix.entrypoints.utils import DeploymentMode from llumnix.utils import (get_placement_group_name, get_server_name, get_instance_name, - initialize_placement_group, remove_placement_group, INSTANCE_NAME_PREFIX) + initialize_placement_group, remove_placement_group, INSTANCE_NAME_PREFIX, + SERVER_NAME_PREFIX, remove_placement_group, kill_server, + kill_instance) # pylint: disable=unused-import from tests.conftest import ray_env @@ -315,30 +317,7 @@ def test_update_instance_info_loop_and_migrate(ray_env, manager): assert num_migrate_in == 0 and num_migrate_out > 1 else: assert num_migrate_in == 0 and num_migrate_out == 0 - -@pytest.mark.parametrize("request_output_queue_type", ['rayqueue', 'zmq']) -def test_auto_scale_up_loop_and_get_curr_deployment(ray_env, request_output_queue_type): - manager, _, _, _, _ = init_manager_with_deployment_mode(DeploymentMode.GLOBAL, request_output_queue_type) - time.sleep(30.0) - num_instances = ray.get(manager.scale_up.remote([], [])) - assert num_instances == 4 - curr_pgs, curr_servers, curr_instances = ray.get(manager.get_curr_deployment.remote()) - assert len(curr_pgs) == 4 and len(curr_servers) == 4 and len(curr_instances) == 4 - - actor_names_dict = ray.util.list_named_actors(all_namespaces=True) - instance_actor_names = [actor_name_dict['name'] for actor_name_dict in actor_names_dict - if actor_name_dict['name'].startswith(INSTANCE_NAME_PREFIX)] - instance_ids = [actor_name.split("_")[-1] for actor_name in instance_actor_names] - ray.get(manager._clear_instance_ray_resources.remote(instance_ids[0])) - time.sleep(5.0) - ray.get(manager._clear_instance_ray_resources.remote(instance_ids[1])) - # TODO(s5u13b): Get ray queue rpc error or some instances died sometimes. - time.sleep(30.0) - num_instances = ray.get(manager.scale_up.remote([], [])) - assert num_instances == 4 - curr_pgs, curr_servers, curr_instances = ray.get(manager.get_curr_deployment.remote()) - assert len(curr_pgs) == 4 and len(curr_servers) == 4 and len(curr_instances) == 4 - + def test_init_server_and_instance_and_clear_instance_ray_resources(ray_env): manager, _, _, engine_args, _ = init_manager_with_deployment_mode(DeploymentMode.LOCAL) instance_id = random_uuid() @@ -366,5 +345,47 @@ def test_init_server_and_instance_and_clear_instance_ray_resources(ray_env): instance_exists = is_actor_exists(get_instance_name(instance_id)) assert not instance_exists -def test_check_deployment_states_loop(ray_env): - pass +@pytest.mark.parametrize("request_output_queue_type", ['rayqueue', 'zmq']) +def test_auto_scale_up_loop_and_get_curr_deployment(ray_env, request_output_queue_type): + manager, _, _, _, _ = init_manager_with_deployment_mode(DeploymentMode.GLOBAL, request_output_queue_type) + time.sleep(30.0) + num_instances = ray.get(manager.scale_up.remote([], [])) + assert num_instances == 4 + curr_pgs, curr_servers, curr_instances = ray.get(manager.get_curr_deployment.remote()) + assert len(curr_pgs) == 4 and len(curr_servers) == 4 and len(curr_instances) == 4 + + actor_names_dict = ray.util.list_named_actors(all_namespaces=True) + instance_ids = [actor_name_dict['name'].split("_")[-1] for actor_name_dict in actor_names_dict + if actor_name_dict['name'].startswith(INSTANCE_NAME_PREFIX)] + assert len(instance_ids) == 4 + ray.get(manager._clear_instance_ray_resources.remote(instance_ids[0])) + ray.get(manager._clear_instance_ray_resources.remote(instance_ids[1])) + # TODO(s5u13b): Get ray queue rpc error or some instances died sometimes. + time.sleep(30.0) + num_instances = ray.get(manager.scale_up.remote([], [])) + assert num_instances == 4 + curr_pgs, curr_servers, curr_instances = ray.get(manager.get_curr_deployment.remote()) + assert len(curr_pgs) == 4 and len(curr_servers) == 4 and len(curr_instances) == 4 + +@pytest.mark.parametrize("request_output_queue_type", ['rayqueue', 'zmq']) +def test_check_deployment_states_loop_and_auto_scale_up_loop(ray_env, request_output_queue_type): + manager, _, _, _, _ = init_manager_with_deployment_mode(DeploymentMode.GLOBAL, request_output_queue_type) + time.sleep(30.0) + num_instances = ray.get(manager.scale_up.remote([], [])) + assert num_instances == 4 + curr_pgs, curr_servers, curr_instances = ray.get(manager.get_curr_deployment.remote()) + assert len(curr_pgs) == 4 and len(curr_servers) == 4 and len(curr_instances) == 4 + + actor_names_dict = ray.util.list_named_actors(all_namespaces=True) + instance_ids = [actor_name_dict['name'].split("_")[-1] for actor_name_dict in actor_names_dict + if actor_name_dict['name'].startswith(INSTANCE_NAME_PREFIX)] + assert len(instance_ids) == 4 + remove_placement_group(instance_ids[0]) + kill_server(instance_ids[1]) + kill_instance(instance_ids[2]) + # Wait for check deployment states, scale down instance and auto scale up. + time.sleep(90.0) + num_instances = ray.get(manager.scale_up.remote([], [])) + assert num_instances == 4 + curr_pgs, curr_servers, curr_instances = ray.get(manager.get_curr_deployment.remote()) + assert len(curr_pgs) == 4 and len(curr_servers) == 4 and len(curr_instances) == 4 From 5429aad0dfe5798503f21e670ae5795b3da6dc7e Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 06:48:45 +0000 Subject: [PATCH 55/92] Add global deployment mode in bench test --- llumnix/arg_utils.py | 2 +- tests/e2e_test/test_bench.py | 45 ++++++++++++------- tests/e2e_test/utils.py | 35 +++++++++++++++ .../global_scheduler/test_manager.py | 2 +- 4 files changed, 67 insertions(+), 17 deletions(-) diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index 6f20fa51..d692880a 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -150,7 +150,7 @@ class ManagerArgs: enable_pd_disagg: bool = None - enbale_port_increment: bool = None + enable_port_increment: bool = None def __post_init__(self): # Check if all fields default to None diff --git a/tests/e2e_test/test_bench.py b/tests/e2e_test/test_bench.py index 0719a524..5c1a8644 100644 --- a/tests/e2e_test/test_bench.py +++ b/tests/e2e_test/test_bench.py @@ -23,7 +23,8 @@ # pylint: disable=unused-import from tests.conftest import ray_env from .utils import (generate_launch_command, generate_bench_command, to_markdown_table, - wait_for_llumnix_service_ready, shutdown_llumnix_service) + wait_for_llumnix_service_ready, shutdown_llumnix_service, + generate_serve_command) BENCH_TEST_TIMEOUT_MINS = 30 @@ -63,21 +64,34 @@ def get_markdown_data(key: str, head_name: str): @pytest.mark.asyncio @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="at least 1 gpus required for simple benchmark") @pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B']) -async def test_simple_benchmark(ray_env, shutdown_llumnix_service, model): - device_count = torch.cuda.device_count() +@pytest.mark.parametrize("deployment_mode", ['global', 'local']) +async def test_simple_benchmark(ray_env, shutdown_llumnix_service, model, deployment_mode): ip = "127.0.0.1" base_port = 37037 ip_ports = [] - for i in range(device_count): - port = base_port+i - ip_port = f"{ip}:{port}" - ip_ports.append(ip_port) - launch_command = generate_launch_command(result_filename=str(base_port+i)+".out", - launch_ray_cluster=False, - ip=ip, - port=port, - model=model) - subprocess.run(launch_command, shell=True, check=True) + if deployment_mode == 'local': + device_count = torch.cuda.device_count() + for i in range(device_count): + port = base_port+i + ip_port = f"{ip}:{port}" + ip_ports.append(ip_port) + launch_command = generate_launch_command(result_filename=str(base_port+i)+".out", + launch_ray_cluster=False, + ip=ip, + port=port, + model=model) + subprocess.run(launch_command, shell=True, check=True) + else: # global + device_count = torch.cuda.device_count() + for i in range(device_count): + port = base_port+i + ip_port = f"{ip}:{port}" + ip_ports.append(ip_port) + serve_command = generate_serve_command(result_filename=str(base_port)+".out", + ip=ip, + port=base_port, + model=model) + subprocess.run(serve_command, shell=True, check=True) wait_for_llumnix_service_ready(ip_ports) @@ -113,7 +127,8 @@ def run_bench_command(command): process.kill() assert False, "bench_test timed out after {} minutes.".format(BENCH_TEST_TIMEOUT_MINS) - with open("performance.txt", "w", encoding="utf-8") as f: - f.write(parse_log_file()) + if deployment_mode == 'local': + with open("performance.txt", "w", encoding="utf-8") as f: + f.write(parse_log_file()) await asyncio.sleep(3) diff --git a/tests/e2e_test/utils.py b/tests/e2e_test/utils.py index 7b454c2c..8321e161 100644 --- a/tests/e2e_test/utils.py +++ b/tests/e2e_test/utils.py @@ -56,6 +56,40 @@ def generate_launch_command(result_filename: str = "", ) return command +def generate_serve_command(result_filename: str = "", + ip: str = "127.0.0.1", + port: int = 37000, + dispatch_policy: str = "load", + migration_backend = "gloo", + model = "facebook/opt-125m", + max_model_len: int = 4096, + log_instance_info: bool = False, + request_migration_policy: str = 'SR', + max_num_batched_tokens: int = 16000): + command = ( + f"RAY_DEDUP_LOGS=0 " + f"nohup python -u -m llumnix.entrypoints.vllm.serve " + f"--host {ip} " + f"--port {port} " + f"{'--log-filename manager ' if log_instance_info else ''}" + f"{'--log-instance-info ' if log_instance_info else ''}" + f"--enable-migration " + f"--model {model} " + f"--engine-use-ray " + f"--worker-use-ray " + f"--max-model-len {max_model_len} " + f"--dispatch-policy {dispatch_policy} " + f"--trust-remote-code " + f"--request-migration-policy {request_migration_policy} " + f"--migration-backend {migration_backend} " + f"--migration-buffer-blocks 32 " + f"--tensor-parallel-size 1 " + f"--request-output-queue-port {1234+port} " + f"--max-num-batched-tokens {max_num_batched_tokens} " + f"{'> instance_'+result_filename if len(result_filename)> 0 else ''} 2>&1 &" + ) + return command + def wait_for_llumnix_service_ready(ip_ports, timeout=120): start_time = time.time() while True: @@ -112,6 +146,7 @@ def generate_bench_command(ip_ports: str, def shutdown_llumnix_service_func(): subprocess.run('pkill -f llumnix.entrypoints.vllm.api_server', shell=True, check=False) subprocess.run('pkill -f benchmark_serving.py', shell=True, check=False) + subprocess.run('pkill -f llumnix.entrypoints.vllm.serve', shell=True, check=False) @pytest.fixture def shutdown_llumnix_service(): diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index 70127722..a926f1f0 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -119,7 +119,7 @@ def init_manager(): return manager def init_manager_with_deployment_mode(deployment_mode, request_output_queue_type="rayqueue"): - manager_args = ManagerArgs(migration_backend="rayrpc", enbale_port_increment=True) + manager_args = ManagerArgs(migration_backend="rayrpc", enable_port_increment=True) entrypoints_args = EntrypointsArgs(host="127.0.0.1", port=8000, request_output_queue_type=request_output_queue_type) engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) deployment_args = DeploymentArgs(deployment_mode=deployment_mode, backend_type=BackendType.VLLM) From e895c09b4190920adf4588842d60ea85220e219c Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 07:05:18 +0000 Subject: [PATCH 56/92] Fix bench test --- tests/e2e_test/test_bench.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/e2e_test/test_bench.py b/tests/e2e_test/test_bench.py index 5c1a8644..f4a1918c 100644 --- a/tests/e2e_test/test_bench.py +++ b/tests/e2e_test/test_bench.py @@ -91,6 +91,7 @@ async def test_simple_benchmark(ray_env, shutdown_llumnix_service, model, deploy ip=ip, port=base_port, model=model) + subprocess.run('ray start --head', shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) subprocess.run(serve_command, shell=True, check=True) wait_for_llumnix_service_ready(ip_ports) From 98e95d4117d9a77b018abe33f1439e619dc53b44 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 07:08:52 +0000 Subject: [PATCH 57/92] Fix lint --- tests/e2e_test/test_bench.py | 1 + tests/unit_test/global_scheduler/test_manager.py | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/e2e_test/test_bench.py b/tests/e2e_test/test_bench.py index f4a1918c..21d35f19 100644 --- a/tests/e2e_test/test_bench.py +++ b/tests/e2e_test/test_bench.py @@ -91,6 +91,7 @@ async def test_simple_benchmark(ray_env, shutdown_llumnix_service, model, deploy ip=ip, port=base_port, model=model) + # pylint: disable=subprocess-run-check subprocess.run('ray start --head', shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) subprocess.run(serve_command, shell=True, check=True) diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index a926f1f0..975c449a 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -19,7 +19,6 @@ from vllm import EngineArgs -from llumnix.utils import random_uuid, get_instance_name, get_manager_name from llumnix.arg_utils import ManagerArgs, EntrypointsArgs, DeploymentArgs from llumnix.manager import Manager from llumnix.instance_info import InstanceInfo @@ -31,9 +30,8 @@ from llumnix.backends.profiling import LatencyMemData from llumnix.entrypoints.utils import DeploymentMode from llumnix.utils import (get_placement_group_name, get_server_name, get_instance_name, - initialize_placement_group, remove_placement_group, INSTANCE_NAME_PREFIX, - SERVER_NAME_PREFIX, remove_placement_group, kill_server, - kill_instance) + remove_placement_group, INSTANCE_NAME_PREFIX, kill_server, + kill_instance, random_uuid, get_manager_name) # pylint: disable=unused-import from tests.conftest import ray_env @@ -317,7 +315,7 @@ def test_update_instance_info_loop_and_migrate(ray_env, manager): assert num_migrate_in == 0 and num_migrate_out > 1 else: assert num_migrate_in == 0 and num_migrate_out == 0 - + def test_init_server_and_instance_and_clear_instance_ray_resources(ray_env): manager, _, _, engine_args, _ = init_manager_with_deployment_mode(DeploymentMode.LOCAL) instance_id = random_uuid() From 9121e197a672195dd998174832257d507ab6e6e2 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 07:51:53 +0000 Subject: [PATCH 58/92] Fix port increment --- llumnix/manager.py | 7 ++++--- tests/e2e_test/utils.py | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index b392a99e..e65edafb 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -530,9 +530,10 @@ def _init_server(self, placement_group: PlacementGroup, entrypoints_args: EntrypointsArgs) -> FastAPIServer: entrypoints_args = copy.deepcopy(entrypoints_args) - entrypoints_args.port += self.port_count - entrypoints_args.request_output_queue_port += self.port_count - self.port_count += 1 + if self.manager_args.enable_port_increment: + entrypoints_args.port += self.port_count + entrypoints_args.request_output_queue_port += self.port_count + self.port_count += 1 fastapi_server = FastAPIServer.from_args(instance_id, placement_group, entrypoints_args) return fastapi_server diff --git a/tests/e2e_test/utils.py b/tests/e2e_test/utils.py index 8321e161..da71f32a 100644 --- a/tests/e2e_test/utils.py +++ b/tests/e2e_test/utils.py @@ -86,6 +86,7 @@ def generate_serve_command(result_filename: str = "", f"--tensor-parallel-size 1 " f"--request-output-queue-port {1234+port} " f"--max-num-batched-tokens {max_num_batched_tokens} " + f"--enable-port-increment " f"{'> instance_'+result_filename if len(result_filename)> 0 else ''} 2>&1 &" ) return command From 0e975515ffd00b955574133509710f721f892d56 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 08:02:29 +0000 Subject: [PATCH 59/92] Updata ray requirements --- requirements/requirements_bladellm.txt | 2 +- requirements/requirements_vllm.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/requirements_bladellm.txt b/requirements/requirements_bladellm.txt index 3c66e6c7..ecaa9301 100644 --- a/requirements/requirements_bladellm.txt +++ b/requirements/requirements_bladellm.txt @@ -1,4 +1,4 @@ -ray >= 2.9.0 +ray[default] >= 2.9.0 pyarrow # Required for Ray data. aiohttp pandas diff --git a/requirements/requirements_vllm.txt b/requirements/requirements_vllm.txt index f9fbe6a6..8af54fbd 100644 --- a/requirements/requirements_vllm.txt +++ b/requirements/requirements_vllm.txt @@ -1,5 +1,5 @@ vllm == 0.4.2 -ray >= 2.9.0 +ray[default] >= 2.9.0 pyarrow # Required for Ray data. aiohttp scipy From 2561691910cd66ba7ae11b09f4130f68d168d81d Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 09:04:21 +0000 Subject: [PATCH 60/92] Fix cr comments --- llumnix/entrypoints/vllm/api_server_actor.py | 5 ++--- llumnix/manager.py | 20 +++++++++++------- llumnix/utils.py | 6 +++--- .../backends/vllm/test_llm_engine.py | 8 +++---- .../unit_test/backends/vllm/test_migration.py | 6 +++--- .../backends/vllm/test_migration_backend.py | 6 +++--- .../unit_test/backends/vllm/test_simulator.py | 4 ++-- tests/unit_test/backends/vllm/test_worker.py | 6 +++--- .../global_scheduler/test_manager.py | 3 ++- .../llumlet/test_engine_step_exception.py | 21 ++++++++++++------- 10 files changed, 47 insertions(+), 38 deletions(-) diff --git a/llumnix/entrypoints/vllm/api_server_actor.py b/llumnix/entrypoints/vllm/api_server_actor.py index 508092b7..eba9f106 100644 --- a/llumnix/entrypoints/vllm/api_server_actor.py +++ b/llumnix/entrypoints/vllm/api_server_actor.py @@ -9,7 +9,6 @@ from llumnix.arg_utils import EntrypointsArgs from llumnix.entrypoints.utils import EntrypointsContext, get_ip_address from llumnix.llumlet.llumlet import Llumlet -from llumnix.utils import get_server_name from llumnix.queue.utils import init_request_output_queue_server, QueueType from llumnix.logger import init_logger @@ -62,12 +61,12 @@ def run(self): @classmethod def from_args(cls, - instance_id: str, + server_name: str, placement_group: PlacementGroup, entrypoints_args: EntrypointsArgs): try: fastapi_server_class = ray.remote(num_cpus=1, - name=get_server_name(instance_id), + name=server_name, namespace="llumnix", lifetime="detached")(cls).options( scheduling_strategy=PlacementGroupSchedulingStrategy( diff --git a/llumnix/manager.py b/llumnix/manager.py index e65edafb..68738933 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -36,7 +36,8 @@ from llumnix.utils import (random_uuid, clear_gloo_backend_state, remove_placement_group, get_instance_name, get_manager_name, INSTANCE_NAME_PREFIX, SERVER_NAME_PREFIX, get_placement_group_name, run_async_func_sync, - kill_server, kill_instance, initialize_placement_group) + kill_server, kill_instance, initialize_placement_group, + get_server_name) from llumnix.entrypoints.utils import DeploymentMode from llumnix.backends.utils import get_engine_world_size from llumnix.queue.queue_type import QueueType @@ -300,7 +301,8 @@ async def _auto_scale_up_loop(self, interval: float) -> None: self.scale_down(instance_id) if new_pg is None: new_instance_id = random_uuid() - new_pg = self._init_placement_group(new_instance_id, self.engine_args, self.backend_type, init_server=True, block=False) + new_pg = self._init_placement_group(get_placement_group_name(new_instance_id), self.engine_args, self.backend_type, + init_server=True, block=False) try: await asyncio.wait_for(new_pg.ready(), WAIT_PLACEMENT_GROUP_TIMEOUT) except asyncio.TimeoutError: @@ -508,7 +510,7 @@ def from_args(cls, return manager def _init_placement_group(self, - instance_id: str, + placement_group_name: str, engine_args, backend_type: BackendType, init_server: bool = False, @@ -517,16 +519,18 @@ def _init_placement_group(self, # num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor # num_gpus=world_size, for world_size Workers world_size = get_engine_world_size(engine_args, backend_type) - placement_group = initialize_placement_group(instance_id, num_cpus=3+int(init_server), num_gpus=world_size, detached=True, block=block) + placement_group = initialize_placement_group(placement_group_name, + num_cpus=3+int(init_server), num_gpus=world_size, detached=True, block=block) else: assert backend_type == backend_type.VLLM, "Only support the simulator backend for vLLM." # num_cpus=1, for Llumlet + AsyncPutQueueActor - placement_group = initialize_placement_group(instance_id, num_cpus=2+int(init_server), num_gpus=0, detached=True, block=block) + placement_group = initialize_placement_group(placement_group_name, + num_cpus=2+int(init_server), num_gpus=0, detached=True, block=block) return placement_group def _init_server(self, - instance_id: str, + server_name: str, placement_group: PlacementGroup, entrypoints_args: EntrypointsArgs) -> FastAPIServer: entrypoints_args = copy.deepcopy(entrypoints_args) @@ -534,7 +538,7 @@ def _init_server(self, entrypoints_args.port += self.port_count entrypoints_args.request_output_queue_port += self.port_count self.port_count += 1 - fastapi_server = FastAPIServer.from_args(instance_id, placement_group, entrypoints_args) + fastapi_server = FastAPIServer.from_args(server_name, placement_group, entrypoints_args) return fastapi_server def _init_instance(self, @@ -617,7 +621,7 @@ async def done_scale_up(): request_output_queue_type = QueueType(self.entrypoints_args.request_output_queue_type) instance = self._init_instance(instance_id, placement_group, request_output_queue_type, self.backend_type, self.engine_args) - server = self._init_server(instance_id, placement_group, self.entrypoints_args) + server = self._init_server(get_server_name(instance_id), placement_group, self.entrypoints_args) asyncio.create_task(done_scale_up()) async def _check_deployment_states_loop(self, interval: float) -> None: diff --git a/llumnix/utils.py b/llumnix/utils.py index e5686dfc..a4d972fa 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -28,7 +28,7 @@ def initialize_placement_group( - instance_id: str, + placement_group_name: str, num_cpus: int, num_gpus: int, detached: bool = False, @@ -37,7 +37,7 @@ def initialize_placement_group( """Initialize the distributed cluster probably with Ray. Args: - instance_id: The instance id of the instance scheduled to the placement group. + placement_group_name: The name of placement group. num_cpus: The number of cpus in placement group. num_cpus: The number of cpus in placement group. detached: Whether the lifetime of the placement group being detached. @@ -80,7 +80,7 @@ def initialize_placement_group( # bundle_0: Llumlet + AsyncPutQueueActor + ProxyActor, bundle_1: Workers placement_group_specs = ([{"CPU": num_cpus}] + [{"GPU": 1}] * num_gpus) current_placement_group = ray.util.placement_group( - placement_group_specs, "STRICT_PACK", name=get_placement_group_name(instance_id), lifetime=lifetime) + placement_group_specs, "STRICT_PACK", name=placement_group_name, lifetime=lifetime) # Wait until PG is ready - this will block until all # requested resources are available, and will timeout # if they cannot be provisioned. diff --git a/tests/unit_test/backends/vllm/test_llm_engine.py b/tests/unit_test/backends/vllm/test_llm_engine.py index 5da6a025..86d5ff61 100644 --- a/tests/unit_test/backends/vllm/test_llm_engine.py +++ b/tests/unit_test/backends/vllm/test_llm_engine.py @@ -28,7 +28,7 @@ from llumnix.backends.vllm.sequence import LlumnixRequest from llumnix.queue.queue_type import QueueType from llumnix.server_info import ServerInfo -from llumnix.utils import initialize_placement_group +from llumnix.utils import initialize_placement_group, get_placement_group_name from tests.conftest import ray_env from .utils import create_dummy_prompt, initialize_scheduler @@ -90,7 +90,7 @@ def test_llm_engine_process_model_outputs(): def test_llm_engine_from_engine_args(ray_env): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - placement_group = initialize_placement_group(instance_id="0", num_cpus=3, num_gpus=1, detached=True) + placement_group = initialize_placement_group(get_placement_group_name("0"), num_cpus=3, num_gpus=1, detached=True) llm_engine = MockEngine.from_engine_args(engine_args=engine_args, request_output_queue_type=QueueType.RAYQUEUE, instance_id="0", migration_config=None, placement_group=placement_group) assert llm_engine.executor_class == LlumnixRayGPUExecutor @@ -98,7 +98,7 @@ def test_llm_engine_from_engine_args(ray_env): def test_llm_engine_from_engine_args_sim(ray_env): latency_data = LatencyMemData({},{},{}) engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - placement_group = initialize_placement_group(instance_id="0", num_cpus=2, num_gpus=1, detached=True) + placement_group = initialize_placement_group(get_placement_group_name("0"), num_cpus=2, num_gpus=1, detached=True) llm_engine = MockEngine.from_engine_args(engine_args=engine_args, request_output_queue_type=QueueType.RAYQUEUE, instance_id="0", migration_config=None, latency_mem=latency_data, placement_group=placement_group) @@ -106,7 +106,7 @@ def test_llm_engine_from_engine_args_sim(ray_env): def test_llm_engine_add_requset(ray_env): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - placement_group = initialize_placement_group(instance_id="0", num_cpus=3, num_gpus=1, detached=True) + placement_group = initialize_placement_group(get_placement_group_name("0"), num_cpus=3, num_gpus=1, detached=True) llm_engine = LLMEngineLlumnix.from_engine_args(engine_args=engine_args, request_output_queue_type=QueueType.RAYQUEUE, instance_id="0", diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index 0251d7a9..5554157a 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -28,7 +28,7 @@ from llumnix.internal_config import MigrationConfig from llumnix.llumlet.request import RequestInferenceType, RequestStatus from llumnix.queue.queue_type import QueueType -from llumnix.utils import initialize_placement_group +from llumnix.utils import initialize_placement_group, get_placement_group_name from tests.unit_test.queue.utils import request_output_queue_server # pylint: disable=unused-import @@ -46,7 +46,7 @@ ] def init_llumlet(request_output_queue_type, instance_id, migration_config, engine_args): - placement_group = initialize_placement_group(instance_id=instance_id, num_cpus=3, num_gpus=1, detached=True) + placement_group = initialize_placement_group(get_placement_group_name(instance_id), num_cpus=3, num_gpus=1, detached=True) llumlet = Llumlet.from_args( instance_id=instance_id, placement_group=placement_group, @@ -69,7 +69,7 @@ def __init__(self): class MockLlumletDoNotSchedule(Llumlet): def __init__(self, *args, **kwargs): instance_id = kwargs["instance_id"] - placement_group = initialize_placement_group(instance_id=instance_id, num_cpus=3, num_gpus=1, detached=True) + placement_group = initialize_placement_group(get_placement_group_name(instance_id), num_cpus=3, num_gpus=1, detached=True) kwargs["placement_group"] = placement_group super().__init__(*args, **kwargs) # stop the schedule in engine step loop diff --git a/tests/unit_test/backends/vllm/test_migration_backend.py b/tests/unit_test/backends/vllm/test_migration_backend.py index 843c2a23..f6b1d50d 100644 --- a/tests/unit_test/backends/vllm/test_migration_backend.py +++ b/tests/unit_test/backends/vllm/test_migration_backend.py @@ -20,7 +20,7 @@ from llumnix.backends.vllm.worker import MigrationWorker from llumnix.arg_utils import ManagerArgs -from llumnix.utils import random_uuid, initialize_placement_group +from llumnix.utils import random_uuid, initialize_placement_group, get_placement_group_name # pylint: disable=unused-import from tests.conftest import ray_env @@ -58,7 +58,7 @@ def test_migrate_cache(ray_env, backend): ray.get(worker1.execute_method.remote('initialize_cache', num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=0)) worker0_id = random_uuid() - placement_group0 = initialize_placement_group(instance_id=worker0_id, num_cpus=1, num_gpus=1, detached=True) + placement_group0 = initialize_placement_group(get_placement_group_name(worker0_id), num_cpus=1, num_gpus=1, detached=True) ray.get(worker0.execute_method.remote( 'init_migration', instance_id=worker0_id, @@ -67,7 +67,7 @@ def test_migrate_cache(ray_env, backend): placement_group=placement_group0)) worker1_id = random_uuid() - placement_group1 = initialize_placement_group(instance_id=worker1_id, num_cpus=1, num_gpus=1, detached=True) + placement_group1 = initialize_placement_group(get_placement_group_name(worker1_id), num_cpus=1, num_gpus=1, detached=True) ray.get(worker1.execute_method.remote( 'init_migration', instance_id=worker1_id, diff --git a/tests/unit_test/backends/vllm/test_simulator.py b/tests/unit_test/backends/vllm/test_simulator.py index 4d11f1a4..f71c5a95 100644 --- a/tests/unit_test/backends/vllm/test_simulator.py +++ b/tests/unit_test/backends/vllm/test_simulator.py @@ -12,7 +12,7 @@ from llumnix.backends.profiling import LatencyMemData from llumnix.internal_config import MigrationConfig from llumnix.queue.queue_type import QueueType -from llumnix.utils import initialize_placement_group +from llumnix.utils import initialize_placement_group, get_placement_group_name # pylint: disable=unused-import from tests.conftest import ray_env @@ -86,7 +86,7 @@ def __init__(self): namespace='llumnix', max_concurrency=4)(DummyActor) dummy_actor = dummy_actor.remote() - placement_group = initialize_placement_group("0", num_cpus=2, num_gpus=0, detached=True) + placement_group = initialize_placement_group(get_placement_group_name("0"), num_cpus=2, num_gpus=0, detached=True) sim_backend = MockBackendSim(instance_id="0", request_output_queue_type=request_output_queue_type, migration_config=migration_config, diff --git a/tests/unit_test/backends/vllm/test_worker.py b/tests/unit_test/backends/vllm/test_worker.py index ef5f15f0..15e8e6d6 100644 --- a/tests/unit_test/backends/vllm/test_worker.py +++ b/tests/unit_test/backends/vllm/test_worker.py @@ -23,7 +23,7 @@ from llumnix.arg_utils import ManagerArgs from llumnix.utils import random_uuid -from llumnix.utils import initialize_placement_group +from llumnix.utils import initialize_placement_group, get_placement_group_name # pylint: disable=unused-import from tests.conftest import ray_env @@ -86,7 +86,7 @@ def test_rebuild_migration_backend(ray_env, backend): worker0 = create_worker(rank=0, local_rank=0, engine_config=engine_config) worker0_id = random_uuid() - placement_group0 = initialize_placement_group(instance_id=worker0_id, num_cpus=1, num_gpus=1, detached=True) + placement_group0 = initialize_placement_group(get_placement_group_name(worker0_id), num_cpus=1, num_gpus=1, detached=True) ray.get(worker0.execute_method.remote('init_device')) ray.get(worker0.execute_method.remote('initialize_cache', num_gpu_blocks=8, num_cpu_blocks=0)) ray.get(worker0.execute_method.remote( @@ -102,7 +102,7 @@ def test_rebuild_migration_backend(ray_env, backend): worker1 = create_worker(rank=0, local_rank=0, engine_config=engine_config) worker1_id = random_uuid() - placement_group1 = initialize_placement_group(instance_id=worker1_id, num_cpus=1, num_gpus=1, detached=True) + placement_group1 = initialize_placement_group(get_placement_group_name(worker1_id), num_cpus=1, num_gpus=1, detached=True) ray.get(worker1.execute_method.remote('init_device')) ray.get(worker1.execute_method.remote('initialize_cache', num_gpu_blocks=8, num_cpu_blocks=0)) ray.get(worker1.execute_method.remote( diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index 975c449a..f4e903c9 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -319,7 +319,8 @@ def test_update_instance_info_loop_and_migrate(ray_env, manager): def test_init_server_and_instance_and_clear_instance_ray_resources(ray_env): manager, _, _, engine_args, _ = init_manager_with_deployment_mode(DeploymentMode.LOCAL) instance_id = random_uuid() - pg = ray.get(manager._init_placement_group.remote(instance_id, engine_args, BackendType.VLLM, init_server=True)) + pg = ray.get(manager._init_placement_group.remote(get_placement_group_name(instance_id), + engine_args, BackendType.VLLM, init_server=True)) pg = ray.util.get_placement_group(get_placement_group_name(instance_id)) ray.get(pg.ready()) ray.get(manager._init_server_and_instance.remote(instance_id, pg)) diff --git a/tests/unit_test/llumlet/test_engine_step_exception.py b/tests/unit_test/llumlet/test_engine_step_exception.py index ccb45f1e..a3165556 100644 --- a/tests/unit_test/llumlet/test_engine_step_exception.py +++ b/tests/unit_test/llumlet/test_engine_step_exception.py @@ -25,7 +25,7 @@ from llumnix.llumlet.llumlet import Llumlet from llumnix.internal_config import MigrationConfig from llumnix.queue.queue_type import QueueType -from llumnix.utils import initialize_placement_group +from llumnix.utils import initialize_placement_group, get_placement_group_name # pylint: disable=unused-import from tests.conftest import ray_env @@ -34,7 +34,7 @@ class MockLlumlet(Llumlet): def __init__(self, *args, **kwargs) -> None: instance_id = kwargs["instance_id"] - placement_group = initialize_placement_group(instance_id=instance_id, num_cpus=3, num_gpus=1, detached=True) + placement_group = initialize_placement_group(get_placement_group_name(instance_id), num_cpus=3, num_gpus=1, detached=True) kwargs["placement_group"] = placement_group super().__init__(*args, **kwargs) self.origin_step = self.backend_engine.engine.step_async @@ -59,7 +59,11 @@ def test_engine_step_exception(ray_env): migration_config = MigrationConfig("SR", "rayrpc", 16, 1, 4, 5, 20) scheduling_strategy = NodeAffinitySchedulingStrategy(node_id=ray.get_runtime_context().get_node_id(), soft=False) - origin_free_memory, _ = torch.cuda.mem_get_info() + device_count = torch.cuda.device_count() + origin_free_memory_list = [] + for device_id in range(device_count): + origin_free_memory, _ = torch.cuda.mem_get_info(device_id) + origin_free_memory_list.append(origin_free_memory) actor_name = "instance_0" llumlet = MockLlumlet.options(name=actor_name, namespace='llumnix', @@ -76,9 +80,6 @@ def test_engine_step_exception(ray_env): all_actor_names = [actor["name"] for actor in all_actors] assert actor_name in all_actor_names - cur_free_memory, _ = torch.cuda.mem_get_info() - assert cur_free_memory < origin_free_memory - ray.get(llumlet.set_error_step.remote(True)) time.sleep(3) @@ -86,5 +87,9 @@ def test_engine_step_exception(ray_env): all_actor_names = [actor["name"] for actor in all_actors] assert actor_name not in all_actor_names - cur_free_memory, _ = torch.cuda.mem_get_info() - assert origin_free_memory == cur_free_memory + cur_free_memory_list = [] + for device_id in range(device_count): + cur_free_memory, _ = torch.cuda.mem_get_info(device_id) + cur_free_memory_list.append(cur_free_memory) + + assert origin_free_memory_list == cur_free_memory_list From 8b4f2578ccd9f831fc850139ed476bf1a5d172d1 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 09:06:49 +0000 Subject: [PATCH 61/92] Fix test_engine_step_exception --- tests/unit_test/llumlet/test_engine_step_exception.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/unit_test/llumlet/test_engine_step_exception.py b/tests/unit_test/llumlet/test_engine_step_exception.py index a3165556..05981111 100644 --- a/tests/unit_test/llumlet/test_engine_step_exception.py +++ b/tests/unit_test/llumlet/test_engine_step_exception.py @@ -80,6 +80,12 @@ def test_engine_step_exception(ray_env): all_actor_names = [actor["name"] for actor in all_actors] assert actor_name in all_actor_names + cur_free_memory_list = [] + for device_id in range(device_count): + cur_free_memory, _ = torch.cuda.mem_get_info(device_id) + cur_free_memory_list.append(cur_free_memory) + assert origin_free_memory_list != cur_free_memory_list + ray.get(llumlet.set_error_step.remote(True)) time.sleep(3) @@ -91,5 +97,4 @@ def test_engine_step_exception(ray_env): for device_id in range(device_count): cur_free_memory, _ = torch.cuda.mem_get_info(device_id) cur_free_memory_list.append(cur_free_memory) - assert origin_free_memory_list == cur_free_memory_list From de82d8ff2d5e2c3bb7278dc8d1a732c56fe34c24 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 09:22:37 +0000 Subject: [PATCH 62/92] Fix _check_deployment_states_loop --- llumnix/manager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index 68738933..5cf3f786 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -626,11 +626,12 @@ async def done_scale_up(): async def _check_deployment_states_loop(self, interval: float) -> None: async def watch_deployment(instance_id: str): - logger.warning("[_check_deployment_states_loop] watch instance {} deployment".format(instance_id)) await asyncio.sleep(WATCH_DEPLOYMENT_INTERVAL) curr_pgs, curr_servers, curr_instances = self.get_curr_deployment() if instance_id in curr_pgs and (instance_id not in curr_servers or instance_id not in curr_instances): - logger.warning("[_check_deployment_states_loop] instance {} deployment states incorrect".format(instance_id)) + logger.warning("[_check_deployment_states_loop] instance {} deployment states incorrect, " + "states: (pg {}, server {}, instance {})" + .format(instance_id, instance_id in curr_pgs, instance_id in curr_servers, instance_id in curr_instances)) self.scale_down(instance_id) while True: From fdb54850a1a9dc94cf723a934831ba9df64367ea Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 11:00:07 +0000 Subject: [PATCH 63/92] Fix test_engine_step_exception --- tests/unit_test/backends/vllm/test_migration.py | 5 +---- tests/unit_test/llumlet/test_engine_step_exception.py | 8 +++----- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index 5554157a..8892c8df 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -16,7 +16,6 @@ from unittest.mock import MagicMock import pytest import ray -from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy from vllm import EngineArgs, SamplingParams from vllm.utils import random_uuid @@ -113,15 +112,13 @@ async def test_migration_correctness(ray_env, migration_backend, migration_reque request_output_queue_type = QueueType.RAYQUEUE que, server_info = request_output_queue_server(request_output_queue_type) asyncio.create_task(que.run_server_loop()) - scheduling_strategy = NodeAffinitySchedulingStrategy(node_id=ray.get_runtime_context().get_node_id(), soft=False) llumlet_0 = init_llumlet(request_output_queue_type, "0", migration_config, engine_args) llumlet_1 = init_llumlet(request_output_queue_type, "1", migration_config, engine_args) llumlet_2: Llumlet = MockLlumletDoNotSchedule.options( name='instance_2', - namespace='llumnix', - scheduling_strategy=scheduling_strategy).remote( + namespace='llumnix').remote( instance_id="2", request_output_queue_type=request_output_queue_type, backend_type=BackendType.VLLM, diff --git a/tests/unit_test/llumlet/test_engine_step_exception.py b/tests/unit_test/llumlet/test_engine_step_exception.py index 05981111..a15dc52f 100644 --- a/tests/unit_test/llumlet/test_engine_step_exception.py +++ b/tests/unit_test/llumlet/test_engine_step_exception.py @@ -17,8 +17,6 @@ import torch import pytest -from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy - from vllm.engine.arg_utils import EngineArgs from llumnix.backends.backend_interface import BackendType @@ -57,7 +55,8 @@ async def raise_error_step(): def test_engine_step_exception(ray_env): engine_args = EngineArgs(model="facebook/opt-125m", max_model_len=8, worker_use_ray=True) migration_config = MigrationConfig("SR", "rayrpc", 16, 1, 4, 5, 20) - scheduling_strategy = NodeAffinitySchedulingStrategy(node_id=ray.get_runtime_context().get_node_id(), soft=False) + + time.sleep(5.0) device_count = torch.cuda.device_count() origin_free_memory_list = [] @@ -66,8 +65,7 @@ def test_engine_step_exception(ray_env): origin_free_memory_list.append(origin_free_memory) actor_name = "instance_0" - llumlet = MockLlumlet.options(name=actor_name, namespace='llumnix', - scheduling_strategy=scheduling_strategy).remote( + llumlet = MockLlumlet.options(name=actor_name, namespace='llumnix').remote( instance_id="0", request_output_queue_type=QueueType.RAYQUEUE, backend_type=BackendType.VLLM, From e92b6b8d6d29836868a04fa8d36289b7f66d7220 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Tue, 7 Jan 2025 11:31:39 +0000 Subject: [PATCH 64/92] Minors --- llumnix/llumlet/migration_coordinator.py | 3 ++- llumnix/manager.py | 2 ++ llumnix/utils.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/llumnix/llumlet/migration_coordinator.py b/llumnix/llumlet/migration_coordinator.py index bc356f48..dfebf828 100644 --- a/llumnix/llumlet/migration_coordinator.py +++ b/llumnix/llumlet/migration_coordinator.py @@ -89,7 +89,8 @@ async def _migrate_out_multistage(self, migrate_out_request: LlumnixRequest) -> "MigrationStatus": """Migrate out requests to a specified instance, return migrated request id. Args: - migrate_in_ray_actor: instance actor name, used to get ray actor handle + migrate_in_ray_actor: instance actor name, used to get ray actor handle. + migrate_out_request: request to migrate out. """ try: stage_count = 0 diff --git a/llumnix/manager.py b/llumnix/manager.py index 5cf3f786..d5bba99d 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -57,6 +57,8 @@ # TODO(s5u13b): Handle exception of ray operations. # TODO(s5u13b): Update the documents of global deployment. # TODO(s5u13b): Add exeception handling wrapper. +# TODO(s5u13b): Update Arguments.md. +# TODO(s5u13b): Reorganize constant variables. class Manager: diff --git a/llumnix/utils.py b/llumnix/utils.py index a4d972fa..07b4a003 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -41,6 +41,7 @@ def initialize_placement_group( num_cpus: The number of cpus in placement group. num_cpus: The number of cpus in placement group. detached: Whether the lifetime of the placement group being detached. + block: If True, the function will block until the placement group is ready. Returns: `placement_group`. `placement_group` includes the specification From a738d7c7540ccaf7d1094e46c5bbff02aee37712 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 8 Jan 2025 03:08:34 +0000 Subject: [PATCH 65/92] Refine arguments --- docs/Arguments.md | 131 +++++++++++++++++-------- docs/Quickstart.md | 16 ++- llumnix/arg_utils.py | 57 +++++------ llumnix/config/default.py | 47 +++++---- llumnix/entrypoints/vllm/api_server.py | 2 +- llumnix/manager.py | 18 ++-- 6 files changed, 168 insertions(+), 103 deletions(-) diff --git a/docs/Arguments.md b/docs/Arguments.md index 916755cf..4374f3b2 100644 --- a/docs/Arguments.md +++ b/docs/Arguments.md @@ -6,17 +6,28 @@ Note: since Llumnix is still in alpha stage, the interface and arguments are *su ``` usage: -m llumnix.entrypoints.vllm.api_server [-h] + [--host HOST] + [--port PORT] + [--ssl-keyfile SSL_KEYFILE] + [--ssl-certfile SSL_CERTFILE] + [--log-level {debug,info,warning,error}] + [--launch-ray-cluster] + [--ray-cluster-port RAY_CLUSTER_PORT] + [--request-output-queue-type {rayqueue,zmq}] + [--request-output-queue-port REQUEST_OUTPUT_QUEUE_PORT] + [--disable-log-requests-server] + [--log-request-timestamps] [--config-file CONFIG_FILE] [--initial-instances INITIAL_INSTANCES] [--load-metric {remaining_steps,usage_ratio}] [--polling-interval POLLING_INTERVAL] [--dispatch-policy {balanced,load,queue,rr}] [--enable-migration] + [--enable-defrag] [--pair-migration-frequency PAIR_MIGRATION_FREQUENCY] [--pair-migration-policy {balanced,defrag_constrained,defrag_relaxed}] [--migrate-out-threshold MIGRATE_OUT_THRESHOLD] [--request-migration-policy {LCR,SR,LR,FCW,FCWSR}] - [--enable-defrag ENABLE_DEFRAG] [--enable-scaling] [--min-instances MIN_INSTANCES] [--max-instances MAX_INSTANCES] @@ -29,24 +40,66 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] [--log-filename LOG_FILENAME] [--profiling-result-file-path PROFILING_RESULT_FILE_PATH] [--gpu-type GPU_TYPE] - [--polling-interval POLLING_INTERVAL] [--migration-backend {gloo,nccl,rayrpc,grpc,kvtransfer}] [--migration-buffer-blocks MIGRATION_BUFFER_BLOCKS] - [--migration-backend-transfer-type {cuda_ipc,rdma,}] - [--migration-backend-kvtransfer-naming-url MIGRATION_BACKEND_KVTRANSFER_NAMING_URL] - [--migration-backend-server-address MIGRATION_BACKEND_SERVER_ADDRESS] - [--migration-backend-init-timeout MIGRATION_BACKEND_INIT_TIMEOUT] [--migration-num-layers MIGRATION_NUM_LAYERS] - [--last-stage-max-blocks LAST_STAGE_MAX_BLOCKS] + [--migration-backend-init-timeout MIGRATION_BACKEND_INIT_TIMEOUT] + [--migration-backend-transfer-type {cuda_ipc,rdma,}] + [--grpc-migration-backend-server-address GRPC_MIGRATION_BACKEND_SERVER_ADDRESS] + [--kvtransfer-migration-backend-naming-url KVTRANSFER_MIGRATION_BACKEND_NAMING_URL] [--max-stages MAX_STAGES] + [--last-stage-max-blocks LAST_STAGE_MAX_BLOCKS] [--enable-pd-disagg] [--num-dispatch-instances NUM_DISPATCH_INSTANCES] - [--log-request-timestamps] - + [--enable-port-increment] ``` +`--host` +- Hostname of the server. +- Default: "localhost" + +`--port` +- Port number of the server. +- Default: 8000 + +`--ssl-keyfile` +- Path to SSL key file. +- Default: None + +`--ssl-certfile` +- Path to SSL certificate file. +- Default: None + +`--log-level` +- Log level for the server. +- Possible choices: debug, info, warning, error +- Default: "info" + +`--launch-ray-cluster` +- If launch ray cluster. + +`--ray-cluster-port` +- Ray cluster port. +- Default: 6379 + +`--request-output-queue-type` +- Queue type for request output queue. +- Possible choices: rayqueue, zmq +- Default: "rayqueue" + +`--request-output-queue-port` +- Port number for the zmq request output queue. +- Default: 1234 + +`--disable-log-requests-server` +- Disable logging requests in server. + +`--log-request-timestamps` +- If log request timestamps. + `--config-file` -- Path to config file. +- Path to config file of arguments. +- Default: None `--initial-instances` - Number of instances created at initialization. @@ -69,6 +122,9 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] `--enable-migration` - Enable migrate requests between instances. +`--enable-defrag` +- Enable defragmentation through migration based on virtual usage. + `--pair-migration-frequency` - Pair migration frequency. - Default: 1 @@ -87,10 +143,6 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] - Possible choices: LCR, SR, LR, FCW, FCWSR - Default: "SR" -`--enable-defrag` -- Enable defragmentation through migration based on virtual usage. -- Default: False - `--enable-scaling` - Enable auto scaling. @@ -130,59 +182,56 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] - Default: "server.log" `--profiling-result-file-path` -- Profiling result file path. -- Default: "" - -`--gpu-type` -- GPU type specified when using simulator. -- Default: "a10" +- Profiling result file path when using simulator. +- Default: None `--migration-backend` - Communication backend of migration. - Possible choices: gloo, rayrpc, nccl, grpc, kvtransfer. [gloo, rayrpc, nccl] are available for vllm and [grpc, kvtransfer] are available for bladellm. - Default: "gloo" -`--migration-backend-transfer-type` -- Transfer type for migration backend kvTransfer. -- Possible choices: cuda_ipc, rdma -- Default: "rdma" - -`--migration-backend-server-address` -- Address of grpc server for migration backend -- Default: "127.0.0.1:50051" - -`--migration-backend-kvtransfer-naming-url` -- URL of naming server for kvtransfer migration backend -- Default: "file:/tmp/llumnix/naming/" - `--migration-buffer-blocks` - Number of buffer blocks in migration. - Default: 512 +`--migration-num-layers` +- number of kv-cache layers to transfer in each round during migration +- Default: 1 + `--migration-backend-init-timeout` - Timeout(s) for initializing migration backend. - Default: 10.0 -`--migration-num-layers` -- number of kv-cache layers to transfer in each round during migration -- Default: 1 +`--migration-backend-transfer-type` +- Transfer type for migration backend grpc and kvTransfer. +- Possible choices: cuda_ipc, rdma +- Default: "rdma" -`--last-stage-max-blocks` -- If the number of remaining blocks < last_stage_max_blocks, do last stage migration. -- Default: 4 +`--grpc-migration-backend-server-address` +- Address of grpc server for migration backend +- Default: "127.0.0.1:50051" + +`--kvtransfer-migration-backend-naming-url` +- URL of naming server for kvtransfer migration backend +- Default: "file:/tmp/llumnix/naming/" `--max-stages` - Drop migration if the number of stages > max_stages. - Default: 3 -`--log-request-timestamps` -- Enable logging request timestamps. +`--last-stage-max-blocks` +- If the number of remaining blocks < last_stage_max_blocks, do last stage migration. +- Default: 16 `--enable-pd-disagg` - Enable prefill decoding disaggregation. `--num-dispatch-instances` - Number of available instances for dispatch. +- Default: math.inf + +`--enable-port-increment` +- Enable port increment when desploying multiple servers. # Unsupported vLLM feature options diff --git a/docs/Quickstart.md b/docs/Quickstart.md index 6081c537..c01e2299 100644 --- a/docs/Quickstart.md +++ b/docs/Quickstart.md @@ -34,7 +34,7 @@ After installation, you can follow this guide to use Llumnix for multi-instance ## Migrating from Existing Deployments -Inference engines like vLLM provide an API server user interface, e.g., `python -m vllm.entrypoints.api_server`. To deploy multiple instances, people start multiple such API servers, each corresponding to one instance, on multiple nodes / containers / k8s pods. +Inference engines like vLLM provide an API server user interface, e.g., `python -m entrypoints.vllm.api_server`. To deploy multiple instances, people start multiple such API servers, each corresponding to one instance, on multiple nodes / containers / k8s pods. Llumnix provides a similar user interface to enable seamless integration with such existing multi-instance deployments. You only need two simple steps to migrate from a deployed vLLM service to Llumnix: @@ -67,6 +67,20 @@ During the execution of serving deployment, Llumnix will: Following these steps, Llumnix acts as the request scheduling layer situated behind the multiple frontend API servers and above the multiple backend vLLM engine instances. This positioning allows Llumnix to significantly enhance serving performance through its dynamic, fine-grained, and KV-cache-aware request scheduling and rescheduling across instances. +## Global Deployment + +Llumnix also supports deploying multiple servers and instances at once by running `python -m vllm.entrypoints.serve`, which is named as global deployment. + +``` +python -m llumnix.entrypoints.vllm.serve \ + --config-file $CONFIG_PATH \ + # vLLM arguments ... + # Llumnix arguments ... + ... +``` + +Global deployment assumes that user has already launch a Ray cluter. Upon running the serve module, Llumnix will automatically connect to the existing Ray cluster, start the Llumnix components, and deploy multiple servers and instances to the Ray cluster until there is no more available gpus or cpus. + ## Ray Cluster Notice When you include the --launch-ray-cluster option in Llumnix's serving deployment command, Llumnix automatically builds a Ray cluster during the execution of serving deployment. This action will overwrite any existing Ray cluster. If this behavior is not desired, simply omit the --launch-ray-cluster option, and Llumnix will initiate its actor components within the current Ray cluster. diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index d692880a..198ef2e8 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -85,17 +85,17 @@ def check_args(cls, args: 'EntrypointsArgs', parser: argparse.ArgumentParser): def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.add_argument('--launch-ray-cluster', action='store_true', - help='if launch ray cluster in server') + help='if launch ray cluster') parser.add_argument("--ray-cluster-port", type=int, help='ray cluster port') parser.add_argument("--request-output-queue-type", type=str, choices=['rayqueue', 'zmq'], - help='request output queue type for request output queue') + help='queue type for request output queue') parser.add_argument("--request-output-queue-port", type=int, - help='port for zmq') + help='port number for the zmq request output queue') parser.add_argument('--disable-log-requests-server', action='store_true', help='disable logging requests in server') @@ -104,7 +104,7 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: help='if log request timestamps') parser.add_argument("--config-file", type=str, - help="path to config file") + help="path to config file of arguments") return parser @@ -116,7 +116,6 @@ class ManagerArgs: polling_interval: float = None dispatch_policy: str = None - num_dispatch_instances: int = None enable_migration: bool = None enable_defrag: bool = None @@ -133,22 +132,23 @@ class ManagerArgs: scale_up_threshold: float = None scale_down_threshold: float = None - log_filename: str = None disable_log_requests_manager: bool = None log_instance_info: bool = None + log_filename: str = None profiling_result_file_path: str = None - migration_backend_kvtransfer_naming_url: str = None - migration_backend_server_address: str = None - migration_backend_init_timeout: float = None migration_backend: str = None migration_buffer_blocks: int = None - migration_backend_transfer_type: str = None migration_num_layers: int = None + migration_backend_init_timeout: float = None + migration_backend_transfer_type: str = None + grpc_migration_backend_server_address: str = None + kvtransfer_migration_backend_naming_url: str = None last_stage_max_blocks: int = None max_stages: int = None enable_pd_disagg: bool = None + num_dispatch_instances: int = None enable_port_increment: bool = None @@ -240,13 +240,13 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: '* "queue" dispatch request to the instance with minimum waiting request queue length.\n' '* "flood" dispatch request to the instance with maximum requests dispatched.\n' '* "rr" dispatch requests with round-robin policy.\n') - parser.add_argument('--num-available-dispatch-instances', - type=int, - help='number of available instances for dispatching') parser.add_argument('--enable-migration', action='store_true', help='enable migrate requests between instances') + parser.add_argument('--enable-defrag', + type=bool, + help='enable defragmentation through migration based on virtual usage') parser.add_argument('--pair-migration-frequency', type=int, help='pair migration frequency') @@ -272,9 +272,6 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: '* "LR" migrate the running request longest.\n' '* "FCW" migrate the waiting request first come.\n' '* "FCWSR" migrate the waiting request first come and running request shortest.\n') - parser.add_argument('--enable-defrag', - type=bool, - help='enable defragmentation through migration based on virtual usage') parser.add_argument('--enable-scaling', action='store_true', @@ -310,37 +307,37 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: help='log filename') parser.add_argument('--profiling-result-file-path', type=str, - help='profiling result file path') + help='profiling result file path when using simulator') parser.add_argument('--migration-backend', type=str, choices=['gloo','nccl','rayrpc','grpc','kvtransfer'], help='communication backend of migration, [gloo, rayrpc, nccl] are available for vllm \ and [grpc, kvtransfer] are available for bladellm') + parser.add_argument('--migration-buffer-blocks', + type=int, + help='number of buffer blocks in migration') + parser.add_argument('--migration-num-layers', + type=int, + help='number of kv-cache layers to transfer in each round during migration') + parser.add_argument('--migration-backend-init-timeout', + type=float, + help='timeout(s) for initializing migration backend') parser.add_argument('--migration-backend-transfer-type', type=str, choices=['cuda_ipc','rdma', ''], help='transfer type for migration backend grpc and kvTransfer') - parser.add_argument('--grpc-migration-backend-address', + parser.add_argument('--grpc-migration-backend-server-address', type=str, help='address of grpc server for migration backend') - parser.add_argument('--migration-backend-kvtransfer-naming-url', + parser.add_argument('--kvtransfer-migration-backend-naming-url', type=str, help='url of naming server for kvtransfer migration backend') - parser.add_argument('--migration-backend-init-timeout', - type=float, - help='timeout(s) for initializing migration backend') - parser.add_argument('--migration-buffer-blocks', - type=int, - help='number of buffer blocks in migration') - parser.add_argument('--migration-num-layers', + parser.add_argument('--max-stages', type=int, - help='number of kv-cache layers to transfer in each round during migration') + help='drop migration if the number of stages > max_stages') parser.add_argument('--last-stage-max-blocks', type=int, help='if the number pf remain blocks < last_stage_max_blocks, do last stage migration') - parser.add_argument('--max-stages', - type=int, - help='drop migration if the number of stages > max_stages') parser.add_argument('--enable-pd-disagg', action='store_true', diff --git a/llumnix/config/default.py b/llumnix/config/default.py index 23cc1fb1..ca1fdb68 100644 --- a/llumnix/config/default.py +++ b/llumnix/config/default.py @@ -28,21 +28,21 @@ _C.SERVER.HOST = "localhost" # Port number for the server _C.SERVER.PORT = 8000 -# Path to SSL key file for secure connections +# Path to SSL key file _C.SERVER.SSL_KEYFILE = None -# Path to SSL certificate file for secure connections +# Path to SSL certificate file _C.SERVER.SSL_CERTFILE = None # Log level for the server -_C.SERVER.LOG_LEVEL = "debug" +_C.SERVER.LOG_LEVEL = "info" # Queue type for request output queue _C.SERVER.REQUEST_OUTPUT_QUEUE_TYPE = "rayqueue" -# Port number for the request output queue +# Port number for the zmq request output queue _C.SERVER.REQUEST_OUTPUT_QUEUE_PORT = 1234 # Disable logging requests in server _C.SERVER.DISABLE_LOG_REQUESTS_SERVER = False # Enable logging request timestamp _C.SERVER.LOG_REQUEST_TIMESTAMPS = False -# Config file of Llumnix arguments +# Path to config file of arguments _C.SERVER.CONFIG_FILE = None # ----------------------------------------------------------------------------- @@ -57,22 +57,21 @@ # MANAGER CONFIGURATION # ----------------------------------------------------------------------------- _C.MANAGER = LC() +# Number of instances created at initialization +_C.MANAGER.INITIAL_INSTANCES = 1 +# Time interval(s) to update instance info and pair migration +_C.MANAGER.POLLING_INTERVAL = 0.05 # Disable logging requests in manager _C.MANAGER.DISABLE_LOG_REQUESTS_MANAGER = False # Enable logging instance info _C.MANAGER.LOG_INSTANCE_INFO = False # Log filename _C.MANAGER.LOG_FILENAME = "server.log" -# Profiling result file path +# Profiling result file path when using simulator _C.MANAGER.PROFILING_RESULT_FILE_PATH = None # Enable port increment when deploying multiple servers _C.MANAGER.ENABLE_PORT_INCREMENT = False -# Number of instances created at initialization -_C.MANAGER.INITIAL_INSTANCES = 1 -# Time interval(s) to update instance info and pair migration -_C.MANAGER.POLLING_INTERVAL = 0.05 - # ----------------------------------------------------------------------------- # DISPATCH CONFIGURATION # ----------------------------------------------------------------------------- @@ -80,14 +79,14 @@ _C.MANAGER.LOAD_METRIC = 'remaining_steps' # Request dispatch policy _C.MANAGER.DISPATCH_POLICY = 'load' -# Number of available dispatch instances. math.inf indicates that all instances can be used for dispatching -_C.MANAGER.NUM_DISPATCH_INSTANCES = math.inf # ----------------------------------------------------------------------------- # MIGRATION CONFIGURATION # ----------------------------------------------------------------------------- # Enable migrate requests between instances _C.MANAGER.ENABLE_MIGRATION = False +# Enable defragmentation through migration based on virtual usage +_C.MANAGER.ENABLE_DEFRAG = False # Pair migration frequency _C.MANAGER.PAIR_MIGRATION_FREQUENCY = 1 # Pair migration policy @@ -96,8 +95,6 @@ _C.MANAGER.MIGRATE_OUT_THRESHOLD = 3.0 # Request migration policy _C.MANAGER.REQUEST_MIGRATION_POLICY = 'SR' -# Enable defragmentation through migration based on virtual usage -_C.MANAGER.ENABLE_DEFRAG = False # Drop migration if the number of stages > max_stages _C.MANAGER.MAX_STAGES = 3 # If the number of remain blocks < last_stage_max_blocks, do last stage migration @@ -105,23 +102,23 @@ # Communication backend of migration _C.MANAGER.MIGRATION_BACKEND = "gloo" -# Transfer type for migration backend kvTransfer -_C.MANAGER.MIGRATION_BACKEND_TRANSFER_TYPE = "rdma" -# Address of grpc server for migration backend -_C.MANAGER.MIGRATION_BACKEND_SERVER_ADDRESS = "127.0.0.1:50051" -# URL of naming server for kvtransfer migration backend -_C.MANAGER.MIGRATION_BACKEND_KVTRANSFER_NAMING_URL = "file:/tmp/llumnix/naming/" -# Timeout(s) for initializing migration backend -_C.MANAGER.MIGRATION_BACKEND_INIT_TIMEOUT = 10.0 # Number of cache blocks in migration _C.MANAGER.MIGRATION_BUFFER_BLOCKS = 512 # Number of kv-cache layers to transfer in each round during migration _C.MANAGER.MIGRATION_NUM_LAYERS = 1 +# Timeout(s) for initializing migration backend +_C.MANAGER.MIGRATION_BACKEND_INIT_TIMEOUT = 10.0 +# Transfer type for migration backend kvTransfer +_C.MANAGER.MIGRATION_BACKEND_TRANSFER_TYPE = "rdma" +# Address of grpc server for migration backend +_C.MANAGER.GRPC_MIGRATION_BACKEND_SERVER_ADDRESS = "127.0.0.1:50051" +# URL of naming server for kvtransfer migration backend +_C.MANAGER.KVTRANSFER_MIGRATION_BACKEND_NAMING_URL = "file:/tmp/llumnix/naming/" # ----------------------------------------------------------------------------- # SCALING CONFIGURATION # ----------------------------------------------------------------------------- -# Enable scaling instances based on load +# Enable auto scaling _C.MANAGER.ENABLE_SCALING = False # Minimum number of instances _C.MANAGER.MIN_INSTANCES = 1 @@ -141,3 +138,5 @@ # ----------------------------------------------------------------------------- # Enable prefill decoding disaggregation _C.MANAGER.ENABLE_PD_DISAGG = False +# Number of available instances for dispatch. math.inf indicates that all instances can be used for dispatching +_C.MANAGER.NUM_DISPATCH_INSTANCES = math.inf diff --git a/llumnix/entrypoints/vllm/api_server.py b/llumnix/entrypoints/vllm/api_server.py index a0ccd26d..48ba86f0 100644 --- a/llumnix/entrypoints/vllm/api_server.py +++ b/llumnix/entrypoints/vllm/api_server.py @@ -177,7 +177,7 @@ async def is_ready() -> bool: parser.add_argument("--port", type=int) parser.add_argument("--ssl-keyfile", type=str) parser.add_argument("--ssl-certfile", type=str) - parser.add_argument("--log-level", type=str) + parser.add_argument("--log-level", type=str, choices=["debug", "info", "warning", "error"]) cli_args = add_cli_args(parser) cfg = get_llumnix_config(cli_args.config_file, cli_args) diff --git a/llumnix/manager.py b/llumnix/manager.py index d5bba99d..5cf186cd 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -81,16 +81,22 @@ def __init__(self, if deployment_args is not None: self.deployment_mode: DeploymentMode = deployment_args.deployment_mode self.backend_type: BackendType = deployment_args.backend_type - self.max_instances = manager_args.max_instances - self.min_instances = manager_args.min_instances - # scheduling args + # migration args self.enable_migration = manager_args.enable_migration - self.enable_scaling = manager_args.enable_scaling - self.enable_pd_disagg = manager_args.enable_pd_disagg - self.polling_interval = manager_args.polling_interval self.pair_migration_frequency = manager_args.pair_migration_frequency + self.enable_pd_disagg = manager_args.enable_pd_disagg + + # scaling args + self.enable_scaling = manager_args.enable_scaling + self.max_instances = manager_args.max_instances + self.min_instances = manager_args.min_instances self.scaling_interval = manager_args.scaling_interval + self.scaling_policy = manager_args.scaling_policy + self.scale_up_threshold = manager_args.scale_up_threshold + self.scale_down_threshold = manager_args.scale_down_threshold + + self.polling_interval = manager_args.polling_interval global_scheduler_config = manager_args.create_global_scheduler_config() self.global_scheduler = GlobalScheduler(global_scheduler_config) From c8f772c3b76896b060f88694115012d176763821 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 8 Jan 2025 03:48:37 +0000 Subject: [PATCH 66/92] Fix backends unit test --- llumnix/arg_utils.py | 4 ++-- llumnix/internal_config.py | 8 ++++---- llumnix/manager.py | 20 +++---------------- .../unit_test/backends/vllm/test_migration.py | 3 --- 4 files changed, 9 insertions(+), 26 deletions(-) diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index 198ef2e8..e3a8d17c 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -190,8 +190,8 @@ def create_migration_config(self) -> MigrationConfig: self.max_stages, self.migration_backend_init_timeout, self.migration_backend_transfer_type, - self.migration_backend_server_address, - self.migration_backend_kvtransfer_naming_url) + self.grpc_migration_backend_server_address, + self.kvtransfer_migration_backend_naming_url) return migration_config @classmethod diff --git a/llumnix/internal_config.py b/llumnix/internal_config.py index b21d45d7..60c4b593 100644 --- a/llumnix/internal_config.py +++ b/llumnix/internal_config.py @@ -22,8 +22,8 @@ def __init__( max_stages: int, migration_backend_init_timeout: float, migration_backend_transfer_type: str = "", - migration_backend_server_address: str = "", - migration_backend_kvtransfer_naming_url: str = "", + grpc_migration_backend_server_address: str = "", + kvtransfer_migration_backend_naming_url: str = "", ) -> None: self.request_migration_policy = request_migration_policy self.migration_backend = migration_backend @@ -33,8 +33,8 @@ def __init__( self.last_stage_max_blocks = last_stage_max_blocks self.max_stages = max_stages self.migration_backend_init_timeout = migration_backend_init_timeout - self.migration_backend_server_address = migration_backend_server_address - self.migration_backend_kvtransfer_naming_url = migration_backend_kvtransfer_naming_url + self.grpc_migration_backend_server_address = grpc_migration_backend_server_address + self.kvtransfer_migration_backend_naming_url = kvtransfer_migration_backend_naming_url class GlobalSchedulerConfig: def __init__( diff --git a/llumnix/manager.py b/llumnix/manager.py index 5cf186cd..006d0758 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -578,26 +578,12 @@ def init_instances(self, instance_id = random_uuid() if not manager_args.profiling_result_file_path: # num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor, num_gpus=world_size, for Workers - placement_group = initialize_placement_group(instance_id, num_cpus=3, num_gpus=world_size, detached=True) - # TODO(s5u13b): Refine the order of arguments. - llumlet = Llumlet.from_args( - request_output_queue_type, - instance_id, - backend_type, - world_size, - engine_manager_args.create_migration_config(), - placement_group, - engine_args, - placement_group, - engine_args, - *args, - **kwargs - ) + placement_group = initialize_placement_group(get_placement_group_name(instance_id), num_cpus=3, num_gpus=world_size, detached=True) else: assert backend_type == backend_type.VLLM, 'Only support the simulator backend for vLLM.' # num_cpus=1, for Llumlet + AsyncPutQueueActor - placement_group = initialize_placement_group(instance_id, num_cpus=2, num_gpus=0, detached=True) - llumlet = Llumlet.from_args( + placement_group = initialize_placement_group(get_placement_group_name(instance_id), num_cpus=2, num_gpus=0, detached=True) + instance = Llumlet.from_args( instance_id, placement_group, request_output_queue_type, diff --git a/tests/unit_test/backends/vllm/test_migration.py b/tests/unit_test/backends/vllm/test_migration.py index 8892c8df..4f28d753 100644 --- a/tests/unit_test/backends/vllm/test_migration.py +++ b/tests/unit_test/backends/vllm/test_migration.py @@ -205,9 +205,6 @@ async def test_pd_diaggregation_correctness(ray_env, migration_backend): que, server_info = request_output_queue_server(request_output_queue_type) asyncio.create_task(que.run_server_loop()) - placement_group_0 = initialize_placement_group(instance_id="0", world_size=1, detached=True) - placement_group_1 = initialize_placement_group(instance_id="1", world_size=1, detached=True) - llumlet_0 = init_llumlet(request_output_queue_type, "0", migration_config, engine_args) llumlet_1 = init_llumlet(request_output_queue_type, "1", migration_config, engine_args) From 0d76812d9f8c0fcdd5241c03ceacdca1db0989a1 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 8 Jan 2025 05:29:20 +0000 Subject: [PATCH 67/92] Fix global scheduler unit test --- llumnix/manager.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index 006d0758..a356dc03 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -576,21 +576,8 @@ def init_instances(self, instances: List[Llumlet] = [] for _ in range(self.manager_args.initial_instances): instance_id = random_uuid() - if not manager_args.profiling_result_file_path: - # num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor, num_gpus=world_size, for Workers - placement_group = initialize_placement_group(get_placement_group_name(instance_id), num_cpus=3, num_gpus=world_size, detached=True) - else: - assert backend_type == backend_type.VLLM, 'Only support the simulator backend for vLLM.' - # num_cpus=1, for Llumlet + AsyncPutQueueActor - placement_group = initialize_placement_group(get_placement_group_name(instance_id), num_cpus=2, num_gpus=0, detached=True) - instance = Llumlet.from_args( - instance_id, - placement_group, - request_output_queue_type, - manager_args.create_migration_config(), - backend_type, - engine_args, - manager_args.profiling_result_file_path) + placement_group = self._init_placement_group(get_placement_group_name(instance_id), engine_args, backend_type) + instance = self._init_instance(instance_id, placement_group, request_output_queue_type, backend_type, engine_args) instance_ids.append(instance_id) instances.append(instance) From 7500be45d3688d791296efbcaeb07e9fc10a0a64 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 8 Jan 2025 05:33:21 +0000 Subject: [PATCH 68/92] Fix offline test --- examlpes/offline_inference.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py index c4c38047..9ff807d9 100644 --- a/examlpes/offline_inference.py +++ b/examlpes/offline_inference.py @@ -50,15 +50,6 @@ ray.get(manager.scale_up.remote(instance_ids, instances)) -# Create llumlets. -instance_ids: List[str] = None -llumlets: List[Llumlet] = None -instance_ids, llumlets = ray.get(manager.init_llumlets.remote( - engine_args, QueueType("rayqueue"), BackendType.VLLM, 1, -)) - -ray.get(manager.scale_up.remote(instance_ids, llumlets)) - # The requests‘ outputs will be put to the request_output_queue no matter which instance it's running in. server_id = random_uuid() request_output_queue = RayQueueServer() From deb25b99ea362e4d8c38ada49f8eaa391b7d9167 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 8 Jan 2025 06:02:56 +0000 Subject: [PATCH 69/92] Add watch instance deployment time --- llumnix/manager.py | 2 +- tests/unit_test/global_scheduler/test_manager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index a356dc03..616456a1 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -51,7 +51,7 @@ AUTO_SCALE_UP_INTERVAL = 1.0 WAIT_PLACEMENT_GROUP_TIMEOUT = 5.0 CHECK_DEPLOYMENT_STATES_INTERVAL = 30.0 -WATCH_DEPLOYMENT_INTERVAL = 30.0 +WATCH_DEPLOYMENT_INTERVAL = 60.0 # TODO(s5u13b): Fix the logger when manager failover. # TODO(s5u13b): Handle exception of ray operations. diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index f4e903c9..989b369c 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -383,7 +383,7 @@ def test_check_deployment_states_loop_and_auto_scale_up_loop(ray_env, request_ou kill_server(instance_ids[1]) kill_instance(instance_ids[2]) # Wait for check deployment states, scale down instance and auto scale up. - time.sleep(90.0) + time.sleep(120.0) num_instances = ray.get(manager.scale_up.remote([], [])) assert num_instances == 4 curr_pgs, curr_servers, curr_instances = ray.get(manager.get_curr_deployment.remote()) From 546d3dfcd45e03d4ed350a3d17e39f1a06c49777 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 8 Jan 2025 06:43:29 +0000 Subject: [PATCH 70/92] Decrease WATCH_DEPLOYMENT_INTERVAL --- llumnix/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index 616456a1..2613dc82 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -51,7 +51,7 @@ AUTO_SCALE_UP_INTERVAL = 1.0 WAIT_PLACEMENT_GROUP_TIMEOUT = 5.0 CHECK_DEPLOYMENT_STATES_INTERVAL = 30.0 -WATCH_DEPLOYMENT_INTERVAL = 60.0 +WATCH_DEPLOYMENT_INTERVAL = 40.0 # TODO(s5u13b): Fix the logger when manager failover. # TODO(s5u13b): Handle exception of ray operations. From f0c86aae91bb7ec4bc5fdfbb262b084515d096f6 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 8 Jan 2025 07:40:48 +0000 Subject: [PATCH 71/92] Change TODOs --- llumnix/entrypoints/vllm/api_server.py | 1 - llumnix/manager.py | 3 --- tests/unit_test/global_scheduler/test_manager.py | 1 - 3 files changed, 5 deletions(-) diff --git a/llumnix/entrypoints/vllm/api_server.py b/llumnix/entrypoints/vllm/api_server.py index 48ba86f0..dcb589c1 100644 --- a/llumnix/entrypoints/vllm/api_server.py +++ b/llumnix/entrypoints/vllm/api_server.py @@ -44,7 +44,6 @@ # pylint: disable=unused-argument @asynccontextmanager async def lifespan(fastapi_app: FastAPI): - # TODO(s5u13b): Do not run request output queue in event loop of api server. asyncio.create_task(llumnix_client.request_output_queue.run_server_loop()) asyncio.create_task(llumnix_client.get_request_outputs_loop()) yield diff --git a/llumnix/manager.py b/llumnix/manager.py index 2613dc82..dafffd75 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -53,11 +53,8 @@ CHECK_DEPLOYMENT_STATES_INTERVAL = 30.0 WATCH_DEPLOYMENT_INTERVAL = 40.0 -# TODO(s5u13b): Fix the logger when manager failover. # TODO(s5u13b): Handle exception of ray operations. -# TODO(s5u13b): Update the documents of global deployment. # TODO(s5u13b): Add exeception handling wrapper. -# TODO(s5u13b): Update Arguments.md. # TODO(s5u13b): Reorganize constant variables. diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index 989b369c..0d3e5f97 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -359,7 +359,6 @@ def test_auto_scale_up_loop_and_get_curr_deployment(ray_env, request_output_queu assert len(instance_ids) == 4 ray.get(manager._clear_instance_ray_resources.remote(instance_ids[0])) ray.get(manager._clear_instance_ray_resources.remote(instance_ids[1])) - # TODO(s5u13b): Get ray queue rpc error or some instances died sometimes. time.sleep(30.0) num_instances = ray.get(manager.scale_up.remote([], [])) assert num_instances == 4 From 3f31c5808e67ae31ac30f6fa5e83a06bc80ec6bb Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 8 Jan 2025 08:28:38 +0000 Subject: [PATCH 72/92] Consider scheduling pg state --- llumnix/manager.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index dafffd75..e0d186f8 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -294,11 +294,12 @@ async def _auto_scale_up_loop(self, interval: float) -> None: last_timeout_pg_states = list_placement_groups(filters=[("name", "=", last_timeout_pg_name)]) if len(last_timeout_pg_states) > 0: new_instance_id = self.last_timeout_instance_id - # pending or created(without server and instance) + # pending, created(without server and instance) or rescheduling new_pg = ray.util.get_placement_group(last_timeout_pg_name) # reset self.last_timeout_instance_id = None pending_pg_states = list_placement_groups(filters=[("state", "=", "PENDING")]) + pending_pg_states.extend(list_placement_groups(filters=[("state", "=", "RESCHEDULING")])) for pending_pg_state in pending_pg_states: instance_id = pending_pg_state["name"].split("_")[-1] if new_pg is not None and instance_id == new_instance_id: @@ -312,7 +313,8 @@ async def _auto_scale_up_loop(self, interval: float) -> None: await asyncio.wait_for(new_pg.ready(), WAIT_PLACEMENT_GROUP_TIMEOUT) except asyncio.TimeoutError: logger.info("[_auto_scale_up_loop] waiting for new placement group ready timeout") - # After timeout, the new instance might be pending, created(without server and instance) or killed. + # After timeout, the new placement group might be pending, + # created(without server and instance), rescheduling or killed. self.last_timeout_instance_id = new_instance_id await asyncio.sleep(interval) continue From 207ba5777cd170a73eb213d3195419282898d9e4 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 8 Jan 2025 08:43:45 +0000 Subject: [PATCH 73/92] Minor --- llumnix/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index e0d186f8..7f4187ab 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -314,7 +314,7 @@ async def _auto_scale_up_loop(self, interval: float) -> None: except asyncio.TimeoutError: logger.info("[_auto_scale_up_loop] waiting for new placement group ready timeout") # After timeout, the new placement group might be pending, - # created(without server and instance), rescheduling or killed. + # created(without server and instance), rescheduling. self.last_timeout_instance_id = new_instance_id await asyncio.sleep(interval) continue From d3cf1dfde5713fe11f1d389214a80fb6cd575558 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 8 Jan 2025 11:00:00 +0000 Subject: [PATCH 74/92] Fix benchmark --- llumnix/backends/vllm/llm_engine.py | 2 +- llumnix/entrypoints/vllm/client.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 5c07ed19..7662f462 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -210,7 +210,7 @@ def add_request(self, request_id: str, server_info: ServerInfo, expected_steps: if hasattr(server_info, 'request_timestamps'): server_info.request_timestamps.engine_add_request_timestamp = time.time() self.scheduler.waiting[-1] = SequenceGroupLlumnix(request_id, server_info, expected_steps, [seq_group.get_seqs()[0]], - seq_group.sampling_params, seq_group.metrics.arrival_time, seq_group.lora_request, + seq_group.metrics.arrival_time, seq_group.sampling_params, seq_group.lora_request, seq_group.multi_modal_data) def _start_put_queue_loop(self): diff --git a/llumnix/entrypoints/vllm/client.py b/llumnix/entrypoints/vllm/client.py index faff7973..044c241f 100644 --- a/llumnix/entrypoints/vllm/client.py +++ b/llumnix/entrypoints/vllm/client.py @@ -11,6 +11,8 @@ from llumnix.server_info import RequestTimestamps from llumnix.queue.queue_server_base import QueueServerBase from llumnix.server_info import ServerInfo +from llumnix.manager import Manager +from llumnix.llumlet.llumlet import Llumlet logger = init_logger(__name__) From 7728307b6a7304e68cca15252493b75ed67884da Mon Sep 17 00:00:00 2001 From: s5u13b Date: Wed, 8 Jan 2025 11:44:37 +0000 Subject: [PATCH 75/92] Fix error raise --- llumnix/entrypoints/vllm/api_server_actor.py | 1 + llumnix/llumlet/llumlet.py | 2 ++ llumnix/manager.py | 2 -- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llumnix/entrypoints/vllm/api_server_actor.py b/llumnix/entrypoints/vllm/api_server_actor.py index eba9f106..000cc51e 100644 --- a/llumnix/entrypoints/vllm/api_server_actor.py +++ b/llumnix/entrypoints/vllm/api_server_actor.py @@ -80,6 +80,7 @@ def from_args(cls, except Exception as e: logger.error("failed to initialize FastAPIServer: {}".format(e)) logger.error("exception traceback: {}".format(traceback.format_exc())) + raise return fastapi_server diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index b4d58d05..378e717a 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -69,6 +69,7 @@ def __init__(self, except Exception as e: logger.error("failed to initialize Llumlet: {}".format(e)) logger.error("exception traceback: {}".format(traceback.format_exc())) + raise @classmethod def from_args(cls, @@ -110,6 +111,7 @@ def from_args(cls, except Exception as e: logger.error("failed to initialize Llumlet: {}".format(e)) logger.error("exception traceback: {}".format(traceback.format_exc())) + raise return llumlet diff --git a/llumnix/manager.py b/llumnix/manager.py index 7f4187ab..b5a349ed 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -198,8 +198,6 @@ def update_instance_info_done_callback(instance_id: str, fut): else: logger.info("[_update_instance_info_loop] instance {} is dead".format(instance_id)) self.scale_down(instance_id) - logger.info("[_update_instance_info_loop] dead instances: {}.".format(ret)) - logger.info("[_update_instance_info_loop] dead instances: {}.".format(self.instances)) while True: try: From 2fa1a5f06799ac254c6277cf32448b1bc45cef9a Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 03:37:15 +0000 Subject: [PATCH 76/92] Minor --- llumnix/backends/vllm/llm_engine.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 7662f462..dec38700 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -210,8 +210,10 @@ def add_request(self, request_id: str, server_info: ServerInfo, expected_steps: if hasattr(server_info, 'request_timestamps'): server_info.request_timestamps.engine_add_request_timestamp = time.time() self.scheduler.waiting[-1] = SequenceGroupLlumnix(request_id, server_info, expected_steps, [seq_group.get_seqs()[0]], - seq_group.metrics.arrival_time, seq_group.sampling_params, seq_group.lora_request, - seq_group.multi_modal_data) + sampling_params=seq_group.sampling_params, + arrival_time=seq_group.metrics.arrival_time, + lora_request=seq_group.lora_request, + multi_modal_data=seq_group.multi_modal_data) def _start_put_queue_loop(self): while True: From e86be3eaec8ac57b4d89ab4f11f9d2ccd6ce682e Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 07:46:26 +0000 Subject: [PATCH 77/92] Update Quickstart --- docs/Quickstart.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/Quickstart.md b/docs/Quickstart.md index c01e2299..e0886fcb 100644 --- a/docs/Quickstart.md +++ b/docs/Quickstart.md @@ -98,7 +98,8 @@ HEAD_NODE=1 python -m llumnix.entrypoints.vllm.api_server \ --model $MODEL_PATH \ --engine-use-ray \ --worker-use-ray \ - --max-model-len 4096 + --max-model-len 4096 \ + --migration-backend rayrpc \ ``` `CONFIG_PATH` is the path to the configuration file for Llumnix, and we give an example configuration file [here](../configs/base.yml). `MODEL_PATH` defines the location of your model. `INITIAL_INSTANCES` determines the number of instances to be launched on the current node, From 658ae5502ddcb4ebf99fcbbd81ab073d912d4ec3 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 09:17:06 +0000 Subject: [PATCH 78/92] Add disable-keep-serve-process-alive --- llumnix/arg_utils.py | 1 + llumnix/config/default.py | 2 ++ llumnix/entrypoints/vllm/serve.py | 6 ++++-- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index e3a8d17c..82932841 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -59,6 +59,7 @@ class EntrypointsArgs: disable_log_requests_server: bool = None log_request_timestamps: bool = None config_file: str = None + disable_keep_serve_process_alive: bool = None def __post_init__(self): for attr in dataclasses.fields(self): diff --git a/llumnix/config/default.py b/llumnix/config/default.py index ca1fdb68..078607b4 100644 --- a/llumnix/config/default.py +++ b/llumnix/config/default.py @@ -44,6 +44,8 @@ _C.SERVER.LOG_REQUEST_TIMESTAMPS = False # Path to config file of arguments _C.SERVER.CONFIG_FILE = None +# Disable keep serve process alive +_C.SERVER.DISABLE_KEEP_SERVE_PROCESS_ALIVE = False # ----------------------------------------------------------------------------- # RAY CONFIGURATION diff --git a/llumnix/entrypoints/vllm/serve.py b/llumnix/entrypoints/vllm/serve.py index f0d7e7ef..3dce2db2 100644 --- a/llumnix/entrypoints/vllm/serve.py +++ b/llumnix/entrypoints/vllm/serve.py @@ -19,6 +19,7 @@ parser.add_argument("--ssl-keyfile", type=str) parser.add_argument("--ssl-certfile", type=str) parser.add_argument("--log-level", type=str) + parser.add_argument('--disable-keep-serve-process-alive', action='store_true') cli_args = add_cli_args(parser) cfg = get_llumnix_config(cli_args.config_file, cli_args) @@ -36,5 +37,6 @@ setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args) # keep the process alive to get the terminal output. - while True: - time.sleep(100.0) + if not entrypoints_args.disable_keep_serve_process_alive: + while True: + time.sleep(100.0) From 33231e5060e30a519fb597cbe834d4da1ae4c8c0 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 09:19:07 +0000 Subject: [PATCH 79/92] Fix typos in readme --- README.md | 2 +- docs/Quickstart.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 09902f69..de06122d 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ Llumnix is easy to use with: ## Getting Started -If you are already utilizing vLLM for multi-instance LLM serving deployments, simply replace the vLLM serving deployment command `python -m vllm.entrypoints.api_server ...` for each instance with the command provided below: +If you are already utilizing vLLM for multi-instance LLM serving deployments, simply replace the vLLM serving deployment command `python -m entrypoints.vllm.api_server ...` for each instance with the command provided below: ``` python -m llumnix.entrypoints.vllm.api_server \ --host $HOST \ diff --git a/docs/Quickstart.md b/docs/Quickstart.md index e0886fcb..a5c5d82c 100644 --- a/docs/Quickstart.md +++ b/docs/Quickstart.md @@ -69,7 +69,7 @@ Following these steps, Llumnix acts as the request scheduling layer situated beh ## Global Deployment -Llumnix also supports deploying multiple servers and instances at once by running `python -m vllm.entrypoints.serve`, which is named as global deployment. +Llumnix also supports deploying multiple servers and instances at once by running `python -m entrypoints.vllm.serve`, which is named as global deployment. ``` python -m llumnix.entrypoints.vllm.serve \ From 1f22a61f481792f9cc9bf9a031f6c2b2b91e347a Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 09:25:40 +0000 Subject: [PATCH 80/92] Simplify kill --- llumnix/utils.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/llumnix/utils.py b/llumnix/utils.py index 07b4a003..b19d042c 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -130,9 +130,6 @@ def get_instance_name(instance_id: str) -> str: def remove_placement_group(instance_id: str) -> bool: try: placement_group = ray.util.get_placement_group(get_placement_group_name(instance_id)) - except ValueError: - return False - try: # asynchronous api ray.util.remove_placement_group(placement_group) logger.info("remove placement group {}".format(instance_id)) @@ -144,9 +141,6 @@ def remove_placement_group(instance_id: str) -> bool: def kill_server(instance_id: str) -> bool: try: server = ray.get_actor(get_server_name(instance_id), namespace="llumnix") - except ValueError: - return False - try: ray.kill(server) logger.info("kill server {}".format(instance_id)) # pylint: disable=broad-except @@ -157,9 +151,6 @@ def kill_server(instance_id: str) -> bool: def kill_instance(instance_id: str) -> bool: try: instance = ray.get_actor(get_instance_name(instance_id), namespace="llumnix") - except ValueError: - return False - try: ray.kill(instance) logger.info("kill instance {}".format(instance_id)) # pylint: disable=broad-except From 846eef9d4849a18d58a8891c16d936d8194a97b4 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 09:28:10 +0000 Subject: [PATCH 81/92] Simplify initialize_placement_group --- llumnix/utils.py | 50 ++++++++++++++++-------------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/llumnix/utils.py b/llumnix/utils.py index b19d042c..44c18976 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -53,40 +53,22 @@ def initialize_placement_group( "serving.") lifetime = "detached" if detached else None - # Create placement group for worker processes - current_placement_group = ray.util.get_current_placement_group() - if current_placement_group: - # We are in a placement group - bundles = current_placement_group.bundle_specs - # Verify that we can use the placement group. - gpu_bundles = 0 - for bundle in bundles: - bundle_gpus = bundle.get("GPU", 0) - if bundle_gpus > 1: - raise ValueError( - "Placement group bundle cannot have more than 1 GPU.") - if bundle_gpus: - gpu_bundles += 1 - if num_gpus > gpu_bundles: - raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the placement group.") - else: - num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) - if num_gpus > num_gpus_in_cluster: - raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the cluster.") - # Create a new placement group - # bundle_0: Llumlet + AsyncPutQueueActor + ProxyActor, bundle_1: Workers - placement_group_specs = ([{"CPU": num_cpus}] + [{"GPU": 1}] * num_gpus) - current_placement_group = ray.util.placement_group( - placement_group_specs, "STRICT_PACK", name=placement_group_name, lifetime=lifetime) - # Wait until PG is ready - this will block until all - # requested resources are available, and will timeout - # if they cannot be provisioned. - if block: - ray.get(current_placement_group.ready(), timeout=1800) + + num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) + if num_gpus > num_gpus_in_cluster: + raise ValueError( + "The number of required GPUs exceeds the total number of " + "available GPUs in the cluster.") + # Create a new placement group + # bundle_0: Llumlet + AsyncPutQueueActor + ProxyActor, bundle_1: Workers + placement_group_specs = ([{"CPU": num_cpus}] + [{"GPU": 1}] * num_gpus) + current_placement_group = ray.util.placement_group( + placement_group_specs, "STRICT_PACK", name=placement_group_name, lifetime=lifetime) + # Wait until PG is ready - this will block until all + # requested resources are available, and will timeout + # if they cannot be provisioned. + if block: + ray.get(current_placement_group.ready(), timeout=1800) return current_placement_group From 050266877dcfae8965a371e6808d427f0c5e7a09 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 09:29:56 +0000 Subject: [PATCH 82/92] Add one log --- llumnix/entrypoints/bladellm/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llumnix/entrypoints/bladellm/client.py b/llumnix/entrypoints/bladellm/client.py index f95dae96..3eadd8fd 100644 --- a/llumnix/entrypoints/bladellm/client.py +++ b/llumnix/entrypoints/bladellm/client.py @@ -59,6 +59,7 @@ async def background_process_outputs(self): continue await self.request_streams[request_id].put(request_output) if request_output.is_finished: + logger.debug("client recv request output: {}".format(request_output)) del self.entrypoint_id2llumnix_id[self.llumnix_id2entrypoint_id[request_id]] del self.llumnix_id2entrypoint_id[request_id] del self.request_streams[request_id] From f1af4e42af7849e806ccd524aab054d221df68fe Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 09:38:16 +0000 Subject: [PATCH 83/92] Call scale_up in init_instances --- examlpes/offline_inference.py | 5 +---- llumnix/entrypoints/setup.py | 21 +------------------ llumnix/manager.py | 2 ++ .../global_scheduler/test_manager.py | 2 -- 4 files changed, 4 insertions(+), 26 deletions(-) diff --git a/examlpes/offline_inference.py b/examlpes/offline_inference.py index 9ff807d9..5148a9e8 100644 --- a/examlpes/offline_inference.py +++ b/examlpes/offline_inference.py @@ -38,8 +38,7 @@ engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True, trust_remote_code=True, max_model_len=370) -# Create a manager. If the manager is created first, and then the instances are created, manager.scale_up -# need to be called to add the newly created instances to the management of the manager. +# Create a manager. If the manager is created first, and then the instances are created. manager: Manager = init_manager(manager_args) ray.get(manager.is_ready.remote()) @@ -48,8 +47,6 @@ instances: List[Llumlet] = None instance_ids, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) -ray.get(manager.scale_up.remote(instance_ids, instances)) - # The requests‘ outputs will be put to the request_output_queue no matter which instance it's running in. server_id = random_uuid() request_output_queue = RayQueueServer() diff --git a/llumnix/entrypoints/setup.py b/llumnix/entrypoints/setup.py index 135e20c6..b9134f66 100644 --- a/llumnix/entrypoints/setup.py +++ b/llumnix/entrypoints/setup.py @@ -113,29 +113,10 @@ def init_llumnix_components(manager_args: ManagerArgs, instance_ids, instances = retry_manager_method_sync( manager.init_instances.remote, 'init_instances', request_output_queue_type, backend_type, engine_args) - available_instance_ids: List[str] = [] - dead_instance_ids: List[str] = [] - available_instances: List[Llumlet] = [] - ready_tasks = [instance.is_ready.remote() for instance in instances] - for idx, task in enumerate(ready_tasks): - try: - ray.get(task) - available_instance_ids.append(instance_ids[idx]) - available_instances.append(instances[idx]) - except ray.exceptions.RayActorError: - dead_instance_ids.append(instance_ids[idx]) - if len(dead_instance_ids) > 0: - retry_manager_method_sync(manager.scale_down.remote, 'scale_down', dead_instance_ids) - if len(available_instance_ids) > 0: - retry_manager_method_sync(manager.scale_up.remote, 'scale_up', - available_instance_ids, available_instances) - logger.info("Init Llumnix components done, {} instances are ready, instance_ids: {}." - .format(len(available_instance_ids), available_instance_ids)) - ip = get_ip_address() request_output_queue = init_request_output_queue_server(ip, request_output_queue_port, request_output_queue_type) - return manager, available_instance_ids, available_instances, request_output_queue + return manager, instance_ids, instances, request_output_queue def setup_entrypoints_context(entrypoints_args, manager, instance_ids, instances, request_output_queue) -> EntrypointsContext: instances_dict: Dict[str, Llumlet] = {} diff --git a/llumnix/manager.py b/llumnix/manager.py index b5a349ed..884a8a01 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -578,6 +578,8 @@ def init_instances(self, instance_ids.append(instance_id) instances.append(instance) + self.scale_up(instance_ids, instances) + return instance_ids, instances def _init_server_and_instance(self, diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index 0d3e5f97..5c4f74b9 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -183,7 +183,6 @@ def test_init_llumlet(ray_env, llumlet): def test_init_instances(ray_env, manager): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) instance_ids, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) - num_instances = ray.get(manager.scale_up.remote(instance_ids, instances)) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances @@ -194,7 +193,6 @@ def test_init_instances_sim(ray_env, manager): llumnix.backends.vllm.simulator.BackendSimVLLM = MockBackendSim engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) instance_ids, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) - num_instances = ray.get(manager.scale_up.remote(instance_ids, instances)) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances From 8d59d293f0379f1573902c2f79d059bc704771f3 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 09:43:20 +0000 Subject: [PATCH 84/92] Simplify FastAPIServerActor run --- llumnix/entrypoints/vllm/api_server_actor.py | 8 ++++++-- llumnix/manager.py | 4 +--- tests/unit_test/entrypoints/vllm/api_server_actor.py | 4 ++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/llumnix/entrypoints/vllm/api_server_actor.py b/llumnix/entrypoints/vllm/api_server_actor.py index 000cc51e..652b0e2d 100644 --- a/llumnix/entrypoints/vllm/api_server_actor.py +++ b/llumnix/entrypoints/vllm/api_server_actor.py @@ -24,7 +24,7 @@ def __init__(self, entrypoints_args: EntrypointsArgs): self.request_output_queue = init_request_output_queue_server( ip, self.request_output_queue_port, self.request_output_queue_type) - def setup_entrypoints_context(self, + def _setup_entrypoints_context(self, manager: "ray.actor.ActorHandle", instance_id: str, instance: Llumlet): @@ -52,7 +52,11 @@ def _run_uvicorn_server(self, ssl_keyfile=entrypoints_args.ssl_keyfile, ssl_certfile=entrypoints_args.ssl_certfile) - def run(self): + def run(self, + manager: "ray.actor.ActorHandle", + instance_id: str, + instance: Llumlet): + self._setup_entrypoints_context(manager, instance_id, instance) self.run_uvicorn_server_thread = threading.Thread( target=self._run_uvicorn_server, args=(self.entrypoints_args, self.entrypoints_context), daemon=True, name="run_uvicorn_server" diff --git a/llumnix/manager.py b/llumnix/manager.py index 884a8a01..ed42f168 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -588,10 +588,8 @@ def _init_server_and_instance(self, async def done_scale_up(): try: manager = ray.get_actor(get_manager_name(), namespace="llumnix") - await server.is_ready.remote() - await server.setup_entrypoints_context.remote(manager, instance_id, instance) await instance.is_ready.remote() - await server.run.remote() + await server.run.remote(manager, instance_id, instance) self.scale_up(instance_id, instance) # pylint: disable=broad-except except Exception as e: diff --git a/tests/unit_test/entrypoints/vllm/api_server_actor.py b/tests/unit_test/entrypoints/vllm/api_server_actor.py index c857a747..5e42ff61 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_actor.py +++ b/tests/unit_test/entrypoints/vllm/api_server_actor.py @@ -33,7 +33,6 @@ def __init__(self, entrypoints_args): self.request_output_queue = init_request_output_queue_client( QueueType(entrypoints_args.request_output_queue_type)) self.server = self.init_server(entrypoints_args) - ray.get(self.server.setup_entrypoints_context.remote()) ray.get(self.server.run.remote()) def init_server(self, entrypoints_args): @@ -59,13 +58,14 @@ def __init__(self, entrypoints_args): self.port = entrypoints_args.port self.request_output_queue_type = QueueType(entrypoints_args.request_output_queue_type) - def setup_entrypoints_context(self): + def _setup_entrypoints_context(self): self.entrypoints_context = setup_entrypoints_context(self.request_output_queue_type) def _run_uvicorn_server(self): run_uvicorn_server(self.host, self.port, self.entrypoints_context) def run(self): + self._setup_entrypoints_context() self.run_uvicorn_server_thread = threading.Thread( target=self._run_uvicorn_server, args=(), daemon=True, name="run_uvicorn_server" From 2a4639c601a7238ff83808148f7b31e02e062e19 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 09:44:20 +0000 Subject: [PATCH 85/92] Rename FastAPIServer to FastAPIServerActor --- llumnix/entrypoints/vllm/api_server_actor.py | 2 +- llumnix/manager.py | 8 ++++---- tests/unit_test/entrypoints/vllm/api_server_actor.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llumnix/entrypoints/vllm/api_server_actor.py b/llumnix/entrypoints/vllm/api_server_actor.py index 652b0e2d..e2bf0fbe 100644 --- a/llumnix/entrypoints/vllm/api_server_actor.py +++ b/llumnix/entrypoints/vllm/api_server_actor.py @@ -15,7 +15,7 @@ logger = init_logger(__name__) -class FastAPIServer: +class FastAPIServerActor: def __init__(self, entrypoints_args: EntrypointsArgs): self.entrypoints_args = entrypoints_args self.request_output_queue_port = self.entrypoints_args.request_output_queue_port diff --git a/llumnix/manager.py b/llumnix/manager.py index ed42f168..a24c3b45 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -41,7 +41,7 @@ from llumnix.entrypoints.utils import DeploymentMode from llumnix.backends.utils import get_engine_world_size from llumnix.queue.queue_type import QueueType -from llumnix.entrypoints.vllm.api_server_actor import FastAPIServer +from llumnix.entrypoints.vllm.api_server_actor import FastAPIServerActor logger = init_logger(__name__) @@ -537,13 +537,13 @@ def _init_placement_group(self, def _init_server(self, server_name: str, placement_group: PlacementGroup, - entrypoints_args: EntrypointsArgs) -> FastAPIServer: + entrypoints_args: EntrypointsArgs) -> FastAPIServerActor: entrypoints_args = copy.deepcopy(entrypoints_args) if self.manager_args.enable_port_increment: entrypoints_args.port += self.port_count entrypoints_args.request_output_queue_port += self.port_count self.port_count += 1 - fastapi_server = FastAPIServer.from_args(server_name, placement_group, entrypoints_args) + fastapi_server = FastAPIServerActor.from_args(server_name, placement_group, entrypoints_args) return fastapi_server def _init_instance(self, @@ -653,7 +653,7 @@ def check_instance_error_done_callback(idx: int, instance_id: str, fut): return results - def get_curr_deployment(self) -> Tuple[Dict[str, PlacementGroup], Dict[str, FastAPIServer], Dict[str, Llumlet]]: + def get_curr_deployment(self) -> Tuple[Dict[str, PlacementGroup], Dict[str, FastAPIServerActor], Dict[str, Llumlet]]: curr_pgs: Dict[str, PlacementGroup] = {} curr_servers: Dict[str, PlacementGroup] = {} curr_instances: Dict[str, Llumlet] = {} diff --git a/tests/unit_test/entrypoints/vllm/api_server_actor.py b/tests/unit_test/entrypoints/vllm/api_server_actor.py index 5e42ff61..95ae5eef 100644 --- a/tests/unit_test/entrypoints/vllm/api_server_actor.py +++ b/tests/unit_test/entrypoints/vllm/api_server_actor.py @@ -36,7 +36,7 @@ def __init__(self, entrypoints_args): ray.get(self.server.run.remote()) def init_server(self, entrypoints_args): - server = FastAPIServer.options(name=ENTRYPOINTS_ACTOR_NAME, + server = FastAPIServerActor.options(name=ENTRYPOINTS_ACTOR_NAME, namespace='llumnix').remote(entrypoints_args) return server @@ -52,7 +52,7 @@ def from_args(cls, entrypoints_args): @ray.remote(num_cpus=1, lifetime="detached") -class FastAPIServer: +class FastAPIServerActor: def __init__(self, entrypoints_args): self.host = entrypoints_args.host self.port = entrypoints_args.port From ae89c4e31385de1f3c6d589e749c53cdf0601a6c Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 09:48:36 +0000 Subject: [PATCH 86/92] Rename deployment to launch --- docs/Quickstart.md | 6 ++--- llumnix/arg_utils.py | 6 ++--- llumnix/entrypoints/bladellm/api_server.py | 8 +++---- llumnix/entrypoints/setup.py | 24 +++++++++---------- llumnix/entrypoints/utils.py | 2 +- llumnix/entrypoints/vllm/api_server.py | 8 +++---- llumnix/entrypoints/vllm/serve.py | 8 +++---- llumnix/manager.py | 22 ++++++++--------- tests/e2e_test/test_bench.py | 8 +++---- .../global_scheduler/test_manager.py | 18 +++++++------- 10 files changed, 55 insertions(+), 55 deletions(-) diff --git a/docs/Quickstart.md b/docs/Quickstart.md index a5c5d82c..5798ba3f 100644 --- a/docs/Quickstart.md +++ b/docs/Quickstart.md @@ -67,9 +67,9 @@ During the execution of serving deployment, Llumnix will: Following these steps, Llumnix acts as the request scheduling layer situated behind the multiple frontend API servers and above the multiple backend vLLM engine instances. This positioning allows Llumnix to significantly enhance serving performance through its dynamic, fine-grained, and KV-cache-aware request scheduling and rescheduling across instances. -## Global Deployment +## Centralized Deployment -Llumnix also supports deploying multiple servers and instances at once by running `python -m entrypoints.vllm.serve`, which is named as global deployment. +Llumnix also supports deploying multiple servers and instances at once by running `python -m entrypoints.vllm.serve`, which is named as centralized deployment. ``` python -m llumnix.entrypoints.vllm.serve \ @@ -79,7 +79,7 @@ python -m llumnix.entrypoints.vllm.serve \ ... ``` -Global deployment assumes that user has already launch a Ray cluter. Upon running the serve module, Llumnix will automatically connect to the existing Ray cluster, start the Llumnix components, and deploy multiple servers and instances to the Ray cluster until there is no more available gpus or cpus. +Centralized deployment assumes that user has already launch a Ray cluter. Upon running the serve module, Llumnix will automatically connect to the existing Ray cluster, start the Llumnix components, and deploy multiple servers and instances to the Ray cluster until there is no more available gpus or cpus. ## Ray Cluster Notice When you include the --launch-ray-cluster option in Llumnix's serving deployment command, Llumnix automatically builds a Ray cluster during the execution of serving deployment. This action will overwrite any existing Ray cluster. If this behavior is not desired, simply omit the --launch-ray-cluster option, and Llumnix will initiate its actor components within the current Ray cluster. diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index 82932841..c882914a 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -22,7 +22,7 @@ from llumnix.config import LlumnixConfig, get_llumnix_config from llumnix.config.default import _C from llumnix.backends.backend_interface import BackendType -from llumnix.entrypoints.utils import DeploymentMode +from llumnix.entrypoints.utils import LaunchMode class LlumnixArgumentParser(argparse.ArgumentParser): @@ -354,6 +354,6 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: return parser @dataclass -class DeploymentArgs: - deployment_mode: DeploymentMode = None +class LaunchArgs: + launch_mode: LaunchMode = None backend_type: BackendType = None diff --git a/llumnix/entrypoints/bladellm/api_server.py b/llumnix/entrypoints/bladellm/api_server.py index d56fe959..56a563b8 100644 --- a/llumnix/entrypoints/bladellm/api_server.py +++ b/llumnix/entrypoints/bladellm/api_server.py @@ -17,11 +17,11 @@ from llumnix.config import get_llumnix_config from llumnix.backends.backend_interface import BackendType from llumnix.arg_utils import (EntrypointsArgs, ManagerArgs, LlumnixArgumentParser, - DeploymentArgs) + LaunchArgs) from llumnix.entrypoints.setup import setup_ray_cluster, setup_llumnix from llumnix.entrypoints.bladellm.client import LlumnixClientBladeLLM from llumnix.entrypoints.bladellm.utils import get_args -from llumnix.entrypoints.utils import EntrypointsContext, DeploymentMode, is_gpu_available +from llumnix.entrypoints.utils import EntrypointsContext, LaunchMode, is_gpu_available def setup_llumnix_api_server(bladellm_args: ServingArgs, loop: asyncio.AbstractEventLoop): @@ -32,7 +32,7 @@ def setup_llumnix_api_server(bladellm_args: ServingArgs, loop: asyncio.AbstractE llumnix_config = get_llumnix_config(bladellm_args.llumnix_config) entrypoints_args, manager_args, engine_args = get_args(llumnix_config, llumnix_parser, bladellm_args) - deployment_args = DeploymentArgs(deployment_mode=DeploymentMode.LOCAL, backend_type=BackendType.VLLM) + launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.VLLM) setup_ray_cluster(entrypoints_args) @@ -40,7 +40,7 @@ def setup_llumnix_api_server(bladellm_args: ServingArgs, loop: asyncio.AbstractE # if gpu is not available, it means that this node is head pod x any llumnix components if is_gpu_available(): llumnix_context: EntrypointsContext = \ - setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args) + setup_llumnix(manager_args, entrypoints_args, engine_args, launch_args) llumnix_client = LlumnixClientBladeLLM(bladellm_args, llumnix_context, loop) return llumnix_client diff --git a/llumnix/entrypoints/setup.py b/llumnix/entrypoints/setup.py index b9134f66..16f2a5f3 100644 --- a/llumnix/entrypoints/setup.py +++ b/llumnix/entrypoints/setup.py @@ -22,12 +22,12 @@ from llumnix.llumlet.llumlet import Llumlet from llumnix.logger import init_logger from llumnix.utils import random_uuid, get_manager_name -from llumnix.arg_utils import ManagerArgs, EntrypointsArgs, DeploymentArgs +from llumnix.arg_utils import ManagerArgs, EntrypointsArgs, LaunchArgs from llumnix.queue.queue_type import QueueType from llumnix.server_info import ServerInfo from llumnix.queue.utils import init_request_output_queue_server from llumnix.entrypoints.utils import EntrypointsContext, get_ip_address, retry_manager_method_sync -from llumnix.entrypoints.utils import DeploymentMode +from llumnix.entrypoints.utils import LaunchMode from llumnix.backends.backend_interface import BackendType from llumnix.queue.queue_server_base import QueueServerBase @@ -89,14 +89,14 @@ def setup_ray_cluster(entrypoints_args) -> None: def init_manager(manager_args: ManagerArgs, entrypoints_args: EntrypointsArgs = None, engine_args = None, - deployment_args: DeploymentArgs = None, + launch_args: LaunchArgs = None, ) -> Manager: # Only one instance create the manager actor, the other instances get the existing manager actor through ray. try: manager = Manager.from_args(manager_args=manager_args, entrypoints_args=entrypoints_args, engine_args=engine_args, - deployment_args=deployment_args) + launch_args=launch_args) logger.info("Init Manager on current node.") except ValueError: manager = ray.get_actor(get_manager_name(), namespace='llumnix') @@ -143,21 +143,21 @@ def setup_entrypoints_context(entrypoints_args, manager, instance_ids, instances log_request_timestamps) return entrypoints_context -def _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment_args) -> EntrypointsContext: +def _setup_llumnix_local(manager_args, entrypoints_args, engine_args, launch_args) -> EntrypointsContext: manager, instance_ids, instances, request_output_queue = \ init_llumnix_components(manager_args, engine_args, QueueType(entrypoints_args.request_output_queue_type), entrypoints_args.request_output_queue_port, - deployment_args.backend_type) + launch_args.backend_type) return setup_entrypoints_context(entrypoints_args, manager, instance_ids, instances, request_output_queue) -def _setup_llumnix_global(manager_args, entrypoints_args, engine_args, deployment_args) -> None: - _ = init_manager(manager_args, entrypoints_args, engine_args, deployment_args) +def _setup_llumnix_global(manager_args, entrypoints_args, engine_args, launch_args) -> None: + _ = init_manager(manager_args, entrypoints_args, engine_args, launch_args) -def setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args) -> Optional[EntrypointsContext]: - if deployment_args.deployment_mode == DeploymentMode.LOCAL: - return _setup_llumnix_local(manager_args, entrypoints_args, engine_args, deployment_args) +def setup_llumnix(manager_args, entrypoints_args, engine_args, launch_args) -> Optional[EntrypointsContext]: + if launch_args.launch_mode == LaunchMode.LOCAL: + return _setup_llumnix_local(manager_args, entrypoints_args, engine_args, launch_args) - return _setup_llumnix_global(manager_args, entrypoints_args, engine_args, deployment_args) + return _setup_llumnix_global(manager_args, entrypoints_args, engine_args, launch_args) diff --git a/llumnix/entrypoints/utils.py b/llumnix/entrypoints/utils.py index 691a050f..31c3fa28 100644 --- a/llumnix/entrypoints/utils.py +++ b/llumnix/entrypoints/utils.py @@ -14,7 +14,7 @@ logger = init_logger(__name__) -class DeploymentMode(str, Enum): +class LaunchMode(str, Enum): LOCAL = "LOCAL" GLOBAL = "GLOBAL" diff --git a/llumnix/entrypoints/vllm/api_server.py b/llumnix/entrypoints/vllm/api_server.py index dcb589c1..d297dcd9 100644 --- a/llumnix/entrypoints/vllm/api_server.py +++ b/llumnix/entrypoints/vllm/api_server.py @@ -22,7 +22,7 @@ from vllm.sampling_params import SamplingParams -from llumnix.arg_utils import LlumnixArgumentParser, DeploymentArgs +from llumnix.arg_utils import LlumnixArgumentParser, LaunchArgs from llumnix.entrypoints.setup import setup_ray_cluster, setup_llumnix from llumnix.entrypoints.utils import init_per_token_latency_breakdown_dict, record_per_token_latency_breakdown from llumnix.entrypoints.vllm.arg_utils import add_cli_args, get_args @@ -31,7 +31,7 @@ from llumnix.utils import random_uuid from llumnix.config import get_llumnix_config from llumnix.backends.backend_interface import BackendType -from llumnix.entrypoints.utils import DeploymentMode, is_gpu_available +from llumnix.entrypoints.utils import LaunchMode, is_gpu_available # Code file with __main__ should set the logger name to inherit the llumnix logger configuration. logger = init_logger("llumnix.entrypoints.vllm.api_server") @@ -182,14 +182,14 @@ async def is_ready() -> bool: cfg = get_llumnix_config(cli_args.config_file, cli_args) entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args) - deployment_args = DeploymentArgs(deployment_mode=DeploymentMode.LOCAL, backend_type=BackendType.VLLM) + launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.VLLM) # Launch or connect to the ray cluster for multi-node serving. setup_ray_cluster(entrypoints_args) # if gpu is not available, it means that this node is head pod without any llumnix components if is_gpu_available(): - entrypoints_context = setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args) + entrypoints_context = setup_llumnix(manager_args, entrypoints_args, engine_args, launch_args) llumnix_client = LlumnixClientVLLM(entrypoints_context) # Start the api server after all the components of llumnix are ready. diff --git a/llumnix/entrypoints/vllm/serve.py b/llumnix/entrypoints/vllm/serve.py index 3dce2db2..fc865403 100644 --- a/llumnix/entrypoints/vllm/serve.py +++ b/llumnix/entrypoints/vllm/serve.py @@ -5,8 +5,8 @@ from llumnix.entrypoints.vllm.arg_utils import add_cli_args, get_args from llumnix.entrypoints.setup import connect_to_ray_cluster from llumnix.config import get_llumnix_config -from llumnix.arg_utils import LlumnixArgumentParser, DeploymentArgs -from llumnix.entrypoints.utils import DeploymentMode +from llumnix.arg_utils import LlumnixArgumentParser, LaunchArgs +from llumnix.entrypoints.utils import LaunchMode from llumnix.backends.backend_interface import BackendType from llumnix.entrypoints.setup import setup_llumnix @@ -25,7 +25,7 @@ cfg = get_llumnix_config(cli_args.config_file, cli_args) entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args) - deployment_args = DeploymentArgs(deployment_mode=DeploymentMode.GLOBAL, backend_type=BackendType.VLLM) + launch_args = LaunchArgs(launch_mode=LaunchMode.GLOBAL, backend_type=BackendType.VLLM) # Assume that there is an existing ray cluster when using centralized deployment. connect_to_ray_cluster() @@ -34,7 +34,7 @@ request_output_queue = RayQueue(actor_options={"namespace": "llumnix", "name": "magic_ray_queue"}) - setup_llumnix(manager_args, entrypoints_args, engine_args, deployment_args) + setup_llumnix(manager_args, entrypoints_args, engine_args, launch_args) # keep the process alive to get the terminal output. if not entrypoints_args.disable_keep_serve_process_alive: diff --git a/llumnix/manager.py b/llumnix/manager.py index a24c3b45..496edbac 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -30,7 +30,7 @@ from llumnix.global_scheduler.migration_scheduler import PairMigrationConstraints from llumnix.global_scheduler.migration_filter import CustomFilter from llumnix.instance_info import InstanceInfo -from llumnix.arg_utils import ManagerArgs, EntrypointsArgs, DeploymentArgs +from llumnix.arg_utils import ManagerArgs, EntrypointsArgs, LaunchArgs from llumnix.server_info import ServerInfo from llumnix.backends.backend_interface import BackendType from llumnix.utils import (random_uuid, clear_gloo_backend_state, remove_placement_group, @@ -38,7 +38,7 @@ SERVER_NAME_PREFIX, get_placement_group_name, run_async_func_sync, kill_server, kill_instance, initialize_placement_group, get_server_name) -from llumnix.entrypoints.utils import DeploymentMode +from llumnix.entrypoints.utils import LaunchMode from llumnix.backends.utils import get_engine_world_size from llumnix.queue.queue_type import QueueType from llumnix.entrypoints.vllm.api_server_actor import FastAPIServerActor @@ -64,7 +64,7 @@ def __init__(self, work_dir: str, entrypoints_args: EntrypointsArgs = None, engine_args = None, - deployment_args: DeploymentArgs = None + launch_args: LaunchArgs = None ) -> None: os.chdir(work_dir) self.actor_name = get_manager_name() @@ -72,12 +72,12 @@ def __init__(self, # engine_args and entrypoints_args are used in global deployment. self.entrypoints_args = entrypoints_args self.engine_args = engine_args - self.deployment_args = deployment_args + self.launch_args = launch_args - # deployment args - if deployment_args is not None: - self.deployment_mode: DeploymentMode = deployment_args.deployment_mode - self.backend_type: BackendType = deployment_args.backend_type + # launch args + if launch_args is not None: + self.launch_mode: LaunchMode = launch_args.launch_mode + self.backend_type: BackendType = launch_args.backend_type # migration args self.enable_migration = manager_args.enable_migration @@ -132,7 +132,7 @@ def __init__(self, asyncio.create_task(self._clear_request_instance_loop(CLEAR_REQUEST_INSTANCE_INTERVAL)) self.port_count = 0 - if hasattr(self, "deployment_mode") and self.deployment_mode == DeploymentMode.GLOBAL: + if hasattr(self, "launch_mode") and self.launch_mode == LaunchMode.GLOBAL: assert self.entrypoints_args is not None and self.engine_args is not None self.last_timeout_instance_id = None asyncio.create_task(self._auto_scale_up_loop(AUTO_SCALE_UP_INTERVAL)) @@ -499,7 +499,7 @@ def from_args(cls, manager_args: ManagerArgs, entrypoints_args: EntrypointsArgs = None, engine_args = None, - deployment_args: DeploymentArgs = None, + launch_args: LaunchArgs = None, ) -> "Manager": manager_class = ray.remote(num_cpus=1, max_restarts=-1, @@ -510,7 +510,7 @@ def from_args(cls, os.getcwd(), entrypoints_args, engine_args, - deployment_args) + launch_args) return manager diff --git a/tests/e2e_test/test_bench.py b/tests/e2e_test/test_bench.py index 21d35f19..5567db1a 100644 --- a/tests/e2e_test/test_bench.py +++ b/tests/e2e_test/test_bench.py @@ -64,12 +64,12 @@ def get_markdown_data(key: str, head_name: str): @pytest.mark.asyncio @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="at least 1 gpus required for simple benchmark") @pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B']) -@pytest.mark.parametrize("deployment_mode", ['global', 'local']) -async def test_simple_benchmark(ray_env, shutdown_llumnix_service, model, deployment_mode): +@pytest.mark.parametrize("launch_mode", ['global', 'local']) +async def test_simple_benchmark(ray_env, shutdown_llumnix_service, model, launch_mode): ip = "127.0.0.1" base_port = 37037 ip_ports = [] - if deployment_mode == 'local': + if launch_mode == 'local': device_count = torch.cuda.device_count() for i in range(device_count): port = base_port+i @@ -129,7 +129,7 @@ def run_bench_command(command): process.kill() assert False, "bench_test timed out after {} minutes.".format(BENCH_TEST_TIMEOUT_MINS) - if deployment_mode == 'local': + if launch_mode == 'local': with open("performance.txt", "w", encoding="utf-8") as f: f.write(parse_log_file()) diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index 5c4f74b9..8b00b630 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -19,7 +19,7 @@ from vllm import EngineArgs -from llumnix.arg_utils import ManagerArgs, EntrypointsArgs, DeploymentArgs +from llumnix.arg_utils import ManagerArgs, EntrypointsArgs, LaunchArgs from llumnix.manager import Manager from llumnix.instance_info import InstanceInfo from llumnix.server_info import ServerInfo @@ -28,7 +28,7 @@ from llumnix.backends.vllm.simulator import BackendSimVLLM from llumnix.backends.backend_interface import BackendType from llumnix.backends.profiling import LatencyMemData -from llumnix.entrypoints.utils import DeploymentMode +from llumnix.entrypoints.utils import LaunchMode from llumnix.utils import (get_placement_group_name, get_server_name, get_instance_name, remove_placement_group, INSTANCE_NAME_PREFIX, kill_server, kill_instance, random_uuid, get_manager_name) @@ -116,17 +116,17 @@ def init_manager(): ray.get(manager.is_ready.remote()) return manager -def init_manager_with_deployment_mode(deployment_mode, request_output_queue_type="rayqueue"): +def init_manager_with_launch_mode(launch_mode, request_output_queue_type="rayqueue"): manager_args = ManagerArgs(migration_backend="rayrpc", enable_port_increment=True) entrypoints_args = EntrypointsArgs(host="127.0.0.1", port=8000, request_output_queue_type=request_output_queue_type) engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - deployment_args = DeploymentArgs(deployment_mode=deployment_mode, backend_type=BackendType.VLLM) + launch_args = LaunchArgs(launch_mode=launch_mode, backend_type=BackendType.VLLM) manager = Manager.from_args(manager_args=manager_args, entrypoints_args=entrypoints_args, engine_args=engine_args, - deployment_args=deployment_args) + launch_args=launch_args) ray.get(manager.is_ready.remote()) - return manager, manager_args, entrypoints_args, engine_args, deployment_args + return manager, manager_args, entrypoints_args, engine_args, launch_args def init_instances(initial_instances): instance_ids = [] @@ -315,7 +315,7 @@ def test_update_instance_info_loop_and_migrate(ray_env, manager): assert num_migrate_in == 0 and num_migrate_out == 0 def test_init_server_and_instance_and_clear_instance_ray_resources(ray_env): - manager, _, _, engine_args, _ = init_manager_with_deployment_mode(DeploymentMode.LOCAL) + manager, _, _, engine_args, _ = init_manager_with_launch_mode(LaunchMode.LOCAL) instance_id = random_uuid() pg = ray.get(manager._init_placement_group.remote(get_placement_group_name(instance_id), engine_args, BackendType.VLLM, init_server=True)) @@ -344,7 +344,7 @@ def test_init_server_and_instance_and_clear_instance_ray_resources(ray_env): @pytest.mark.parametrize("request_output_queue_type", ['rayqueue', 'zmq']) def test_auto_scale_up_loop_and_get_curr_deployment(ray_env, request_output_queue_type): - manager, _, _, _, _ = init_manager_with_deployment_mode(DeploymentMode.GLOBAL, request_output_queue_type) + manager, _, _, _, _ = init_manager_with_launch_mode(LaunchMode.GLOBAL, request_output_queue_type) time.sleep(30.0) num_instances = ray.get(manager.scale_up.remote([], [])) assert num_instances == 4 @@ -365,7 +365,7 @@ def test_auto_scale_up_loop_and_get_curr_deployment(ray_env, request_output_queu @pytest.mark.parametrize("request_output_queue_type", ['rayqueue', 'zmq']) def test_check_deployment_states_loop_and_auto_scale_up_loop(ray_env, request_output_queue_type): - manager, _, _, _, _ = init_manager_with_deployment_mode(DeploymentMode.GLOBAL, request_output_queue_type) + manager, _, _, _, _ = init_manager_with_launch_mode(LaunchMode.GLOBAL, request_output_queue_type) time.sleep(30.0) num_instances = ray.get(manager.scale_up.remote([], [])) assert num_instances == 4 From 3ef7ca045ca829d1f6b6a96ee111617a71bc451f Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 10:54:24 +0000 Subject: [PATCH 87/92] Support port_offset kv store --- llumnix/manager.py | 14 +++++++++----- llumnix/utils.py | 31 +++++++++++++++++++++++++++++++ test.py | 19 +++++++++++++++++++ 3 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 test.py diff --git a/llumnix/manager.py b/llumnix/manager.py index 496edbac..51ac4cb0 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -37,7 +37,8 @@ get_instance_name, get_manager_name, INSTANCE_NAME_PREFIX, SERVER_NAME_PREFIX, get_placement_group_name, run_async_func_sync, kill_server, kill_instance, initialize_placement_group, - get_server_name) + get_server_name, get_actor_data_from_ray_internal_kv, + put_actor_data_to_ray_internal_kv) from llumnix.entrypoints.utils import LaunchMode from llumnix.backends.utils import get_engine_world_size from llumnix.queue.queue_type import QueueType @@ -131,7 +132,8 @@ def __init__(self, asyncio.create_task(self._update_instance_info_loop(self.polling_interval)) asyncio.create_task(self._clear_request_instance_loop(CLEAR_REQUEST_INSTANCE_INTERVAL)) - self.port_count = 0 + value = get_actor_data_from_ray_internal_kv("manager", "port_offset") + self.port_offset = 0 if value is None else int(value) if hasattr(self, "launch_mode") and self.launch_mode == LaunchMode.GLOBAL: assert self.entrypoints_args is not None and self.engine_args is not None self.last_timeout_instance_id = None @@ -539,10 +541,12 @@ def _init_server(self, placement_group: PlacementGroup, entrypoints_args: EntrypointsArgs) -> FastAPIServerActor: entrypoints_args = copy.deepcopy(entrypoints_args) + # TODO(s5u13b): Temporary workaround for port conflict, will be failed when manager restarts. if self.manager_args.enable_port_increment: - entrypoints_args.port += self.port_count - entrypoints_args.request_output_queue_port += self.port_count - self.port_count += 1 + entrypoints_args.port += self.port_offset + entrypoints_args.request_output_queue_port += self.port_offset + self.port_offset += 1 + put_actor_data_to_ray_internal_kv("manager", "port_offset", self.port_offset) fastapi_server = FastAPIServerActor.from_args(server_name, placement_group, entrypoints_args) return fastapi_server diff --git a/llumnix/utils.py b/llumnix/utils.py index 44c18976..23bfd2a5 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -14,8 +14,16 @@ import uuid import asyncio import threading +from typing import Any, Union import ray from ray.util.placement_group import PlacementGroup +import ray.cloudpickle as pickle +from ray.experimental.internal_kv import ( + _internal_kv_del, + _internal_kv_get, + _internal_kv_initialized, + _internal_kv_put, +) from llumnix.logger import init_logger @@ -150,3 +158,26 @@ def run_task(): thread = threading.Thread(target=run_task) thread.start() thread.join() + +def _make_key(actor_name: str, data_name: str): + """Generate a binary key for the given actor name and data. + + Args: + actor_name: The name of the actor + data_name: The data member of the actor + + Returns: + The key to use for storing a the value. + """ + return (actor_name.encode("ascii") + b"." + data_name.encode("ascii")) + +def get_actor_data_from_ray_internal_kv(actor_name: str, data_name: str) -> Union[str, None]: + value = None + if _internal_kv_initialized(): + value = _internal_kv_get(_make_key(actor_name, data_name)) + print(f"value: {value}") + return value if value is None else value.decode() + +def put_actor_data_to_ray_internal_kv(actor_name: str, data_name: str, value: Any): + if _internal_kv_initialized(): + _internal_kv_put(_make_key(actor_name, data_name), f"{value}".encode(), overwrite=True) diff --git a/test.py b/test.py new file mode 100644 index 00000000..a06586b0 --- /dev/null +++ b/test.py @@ -0,0 +1,19 @@ +import ray + +from llumnix.utils import get_actor_data_from_ray_internal_kv, put_actor_data_to_ray_internal_kv + + +@ray.remote(num_cpus=1) +class Manager: + def __init__(self): + value = get_actor_data_from_ray_internal_kv("manager", "port_offset") + self.port_offset = 0 if value is None else int(value) + print("[__init__] self.port_offset: {}".format(self.port_offset)) + + def put(self): + self.port_offset = 2 + put_actor_data_to_ray_internal_kv("manager", "port_offset", self.port_offset) + print("[put] self.port_offset: {}".format(self.port_offset)) + +manager = Manager.remote() +ray.get(manager.put.remote()) From 99ee5617c884497b64def3a528b30174bdd895f0 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 10:56:15 +0000 Subject: [PATCH 88/92] Fix lint --- llumnix/utils.py | 2 -- tests/unit_test/global_scheduler/test_manager.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/llumnix/utils.py b/llumnix/utils.py index 23bfd2a5..675d0d69 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -17,9 +17,7 @@ from typing import Any, Union import ray from ray.util.placement_group import PlacementGroup -import ray.cloudpickle as pickle from ray.experimental.internal_kv import ( - _internal_kv_del, _internal_kv_get, _internal_kv_initialized, _internal_kv_put, diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index 8b00b630..7140120d 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -182,7 +182,7 @@ def test_init_llumlet(ray_env, llumlet): def test_init_instances(ray_env, manager): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - instance_ids, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) + _, _ = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances @@ -192,7 +192,7 @@ def test_init_instances_sim(ray_env, manager): import llumnix.backends.vllm.simulator llumnix.backends.vllm.simulator.BackendSimVLLM = MockBackendSim engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - instance_ids, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) + _, _ = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances From 5dcefb9a8136d4a943888dd6816a56bd96d2028e Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 10:56:53 +0000 Subject: [PATCH 89/92] Minor --- llumnix/manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llumnix/manager.py b/llumnix/manager.py index 51ac4cb0..69cd3cad 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -541,7 +541,6 @@ def _init_server(self, placement_group: PlacementGroup, entrypoints_args: EntrypointsArgs) -> FastAPIServerActor: entrypoints_args = copy.deepcopy(entrypoints_args) - # TODO(s5u13b): Temporary workaround for port conflict, will be failed when manager restarts. if self.manager_args.enable_port_increment: entrypoints_args.port += self.port_offset entrypoints_args.request_output_queue_port += self.port_offset From a27f9e82c2fa8b02435335ec729362a6f7e6426d Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 11:03:06 +0000 Subject: [PATCH 90/92] Refine key value store function log --- llumnix/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llumnix/utils.py b/llumnix/utils.py index 675d0d69..08769ca8 100644 --- a/llumnix/utils.py +++ b/llumnix/utils.py @@ -173,9 +173,12 @@ def get_actor_data_from_ray_internal_kv(actor_name: str, data_name: str) -> Unio value = None if _internal_kv_initialized(): value = _internal_kv_get(_make_key(actor_name, data_name)) - print(f"value: {value}") - return value if value is None else value.decode() + if value is not None: + value = value.decode() + logger.info("get {}.{} from ray internal key value store, value: {}".format(actor_name, data_name, value)) + return value def put_actor_data_to_ray_internal_kv(actor_name: str, data_name: str, value: Any): if _internal_kv_initialized(): _internal_kv_put(_make_key(actor_name, data_name), f"{value}".encode(), overwrite=True) + logger.debug("put {}.{} to ray internal key value store, value: {}".format(actor_name, data_name, value)) From fb1b841e26afbe45b578fcd5a825611b92de740f Mon Sep 17 00:00:00 2001 From: s5u13b Date: Thu, 9 Jan 2025 11:12:28 +0000 Subject: [PATCH 91/92] FIx manager unit test --- test.py | 19 ------------------- .../global_scheduler/test_manager.py | 6 ++++-- 2 files changed, 4 insertions(+), 21 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index a06586b0..00000000 --- a/test.py +++ /dev/null @@ -1,19 +0,0 @@ -import ray - -from llumnix.utils import get_actor_data_from_ray_internal_kv, put_actor_data_to_ray_internal_kv - - -@ray.remote(num_cpus=1) -class Manager: - def __init__(self): - value = get_actor_data_from_ray_internal_kv("manager", "port_offset") - self.port_offset = 0 if value is None else int(value) - print("[__init__] self.port_offset: {}".format(self.port_offset)) - - def put(self): - self.port_offset = 2 - put_actor_data_to_ray_internal_kv("manager", "port_offset", self.port_offset) - print("[put] self.port_offset: {}".format(self.port_offset)) - -manager = Manager.remote() -ray.get(manager.put.remote()) diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index 7140120d..a2dbcf89 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -182,7 +182,8 @@ def test_init_llumlet(ray_env, llumlet): def test_init_instances(ray_env, manager): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - _, _ = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) + _, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) + num_instances = len(instances) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances @@ -192,7 +193,8 @@ def test_init_instances_sim(ray_env, manager): import llumnix.backends.vllm.simulator llumnix.backends.vllm.simulator.BackendSimVLLM = MockBackendSim engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - _, _ = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) + _, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) + num_instances = len(instances) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances From 8d8984dba9e97a5f0e7897d446dd8d29d25d5168 Mon Sep 17 00:00:00 2001 From: s5u13b Date: Fri, 10 Jan 2025 02:20:02 +0000 Subject: [PATCH 92/92] Refine simulator mode --- docs/Arguments.md | 4 +++ llumnix/arg_utils.py | 7 +++++ llumnix/backends/backend_interface.py | 5 +++ llumnix/backends/profiling.py | 3 +- llumnix/backends/utils.py | 31 +++++++++---------- llumnix/config/default.py | 2 ++ llumnix/entrypoints/bladellm/api_server.py | 3 +- llumnix/entrypoints/vllm/api_server.py | 3 +- llumnix/entrypoints/vllm/serve.py | 1 + llumnix/llumlet/llumlet.py | 2 +- llumnix/manager.py | 3 +- .../global_scheduler/test_manager.py | 2 +- 12 files changed, 43 insertions(+), 23 deletions(-) diff --git a/docs/Arguments.md b/docs/Arguments.md index 4374f3b2..f5be9210 100644 --- a/docs/Arguments.md +++ b/docs/Arguments.md @@ -38,6 +38,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] [--disable-log-requests-manager] [--log-instance-info] [--log-filename LOG_FILENAME] + [--simulator-mode] [--profiling-result-file-path PROFILING_RESULT_FILE_PATH] [--gpu-type GPU_TYPE] [--migration-backend {gloo,nccl,rayrpc,grpc,kvtransfer}] @@ -181,6 +182,9 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] - Log filename. - Default: "server.log" +`--simulator-mode` +- Enable simulator mode. + `--profiling-result-file-path` - Profiling result file path when using simulator. - Default: None diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index c882914a..c3a7d9ff 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -136,6 +136,7 @@ class ManagerArgs: disable_log_requests_manager: bool = None log_instance_info: bool = None log_filename: str = None + simulator_mode: bool = None profiling_result_file_path: str = None migration_backend: str = None @@ -218,6 +219,9 @@ def check_args(cls, args: 'ManagerArgs', parser: argparse.ArgumentParser): ("When using kvTransfer as migration backend, " "do not set --migration-backend-transfer-type as empty.") + assert not args.simulator_mode or args.profiling_result_file_path is not None, \ + "Set profiling_result_file_path args when enable simulator mode" + @staticmethod def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.add_argument('--initial-instances', @@ -309,6 +313,9 @@ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.add_argument('--profiling-result-file-path', type=str, help='profiling result file path when using simulator') + parser.add_argument('--simulator-mode', + action='store_true', + help='enable simulator mode') parser.add_argument('--migration-backend', type=str, choices=['gloo','nccl','rayrpc','grpc','kvtransfer'], diff --git a/llumnix/backends/backend_interface.py b/llumnix/backends/backend_interface.py index 257c2189..d8631b84 100644 --- a/llumnix/backends/backend_interface.py +++ b/llumnix/backends/backend_interface.py @@ -29,6 +29,11 @@ class EngineState(str, Enum): class BackendType(str, Enum): VLLM = "VLLM" BLADELLM = "BLADELLM" + SIM_VLLM = "SIM_VLLM" + + @staticmethod + def is_sim_backend(status: "BackendType") -> bool: + return status in [BackendType.SIM_VLLM] class BackendInterface(ABC): diff --git a/llumnix/backends/profiling.py b/llumnix/backends/profiling.py index cf21fcc4..b79afcc1 100644 --- a/llumnix/backends/profiling.py +++ b/llumnix/backends/profiling.py @@ -178,7 +178,8 @@ def model_decode(x, a, b, c): return a * bs + b * tot_seq_len + c def get_latency_mem(backend_type: BackendType, profiling_database: ProfilingDatabase, **backend_args): - if backend_type == BackendType.VLLM: + assert BackendType.is_sim_backend(backend_type) + if backend_type == BackendType.SIM_VLLM: # TODO(ZeldaHuang): support multi-lora, more device, vision language model model_config = backend_args.get("model_config") _ = backend_args.get("cache_config") diff --git a/llumnix/backends/utils.py b/llumnix/backends/utils.py index 501d8a54..8659c016 100644 --- a/llumnix/backends/utils.py +++ b/llumnix/backends/utils.py @@ -71,22 +71,12 @@ def init_backend_engine(instance_id: str, profiling_result_file_path: str = None) -> BackendInterface: if backend_type == BackendType.VLLM: # pylint: disable=import-outside-toplevel - if profiling_result_file_path is None: - from llumnix.backends.vllm.llm_engine import BackendVLLM - backend_engine = BackendVLLM(instance_id, - placement_group, - request_output_queue_type, - migration_config, - engine_args) - else: - # pylint: disable=import-outside-toplevel - from llumnix.backends.vllm.simulator import BackendSimVLLM - backend_engine = BackendSimVLLM(instance_id, - placement_group, - request_output_queue_type, - migration_config, - engine_args, - profiling_result_file_path) + from llumnix.backends.vllm.llm_engine import BackendVLLM + backend_engine = BackendVLLM(instance_id, + placement_group, + request_output_queue_type, + migration_config, + engine_args) elif backend_type == BackendType.BLADELLM: # pylint: disable=import-outside-toplevel from llumnix.backends.bladellm.llm_engine import BackendBladeLLM @@ -95,6 +85,15 @@ def init_backend_engine(instance_id: str, request_output_queue_type, migration_config, engine_args) + elif backend_type == BackendType.SIM_VLLM: + # pylint: disable=import-outside-toplevel + from llumnix.backends.vllm.simulator import BackendSimVLLM + backend_engine = BackendSimVLLM(instance_id, + placement_group, + request_output_queue_type, + migration_config, + engine_args, + profiling_result_file_path) else: raise ValueError(f'Unsupported backend: {backend_type}') return backend_engine diff --git a/llumnix/config/default.py b/llumnix/config/default.py index 078607b4..ec6d060e 100644 --- a/llumnix/config/default.py +++ b/llumnix/config/default.py @@ -69,6 +69,8 @@ _C.MANAGER.LOG_INSTANCE_INFO = False # Log filename _C.MANAGER.LOG_FILENAME = "server.log" +# Enable simulator mode +_C.MANAGER.SIMULATOR_MODE = False # Profiling result file path when using simulator _C.MANAGER.PROFILING_RESULT_FILE_PATH = None # Enable port increment when deploying multiple servers diff --git a/llumnix/entrypoints/bladellm/api_server.py b/llumnix/entrypoints/bladellm/api_server.py index 56a563b8..537798f5 100644 --- a/llumnix/entrypoints/bladellm/api_server.py +++ b/llumnix/entrypoints/bladellm/api_server.py @@ -32,7 +32,8 @@ def setup_llumnix_api_server(bladellm_args: ServingArgs, loop: asyncio.AbstractE llumnix_config = get_llumnix_config(bladellm_args.llumnix_config) entrypoints_args, manager_args, engine_args = get_args(llumnix_config, llumnix_parser, bladellm_args) - launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.VLLM) + assert not manager_args.simulator_mode, "Only support the simulator mode for vLLM." + launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.BLADELLM) setup_ray_cluster(entrypoints_args) diff --git a/llumnix/entrypoints/vllm/api_server.py b/llumnix/entrypoints/vllm/api_server.py index d297dcd9..a1e1b955 100644 --- a/llumnix/entrypoints/vllm/api_server.py +++ b/llumnix/entrypoints/vllm/api_server.py @@ -182,7 +182,8 @@ async def is_ready() -> bool: cfg = get_llumnix_config(cli_args.config_file, cli_args) entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args) - launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=BackendType.VLLM) + backend_type = BackendType.VLLM if not manager_args.simulator_mode else BackendType.SIM_VLLM + launch_args = LaunchArgs(launch_mode=LaunchMode.LOCAL, backend_type=backend_type) # Launch or connect to the ray cluster for multi-node serving. setup_ray_cluster(entrypoints_args) diff --git a/llumnix/entrypoints/vllm/serve.py b/llumnix/entrypoints/vllm/serve.py index fc865403..a73f1ce9 100644 --- a/llumnix/entrypoints/vllm/serve.py +++ b/llumnix/entrypoints/vllm/serve.py @@ -25,6 +25,7 @@ cfg = get_llumnix_config(cli_args.config_file, cli_args) entrypoints_args, manager_args, engine_args = get_args(cfg, parser, cli_args) + backend_type = BackendType.VLLM if not manager_args.simulator_mode else BackendType.SIM_VLLM launch_args = LaunchArgs(launch_mode=LaunchMode.GLOBAL, backend_type=BackendType.VLLM) # Assume that there is an existing ray cluster when using centralized deployment. diff --git a/llumnix/llumlet/llumlet.py b/llumnix/llumlet/llumlet.py index 378e717a..85c14ba1 100644 --- a/llumnix/llumlet/llumlet.py +++ b/llumnix/llumlet/llumlet.py @@ -81,7 +81,7 @@ def from_args(cls, engine_args, profiling_result_file_path: str = None): try: - assert backend_type in [backend_type.VLLM, backend_type.BLADELLM], \ + assert backend_type in [backend_type.VLLM, backend_type.BLADELLM, backend_type.SIM_VLLM], \ f'unimplemented backend {backend_type}' num_gpus = 0 if backend_type == backend_type.BLADELLM: diff --git a/llumnix/manager.py b/llumnix/manager.py index 69cd3cad..8b0d63f4 100644 --- a/llumnix/manager.py +++ b/llumnix/manager.py @@ -522,14 +522,13 @@ def _init_placement_group(self, backend_type: BackendType, init_server: bool = False, block: bool = True) -> PlacementGroup: - if not self.manager_args.profiling_result_file_path: + if not BackendType.is_sim_backend(backend_type): # num_cpus=3, for Llumlet + AsyncPutQueueActor + ProxyActor # num_gpus=world_size, for world_size Workers world_size = get_engine_world_size(engine_args, backend_type) placement_group = initialize_placement_group(placement_group_name, num_cpus=3+int(init_server), num_gpus=world_size, detached=True, block=block) else: - assert backend_type == backend_type.VLLM, "Only support the simulator backend for vLLM." # num_cpus=1, for Llumlet + AsyncPutQueueActor placement_group = initialize_placement_group(placement_group_name, num_cpus=2+int(init_server), num_gpus=0, detached=True, block=block) diff --git a/tests/unit_test/global_scheduler/test_manager.py b/tests/unit_test/global_scheduler/test_manager.py index a2dbcf89..518424a2 100644 --- a/tests/unit_test/global_scheduler/test_manager.py +++ b/tests/unit_test/global_scheduler/test_manager.py @@ -193,7 +193,7 @@ def test_init_instances_sim(ray_env, manager): import llumnix.backends.vllm.simulator llumnix.backends.vllm.simulator.BackendSimVLLM = MockBackendSim engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) - _, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.VLLM, engine_args)) + _, instances = ray.get(manager.init_instances.remote(QueueType("rayqueue"), BackendType.SIM_VLLM, engine_args)) num_instances = len(instances) manager_args = ManagerArgs() assert num_instances == manager_args.initial_instances