From 88074ea8f68808350b610c1e1b9b79c43c9ae817 Mon Sep 17 00:00:00 2001 From: Ruihang Lai Date: Tue, 17 Dec 2024 10:28:05 +0800 Subject: [PATCH] [Bench] Add "per-gpu-workload" mode (#3068) This PR introduces the per-gpu-workload mode to MLC bench. Under this mode, the specified "num_concurrent_requests" and "request_rate" denote the workload **per GPU**, which means the overall workload of the entire serving system for benchmarking will be multiplied by the number of GPUs. Meanwhile, this PR deprecates the argument `--testset-name` in favor of `--dataset-path` for Loogle dataset. --- python/mlc_llm/bench/__main__.py | 36 +++++++++++++---------- python/mlc_llm/bench/dataset.py | 5 ++-- python/mlc_llm/bench/request_processor.py | 13 +++++--- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/python/mlc_llm/bench/__main__.py b/python/mlc_llm/bench/__main__.py index 3d76438e21..75ccc5bb90 100644 --- a/python/mlc_llm/bench/__main__.py +++ b/python/mlc_llm/bench/__main__.py @@ -101,17 +101,18 @@ def run_pipeline( args.output_len_std, ) request_records = pipeline(request_records) - assert len(request_records) == args.num_requests * args.num_gpus - sorted_requests: List[RequestRecord] = [None] * args.num_requests * args.num_gpus + num_total_requests = ( + args.num_requests if not args.per_gpu_workload else args.num_requests * args.num_gpus + ) + assert len(request_records) == num_total_requests + sorted_requests: List[RequestRecord] = [None] * num_total_requests for request_record in request_records: assert request_record.request_id is not None assert sorted_requests[request_record.request_id] is None sorted_requests[request_record.request_id] = request_record request_records = MetricAnalyzer(tokenizer)(request_records) - report = generate_metrics_summary( - request_records, args.num_requests * args.num_gpus, args.num_gpus - ) + report = generate_metrics_summary(request_records, num_total_requests, args.num_gpus) return report, sorted_requests @@ -221,6 +222,15 @@ def _main(): help="The number of requests for warmup. " "It is optional when fixing the number of concurrent requests, and is required otherwise.", ) + parser.add_argument( + "--per-gpu-workload", + default=False, + action="store_true", + help='When set to True, the specified "num_concurrent_requests"/"request_rate" ' + "denote the workload **per GPU**, which means that the real values of " + '"num_concurrent_requests"/"request_rate" used in benchmark' + 'will be multiplied by "num_gpus".', + ) parser.add_argument( "--num-concurrent-requests", type=_parse_num_concurrent_requests, @@ -354,13 +364,6 @@ def _main(): type=_parse_mlc_engine_config, help="The engine config used when launch MLC server.", ) - parser.add_argument( - "--output", - "-o", - type=str, - default="mlc_benchmark.csv", - help="The path of the output file where to dump the benchmark results.", - ) parser.add_argument( "--cuda-profile", default=False, @@ -378,13 +381,16 @@ def _main(): "--multi-round", default=False, action="store_true", - help="Whether to chat like mulit round conversion with history log each request. " + help="Whether to chat like multi round conversion with history log each request. " "Only enabled when benchmarked with fixed concurrent request mode." "The --num-concurrent-requests should be provided when enabling this option.", ) - parser.add_argument( - "--testset-name", type=str, help="The name of the testset. Only used for Loogle dataset" + "--output", + "-o", + type=str, + default="mlc_benchmark.csv", + help="The path of the output file where to dump the benchmark results.", ) main(parser.parse_args()) diff --git a/python/mlc_llm/bench/dataset.py b/python/mlc_llm/bench/dataset.py index 3cdd26f934..a6e1042f3e 100644 --- a/python/mlc_llm/bench/dataset.py +++ b/python/mlc_llm/bench/dataset.py @@ -174,12 +174,11 @@ class LoogleDataset(Dataset): # pylint: disable=too-few-public-methods # pylint: enable=line-too-long require_fake_warmup: bool = True - def __init__(self, tokenizer: AutoTokenizer, testset_name) -> None: + def __init__(self, tokenizer: AutoTokenizer, testset_name: str) -> None: raw_dataset = load_dataset("bigainlco/LooGLE", testset_name, split="test") self.tokenizer = tokenizer self.dataset = [] self.prompt_format = self.task2prompt[testset_name] - # self.max_gen = self.task2maxlen[testset_name] prompts = [] generate_lens = [] questions = [] @@ -806,7 +805,7 @@ def create_dataset(args: argparse.Namespace, tokenizer: AutoTokenizer) -> "Datas assert ( args.apply_chat_template is False ), "Loogle dataset does not support applying chat template" - return LoogleDataset(tokenizer, args.testset_name) + return LoogleDataset(tokenizer, testset_name=args.dataset_path) if args.dataset == "react": assert ( args.apply_chat_template is False diff --git a/python/mlc_llm/bench/request_processor.py b/python/mlc_llm/bench/request_processor.py index 9c2c798058..70dd9b3bcc 100644 --- a/python/mlc_llm/bench/request_processor.py +++ b/python/mlc_llm/bench/request_processor.py @@ -622,22 +622,27 @@ def create_pipelines( "Please specify the number of warmup requests via " '"--num-warmup-requests" when fixing request rate.' ) + num_total_requests = int( + args.num_requests if not args.per_gpu_workload else args.num_requests * args.num_gpus + ) if dataset.require_fake_warmup: - num_samples = int(args.num_requests * args.num_gpus) + num_samples = num_total_requests else: - num_samples = int(args.num_requests * args.num_gpus) + args.num_warmup_requests + num_samples = num_total_requests + args.num_warmup_requests return [ SequentialProcessor( LogMessage(f"Fixing request rate: {request_rate}"), SampleRequests(num_samples), AttachModelName(args.tokenizer), - AttachRequestRateTimestamp(request_rate * args.num_gpus), + AttachRequestRateTimestamp( + request_rate if not args.per_gpu_workload else request_rate * args.num_gpus + ), AttachStreamFlag(args.stream), AttachSamplingOptions(args.temperature, args.top_p, args.ignore_eos), AttachExecutionFeature({"request_rate": float(request_rate)}), WarmupAndRun( num_warmup_requests=args.num_warmup_requests, - num_benchmark_requests=int(args.num_requests * args.num_gpus), + num_benchmark_requests=num_total_requests, pipeline=FixTimestampExecutor( f_create_api_endpoint, args.num_process_workers,