From d8f0d7e580c60f733a7c3fdf7e2cc7c457a24899 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 31 Oct 2024 14:55:24 +0000 Subject: [PATCH 01/21] Update generate_final_report.py --- tools/submission/generate_final_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py index ba2c368cd..79d9fe076 100644 --- a/tools/submission/generate_final_report.py +++ b/tools/submission/generate_final_report.py @@ -79,7 +79,7 @@ def main(): df["p#"] = df.apply(lambda x: int(x["host_processors_per_node"]), axis=1) # details url - base_url = f"https://github.com/mlcommons/{args.repository}/tree/main" + base_url = f"https://github.com/{args.repository_owner}/{args.repository}/tree/{args.repository_branch}" df["Details"] = df.apply( lambda x: '=HYPERLINK("{}","details")'.format( "/".join( From 6b1a0f87f46288d7b4b487f89e18f3151422694c Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 7 Nov 2024 11:54:48 +0000 Subject: [PATCH 02/21] Fix sdxl (#1911) * Fix typo in fid_score.py, fail_safe for SDXL short runs * [Automated Commit] Format Codebase * Fix typo in fid_score.py, fail_safe for SDXL short runs * Fix dlrmv2 reference implementation | Update run_local.sh --- recommendation/dlrm_v2/pytorch/run_local.sh | 4 +++- text_to_image/coco.py | 24 ++++++++++++--------- text_to_image/tools/fid/fid_score.py | 2 +- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/recommendation/dlrm_v2/pytorch/run_local.sh b/recommendation/dlrm_v2/pytorch/run_local.sh index 0d054c6c4..3bc8ec667 100755 --- a/recommendation/dlrm_v2/pytorch/run_local.sh +++ b/recommendation/dlrm_v2/pytorch/run_local.sh @@ -2,7 +2,9 @@ source ./run_common.sh -common_opt="--mlperf_conf ../../../mlperf.conf" +#mlperf.conf is now automatically loaded by loadgen +#common_opt="--mlperf_conf ../../../mlperf.conf" + OUTPUT_DIR=`pwd`/output/$name if [ ! -d $OUTPUT_DIR ]; then mkdir -p $OUTPUT_DIR diff --git a/text_to_image/coco.py b/text_to_image/coco.py index cb3956a01..e9499b0e6 100644 --- a/text_to_image/coco.py +++ b/text_to_image/coco.py @@ -176,20 +176,24 @@ def __call__(self, results, ids, expected=None, result_dict=None): def save_images(self, ids, ds): info = [] idx = {} - for i, id in enumerate(self.content_ids): - if id in ids: - idx[id] = i + for i, image_id in enumerate(self.content_ids): + if image_id in ids: + idx[image_id] = i if not os.path.exists("images/"): os.makedirs("images/", exist_ok=True) - for id in ids: - caption = ds.get_caption(id) - generated = Image.fromarray(self.results[idx[id]]) - image_path_tmp = f"images/{self.content_ids[idx[id]]}.png" + for image_id in ids: + if not idx.get(image_id): + print( + f"image id {image_id} is missing in the results. Hence not saved.") + continue + caption = ds.get_caption(image_id) + generated = Image.fromarray(self.results[idx[image_id]]) + image_path_tmp = f"images/{self.content_ids[idx[image_id]]}.png" generated.save(image_path_tmp) - info.append((self.content_ids[idx[id]], caption)) + info.append((self.content_ids[idx[image_id]], caption)) with open("images/captions.txt", "w+") as f: - for id, caption in info: - f.write(f"{id} {caption}\n") + for image_id, caption in info: + f.write(f"{image_id} {caption}\n") def start(self): self.results = [] diff --git a/text_to_image/tools/fid/fid_score.py b/text_to_image/tools/fid/fid_score.py index febc12ff5..8e486c8b7 100644 --- a/text_to_image/tools/fid/fid_score.py +++ b/text_to_image/tools/fid/fid_score.py @@ -44,7 +44,7 @@ import pathlib import os import sys -sys.path.insert("..", 0) +sys.path.insert(0, "..") from inception import InceptionV3 # noqa: E402 From a4ba51fb2244f2efc703c341b13411676297e299 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 7 Nov 2024 20:07:07 +0530 Subject: [PATCH 03/21] Fixes for filtering invalid results --- tools/submission/preprocess_submission.py | 28 +++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py index 1e26b81ca..9d44b91d6 100644 --- a/tools/submission/preprocess_submission.py +++ b/tools/submission/preprocess_submission.py @@ -141,6 +141,22 @@ def change_folder_name_in_path(path, old_folder_name, new_folder_name): new_path = os.path.join(*path_parts) return new_path +def clean_model_dir(model_results_dir): + model_measurements_dir = change_folder_name_in_path(model_results_dir, "results", "measurements") + model_compliance_dir = change_folder_name_in_path(model_results_dir, "results", "compliance") + + print(f"rmtree {model_results_dir}") + shutil.rmtree(model_results_dir) + shutil.rmtree(model_measurements_dir) + shutil.rmtree(model_compliance_dir) + sut_results_dir = os.path.dirname(model_results_dir) + if not os.listdir(sut_results_dir): + #clean sut dir + sut = os.path.basename(sut_results_dir) + print(f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}") + shutil.rmtree(sut_results_dir) + shutil.rmtree(os.path.dirname(model_measurements_dir)) + shutil.rmtree(os.path.dirname(model_compliance_dir)) def clean_invalid_results(args, log_path, config, system_desc, system_json, model, mlperf_model, division, system_id_json, is_closed_or_network): @@ -176,6 +192,7 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json, except Exception as e: log.warning(e) perf_is_valid = False + compliance_is_valid = False if perf_is_valid: power_path = os.path.join(scenario_path, "performance", "power") has_power = os.path.exists(power_path) @@ -260,9 +277,11 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json, # if only accuracy or compliance failed, result is valid # for open if not perf_is_valid: - shutil.rmtree(scenario_path) log.warning( f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} and open divisions. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Removing it...") + shutil.rmtree(scenario_path) + scenario_measurements_path = change_folder_name_in_path(scenario_path, "results", "measurements") + shutil.rmtree(scenario_measurements_path) if not os.path.exists(target_results_path): shutil.copytree( model_results_path, target_results_path) @@ -288,9 +307,7 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json, log.warning(f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} division. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Compliance: {compliance_is_valid}. Moving other scenario results of {model} to open...") else: log.warning(f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} division. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Removing all dependent scenario results...") - shutil.rmtree(model_results_path) - shutil.rmtree(model_measurements_path) - shutil.rmtree(model_compliance_path) + clean_model_dir(model_results_path) else: # delete this result # delete other scenario results too shutil.rmtree(scenario_path) @@ -517,6 +534,9 @@ def main(): infer_scenario_results(args, config) + if not args.nodelete_empty_dirs: + delete_empty_dirs(os.path.join(src_dir)) + return 0 From 451b310ef42a28e015ce5abc4e43ba6033ff8d4a Mon Sep 17 00:00:00 2001 From: arjunsuresh Date: Thu, 7 Nov 2024 14:44:42 +0000 Subject: [PATCH 04/21] [Automated Commit] Format Codebase --- tools/submission/preprocess_submission.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py index 9d44b91d6..7803cf568 100644 --- a/tools/submission/preprocess_submission.py +++ b/tools/submission/preprocess_submission.py @@ -141,9 +141,12 @@ def change_folder_name_in_path(path, old_folder_name, new_folder_name): new_path = os.path.join(*path_parts) return new_path + def clean_model_dir(model_results_dir): - model_measurements_dir = change_folder_name_in_path(model_results_dir, "results", "measurements") - model_compliance_dir = change_folder_name_in_path(model_results_dir, "results", "compliance") + model_measurements_dir = change_folder_name_in_path( + model_results_dir, "results", "measurements") + model_compliance_dir = change_folder_name_in_path( + model_results_dir, "results", "compliance") print(f"rmtree {model_results_dir}") shutil.rmtree(model_results_dir) @@ -151,13 +154,15 @@ def clean_model_dir(model_results_dir): shutil.rmtree(model_compliance_dir) sut_results_dir = os.path.dirname(model_results_dir) if not os.listdir(sut_results_dir): - #clean sut dir + # clean sut dir sut = os.path.basename(sut_results_dir) - print(f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}") + print( + f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}") shutil.rmtree(sut_results_dir) shutil.rmtree(os.path.dirname(model_measurements_dir)) shutil.rmtree(os.path.dirname(model_compliance_dir)) + def clean_invalid_results(args, log_path, config, system_desc, system_json, model, mlperf_model, division, system_id_json, is_closed_or_network): # cleanup invalid results @@ -280,7 +285,8 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json, log.warning( f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} and open divisions. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Removing it...") shutil.rmtree(scenario_path) - scenario_measurements_path = change_folder_name_in_path(scenario_path, "results", "measurements") + scenario_measurements_path = change_folder_name_in_path( + scenario_path, "results", "measurements") shutil.rmtree(scenario_measurements_path) if not os.path.exists(target_results_path): shutil.copytree( From 4c109ea8b5b17d0c422d4b8a08a55070142c68ae Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 7 Nov 2024 14:58:42 +0000 Subject: [PATCH 05/21] Update preprocess_submission.py --- tools/submission/preprocess_submission.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py index 7803cf568..a1678c79d 100644 --- a/tools/submission/preprocess_submission.py +++ b/tools/submission/preprocess_submission.py @@ -2,10 +2,6 @@ Tool to infer scenario results and cleanup submission tree """ -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - import argparse import logging import os @@ -156,7 +152,7 @@ def clean_model_dir(model_results_dir): if not os.listdir(sut_results_dir): # clean sut dir sut = os.path.basename(sut_results_dir) - print( + log.info( f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}") shutil.rmtree(sut_results_dir) shutil.rmtree(os.path.dirname(model_measurements_dir)) From 40c1fe0c28364b243b5944b3569000611ddf2b7d Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 7 Nov 2024 21:20:52 +0530 Subject: [PATCH 06/21] Added an option to pass in sample_ids.txt for SDXL accuracy check --- text_to_image/tools/accuracy_coco.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index 2d7c36506..8740ee172 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -51,6 +51,10 @@ def get_args(): required=False, help="path to dump 10 stable diffusion xl compliance images", ) + #Do not use for official MLPerf inference submissions as only the default one is valid + parser.add_argument( + "--ids-path", help="Path to 10 caption ids to dump as compliance images", default="os.path.join(os.path.dirname(__file__), 'sample_ids.txt')" + ) parser.add_argument("--device", default="cpu", choices=["gpu", "cpu"]) parser.add_argument( "--low_memory", @@ -97,8 +101,9 @@ def main(): os.makedirs(args.compliance_images_path) dump_compliance_images = True compliance_images_idx_list = [] + sample_ids_file_path = args.ids_path if args.ids_path else os.path.join(os.path.dirname(__file__), "sample_ids.txt") with open( - os.path.join(os.path.dirname(__file__), "sample_ids.txt"), "r" + os.path.join(sample_ids_file_path, "r" ) as compliance_id_file: for line in compliance_id_file: idx = int(line.strip()) From 89a2ffe257bc8c4c0d8e81cb5c1fec4e15080b2a Mon Sep 17 00:00:00 2001 From: arjunsuresh Date: Thu, 7 Nov 2024 15:51:36 +0000 Subject: [PATCH 07/21] [Automated Commit] Format Codebase --- text_to_image/tools/accuracy_coco.py | 88 ++++++++++++++-------------- 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index 8740ee172..bc3f87d04 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -51,7 +51,8 @@ def get_args(): required=False, help="path to dump 10 stable diffusion xl compliance images", ) - #Do not use for official MLPerf inference submissions as only the default one is valid + # Do not use for official MLPerf inference submissions as only the default + # one is valid parser.add_argument( "--ids-path", help="Path to 10 caption ids to dump as compliance images", default="os.path.join(os.path.dirname(__file__), 'sample_ids.txt')" ) @@ -101,12 +102,13 @@ def main(): os.makedirs(args.compliance_images_path) dump_compliance_images = True compliance_images_idx_list = [] - sample_ids_file_path = args.ids_path if args.ids_path else os.path.join(os.path.dirname(__file__), "sample_ids.txt") + sample_ids_file_path = args.ids_path if args.ids_path else os.path.join( + os.path.dirname(__file__), "sample_ids.txt") with open( os.path.join(sample_ids_file_path, "r" ) as compliance_id_file: for line in compliance_id_file: - idx = int(line.strip()) + idx=int(line.strip()) compliance_images_idx_list.append(idx) # Dump caption.txt with open( @@ -153,28 +155,28 @@ def compute_accuracy( statistics_path, ): # Load torchmetrics modules - clip = CLIPEncoder(device=device) - clip_scores = [] - seen = set() - result_list = [] - result_dict = {} + clip=CLIPEncoder(device=device) + clip_scores=[] + seen=set() + result_list=[] + result_dict={} # Load model outputs with open(mlperf_accuracy_file, "r") as f: - results = json.load(f) + results=json.load(f) for j in tqdm(results): - idx = j["qsl_idx"] + idx=j["qsl_idx"] if idx in seen: continue seen.add(idx) # Load generated image - generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( + generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( 1024, 1024, 3 ) result_list.append(generated_img) - generated_img = Image.fromarray(generated_img) + generated_img=Image.fromarray(generated_img) # Dump compliance images if dump_compliance_images and idx in compliance_images_idx_list: @@ -185,16 +187,16 @@ def compute_accuracy( # generated_img = torch.Tensor(generated_img).to(torch.uint8).to(device) # Load Ground Truth - caption = df_captions.iloc[idx]["caption"] + caption=df_captions.iloc[idx]["caption"] clip_scores.append( 100 * clip.get_clip_score( caption, generated_img).item()) - fid_score = compute_fid(result_list, statistics_path, device) + fid_score=compute_fid(result_list, statistics_path, device) - result_dict["FID_SCORE"] = fid_score - result_dict["CLIP_SCORE"] = np.mean(clip_scores) + result_dict["FID_SCORE"]=fid_score + result_dict["CLIP_SCORE"]=np.mean(clip_scores) print(f"Accuracy Results: {result_dict}") with open(output_file, "w") as fp: @@ -216,43 +218,43 @@ def compute_accuracy_low_memory( ): if num_workers is None: try: - num_cpus = len(os.sched_getaffinity(0)) + num_cpus=len(os.sched_getaffinity(0)) except AttributeError: # os.sched_getaffinity is not available under Windows, use # os.cpu_count instead (which may not return the *available* number # of CPUs). - num_cpus = os.cpu_count() + num_cpus=os.cpu_count() - num_workers = min(num_cpus, 8) if num_cpus is not None else 0 + num_workers=min(num_cpus, 8) if num_cpus is not None else 0 else: - num_workers = num_workers + num_workers=num_workers # Load torchmetrics modules - block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims] - inception_model = InceptionV3([block_idx]).to(device) - clip_model = CLIPEncoder(device=device) + block_idx=InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims] + inception_model=InceptionV3([block_idx]).to(device) + clip_model=CLIPEncoder(device=device) - clip_scores = [] - seen = set() - result_batch = [] - result_dict = {} - activations = np.empty((0, inception_dims)) + clip_scores=[] + seen=set() + result_batch=[] + result_dict={} + activations=np.empty((0, inception_dims)) # Load model outputs with open(mlperf_accuracy_file, "r") as f: - results = ijson.items(f, "item") + results=ijson.items(f, "item") for j in tqdm(results): - idx = j["qsl_idx"] + idx=j["qsl_idx"] if idx in seen: continue seen.add(idx) # Load generated image - generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( + generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( 1024, 1024, 3 ) - generated_img = Image.fromarray(generated_img) + generated_img=Image.fromarray(generated_img) # Dump compliance images if dump_compliance_images and idx in compliance_images_idx_list: @@ -262,7 +264,7 @@ def compute_accuracy_low_memory( f"{idx}.png")) # Load Ground Truth - caption = df_captions.iloc[idx]["caption"] + caption=df_captions.iloc[idx]["caption"] clip_scores.append( 100 * clip_model.get_clip_score(caption, generated_img).item() ) @@ -270,7 +272,7 @@ def compute_accuracy_low_memory( result_batch.append(generated_img.convert("RGB")) if len(result_batch) == batch_size: - act = get_activations( + act=get_activations( result_batch, inception_model, batch_size, @@ -278,12 +280,12 @@ def compute_accuracy_low_memory( device, num_workers, ) - activations = np.append(activations, act, axis=0) + activations=np.append(activations, act, axis=0) result_batch.clear() # Remaining data for last batch if len(result_batch) > 0: - act = get_activations( + act=get_activations( result_batch, inception_model, len(result_batch), @@ -291,9 +293,9 @@ def compute_accuracy_low_memory( device, num_workers, ) - activations = np.append(activations, act, axis=0) + activations=np.append(activations, act, axis=0) - m1, s1 = compute_statistics_of_path( + m1, s1=compute_statistics_of_path( statistics_path, inception_model, batch_size, @@ -304,13 +306,13 @@ def compute_accuracy_low_memory( None, ) - m2 = np.mean(activations, axis=0) - s2 = np.cov(activations, rowvar=False) + m2=np.mean(activations, axis=0) + s2=np.cov(activations, rowvar=False) - fid_score = calculate_frechet_distance(m1, s1, m2, s2) + fid_score=calculate_frechet_distance(m1, s1, m2, s2) - result_dict["FID_SCORE"] = fid_score - result_dict["CLIP_SCORE"] = np.mean(clip_scores) + result_dict["FID_SCORE"]=fid_score + result_dict["CLIP_SCORE"]=np.mean(clip_scores) print(f"Accuracy Results: {result_dict}") with open(output_file, "w") as fp: From 69ffdc0aa783f9127af612a7de57c6329703c1dc Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 7 Nov 2024 20:19:13 +0000 Subject: [PATCH 08/21] Update accuracy_coco.py --- text_to_image/tools/accuracy_coco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index bc3f87d04..0d0c01560 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -105,7 +105,7 @@ def main(): sample_ids_file_path = args.ids_path if args.ids_path else os.path.join( os.path.dirname(__file__), "sample_ids.txt") with open( - os.path.join(sample_ids_file_path, "r" + os.path.join(sample_ids_file_path, "r") ) as compliance_id_file: for line in compliance_id_file: idx=int(line.strip()) From d1d642e06f91e5b8f56088f8d1a4b127a65d962c Mon Sep 17 00:00:00 2001 From: arjunsuresh Date: Thu, 7 Nov 2024 20:19:47 +0000 Subject: [PATCH 09/21] [Automated Commit] Format Codebase --- text_to_image/tools/accuracy_coco.py | 82 ++++++++++++++-------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index 0d0c01560..d73325897 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -108,7 +108,7 @@ def main(): os.path.join(sample_ids_file_path, "r") ) as compliance_id_file: for line in compliance_id_file: - idx=int(line.strip()) + idx = int(line.strip()) compliance_images_idx_list.append(idx) # Dump caption.txt with open( @@ -155,28 +155,28 @@ def compute_accuracy( statistics_path, ): # Load torchmetrics modules - clip=CLIPEncoder(device=device) - clip_scores=[] - seen=set() - result_list=[] - result_dict={} + clip = CLIPEncoder(device=device) + clip_scores = [] + seen = set() + result_list = [] + result_dict = {} # Load model outputs with open(mlperf_accuracy_file, "r") as f: - results=json.load(f) + results = json.load(f) for j in tqdm(results): - idx=j["qsl_idx"] + idx = j["qsl_idx"] if idx in seen: continue seen.add(idx) # Load generated image - generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( + generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( 1024, 1024, 3 ) result_list.append(generated_img) - generated_img=Image.fromarray(generated_img) + generated_img = Image.fromarray(generated_img) # Dump compliance images if dump_compliance_images and idx in compliance_images_idx_list: @@ -187,16 +187,16 @@ def compute_accuracy( # generated_img = torch.Tensor(generated_img).to(torch.uint8).to(device) # Load Ground Truth - caption=df_captions.iloc[idx]["caption"] + caption = df_captions.iloc[idx]["caption"] clip_scores.append( 100 * clip.get_clip_score( caption, generated_img).item()) - fid_score=compute_fid(result_list, statistics_path, device) + fid_score = compute_fid(result_list, statistics_path, device) - result_dict["FID_SCORE"]=fid_score - result_dict["CLIP_SCORE"]=np.mean(clip_scores) + result_dict["FID_SCORE"] = fid_score + result_dict["CLIP_SCORE"] = np.mean(clip_scores) print(f"Accuracy Results: {result_dict}") with open(output_file, "w") as fp: @@ -218,43 +218,43 @@ def compute_accuracy_low_memory( ): if num_workers is None: try: - num_cpus=len(os.sched_getaffinity(0)) + num_cpus = len(os.sched_getaffinity(0)) except AttributeError: # os.sched_getaffinity is not available under Windows, use # os.cpu_count instead (which may not return the *available* number # of CPUs). - num_cpus=os.cpu_count() + num_cpus = os.cpu_count() - num_workers=min(num_cpus, 8) if num_cpus is not None else 0 + num_workers = min(num_cpus, 8) if num_cpus is not None else 0 else: - num_workers=num_workers + num_workers = num_workers # Load torchmetrics modules - block_idx=InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims] - inception_model=InceptionV3([block_idx]).to(device) - clip_model=CLIPEncoder(device=device) + block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims] + inception_model = InceptionV3([block_idx]).to(device) + clip_model = CLIPEncoder(device=device) - clip_scores=[] - seen=set() - result_batch=[] - result_dict={} - activations=np.empty((0, inception_dims)) + clip_scores = [] + seen = set() + result_batch = [] + result_dict = {} + activations = np.empty((0, inception_dims)) # Load model outputs with open(mlperf_accuracy_file, "r") as f: - results=ijson.items(f, "item") + results = ijson.items(f, "item") for j in tqdm(results): - idx=j["qsl_idx"] + idx = j["qsl_idx"] if idx in seen: continue seen.add(idx) # Load generated image - generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( + generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( 1024, 1024, 3 ) - generated_img=Image.fromarray(generated_img) + generated_img = Image.fromarray(generated_img) # Dump compliance images if dump_compliance_images and idx in compliance_images_idx_list: @@ -264,7 +264,7 @@ def compute_accuracy_low_memory( f"{idx}.png")) # Load Ground Truth - caption=df_captions.iloc[idx]["caption"] + caption = df_captions.iloc[idx]["caption"] clip_scores.append( 100 * clip_model.get_clip_score(caption, generated_img).item() ) @@ -272,7 +272,7 @@ def compute_accuracy_low_memory( result_batch.append(generated_img.convert("RGB")) if len(result_batch) == batch_size: - act=get_activations( + act = get_activations( result_batch, inception_model, batch_size, @@ -280,12 +280,12 @@ def compute_accuracy_low_memory( device, num_workers, ) - activations=np.append(activations, act, axis=0) + activations = np.append(activations, act, axis=0) result_batch.clear() # Remaining data for last batch if len(result_batch) > 0: - act=get_activations( + act = get_activations( result_batch, inception_model, len(result_batch), @@ -293,9 +293,9 @@ def compute_accuracy_low_memory( device, num_workers, ) - activations=np.append(activations, act, axis=0) + activations = np.append(activations, act, axis=0) - m1, s1=compute_statistics_of_path( + m1, s1 = compute_statistics_of_path( statistics_path, inception_model, batch_size, @@ -306,13 +306,13 @@ def compute_accuracy_low_memory( None, ) - m2=np.mean(activations, axis=0) - s2=np.cov(activations, rowvar=False) + m2 = np.mean(activations, axis=0) + s2 = np.cov(activations, rowvar=False) - fid_score=calculate_frechet_distance(m1, s1, m2, s2) + fid_score = calculate_frechet_distance(m1, s1, m2, s2) - result_dict["FID_SCORE"]=fid_score - result_dict["CLIP_SCORE"]=np.mean(clip_scores) + result_dict["FID_SCORE"] = fid_score + result_dict["CLIP_SCORE"] = np.mean(clip_scores) print(f"Accuracy Results: {result_dict}") with open(output_file, "w") as fp: From 8d3b8ab09ac392b5a8656ad07d37fb8d7942595b Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 8 Nov 2024 03:44:42 +0530 Subject: [PATCH 10/21] Fix typo --- text_to_image/tools/accuracy_coco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index d73325897..42ef8efe3 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -105,7 +105,7 @@ def main(): sample_ids_file_path = args.ids_path if args.ids_path else os.path.join( os.path.dirname(__file__), "sample_ids.txt") with open( - os.path.join(sample_ids_file_path, "r") + sample_ids_file_path, "r" ) as compliance_id_file: for line in compliance_id_file: idx = int(line.strip()) From b09b1efef4e5225d33618432cf71550ac135f501 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 8 Nov 2024 15:47:17 +0530 Subject: [PATCH 11/21] Not use default for sample_ids.txt --- text_to_image/tools/accuracy_coco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index 42ef8efe3..b5f1be378 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -54,7 +54,7 @@ def get_args(): # Do not use for official MLPerf inference submissions as only the default # one is valid parser.add_argument( - "--ids-path", help="Path to 10 caption ids to dump as compliance images", default="os.path.join(os.path.dirname(__file__), 'sample_ids.txt')" + "--ids-path", help="Path to 10 caption ids to dump as compliance images" ) parser.add_argument("--device", default="cpu", choices=["gpu", "cpu"]) parser.add_argument( From df5049d4dbec41862fef6dd7edf9fb064a779bd6 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 14 Nov 2024 23:12:35 +0000 Subject: [PATCH 12/21] Update requirements.txt (#1907) Updating the pip packages --- text_to_image/requirements.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/text_to_image/requirements.txt b/text_to_image/requirements.txt index a0a850773..857de950e 100644 --- a/text_to_image/requirements.txt +++ b/text_to_image/requirements.txt @@ -1,8 +1,8 @@ -diffusers==0.21.2 -transformers==4.33.2 -accelerate==0.23.0 -open-clip-torch==2.7.0 -opencv-python==4.8.1.78 +diffusers==0.30.3 +transformers==4.45.2 +accelerate==1.0.1 +open-clip-torch==2.26.1 +opencv-python==4.10.0.84 pycocotools==2.0.7 -torchmetrics[image]==1.2.0 -scipy==1.9.1 +torchmetrics[image]==1.4.3 +scipy==1.10.1 From a7e8c8ad2766e3fb64a31eb42c8cde724f7b055d Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 15 Nov 2024 16:46:53 +0530 Subject: [PATCH 13/21] Fix preprocess_sudbmission for a bug --- tools/submission/preprocess_submission.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py index a1678c79d..ec3aa1f7a 100644 --- a/tools/submission/preprocess_submission.py +++ b/tools/submission/preprocess_submission.py @@ -84,7 +84,6 @@ def delete_empty_dirs(src): """ if not os.path.isdir(src): return False - if all([delete_empty_dirs(os.path.join(src, file)) for file in os.listdir(src)]): log.info("Removing empty dir: (%s)", src) @@ -532,13 +531,16 @@ def main(): if not args.nodelete_empty_dirs: delete_empty_dirs(os.path.join(src_dir)) + run_dir = os.getcwd() os.chdir(src_dir) infer_scenario_results(args, config) + os.chdir(run_dir) if not args.nodelete_empty_dirs: delete_empty_dirs(os.path.join(src_dir)) + return 0 From 8915a90ea0fed700afbffbc75908cd2fbf103104 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sat, 16 Nov 2024 22:04:18 +0000 Subject: [PATCH 14/21] Update submission_checker.py | Removed TEST05 --- tools/submission/submission_checker.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 5f2e27267..deff9eb8c 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -188,6 +188,7 @@ "sample_index_rng_seed": 198141574272810017, "schedule_rng_seed": 7575108116881280410, }, + # not required for v5.0+ "test05_seeds": { # TODO: Update random seeds "qsl_rng_seed": 2376919268182438552, @@ -2880,7 +2881,7 @@ def check_compliance_dir( compliance_perf_pass = True compliance_perf_dir_pass = True compliance_acc_pass = True - test_list = ["TEST01", "TEST04", "TEST05"] + test_list = ["TEST01", "TEST04"] if model in [ "bert-99", @@ -2899,7 +2900,7 @@ def check_compliance_dir( ]: test_list.remove("TEST04") - if model in [ + if config.version in ["v4.0", "v4.1"] and model not in [ "gptj-99", "gptj-99.9", "llama2-70b-99", @@ -2907,7 +2908,7 @@ def check_compliance_dir( "stable-diffusion-xl", "mixtral-8x7b", ]: - test_list.remove("TEST05") + test_list.append("TEST05") if model in [ "gptj-99", From 941c0c484f4395e9b63611fbddafbde001a9de39 Mon Sep 17 00:00:00 2001 From: zixianwang2022 Date: Sat, 16 Nov 2024 21:18:39 -0600 Subject: [PATCH 15/21] move changes to fork 4 pr --- text_to_image/backend_pytorch.py | 112 +++++++------- text_to_image/main.py | 241 ++++++++++++++++++++++--------- text_to_image/mlperf.conf | 99 +++++++++++++ 3 files changed, 326 insertions(+), 126 deletions(-) create mode 100644 text_to_image/mlperf.conf diff --git a/text_to_image/backend_pytorch.py b/text_to_image/backend_pytorch.py index 36e2b8009..f2af4d75c 100644 --- a/text_to_image/backend_pytorch.py +++ b/text_to_image/backend_pytorch.py @@ -17,9 +17,9 @@ def __init__( model_id="xl", guidance=8, steps=20, - batch_size=1, + batch_size=2, device="cuda", - precision="fp32", + precision="fp16", negative_prompt="normal quality, low quality, worst quality, low res, blurry, nsfw, nude", ): super(BackendPytorch, self).__init__() @@ -57,39 +57,41 @@ def image_format(self): return "NCHW" def load(self): - if self.model_path is None: - log.warning( - "Model path not provided, running with default hugging face weights\n" - "This may not be valid for official submissions" - ) - self.scheduler = EulerDiscreteScheduler.from_pretrained( - self.model_id, subfolder="scheduler" - ) - self.pipe = StableDiffusionXLPipeline.from_pretrained( - self.model_id, - scheduler=self.scheduler, - safety_checker=None, - add_watermarker=False, - variant="fp16" if (self.dtype == torch.float16) else None, - torch_dtype=self.dtype, - ) + # if self.model_path is None: + # log.warning( + # "Model path not provided, running with default hugging face weights\n" + # "This may not be valid for official submissions" + # ) + self.scheduler = EulerDiscreteScheduler.from_pretrained( + self.model_id, subfolder="scheduler" + ) + self.pipe = StableDiffusionXLPipeline.from_pretrained( + self.model_id, + scheduler=self.scheduler, + safety_checker=None, + add_watermarker=False, + # variant="fp16" if (self.dtype == torch.float16) else None, + variant="fp16" , + torch_dtype=self.dtype, + ) # self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True) - else: - self.scheduler = EulerDiscreteScheduler.from_pretrained( - os.path.join(self.model_path, "checkpoint_scheduler"), - subfolder="scheduler", - ) - self.pipe = StableDiffusionXLPipeline.from_pretrained( - os.path.join(self.model_path, "checkpoint_pipe"), - scheduler=self.scheduler, - safety_checker=None, - add_watermarker=False, - torch_dtype=self.dtype, - ) + # else: + # self.scheduler = EulerDiscreteScheduler.from_pretrained( + # os.path.join(self.model_path, "checkpoint_scheduler"), + # subfolder="scheduler", + # ) + # self.pipe = StableDiffusionXLPipeline.from_pretrained( + # os.path.join(self.model_path, "checkpoint_pipe"), + # scheduler=self.scheduler, + # safety_checker=None, + # add_watermarker=False, + # variant="fp16" if (self.dtype == torch.float16) else None, + # torch_dtype=self.dtype, + # ) # self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True) self.pipe.to(self.device) - # self.pipe.set_progress_bar_config(disable=True) + #self.pipe.set_progress_bar_config(disable=True) self.negative_prompt_tokens = self.pipe.tokenizer( self.convert_prompt(self.negative_prompt, self.pipe.tokenizer), @@ -210,15 +212,13 @@ def encode_tokens( text_input_ids.to(device), output_hidden_states=True ) - # We are only ALWAYS interested in the pooled output of the - # final text encoder + # We are only ALWAYS interested in the pooled output of the final text encoder pooled_prompt_embeds = prompt_embeds[0] if clip_skip is None: prompt_embeds = prompt_embeds.hidden_states[-2] else: # "2" because SDXL always indexes from the penultimate layer. - prompt_embeds = prompt_embeds.hidden_states[-( - clip_skip + 2)] + prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)] prompt_embeds_list.append(prompt_embeds) @@ -234,8 +234,7 @@ def encode_tokens( and zero_out_negative_prompt ): negative_prompt_embeds = torch.zeros_like(prompt_embeds) - negative_pooled_prompt_embeds = torch.zeros_like( - pooled_prompt_embeds) + negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds) elif do_classifier_free_guidance and negative_prompt_embeds is None: negative_prompt = negative_prompt or "" negative_prompt_2 = negative_prompt_2 or negative_prompt @@ -262,35 +261,30 @@ def encode_tokens( uncond_input.to(device), output_hidden_states=True, ) - # We are only ALWAYS interested in the pooled output of the - # final text encoder + # We are only ALWAYS interested in the pooled output of the final text encoder negative_pooled_prompt_embeds = negative_prompt_embeds[0] negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] negative_prompt_embeds_list.append(negative_prompt_embeds) - negative_prompt_embeds = torch.concat( - negative_prompt_embeds_list, dim=-1) + negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1) if pipe.text_encoder_2 is not None: prompt_embeds = prompt_embeds.to( dtype=pipe.text_encoder_2.dtype, device=device ) else: - prompt_embeds = prompt_embeds.to( - dtype=pipe.unet.dtype, device=device) + prompt_embeds = prompt_embeds.to(dtype=pipe.unet.dtype, device=device) bs_embed, seq_len, _ = prompt_embeds.shape - # duplicate text embeddings for each generation per prompt, using mps - # friendly method + # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) prompt_embeds = prompt_embeds.view( bs_embed * num_images_per_prompt, seq_len, -1 ) if do_classifier_free_guidance: - # duplicate unconditional embeddings for each generation per - # prompt, using mps friendly method + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] if pipe.text_encoder_2 is not None: @@ -322,7 +316,7 @@ def encode_tokens( pooled_prompt_embeds, negative_pooled_prompt_embeds, ) - + def prepare_inputs(self, inputs, i): if self.batch_size == 1: return self.encode_tokens( @@ -337,7 +331,7 @@ def prepare_inputs(self, inputs, i): negative_prompt_embeds = [] pooled_prompt_embeds = [] negative_pooled_prompt_embeds = [] - for prompt in inputs[i: min(i + self.batch_size, len(inputs))]: + for prompt in inputs[i:min(i+self.batch_size, len(inputs))]: assert isinstance(prompt, dict) text_input = prompt["input_tokens"] text_input_2 = prompt["input_tokens_2"] @@ -358,26 +352,19 @@ def prepare_inputs(self, inputs, i): pooled_prompt_embeds.append(p_p_e) negative_pooled_prompt_embeds.append(n_p_p_e) + prompt_embeds = torch.cat(prompt_embeds) negative_prompt_embeds = torch.cat(negative_prompt_embeds) pooled_prompt_embeds = torch.cat(pooled_prompt_embeds) - negative_pooled_prompt_embeds = torch.cat( - negative_pooled_prompt_embeds) - return ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) + negative_pooled_prompt_embeds = torch.cat(negative_pooled_prompt_embeds) + return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds def predict(self, inputs): images = [] with torch.no_grad(): for i in range(0, len(inputs), self.batch_size): - latents_input = [ - inputs[idx]["latents"] - for idx in range(i, min(i + self.batch_size, len(inputs))) - ] + print (f'self.steps BEFORE pipe: {self.steps}') + latents_input = [inputs[idx]["latents"] for idx in range(i, min(i+self.batch_size, len(inputs)))] latents_input = torch.cat(latents_input).to(self.device) ( prompt_embeds, @@ -392,8 +379,11 @@ def predict(self, inputs): negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, guidance_scale=self.guidance, num_inference_steps=self.steps, + # num_inference_steps=20, output_type="pt", latents=latents_input, ).images + print (f'self.steps AFTER pipe: {self.steps}') images.extend(generated) return images + diff --git a/text_to_image/main.py b/text_to_image/main.py index 6aa7c15e7..7d4da2a0b 100644 --- a/text_to_image/main.py +++ b/text_to_image/main.py @@ -24,6 +24,8 @@ import dataset import coco +from concurrent.futures import ThreadPoolExecutor, as_completed + logging.basicConfig(level=logging.INFO) log = logging.getLogger("main") @@ -73,34 +75,24 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument( - "--dataset", - choices=SUPPORTED_DATASETS.keys(), - help="dataset") - parser.add_argument( - "--dataset-path", - required=True, - help="path to the dataset") + parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset") + parser.add_argument("--dataset-path", required=True, help="path to the dataset") parser.add_argument( "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles" ) parser.add_argument( "--scenario", default="SingleStream", - help="mlperf benchmark scenario, one of " + - str(list(SCENARIO_MAP.keys())), + help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())), ) parser.add_argument( "--max-batchsize", type=int, - default=1, + default=2, help="max batch size in a single inference", ) parser.add_argument("--threads", default=1, type=int, help="threads") - parser.add_argument( - "--accuracy", - action="store_true", - help="enable accuracy pass") + parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") parser.add_argument( "--find-peak-performance", action="store_true", @@ -111,10 +103,11 @@ def get_args(): parser.add_argument("--output", default="output", help="test results") parser.add_argument("--qps", type=int, help="target qps") parser.add_argument("--model-path", help="Path to model weights") + parser.add_argument("--gpu-num", type=int, default=4, help="Number of gpus to run inference") parser.add_argument( "--dtype", - default="fp32", + default="fp16", choices=["fp32", "fp16", "bf16"], help="dtype of the model", ) @@ -131,6 +124,10 @@ def get_args(): help="framework to load the latents", ) + # file to use mlperf rules compliant parameters + parser.add_argument( + "--mlperf_conf", default="mlperf.conf", help="mlperf rules config" + ) # file for user LoadGen settings such as target QPS parser.add_argument( "--user_conf", @@ -145,20 +142,14 @@ def get_args(): # pass this argument for official submission # parser.add_argument("--output-images", action="store_true", help="Store a subset of the generated images") # do not modify this argument for official submission - parser.add_argument( - "--ids-path", help="Path to caption ids", default="tools/sample_ids.txt" - ) + parser.add_argument("--ids-path", help="Path to caption ids", default="tools/sample_ids.txt") - # below will override mlperf rules compliant settings - don't use for - # official submission + # below will override mlperf rules compliant settings - don't use for official submission parser.add_argument("--time", type=int, help="time to scan in seconds") parser.add_argument("--count", type=int, help="dataset items to use") parser.add_argument("--debug", action="store_true", help="debug") parser.add_argument( - "--performance-sample-count", - type=int, - help="performance sample count", - default=5000, + "--performance-sample-count", type=int, help="performance sample count", default=5000 ) parser.add_argument( "--max-latency", type=float, help="mlperf max latency in pct tile" @@ -271,9 +262,9 @@ def enqueue(self, query_samples): else: bs = self.max_batchsize for i in range(0, len(idx), bs): - data, label = self.ds.get_samples(idx[i: i + bs]) + data, label = self.ds.get_samples(idx[i : i + bs]) self.run_one_item( - Item(query_id[i: i + bs], idx[i: i + bs], data, label) + Item(query_id[i : i + bs], idx[i : i + bs], data, label) ) def finish(self): @@ -288,9 +279,7 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): self.result_dict = {} for _ in range(self.threads): - worker = threading.Thread( - target=self.handle_tasks, args=( - self.tasks,)) + worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,)) worker.daemon = True self.workers.append(worker) worker.start() @@ -333,13 +322,29 @@ def main(): log.info(args) # find backend - backend = get_backend( - args.backend, - precision=args.dtype, - device=args.device, - model_path=args.model_path, - batch_size=args.max_batchsize, - ) + + # backend = get_backend( + # args.backend, + # precision=args.dtype, + # device=args.device, + # model_path=args.model_path, + # batch_size=args.max_batchsize + # ) + # Zixian: Oct 21: create a list of backends for multi-gpu + + # Zixian: Nov 13: Force batchsize=2 since command line doesn't work + args.max_batchsize = 5 + + backends = [get_backend( + args.backend, + precision=args.dtype, + device=f'cuda:{i}', + model_path=args.model_path, + batch_size=args.max_batchsize + ) + for i in np.arange (args.gpu_num)] + + if args.dtype == "fp16": dtype = torch.float16 elif args.dtype == "bf16": @@ -355,7 +360,9 @@ def main(): count_override = True # load model to backend - model = backend.load() + # model = backend.load() + # Zixian: Oct 21: create a list of models corresponding to each backend + models = [backend.load() for backend in backends] # dataset to use dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] @@ -365,28 +372,37 @@ def main(): pre_process=pre_proc, count=count, threads=args.threads, - pipe_tokenizer=model.pipe.tokenizer, - pipe_tokenizer_2=model.pipe.tokenizer_2, + # pipe_tokenizer=model.pipe.tokenizer, + # pipe_tokenizer_2=model.pipe.tokenizer_2, + pipe_tokenizer=models[0].pipe.tokenizer, + pipe_tokenizer_2=models[0].pipe.tokenizer_2, latent_dtype=dtype, latent_device=args.device, latent_framework=args.latent_framework, **kwargs, ) final_results = { - "runtime": model.name(), - "version": model.version(), + # "runtime": model.name(), + # "version": model.version(), + "runtime": models[0].name(), + "version": models[0].version(), "time": int(time.time()), "args": vars(args), "cmdline": str(args), } + mlperf_conf = os.path.abspath(args.mlperf_conf) + if not os.path.exists(mlperf_conf): + log.error("{} not found".format(mlperf_conf)) + sys.exit(1) + user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) sys.exit(1) audit_config = os.path.abspath(args.audit_conf) - + if args.accuracy: ids_path = os.path.abspath(args.ids_path) with open(ids_path) as f: @@ -405,16 +421,47 @@ def main(): # warmup syntetic_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit" latents_pt = torch.rand(ds.latents.shape, dtype=dtype).to(args.device) - warmup_samples = [ - { - "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer), - "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2), - "latents": latents_pt, - } - for _ in range(args.max_batchsize) - ] - for i in range(5): - _ = backend.predict(warmup_samples) + # warmup_samples = [ + # { + # "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer), + # "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2), + # "latents": latents_pt, + # } + # for _ in range(args.max_batchsize) + # ] + warmup_samples_gpus = [ + [ + { + "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer), + "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2), + "latents": latents_pt, + } + for _ in range(int(args.max_batchsize)) + ] + for model in models] * 3 # 3 times warmup samples + + # Zixian: Oct 21: warm up each backend + # for idx, backend in enumerate (backends): + # for i in range(1): + # _ = backend.predict(warmup_samples_gpus[idx]) + + print (f'Start distributed warmup') + with ThreadPoolExecutor(max_workers=len(backends)) as executor: + # Map each runner to its respective sublist + futures = { + executor.submit(backend.predict, queries): backend + for backend, queries in zip(backends, warmup_samples_gpus) + } + + # Optionally process the results + for future in as_completed(futures): + backend = futures[future] + try: + result = future.result() + print(f'Warmup backend {backend} enqueued successfully.') + except Exception as exc: + print(f'Warmup backend {backend} generated an exception: {exc}') + scenario = SCENARIO_MAP[args.scenario] runner_map = { @@ -423,12 +470,54 @@ def main(): lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner, } - runner = runner_map[scenario]( - model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize - ) - + + # Zixian: Oct 21: create a list of runner + # runner = runner_map[scenario]( + # model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize + # ) + runners = [runner_map[scenario]( + model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize + ) + for model in models] + + # def issue_queries(query_samples): + # runner.enqueue(query_samples) def issue_queries(query_samples): - runner.enqueue(query_samples) + print (f'\n\n len (query_samples): {len (query_samples)} \n\n') + + query_samples_len = len (query_samples) + query_samples_seg_len = query_samples_len / len (runners) + splitted_query_samples = [] + + + for idx in range (len (runners)): + log.info (f'\n\n\n') + log.info (f'idx: {idx}') + log.info (f'query_samples_len: {query_samples_len}') + log.info (f'idx: {idx}') + # if idx == len (runners) -1: + # splitted_query_samples.append (query_samples[idx*query_samples_seg_len:]) + # else: + # splitted_query_samples.append (query_samples[idx*query_samples_seg_len : (idx+1)*query_samples_seg_len]) + splitted_query_samples.append (query_samples [int(round(query_samples_seg_len * idx)): int(round(query_samples_seg_len * (idx + 1)))]) + + # splitted_query_samples = [query_samples[int(round(avg * i)): int(round(avg * (i + 1)))] for i in range(b)] + + with ThreadPoolExecutor(max_workers=len(runners)) as executor: + # Map each runner to its respective sublist + futures = { + executor.submit(runner.enqueue, queries): runner + for runner, queries in zip(runners, splitted_query_samples) + } + + # Optionally process the results + for future in as_completed(futures): + runner = futures[future] + try: + result = future.result() + print(f'Runner {runner} enqueued successfully.') + except Exception as exc: + print(f'Runner {runner} generated an exception: {exc}') def flush_queries(): pass @@ -441,8 +530,7 @@ def flush_queries(): log_settings.log_output = log_output_settings settings = lg.TestSettings() - # mlperf.conf is automatically loaded by the loadgen - # settings.FromConfig(mlperf_conf, args.model_name, args.scenario) + settings.FromConfig(mlperf_conf, args.model_name, args.scenario) settings.FromConfig(user_conf, args.model_name, args.scenario) if os.path.exists(audit_config): settings.FromConfig(audit_config, args.model_name, args.scenario) @@ -458,6 +546,8 @@ def flush_queries(): settings.min_duration_ms = args.time * MILLI_SEC settings.max_duration_ms = args.time * MILLI_SEC + # Zixian: Nov8: manually setting args.qps to 1 + # args.qps=1.0 if args.qps: qps = float(args.qps) settings.server_target_qps = qps @@ -471,14 +561,15 @@ def flush_queries(): settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) - settings.multi_stream_expected_latency_ns = int( - args.max_latency * NANO_SEC) + settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC) performance_sample_count = ( args.performance_sample_count if args.performance_sample_count else min(count, 500) ) + + # count = 200 sut = lg.ConstructSUT(issue_queries, flush_queries) qsl = lg.ConstructQSL( count, performance_sample_count, ds.load_query_samples, ds.unload_query_samples @@ -486,19 +577,39 @@ def flush_queries(): log.info("starting {}".format(scenario)) result_dict = {"scenario": str(scenario)} - runner.start_run(result_dict, args.accuracy) + for runner in runners: + runner.start_run(result_dict, args.accuracy) + + # with ThreadPoolExecutor(max_workers=len(runners)) as executor: + # # Map each runner to its respective sublist + # futures = { + # executor.submit(runner.finish(), (result_dict, args.accuracy)): runner + # for runner in runners + # } + lg.StartTestWithLogSettings(sut, qsl, settings, log_settings, audit_config) if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=args.output) final_results["accuracy_results"] = result_dict - post_proc.save_images(saved_images_ids, ds) + # post_proc.save_images(saved_images_ids, ds) + - runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) + # for runner in runners: + # runner.finish() + with ThreadPoolExecutor(max_workers=len(runners)) as executor: + # Map each runner to its respective sublist + futures = { + executor.submit(runner.finish()): runner + for runner in runners + } + + + # # write final results # diff --git a/text_to_image/mlperf.conf b/text_to_image/mlperf.conf new file mode 100644 index 000000000..0cea5351e --- /dev/null +++ b/text_to_image/mlperf.conf @@ -0,0 +1,99 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# Set performance_sample_count for each model. +# User can optionally set this to higher values in user.conf. +resnet50.*.performance_sample_count_override = 1024 +ssd-mobilenet.*.performance_sample_count_override = 256 +retinanet.*.performance_sample_count_override = 64 +bert.*.performance_sample_count_override = 10833 +dlrm.*.performance_sample_count_override = 204800 +dlrm-v2.*.performance_sample_count_override = 204800 +rnnt.*.performance_sample_count_override = 2513 +gptj.*.performance_sample_count_override = 13368 +llama2-70b.*.performance_sample_count_override = 24576 +stable-diffusion-xl.*.performance_sample_count_override = 5000 +# set to 0 to let entire sample set to be performance sample +3d-unet.*.performance_sample_count_override = 0 + +# Set seeds. The seeds will be distributed two weeks before the submission. +*.*.qsl_rng_seed = 3066443479025735752 +*.*.sample_index_rng_seed = 10688027786191513374 +*.*.schedule_rng_seed = 14962580496156340209 +# Set seeds for TEST_05. The seeds will be distributed two weeks before the submission. +*.*.test05_qsl_rng_seed = 16799458546791641818 +*.*.test05_sample_index_rng_seed = 5453809927556429288 +*.*.test05_schedule_rng_seed = 5435552105434836064 + + +*.SingleStream.target_latency_percentile = 90 +*.SingleStream.min_duration = 600000 + +*.MultiStream.target_latency_percentile = 99 +*.MultiStream.samples_per_query = 8 +*.MultiStream.min_duration = 600000 +*.MultiStream.min_query_count = 662 +retinanet.MultiStream.target_latency = 528 + +# 3D-UNet uses equal issue mode because it has non-uniform inputs +3d-unet.*.sample_concatenate_permutation = 1 + +# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario +gptj.*.sample_concatenate_permutation = 1 +llama2-70b.*.sample_concatenate_permutation = 1 +mixtral-8x7b.*.sample_concatenate_permutation = 1 + +*.Server.target_latency = 10 +*.Server.target_latency_percentile = 99 +*.Server.target_duration = 0 +*.Server.min_duration = 600000 +resnet50.Server.target_latency = 15 +retinanet.Server.target_latency = 100 +bert.Server.target_latency = 130 +dlrm.Server.target_latency = 60 +dlrm-v2.Server.target_latency = 60 +rnnt.Server.target_latency = 1000 +gptj.Server.target_latency = 20000 +stable-diffusion-xl.Server.target_latency = 20000 +# Llama2-70b benchmarks measures token latencies +llama2-70b.*.use_token_latencies = 1 +mixtral-8x7b.*.use_token_latencies = 1 +# gptj benchmark infers token latencies +gptj.*.infer_token_latencies = 1 +gptj.*.token_latency_scaling_factor = 69 +# Only ttft and tpot are tracked for the llama2-70b & mixtral-8x7B benchmark therefore target_latency = 0 +llama2-70b.Server.target_latency = 0 +llama2-70b.Server.ttft_latency = 2000 +llama2-70b.Server.tpot_latency = 200 + +mixtral-8x7b.Server.target_latency = 0 +mixtral-8x7b.Server.ttft_latency = 2000 +mixtral-8x7b.Server.tpot_latency = 200 + +*.Offline.target_latency_percentile = 90 +*.Offline.min_duration = 600000 + +# In Offline scenario, we always have one query. But LoadGen maps this to +# min_sample_count internally in Offline scenario. If the dataset size is larger +# than 24576 we limit the min_query_count to 24576 and otherwise we use +# the dataset size as the limit + +resnet50.Offline.min_query_count = 24576 +retinanet.Offline.min_query_count = 24576 +dlrm-v2.Offline.min_query_count = 24576 +bert.Offline.min_query_count = 10833 +gptj.Offline.min_query_count = 13368 +rnnt.Offline.min_query_count = 2513 +3d-unet.Offline.min_query_count = 43 +stable-diffusion-xl.Offline.min_query_count = 4000 +llama2-70b.Offline.min_query_count = 24576 +mixtral-8x7b.Offline.min_query_count = 15000 + +# These fields should be defined and overridden by user.conf. +*.SingleStream.target_latency = 10 +*.MultiStream.target_latency = 80 +*.Server.target_qps = 1.0 +*.Offline.target_qps = 1.0 + From 541aee344935994b855feadb0cb34f54b4b12847 Mon Sep 17 00:00:00 2001 From: zixianwang2022 Date: Sat, 16 Nov 2024 21:20:19 -0600 Subject: [PATCH 16/21] cp changes for migraphx backend to fork 4 pr --- text_to_image/StableDiffusionMGX.py | 875 ++++++++++++++++++++++++++++ text_to_image/backend_migraphx.py | 301 ++++++++++ text_to_image/main.py | 151 +++-- 3 files changed, 1263 insertions(+), 64 deletions(-) create mode 100644 text_to_image/StableDiffusionMGX.py create mode 100644 text_to_image/backend_migraphx.py diff --git a/text_to_image/StableDiffusionMGX.py b/text_to_image/StableDiffusionMGX.py new file mode 100644 index 000000000..18186cdc6 --- /dev/null +++ b/text_to_image/StableDiffusionMGX.py @@ -0,0 +1,875 @@ +# The MIT License (MIT) +# +# Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the 'Software'), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from argparse import ArgumentParser +from diffusers import EulerDiscreteScheduler +from transformers import CLIPTokenizer +from PIL import Image + +import migraphx as mgx +from functools import wraps +from tqdm import tqdm +from hip import hip +from collections import namedtuple + +import os +import sys +import torch +import time +import logging +import coco +import dataset + +logging.basicConfig(level=logging.ERROR) +log = logging.getLogger("mgx-base") + +formatter = logging.Formatter("{levelname} - {message}", style="{") +file_handler = logging.FileHandler("mgx.log", mode="a", encoding="utf-8") +file_handler.setLevel("INFO") +file_handler.setFormatter(formatter) +log.addHandler(file_handler) + + + +HipEventPair = namedtuple('HipEventPair', ['start', 'end']) + + +# measurement helper +def measure(fn): + @wraps(fn) + def measure_ms(*args, **kwargs): + start_time = time.perf_counter_ns() + result = fn(*args, **kwargs) + end_time = time.perf_counter_ns() + print( + f"Elapsed time for {fn.__name__}: {(end_time - start_time) * 1e-6:.4f} ms\n" + ) + return result + + return measure_ms + + +def get_args(): + parser = ArgumentParser() + # Model compile + parser.add_argument( + "--pipeline-type", + type=str, + choices=["sdxl", "sdxl-opt", "sdxl-turbo"], + required=True, + help="Specify pipeline type. Options: `sdxl`, `sdxl-opt`, `sdxl-turbo`", + ) + + parser.add_argument( + "--onnx-model-path", + type=str, + default=None, + help= + "Path to onnx model files. Use it to override the default models/ path", + ) + + parser.add_argument( + "--compiled-model-path", + type=str, + default=None, + help= + "Path to compiled mxr model files. If not set, it will be saved next to the onnx model.", + ) + + parser.add_argument( + "--use-refiner", + action="store_true", + default=False, + help="Use the refiner model", + ) + + parser.add_argument( + "--refiner-onnx-model-path", + type=str, + default=None, + help= + "Path to onnx model files. Use it to override the default models/ path", + ) + + parser.add_argument( + "--refiner-compiled-model-path", + type=str, + default=None, + help= + "Path to compiled mxr model files. If not set, it will be saved next to the refiner onnx model.", + ) + + parser.add_argument( + "--fp16", + choices=[ + "all", "vae", "clip", "clip2", "unetxl", "refiner_clip2", + "refiner_unetxl" + ], + nargs="+", + help="Quantize models with fp16 precision.", + ) + + parser.add_argument( + "--force-compile", + action="store_true", + default=False, + help="Ignore existing .mxr files and override them", + ) + + parser.add_argument( + "--exhaustive-tune", + action="store_true", + default=False, + help="Perform exhaustive tuning when compiling onnx models", + ) + + # Runtime + parser.add_argument( + "-s", + "--seed", + type=int, + default=42, + help="Random seed", + ) + + parser.add_argument( + "-t", + "--steps", + type=int, + default=20, + help="Number of steps", + ) + + parser.add_argument( + "--refiner-steps", + type=int, + default=20, + help="Number of refiner steps", + ) + + parser.add_argument( + "-p", + "--prompt", + type=str, + # required=True, + help="Prompt", + ) + + parser.add_argument( + "-n", + "--negative-prompt", + type=str, + default="", + help="Negative prompt", + ) + + parser.add_argument( + "--scale", + type=float, + default=5.0, + help="Guidance scale", + ) + + parser.add_argument( + "--refiner-aesthetic-score", + type=float, + default=6.0, + help="aesthetic score for refiner", + ) + + parser.add_argument( + "--refiner-negative-aesthetic-score", + type=float, + default=2.5, + help="negative aesthetic score for refiner", + ) + + parser.add_argument( + "-o", + "--output", + type=str, + default=None, + help="Output name", + ) + + parser.add_argument( + "--verbose", + action="store_true", + default=False, + help="Log during run", + ) + return parser.parse_args() + + +model_shapes = { + "clip": { + "input_ids": [2, 77] + }, + "clip2": { + "input_ids": [2, 77] + }, + "unetxl": { + "sample": [2, 4, 128, 128], + "encoder_hidden_states": [2, 77, 2048], + "text_embeds": [2, 1280], + "time_ids": [2, 6], + "timestep": [1], + }, + "refiner_unetxl": { + "sample": [2, 4, 128, 128], + "encoder_hidden_states": [2, 77, 1280], + "text_embeds": [2, 1280], + "time_ids": [2, 5], + "timestep": [1], + }, + "vae": { + "latent_sample": [1, 4, 128, 128] + }, +} + +model_names = { + "sdxl": { + "clip": "text_encoder", + "clip2": "text_encoder_2", + "unetxl": "unet", + "vae": "vae_decoder", + }, + "sdxl-opt": { + "clip": "clip.opt.mod", + "clip2": "clip2.opt.mod", + "unetxl": "unetxl.opt", + "vae": "vae_decoder", + }, + "sdxl-turbo": { + "clip": "text_encoder", + "clip2": "text_encoder_2", + "unetxl": "unet", + "vae": "vae_decoder", + }, + "refiner": { + "clip2": "clip2.opt.mod", + "unetxl": "unetxl.opt", + }, +} + +default_model_paths = { + "sdxl": "models/sdxl-1.0-base", + "sdxl-opt": "models/sdxl-1.0-base", + "sdxl-turbo": "models/sdxl-turbo", + "refiner": "models/sdxl-1.0-refiner", +} + +mgx_to_torch_dtype_dict = { + "bool_type": torch.bool, + "uint8_type": torch.uint8, + "int8_type": torch.int8, + "int16_type": torch.int16, + "int32_type": torch.int32, + "int64_type": torch.int64, + "float_type": torch.float32, + "double_type": torch.float64, + "half_type": torch.float16, +} + +torch_to_mgx_dtype_dict = { + value: key + for (key, value) in mgx_to_torch_dtype_dict.items() +} + + +def tensor_to_arg(tensor): + return mgx.argument_from_pointer( + mgx.shape( + **{ + "type": torch_to_mgx_dtype_dict[tensor.dtype], + "lens": list(tensor.size()), + "strides": list(tensor.stride()) + }), tensor.data_ptr()) + + +def tensors_to_args(tensors): + return {name: tensor_to_arg(tensor) for name, tensor in tensors.items()} + + +def get_output_name(idx): + return f"main:#output_{idx}" + + +def copy_tensor_sync(tensor, data): + tensor.copy_(data.to(tensor.dtype)) + torch.cuda.synchronize() + + +def copy_tensor(tensor, data): + tensor.copy_(data.to(tensor.dtype)) + + +def run_model_sync(model, args): + model.run(args) + mgx.gpu_sync() + + +def run_model_async(model, args, stream): + model.run_async(args, stream, "ihipStream_t") + + +def allocate_torch_tensors(model): + input_shapes = model.get_parameter_shapes() + data_mapping = { + name: torch.zeros(shape.lens()).to( + mgx_to_torch_dtype_dict[shape.type_string()]).to(device="cuda") + for name, shape in input_shapes.items() + } + return data_mapping + + +class StableDiffusionMGX(): + def __init__(self, pipeline_type, onnx_model_path, compiled_model_path, + use_refiner, refiner_onnx_model_path, + refiner_compiled_model_path, fp16, force_compile, + exhaustive_tune, tokenizers=None, scheduler=None): + if not (onnx_model_path or compiled_model_path): + onnx_model_path = default_model_paths[pipeline_type] + + self.use_refiner = use_refiner + if not self.use_refiner and (refiner_onnx_model_path + or refiner_compiled_model_path): + print( + "WARN: Refiner model is provided, but was *not* enabled. Use --use-refiner to enable it." + ) + if self.use_refiner and not (refiner_onnx_model_path + or refiner_compiled_model_path): + refiner_onnx_model_path = default_model_paths["refiner"] + + is_turbo = "turbo" in pipeline_type + model_id = "stabilityai/sdxl-turbo" if is_turbo else "stabilityai/stable-diffusion-xl-base-1.0" + print(f"Using {model_id}") + + if scheduler is None: + print("Creating EulerDiscreteScheduler scheduler") + self.scheduler = EulerDiscreteScheduler.from_pretrained( + model_id, subfolder="scheduler") + else: + self.scheduler = scheduler + + print("Creating CLIPTokenizer tokenizers...") + if tokenizers is None: + tknz_path1 = os.path.join(onnx_model_path, "tokenizer") + tknz_path2 = os.path.join(onnx_model_path, "tokenizer_2") + self.tokenizers = { + "clip": + CLIPTokenizer.from_pretrained(tknz_path1), + "clip2": + CLIPTokenizer.from_pretrained(tknz_path2) + } + else: + self.tokenizers = tokenizers + + if fp16 is None: + fp16 = [] + elif "all" in fp16: + fp16 = [ + "vae", "clip", "clip2", "unetxl", "refiner_clip2", + "refiner_unetxl" + ] + + if "vae" in fp16: + model_names[pipeline_type]["vae"] = "vae_decoder_fp16_fix" + + log.info("Load models...") + self.models = { + "vae": + StableDiffusionMGX.load_mgx_model( + model_names[pipeline_type]["vae"], + model_shapes["vae"], + os.path.join (onnx_model_path, 'vae_decoder_fp16_fix'), + compiled_model_path=compiled_model_path, + use_fp16="vae" in fp16, + force_compile=force_compile, + exhaustive_tune=exhaustive_tune, + offload_copy=False), + "clip": + StableDiffusionMGX.load_mgx_model( + model_names[pipeline_type]["clip"], + model_shapes["clip"], + os.path.join (onnx_model_path, 'text_encoder'), + compiled_model_path=compiled_model_path, + use_fp16="clip" in fp16, + force_compile=force_compile, + exhaustive_tune=exhaustive_tune, + offload_copy=False), + "clip2": + StableDiffusionMGX.load_mgx_model( + model_names[pipeline_type]["clip2"], + model_shapes["clip2"], + os.path.join (onnx_model_path, 'text_encoder_2'), + compiled_model_path=compiled_model_path, + use_fp16="clip2" in fp16, + force_compile=force_compile, + exhaustive_tune=exhaustive_tune, + offload_copy=False), + "unetxl": + StableDiffusionMGX.load_mgx_model( + model_names[pipeline_type]["unetxl"], + model_shapes["unetxl"], + os.path.join (onnx_model_path, 'unet'), + compiled_model_path=compiled_model_path, + use_fp16="unetxl" in fp16, + force_compile=force_compile, + exhaustive_tune=exhaustive_tune, + offload_copy=False) + } + + log.info(f"init: loaded models") + + self.tensors = { + "clip": allocate_torch_tensors(self.models["clip"]), + "clip2": allocate_torch_tensors(self.models["clip2"]), + "unetxl": allocate_torch_tensors(self.models["unetxl"]), + "vae": allocate_torch_tensors(self.models["vae"]), + } + + # log.info(f"init: tensors: {self.tensors}") + + self.model_args = { + "clip": tensors_to_args(self.tensors["clip"]), + "clip2": tensors_to_args(self.tensors["clip2"]), + "unetxl": tensors_to_args(self.tensors["unetxl"]), + "vae": tensors_to_args(self.tensors["vae"]), + } + + # log.info(f"init: self.model_args: {self.model_args}") + + if self.use_refiner: + log.info(f"init: self.use_refiner: {self.use_refiner}") + + # Note: there is no clip for refiner, only clip2 + self.models["refiner_clip2"] = StableDiffusionMGX.load_mgx_model( + model_names["refiner"]["clip2"], + model_shapes["clip2"], + refiner_onnx_model_path, + compiled_model_path=refiner_compiled_model_path, + use_fp16="refiner_clip2" in fp16, + force_compile=force_compile, + exhaustive_tune=exhaustive_tune, + offload_copy=False) + + log.info(f"init: load refiner clip2") + + self.models["refiner_unetxl"] = StableDiffusionMGX.load_mgx_model( + model_names["refiner"]["unetxl"], + model_shapes[ + "refiner_unetxl"], # this differ from the original unetxl + refiner_onnx_model_path, + compiled_model_path=refiner_compiled_model_path, + use_fp16="refiner_unetxl" in fp16, + force_compile=force_compile, + exhaustive_tune=exhaustive_tune, + offload_copy=False) + + log.info(f"init: load refiner unet") + + self.tensors["refiner_clip2"] = allocate_torch_tensors( + self.models["refiner_clip2"]) + self.tensors["refiner_unetxl"] = allocate_torch_tensors( + self.models["refiner_unetxl"]) + self.model_args["refiner_clip2"] = tensors_to_args( + self.tensors["refiner_clip2"]) + self.model_args["refiner_unetxl"] = tensors_to_args( + self.tensors["refiner_unetxl"]) + # hipEventCreate return a tuple(error, event) + + log.info(f"init: creating hip events") + + self.events = { + "warmup": + HipEventPair(start=hip.hipEventCreate()[1], + end=hip.hipEventCreate()[1]), + "run": + HipEventPair(start=hip.hipEventCreate()[1], + end=hip.hipEventCreate()[1]), + "clip": + HipEventPair(start=hip.hipEventCreate()[1], + end=hip.hipEventCreate()[1]), + "denoise": + HipEventPair(start=hip.hipEventCreate()[1], + end=hip.hipEventCreate()[1]), + "decode": + HipEventPair(start=hip.hipEventCreate()[1], + end=hip.hipEventCreate()[1]), + } + + # log.info(f"init: self.events: {self.events}") + + self.stream = hip.hipStreamCreate()[1] + + # log.info(f"init: self.stream: {self.stream}") + + def cleanup(self): + for event in self.events.values(): + hip.hipEventDestroy(event.start) + hip.hipEventDestroy(event.end) + hip.hipStreamDestroy(self.stream) + + def profile_start(self, name): + if name in self.events: + hip.hipEventRecord(self.events[name].start, None) + + def profile_end(self, name): + if name in self.events: + hip.hipEventRecord(self.events[name].end, None) + + # @measure + @torch.no_grad() + def run(self, + prompt, + steps=20, + negative_prompt="normal quality, low quality, worst quality, low res, blurry, nsfw, nude", + seed=42, + scale=5.0, + refiner_steps=20, + refiner_aesthetic_score=6.0, + refiner_negative_aesthetic_score=2.5, + verbose=False, + prompt_tokens=None, + latents_in=None, + device="cuda"): + torch.cuda.synchronize() + self.profile_start("run") + # need to set this for each run + self.scheduler.set_timesteps(steps, device=device) + + if verbose: + print("Tokenizing prompts...") + + if prompt_tokens is not None: + prompt_tokens = prompt_tokens + else: + # log.info(f"[mgx] input prompt: {prompt}") + prompt_tokens = self.tokenize(prompt, negative_prompt) + # log.info(f"[mgx] clip token: {prompt_tokens[0]['input_ids']}") + # log.info(f"[mgx] clip2 token: {prompt_tokens[1]['input_ids']}") + + # raise SystemExit("Checking if tokens match") + + if verbose: + print("Creating text embeddings...") + self.profile_start("clip") + hidden_states, text_embeddings = self.get_embeddings(prompt_tokens) + # log.info(f"[mgx] hidden_states (shape {hidden_states.shape}): {hidden_states}") + # log.info(f"[mgx] text_embeddings (shape {text_embeddings.shape}): {text_embeddings}") + # log.info(f"------DIVIDER--------") + self.profile_end("clip") + sample_size = list(self.tensors["vae"]["latent_sample"].size()) + if verbose: + print( + f"Creating random input data {sample_size} (latents) with {seed = }..." + ) + + height, width = sample_size[2:] + time_id = [height * 8, width * 8, 0, 0, height * 8, width * 8] + time_ids = torch.tensor([time_id, time_id]).to(device=device) + + if latents_in is None: + noise = torch.randn( + sample_size, generator=torch.manual_seed(seed)).to(device=device) + # input h/w crop h/w output h/w + + if verbose: + print("Apply initial noise sigma\n") + + # print(f"noise.device -> {noise.device}") + # print(f"self.scheduler.init_noise_sigma.device -> {self.scheduler.init_noise_sigma.device}") + latents = noise * self.scheduler.init_noise_sigma + else: + + if verbose: + print("Apply initial noise sigma\n") + + # log.info(f"[MGX] input latents provided, no need to generate") + latents = latents_in * self.scheduler.init_noise_sigma + + if verbose: + print("Running denoising loop...") + self.profile_start("denoise") + for step, t in tqdm(enumerate(self.scheduler.timesteps), + total=len(self.scheduler.timesteps), + desc=f"Device {device} Denoising", + ncols=100, + leave=True): + if verbose: + print(f"#{step}/{len(self.scheduler.timesteps)} step") + latents = self.denoise_step(text_embeddings, + hidden_states, + latents, + t, + scale, + time_ids, + model="unetxl", + device=device) + self.profile_end("denoise") + if self.use_refiner and refiner_steps > 0: + hidden_states, text_embeddings = self.get_embeddings( + prompt_tokens, is_refiner=True) + # input h/w crop h/w scores + time_id_pos = time_id[:4] + [refiner_aesthetic_score] + time_id_neg = time_id[:4] + [refiner_negative_aesthetic_score] + time_ids = torch.tensor([time_id_pos, + time_id_neg]).to(device=device) + # need to set this for each run + self.scheduler.set_timesteps(refiner_steps, device=device) + # Add noise to latents using timesteps + latents = self.scheduler.add_noise(latents, noise, + self.scheduler.timesteps[:1]) + if verbose: + print("Running refiner denoising loop...") + for step, t in enumerate(self.scheduler.timesteps): + if verbose: + print(f"#{step}/{len(self.scheduler.timesteps)} step") + latents = self.denoise_step(text_embeddings, + hidden_states, + latents, + t, + scale, + time_ids, + model="refiner_unetxl", + device=device) + if verbose: + print("Scale denoised result...") + latents = 1 / 0.18215 * latents + + self.profile_start("decode") + if verbose: + print("Decode denoised result...") + image = self.decode(latents) + self.profile_end("decode") + + torch.cuda.synchronize() + self.profile_end("run") + # assert image.shape == (1, 3, 1024, 1024), f"Actual shape of image is: {image.shape}" + return image + + def print_summary(self, denoise_steps): + print('WARMUP\t{:>9.2f} ms'.format( + hip.hipEventElapsedTime(self.events['warmup'].start, + self.events['warmup'].end)[1])) + print('CLIP\t{:>9.2f} ms'.format( + hip.hipEventElapsedTime(self.events['clip'].start, + self.events['clip'].end)[1])) + print('UNetx{}\t{:>9.2f} ms'.format( + str(denoise_steps), + hip.hipEventElapsedTime(self.events['denoise'].start, + self.events['denoise'].end)[1])) + print('VAE-Dec\t{:>9.2f} ms'.format( + hip.hipEventElapsedTime(self.events['decode'].start, + self.events['decode'].end)[1])) + print('RUN\t{:>9.2f} ms'.format( + hip.hipEventElapsedTime(self.events['run'].start, + self.events['run'].end)[1])) + + # @measure + @staticmethod + def load_mgx_model(name, + shapes, + onnx_model_path, + compiled_model_path=None, + use_fp16=False, + force_compile=False, + exhaustive_tune=False, + offload_copy=True): + + log.info(f"Zixian: inside load_mgx_model") + print(f"Loading {name} model...") + + if compiled_model_path is None: + compiled_model_path = onnx_model_path + onnx_file = f"{onnx_model_path}/{name}/model.onnx" + mxr_file = f"{compiled_model_path}/{name}/model_{'fp16' if use_fp16 else 'fp32'}_{'gpu' if not offload_copy else 'oc'}.mxr" + log.info(f"Zixian: mxr_file: {mxr_file}") + + if not force_compile and os.path.isfile(mxr_file): + print(f"Found mxr, loading it from {mxr_file}") + model = mgx.load(mxr_file, format="msgpack") + elif os.path.isfile(onnx_file): + print(f"No mxr found at {mxr_file}") + print(f"[IMPORTANT] Parsing from {onnx_file}") + model = mgx.parse_onnx(onnx_file, map_input_dims=shapes) + if use_fp16: + print(f"[IMPORTANT] Unet quantizing to FP16...") + mgx.quantize_fp16(model) + + + model.compile(mgx.get_target("gpu"), + exhaustive_tune=exhaustive_tune, + offload_copy=offload_copy) + print(f"Saving {name} model to {mxr_file}") + os.makedirs(os.path.dirname(mxr_file), exist_ok=True) + mgx.save(model, mxr_file, format="msgpack") + else: + log.info(f"Zixian: no model found") + print( + f"No {name} model found at {onnx_file} or {mxr_file}. Please download it and re-try." + ) + sys.exit(1) + return model + + # @measure + def tokenize(self, prompt, negative_prompt): + def _tokenize(tokenizer): + return self.tokenizers[tokenizer]( + [prompt, negative_prompt], + padding="max_length", + max_length=self.tokenizers[tokenizer].model_max_length, + truncation=True, + return_tensors="pt") + + tokens = _tokenize("clip") + tokens2 = _tokenize("clip2") + return (tokens, tokens2) + + # @measure + def get_embeddings(self, prompt_tokens, is_refiner=False): + def _create_embedding(model, input): + copy_tensor(self.tensors[model]["input_ids"], input.input_ids) + run_model_async(self.models[model], self.model_args[model], + self.stream) + + clip_input, clip2_input = prompt_tokens + clip, clip2 = "clip", ("refiner_" if is_refiner else "") + "clip2" + if not is_refiner: + _create_embedding(clip, clip_input) + _create_embedding(clip2, clip2_input) + + hidden_states = torch.concatenate( + (self.tensors[clip][get_output_name(0)], + self.tensors[clip2][get_output_name(1)]), + axis=2) if not is_refiner else self.tensors[clip2][get_output_name( + 1)] + text_embeds = self.tensors[clip2][get_output_name(0)] + return (hidden_states, text_embeds) + + @staticmethod + def convert_to_rgb_image(image): + image = (image / 2 + 0.5).clamp(0, 1) + image = image.detach().cpu().permute(0, 2, 3, 1).numpy() + images = (image * 255).round().astype("uint8") + return Image.fromarray(images[0]) + + @staticmethod + def save_image(pil_image, filename="output.png"): + pil_image.save(filename) + + # @measure + def denoise_step(self, text_embeddings, hidden_states, latents, t, scale, + time_ids, model, device): + latents_model_input = torch.cat([latents] * 2) + latents_model_input = self.scheduler.scale_model_input( + latents_model_input, t).to(device=device) + timestep = torch.atleast_1d(t.to(device=device)) # convert 0D -> 1D + + copy_tensor(self.tensors[model]["sample"], latents_model_input) + copy_tensor(self.tensors[model]["encoder_hidden_states"], + hidden_states) + copy_tensor(self.tensors[model]["text_embeds"], text_embeddings) + copy_tensor(self.tensors[model]["timestep"], timestep) + copy_tensor(self.tensors[model]["time_ids"], time_ids) + run_model_async(self.models[model], self.model_args[model], + self.stream) + + noise_pred_text, noise_pred_uncond = torch.tensor_split( + self.tensors[model][get_output_name(0)], 2) + + # perform guidance + noise_pred = noise_pred_uncond + scale * (noise_pred_text - + noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + return self.scheduler.step(noise_pred, t, latents).prev_sample + + # @measure + def decode(self, latents): + copy_tensor(self.tensors["vae"]["latent_sample"], latents) + run_model_async(self.models["vae"], self.model_args["vae"], + self.stream) + return self.tensors["vae"][get_output_name(0)] + + # @measure + def warmup(self, num_runs): + self.profile_start("warmup") + init_fn = lambda x: torch.ones if "clip" in x else torch.randn + for model in self.models.keys(): + for tensor in self.tensors[model].values(): + copy_tensor(tensor, init_fn(model)(tensor.size())) + + for _ in range(num_runs): + for model in self.models.keys(): + run_model_async(self.models[model], self.model_args[model], + self.stream) + self.profile_end("warmup") + + +if __name__ == "__main__": + args = get_args() + + # sd = StableDiffusionMGX(args.pipeline_type, args.onnx_model_path, + # args.compiled_model_path, args.use_refiner, + # args.refiner_onnx_model_path, + # args.refiner_compiled_model_path, args.fp16, + # args.force_compile, args.exhaustive_tune) + + sd = StableDiffusionMGX("sdxl", onnx_model_path=args.onnx_model_path, + compiled_model_path=None, use_refiner=False, + refiner_onnx_model_path=None, + refiner_compiled_model_path=None, fp16=args.fp16, + force_compile=False, exhaustive_tune=True) + print("Warmup") + sd.warmup(5) + print("Run") + + prompt_list = [] + prompt_list.append(["A young man in a white shirt is playing tennis.", "tennis.jpg"]) + # prompt_list.append(["Lorem ipsum dolor sit amet, consectetur adipiscing elit", "woman.jpg"]) + prompt_list.append(["Astronaut crashlanding in Madison Square Garden, cold color palette, muted colors, detailed, 8k", "crash_astro.jpg"]) + # prompt_list.append(["John Cena giving The Rock an Attitude Adjustment off the roof, warm color palette, vivid colors, detailed, 8k", "cena_rock.jpg"]) + + for element in prompt_list: + prompt, img_name = element[0], element[1] + # result = sd.run(prompt, args.negative_prompt, args.steps, args.seed, + # args.scale, args.refiner_steps, + # args.refiner_aesthetic_score, + # args.refiner_negative_aesthetic_score, args.verbose) + + result = sd.run(prompt=prompt, steps=20, seed=args.seed, + scale=5.0, refiner_steps=0, + refiner_aesthetic_score=0.0, + refiner_negative_aesthetic_score=0.0, verbose=False) + + print("Summary") + sd.print_summary(args.steps) + + print("Convert result to rgb image...") + image = StableDiffusionMGX.convert_to_rgb_image(result) + StableDiffusionMGX.save_image(image, img_name) + print(f"Image saved to {img_name}") + + print("Cleanup") + sd.cleanup() \ No newline at end of file diff --git a/text_to_image/backend_migraphx.py b/text_to_image/backend_migraphx.py new file mode 100644 index 000000000..2df2605a8 --- /dev/null +++ b/text_to_image/backend_migraphx.py @@ -0,0 +1,301 @@ +from typing import Optional, List, Union +import migraphx as mgx + +import os +import torch +import logging +import sys +import backend +import time +import random +import json +import re + +from hip import hip +from PIL import Image +from functools import wraps +from collections import namedtuple +from transformers import CLIPTokenizer, CLIPTextModelWithProjection, CLIPProcessor, CLIPFeatureExtractor +from diffusers import StableDiffusionXLPipeline, EulerDiscreteScheduler +from argparse import ArgumentParser +from StableDiffusionMGX import StableDiffusionMGX +from huggingface_hub import hf_hub_download, list_repo_files +import numpy as np + +HipEventPair = namedtuple('HipEventPair', ['start', 'end']) + +logging.basicConfig(level=logging.ERROR) +log = logging.getLogger("backend-mgx") + + +formatter = logging.Formatter("{levelname} - {message}", style="{") +file_handler = logging.FileHandler("backend_mgx.log", mode="a", encoding="utf-8") +file_handler.setLevel("INFO") +file_handler.setFormatter(formatter) +log.addHandler(file_handler) + +def download_model(repo_id, model_path): + # Zixian: Nov 10: Comment this out because model_path is current dir. + if os.path.exists(model_path): + log.info(f"MGX models already exists at {model_path}") + return + else: + os.makedirs(model_path, exist_ok=True) + + repo_files = list_repo_files(repo_id) + + files_to_download = [ + file for file in repo_files + if not file.endswith(".onnx") and not file.endswith("model_fp32_gpu.mxr") + ] + + for file_name in files_to_download: + local_file_path = os.path.join(model_path, file_name) + local_folder = os.path.dirname(local_file_path) + + # Create directory structure if it does not exist + os.makedirs(local_folder, exist_ok=True) + + # Download the file to the specific path + try: + hf_hub_download(repo_id=repo_id, filename=file_name, cache_dir=local_folder, local_dir=local_folder, local_dir_use_symlinks=False) + # log.info(f"Downloaded {file_name} to {local_file_path}") + except Exception as e: + log.error(f"Failed to download {file_name}: {e}") + + print (f"Zixian_in_the_log: Downloaded {file_name} to {local_file_path}") + +#! Yalu Ouyang [Nov 10 2024] Keep this in case we aren't allowed to modify coco.py +# class Decoder: +# def __init__(self, vocab_path): +# # Load the vocabulary with UTF-8 encoding to support non-ASCII characters +# with open(vocab_path, "r", encoding="utf-8") as f: +# vocab = json.load(f) + +# # Reverse the mapping: token_id -> word +# self.id_to_word = {int(id_): word for word, id_ in vocab.items()} + +# def decode_tokens(self, token_ids): +# # Ensure token_ids is a list, even if a tensor is passed +# if isinstance(token_ids, torch.Tensor): +# token_ids = token_ids.tolist() + +# # Handle both single sequences and batches +# if isinstance(token_ids[0], list): # Batch of sequences +# decoded_texts = [self._decode_sequence(sequence) for sequence in token_ids] +# return decoded_texts +# else: # Single sequence +# return self._decode_sequence(token_ids) + +# def _decode_sequence(self, token_ids): +# # Convert token IDs to words, handling any unknown tokens +# words = [self.id_to_word.get(token_id, "[UNK]") for token_id in token_ids] + +# # Remove special tokens and `` markers +# text = " ".join(words) +# text = re.sub(r"(<\|startoftext\|>|<\|endoftext\|>)", "", text) # Remove special tokens +# text = text.replace("", "").strip() # Remove `` markers and extra whitespace +# return text + +class BackendMIGraphX(backend.Backend): + def __init__( + self, + model_path=None, + model_id="xl", + guidance=5, #! To match the defaults of MiGraphX + steps=20, + batch_size=1, + device="cuda", + precision="fp32", + negative_prompt="normal quality, low quality, worst quality, low res, blurry, nsfw, nude", + ): + super(BackendMIGraphX, self).__init__() + # Zixian: Nov 10: Hard code to set model_path to current dir + # self.model_path = model_path + # self.model_path = os.getcwd() + self.model_path = os.path.join(os.getcwd(), "downloaded_model_folder") + if self.model_path is None: + raise SystemExit("Provide a valid Model Path to correctly run the program, exiting now...") + + self.pipeline_type = None + if model_id == "xl": + self.model_id = "SeaSponge/scc24_mlperf_mgx_exhaustive" + self.pipeline_type = "sdxl" + else: + raise ValueError(f"{model_id} is not a valid model id") + + download_model(self.model_id, self.model_path) + log.info(f"[mgx backend]: Returned from download_model") + + + self.device = device if torch.cuda.is_available() else "cpu" + self.device_num = int(device[-1]) \ + if (device != "cuda" and device != "cpu") else -1 + + # log.error(f"[mgx backend] self.device -> {self.device} | device_num -> {self.device_num}") + + if precision == "fp16": + self.dtype = torch.float16 + elif precision == "bf16": + self.dtype = torch.bfloat16 + else: + self.dtype = torch.float32 + + if torch.cuda.is_available(): + self.local_rank = 0 + self.world_size = 1 + + self.guidance = guidance + self.steps = steps + self.negative_prompt = negative_prompt + self.max_length_neg_prompt = 77 + self.batch_size = batch_size + + self.mgx = None + tknz_path1 = os.path.join(self.model_path, "tokenizer") + tknz_path2 = os.path.join(self.model_path, "tokenizer_2") + # self.scheduler = EulerDiscreteScheduler.from_pretrained(os.path.join(self.model_path, "scheduler")) + self.scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler") + log.info(f"Zixian: Loaded scheduler") + self.pipe = self.Pipe() + # self.pipe.tokenizer = CLIPTokenizer.from_pretrained(tknz_path1) + # self.pipe.tokenizer_2 = CLIPTokenizer.from_pretrained(tknz_path2) + self.pipe.tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="tokenizer") + self.pipe.tokenizer_2 = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="tokenizer_2") + log.info(f"Zixian: Loaded tokenizer & tokenizer2") + # log.info(f"Zixian_in_the_log tknz_path1: {tknz_path1}") + # log.info(f"Zixian_in_the_log tknz_path2: {tknz_path2}") + # self.decoder1 = Decoder(os.path.join(self.model_path, "tokenizer/vocab.json")) + # self.decoder2 = Decoder(os.path.join(self.model_path, "tokenizer_2/vocab.json")) + self.tokenizers = [self.pipe.tokenizer, self.pipe.tokenizer_2] + + class Pipe: + def __init__(self): + self.tokenizer = None + self.tokenizer_2 = None + + def version(self): + return torch.__version__ + + def name(self): + return "pytorch-SUT" + + def image_format(self): + return "NCHW" + + def load(self): + if self.model_path is None: + log.warning( + "Model path not provided, running with default hugging face weights\n" + "This may not be valid for official submissions" + ) + + raise SystemExit("Provide a valid Model Path to correctly run the program, exiting now...") + + else: + if self.device_num != -1: + # log.error(f"Hip set device to -> {self.device_num}") + hip.hipSetDevice(self.device_num) + + # raise SystemExit("Stopping to check") + + # Parameter explanations here: + # onnx_model_path = self.model_path + # path to compiled .mxr can be left as None + # Don't want to use refiner model + use_refiner = False + # Therefore refiner model path also None + # refiner compiled model path also None + + # set fp16 according to initialization input + fp16 = "all" if self.dtype == torch.float16 else None + # Don't want to force .onnx to .mxr compile + force_compile = False + # Use exhaustive tune when compilling .onnx -> .mxr + exhaustive_tune = True + + tokenizers = {"clip": self.tokenizers[0], "clip2": self.tokenizers[1]} + + self.mgx = StableDiffusionMGX(self.pipeline_type, onnx_model_path=self.model_path, + compiled_model_path=None, use_refiner=use_refiner, + refiner_onnx_model_path=None, + refiner_compiled_model_path=None, fp16=fp16, + force_compile=force_compile, exhaustive_tune=exhaustive_tune, tokenizers=tokenizers, + scheduler=self.scheduler) + + # log.info(f"[backend_migraphx.py]: after initializing self.mgx") + + # self.mgx.warmup(5) + + # log.info(f"[backend_migraphx.py]: after mgx.warmup") + + return self + + def predict(self, inputs): + images = [] + + # Explanation for mgx.run() arguments + # negative_prompt = self.negative_prompt + # steps = self.steps + # scale refers to guidance scale -> scale = self.guidance + # the default SDXLPipeline chooses a random seed everytime, we'll do so manually here + # not using refiner, so refiner_step = 0 + # not using refiner, so aesthetic_score = 0 + # not using refiner, so negative_aesthetic_score = 0 + # defaults to not verbose + verbose = False + #! The main pipeline from loadgen doesn't have text prompt, only tokens + + for i in range(0, len(inputs), self.batch_size): + latents_input = [inputs[idx]["latents"] for idx in range(i, min(i+self.batch_size, len(inputs)))] + latents_input = torch.cat(latents_input).to(self.device) + if self.batch_size == 1: + # prompt_token = inputs[i]["input_tokens"] + # log.info(f"[mgx backend batchsz=1] inputs[i] -> {inputs[i]}") + prompt_in = inputs[i]["caption"] + # log.info(f"[mgx backend] i -> {i} | prompt_in -> {prompt_in}") + seed = random.randint(0, 2**31 - 1) + + # prompt_in = self.decoder1.decode_tokens(prompt_token['input_ids']) + + result = self.mgx.run(prompt=prompt_in, negative_prompt=self.negative_prompt, steps=self.steps, seed=seed, + scale=self.guidance, refiner_steps=0, + refiner_aesthetic_score=0, + refiner_negative_aesthetic_score=0, verbose=verbose, + prompt_tokens=None, device=self.device, latents_in=latents_input) + + # result shape = (3, 1024, 1024) + + # img_name = f"{self.device}_{random.randint(0, 1000)}.jpg" + # image = StableDiffusionMGX.convert_to_rgb_image(result) + # StableDiffusionMGX.save_image(image, img_name) + # log.info(f"[mgx backend batchsz=1] Image saved to {img_name}") + #! COCO needs this to be 3-dimensions + + new_res = (result / 2 + 0.5).clamp(0, 1) + + # log.info(f"[mgx backend] type result: {type(result)} | result shape: {result.shape}") + # log.info(f"[mgx backend] type new_res: {type(new_res)} | new_res shape: {new_res.shape}") + # log.info(f"------DIVIDER--------") + images.extend(new_res) + + else: + prompt_list = [] + for prompt in inputs[i:min(i+self.batch_size, len(inputs))]: + assert isinstance(prompt, dict), "prompt (in inputs) isn't a dict" + # prompt_token = prompt["input_tokens"] + prompt_in = inputs[i]["caption"] + + + for prompt in prompt_list: + seed = random.randint(0, 2**31 - 1) + result = self.mgx.run(prompt=prompt, negative_prompt=self.negative_prompt, steps=self.steps, seed=seed, + scale=self.guidance, refiner_steps=0, + refiner_aesthetic_score=0, + refiner_negative_aesthetic_score=0, verbose=verbose, + prompt_tokens=None, device=self.device, latents_in=latents_input) + + new_res = (result / 2 + 0.5).clamp(0, 1) + images.extend(new_res) + + return images \ No newline at end of file diff --git a/text_to_image/main.py b/text_to_image/main.py index 7d4da2a0b..3d6f7d1d0 100644 --- a/text_to_image/main.py +++ b/text_to_image/main.py @@ -24,6 +24,9 @@ import dataset import coco +# import torchvision.transforms as T +# transform_im = T.ToPILImage() + from concurrent.futures import ThreadPoolExecutor, as_completed logging.basicConfig(level=logging.INFO) @@ -63,6 +66,12 @@ "backend": "pytorch-dist", "model-name": "stable-diffusion-xl", }, + # ? Yalu Ouyang modification: Oct 16 2024 + "stable-diffusion-xl-mgx": { + "dataset": "coco-1024", + "backend": "migraphx", + "model-name": "stable-diffusion-xl", + }, } SCENARIO_MAP = { @@ -75,20 +84,25 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset") - parser.add_argument("--dataset-path", required=True, help="path to the dataset") + parser.add_argument("--dataset", + default="coco-1024", + choices=SUPPORTED_DATASETS.keys(), help="dataset") + parser.add_argument("--dataset-path", + default="coco2014",help="path to the dataset") parser.add_argument( - "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles" + "--profile", + default="stable-diffusion-xl-mgx", + choices=SUPPORTED_PROFILES.keys(), help="standard profiles" ) parser.add_argument( "--scenario", - default="SingleStream", + default="Offline", help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())), ) parser.add_argument( "--max-batchsize", type=int, - default=2, + default=1, help="max batch size in a single inference", ) parser.add_argument("--threads", default=1, type=int, help="threads") @@ -98,12 +112,13 @@ def get_args(): action="store_true", help="enable finding peak performance pass", ) - parser.add_argument("--backend", help="Name of the backend") + parser.add_argument("--backend", default='migraphx', help="Name of the backend") parser.add_argument("--model-name", help="Name of the model") parser.add_argument("--output", default="output", help="test results") parser.add_argument("--qps", type=int, help="target qps") - parser.add_argument("--model-path", help="Path to model weights") - parser.add_argument("--gpu-num", type=int, default=4, help="Number of gpus to run inference") + parser.add_argument("--model-path", + default="/work1/zixian/youyang1/models/sdxl-1.0-base", + help="Path to model weights") parser.add_argument( "--dtype", @@ -138,6 +153,9 @@ def get_args(): parser.add_argument( "--audit_conf", default="audit.config", help="config for LoadGen audit settings" ) + parser.add_argument( + "--gpu-num", default=4, type=int, help="number of gpus to use" + ) # arguments to save images # pass this argument for official submission # parser.add_argument("--output-images", action="store_true", help="Store a subset of the generated images") @@ -149,7 +167,7 @@ def get_args(): parser.add_argument("--count", type=int, help="dataset items to use") parser.add_argument("--debug", action="store_true", help="debug") parser.add_argument( - "--performance-sample-count", type=int, help="performance sample count", default=5000 + "--performance-sample-count", type=int, help="performance sample count", default=1000 ) parser.add_argument( "--max-latency", type=float, help="mlperf max latency in pct tile" @@ -184,6 +202,12 @@ def get_backend(backend, **kwargs): from backend_pytorch import BackendPytorch backend = BackendPytorch(**kwargs) + + # ? Yalu Ouyang Modification: Nov 5 2024 + elif backend == "migraphx": + from backend_migraphx import BackendMIGraphX + + backend = BackendMIGraphX(**kwargs) elif backend == "debug": from backend_debug import BackendDebug @@ -230,15 +254,17 @@ def run_one_item(self, qitem: Item): processed_results = [] try: results = self.model.predict(qitem.inputs) + # log.info("[Line 254] runs fine after results") processed_results = self.post_process( results, qitem.content_id, qitem.inputs, self.result_dict ) + # log.info("[Line 258] runs fine after processed_results") if self.take_accuracy: self.post_process.add_results(processed_results) self.result_timing.append(time.time() - qitem.start) except Exception as ex: # pylint: disable=broad-except src = [self.ds.get_item_loc(i) for i in qitem.content_id] - log.error("thread: failed on contentid=%s, %s", src, ex) + log.error("[Line 262] thread: failed on contentid=%s, %s", src, ex) # since post_process will not run, fake empty responses processed_results = [[]] * len(qitem.query_id) finally: @@ -247,6 +273,7 @@ def run_one_item(self, qitem: Item): for idx, query_id in enumerate(qitem.query_id): response_array = array.array( "B", np.array(processed_results[idx], np.uint8).tobytes() + # "B", np.array(processed_results[idx], np.uint64).tobytes() ) response_array_refs.append(response_array) bi = response_array.buffer_info() @@ -287,13 +314,17 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): def handle_tasks(self, tasks_queue): """Worker thread.""" while True: + # log.info ('getting tasks') qitem = tasks_queue.get() + # log.info ('getten tasks') if qitem is None: # None in the queue indicates the parent want us to exit tasks_queue.task_done() break self.run_one_item(qitem) + # log.info ('going to task_done') tasks_queue.task_done() + # log.info ('tasks done') def enqueue(self, query_samples): idx = [q.index for q in query_samples] @@ -331,19 +362,26 @@ def main(): # batch_size=args.max_batchsize # ) # Zixian: Oct 21: create a list of backends for multi-gpu - - # Zixian: Nov 13: Force batchsize=2 since command line doesn't work - args.max_batchsize = 5 + # backends = [get_backend( + # args.backend, + # precision=args.dtype, + # device=f'cuda:{i}', + # model_path=args.model_path, + # batch_size=args.max_batchsize + # ) + # for i in [0, 1, 2, 3]] backends = [get_backend( args.backend, precision=args.dtype, - device=f'cuda:{i}', + device=f'cuda:{int (i/int (args.gpu_num / 4))}', model_path=args.model_path, batch_size=args.max_batchsize ) for i in np.arange (args.gpu_num)] + log.info(f"Zixian: Returned from get_backends") + if args.dtype == "fp16": dtype = torch.float16 @@ -361,8 +399,10 @@ def main(): # load model to backend # model = backend.load() + log.info(f"Zixian: entering backend.load") # Zixian: Oct 21: create a list of models corresponding to each backend models = [backend.load() for backend in backends] + log.info(f"Zixian: loaded models from all backend") # dataset to use dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] @@ -379,6 +419,7 @@ def main(): latent_dtype=dtype, latent_device=args.device, latent_framework=args.latent_framework, + pipe_type=args.backend, **kwargs, ) final_results = { @@ -430,38 +471,21 @@ def main(): # for _ in range(args.max_batchsize) # ] warmup_samples_gpus = [ - [ - { - "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer), - "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2), - "latents": latents_pt, - } - for _ in range(int(args.max_batchsize)) - ] - for model in models] * 3 # 3 times warmup samples + [ + { + "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer), + "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2), + "caption": syntetic_str, + "latents": latents_pt + } + for _ in range(int(args.max_batchsize)) + ] + for model in models] # Zixian: Oct 21: warm up each backend - # for idx, backend in enumerate (backends): - # for i in range(1): - # _ = backend.predict(warmup_samples_gpus[idx]) - - print (f'Start distributed warmup') - with ThreadPoolExecutor(max_workers=len(backends)) as executor: - # Map each runner to its respective sublist - futures = { - executor.submit(backend.predict, queries): backend - for backend, queries in zip(backends, warmup_samples_gpus) - } - - # Optionally process the results - for future in as_completed(futures): - backend = futures[future] - try: - result = future.result() - print(f'Warmup backend {backend} enqueued successfully.') - except Exception as exc: - print(f'Warmup backend {backend} generated an exception: {exc}') - + for idx, backend in enumerate (backends): + for i in range(1): + _ = backend.predict(warmup_samples_gpus[idx]) scenario = SCENARIO_MAP[args.scenario] runner_map = { @@ -488,8 +512,6 @@ def issue_queries(query_samples): query_samples_len = len (query_samples) query_samples_seg_len = query_samples_len / len (runners) splitted_query_samples = [] - - for idx in range (len (runners)): log.info (f'\n\n\n') log.info (f'idx: {idx}') @@ -499,9 +521,9 @@ def issue_queries(query_samples): # splitted_query_samples.append (query_samples[idx*query_samples_seg_len:]) # else: # splitted_query_samples.append (query_samples[idx*query_samples_seg_len : (idx+1)*query_samples_seg_len]) + splitted_query_samples.append (query_samples [int(round(query_samples_seg_len * idx)): int(round(query_samples_seg_len * (idx + 1)))]) - - # splitted_query_samples = [query_samples[int(round(avg * i)): int(round(avg * (i + 1)))] for i in range(b)] + with ThreadPoolExecutor(max_workers=len(runners)) as executor: # Map each runner to its respective sublist @@ -546,8 +568,6 @@ def flush_queries(): settings.min_duration_ms = args.time * MILLI_SEC settings.max_duration_ms = args.time * MILLI_SEC - # Zixian: Nov8: manually setting args.qps to 1 - # args.qps=1.0 if args.qps: qps = float(args.qps) settings.server_target_qps = qps @@ -568,9 +588,8 @@ def flush_queries(): if args.performance_sample_count else min(count, 500) ) - - # count = 200 sut = lg.ConstructSUT(issue_queries, flush_queries) + #! [Yalu Ouyang] count here affects how many items to run (even for accuracy) qsl = lg.ConstructQSL( count, performance_sample_count, ds.load_query_samples, ds.unload_query_samples ) @@ -589,26 +608,30 @@ def flush_queries(): lg.StartTestWithLogSettings(sut, qsl, settings, log_settings, audit_config) + + log.info("Loadgen finished tests") if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=args.output) final_results["accuracy_results"] = result_dict - # post_proc.save_images(saved_images_ids, ds) + post_proc.save_images(saved_images_ids, ds) + log.info("After processing accuracy") - lg.DestroyQSL(qsl) - lg.DestroySUT(sut) - - # for runner in runners: - # runner.finish() - with ThreadPoolExecutor(max_workers=len(runners)) as executor: - # Map each runner to its respective sublist - futures = { - executor.submit(runner.finish()): runner - for runner in runners - } + for runner in runners: + runner.finish() + + log.info("After runner.finish()") + # with ThreadPoolExecutor(max_workers=len(runners)) as executor: + # # Map each runner to its respective sublist + # futures = { + # executor.submit(runner.finish()): runner + # for runner in runners + # } + lg.DestroyQSL(qsl) + lg.DestroySUT(sut) # # write final results From cb77eaef78278bdfc70854d5a2a926001de8b6c8 Mon Sep 17 00:00:00 2001 From: zixianwang2022 Date: Sat, 16 Nov 2024 21:23:27 -0600 Subject: [PATCH 17/21] copy changes to fork 4 pr --- text_to_image/coco.py | 84 ++-- text_to_image/main.py | 508 ++-------------------- text_to_image/py_demo_server_lon.py | 568 +++++++++++++++++++++++++ text_to_image/sut_over_network_demo.py | 455 ++++++++++++++++++++ 4 files changed, 1116 insertions(+), 499 deletions(-) create mode 100644 text_to_image/py_demo_server_lon.py create mode 100644 text_to_image/sut_over_network_demo.py diff --git a/text_to_image/coco.py b/text_to_image/coco.py index e9499b0e6..92773ed9c 100644 --- a/text_to_image/coco.py +++ b/text_to_image/coco.py @@ -35,22 +35,27 @@ def __init__( latent_dtype=torch.float32, latent_device="cuda", latent_framework="torch", + pipe_type=None, **kwargs, ): super().__init__() - self.captions_df = pd.read_csv( - f"{data_path}/captions/captions.tsv", sep="\t") + self.captions_df = pd.read_csv(f"{data_path}/captions/captions.tsv", sep="\t") self.image_size = image_size self.preprocessed_dir = os.path.abspath(f"{data_path}/preprocessed/") self.img_dir = os.path.abspath(f"{data_path}/validation/data/") self.name = name + + self.pipe_tokenizer = pipe_tokenizer + self.pipe_tokenizer_2 = pipe_tokenizer_2 # Preprocess prompts self.captions_df["input_tokens"] = self.captions_df["caption"].apply( - lambda x: self.preprocess(x, pipe_tokenizer) + # lambda x: self.preprocess(x, pipe_tokenizer) + lambda x: x ) self.captions_df["input_tokens_2"] = self.captions_df["caption"].apply( - lambda x: self.preprocess(x, pipe_tokenizer_2) + # lambda x: self.preprocess(x, pipe_tokenizer_2) + lambda x: x ) self.latent_dtype = latent_dtype self.latent_device = latent_device if torch.cuda.is_available() else "cpu" @@ -66,6 +71,8 @@ def __init__( .to(latent_dtype) .to(latent_device) ) + + self.pipe_type = pipe_type def preprocess(self, prompt, tokenizer): converted_prompt = self.convert_prompt(prompt, tokenizer) @@ -113,14 +120,34 @@ def convert_prompt(self, prompt, tokenizer): def get_item(self, id): return dict(self.captions_df.loc[id], latents=self.latents) + #! Yalu Ouyang [Nov 14 2024] Overrides parent Dataset class, default behavior is same though + def get_samples(self, id_list): + if self.pipe_type == "migraphx": + #! Yalu Ouyang [Nov 14 2024] MGX backend just needs text prompt + data = [ + { + "caption": self.items_inmemory[id]["caption"], + "latents": self.items_inmemory[id]["latents"], + } + for id in id_list + ] + else: + data = [ + { + "input_tokens": self.items_inmemory[id]["input_tokens"], + "input_tokens_2": self.items_inmemory[id]["input_tokens_2"], + "latents": self.items_inmemory[id]["latents"], + } + for id in id_list + ] + images = [self.items_inmemory[id]["file_name"] for id in id_list] + return data, images + def get_item_count(self): return len(self.captions_df) def get_img(self, id): - img = Image.open( - self.img_dir + - "/" + - self.captions_df.loc[id]["file_name"]) + img = Image.open(self.img_dir + "/" + self.captions_df.loc[id]["file_name"]) return self.image_to_tensor(img) def get_imgs(self, id_list): @@ -141,11 +168,7 @@ def get_item_loc(self, id): class PostProcessCoco: def __init__( - self, - device="cpu", - dtype="uint8", - statistics_path=os.path.join( - os.path.dirname(__file__), "tools", "val2014.npz"), + self, device="cpu", dtype="uint8", statistics_path=os.path.join(os.path.dirname(__file__), "tools", "val2014.npz") ): self.results = [] self.good = 0 @@ -167,33 +190,27 @@ def add_results(self, results): def __call__(self, results, ids, expected=None, result_dict=None): self.content_ids.extend(ids) return [ - (t.cpu().permute(1, 2, 0).float().numpy() * 255) - .round() - .astype(self.numpy_dtype) + (t.cpu().permute(1, 2, 0).float().numpy() * 255).round().astype(self.numpy_dtype) for t in results ] - + def save_images(self, ids, ds): info = [] idx = {} - for i, image_id in enumerate(self.content_ids): - if image_id in ids: - idx[image_id] = i + for i, id in enumerate(self.content_ids): + if id in ids: + idx[id] = i if not os.path.exists("images/"): os.makedirs("images/", exist_ok=True) - for image_id in ids: - if not idx.get(image_id): - print( - f"image id {image_id} is missing in the results. Hence not saved.") - continue - caption = ds.get_caption(image_id) - generated = Image.fromarray(self.results[idx[image_id]]) - image_path_tmp = f"images/{self.content_ids[idx[image_id]]}.png" + for id in ids: + caption = ds.get_caption(id) + generated = Image.fromarray(self.results[idx[id]]) + image_path_tmp = f"images/{self.content_ids[idx[id]]}.png" generated.save(image_path_tmp) - info.append((self.content_ids[idx[image_id]], caption)) + info.append((self.content_ids[idx[id]], caption)) with open("images/captions.txt", "w+") as f: - for image_id, caption in info: - f.write(f"{image_id} {caption}\n") + for id, caption in info: + f.write(f"{id} {caption}\n") def start(self): self.results = [] @@ -209,10 +226,7 @@ def finalize(self, result_dict, ds=None, output_dir=None): 100 * clip.get_clip_score(caption, generated).item() ) - fid_score = compute_fid( - self.results, - self.statistics_path, - self.device) + fid_score = compute_fid(self.results, self.statistics_path, self.device) result_dict["FID_SCORE"] = fid_score result_dict["CLIP_SCORE"] = np.mean(self.clip_scores) diff --git a/text_to_image/main.py b/text_to_image/main.py index 3d6f7d1d0..6ff1fb63a 100644 --- a/text_to_image/main.py +++ b/text_to_image/main.py @@ -1,3 +1,4 @@ + """ mlperf inference benchmarking tool """ @@ -21,14 +22,16 @@ import numpy as np import torch +import subprocess +from py_demo_server_lon import main as server_main + import dataset import coco -# import torchvision.transforms as T -# transform_im = T.ToPILImage() - from concurrent.futures import ThreadPoolExecutor, as_completed +# from sut_over_network_demo import main as + logging.basicConfig(level=logging.INFO) log = logging.getLogger("main") @@ -45,6 +48,13 @@ } +SCENARIO_MAP = { + "SingleStream": lg.TestScenario.SingleStream, + "MultiStream": lg.TestScenario.MultiStream, + "Server": lg.TestScenario.Server, + "Offline": lg.TestScenario.Offline, +} + SUPPORTED_PROFILES = { "defaults": { "dataset": "coco-1024", @@ -66,37 +76,24 @@ "backend": "pytorch-dist", "model-name": "stable-diffusion-xl", }, - # ? Yalu Ouyang modification: Oct 16 2024 - "stable-diffusion-xl-mgx": { + "stable-diffusion-xl-migraphx": { "dataset": "coco-1024", "backend": "migraphx", "model-name": "stable-diffusion-xl", }, } -SCENARIO_MAP = { - "SingleStream": lg.TestScenario.SingleStream, - "MultiStream": lg.TestScenario.MultiStream, - "Server": lg.TestScenario.Server, - "Offline": lg.TestScenario.Offline, -} - - def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--dataset", - default="coco-1024", - choices=SUPPORTED_DATASETS.keys(), help="dataset") - parser.add_argument("--dataset-path", - default="coco2014",help="path to the dataset") + parser.add_argument('--sut-server', default=['http://t004-005:8008', "http://t006-001:8008"], nargs='+', help='A list of server address & port') #'http://t004-006:8008' + parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset") + parser.add_argument("--dataset-path", required=True, help="path to the dataset") parser.add_argument( - "--profile", - default="stable-diffusion-xl-mgx", - choices=SUPPORTED_PROFILES.keys(), help="standard profiles" + "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles" ) parser.add_argument( "--scenario", - default="Offline", + default="SingleStream", help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())), ) parser.add_argument( @@ -112,17 +109,15 @@ def get_args(): action="store_true", help="enable finding peak performance pass", ) - parser.add_argument("--backend", default='migraphx', help="Name of the backend") + parser.add_argument("--backend", help="Name of the backend", default="migraphx") parser.add_argument("--model-name", help="Name of the model") parser.add_argument("--output", default="output", help="test results") parser.add_argument("--qps", type=int, help="target qps") - parser.add_argument("--model-path", - default="/work1/zixian/youyang1/models/sdxl-1.0-base", - help="Path to model weights") + parser.add_argument("--model-path", help="Path to model weights") parser.add_argument( "--dtype", - default="fp16", + default="fp32", choices=["fp32", "fp16", "bf16"], help="dtype of the model", ) @@ -153,9 +148,6 @@ def get_args(): parser.add_argument( "--audit_conf", default="audit.config", help="config for LoadGen audit settings" ) - parser.add_argument( - "--gpu-num", default=4, type=int, help="number of gpus to use" - ) # arguments to save images # pass this argument for official submission # parser.add_argument("--output-images", action="store_true", help="Store a subset of the generated images") @@ -167,7 +159,7 @@ def get_args(): parser.add_argument("--count", type=int, help="dataset items to use") parser.add_argument("--debug", action="store_true", help="debug") parser.add_argument( - "--performance-sample-count", type=int, help="performance sample count", default=1000 + "--performance-sample-count", type=int, help="performance sample count", default=5000 ) parser.add_argument( "--max-latency", type=float, help="mlperf max latency in pct tile" @@ -197,449 +189,37 @@ def get_args(): return args -def get_backend(backend, **kwargs): - if backend == "pytorch": - from backend_pytorch import BackendPytorch - - backend = BackendPytorch(**kwargs) - - # ? Yalu Ouyang Modification: Nov 5 2024 - elif backend == "migraphx": - from backend_migraphx import BackendMIGraphX - - backend = BackendMIGraphX(**kwargs) - - elif backend == "debug": - from backend_debug import BackendDebug - - backend = BackendDebug() - else: - raise ValueError("unknown backend: " + backend) - return backend - - -class Item: - """An item that we queue for processing by the thread pool.""" - - def __init__(self, query_id, content_id, inputs, img=None): - self.query_id = query_id - self.content_id = content_id - self.img = img - self.inputs = inputs - self.start = time.time() - - -class RunnerBase: - def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): - self.take_accuracy = False - self.ds = ds - self.model = model - self.post_process = post_proc - self.threads = threads - self.take_accuracy = False - self.max_batchsize = max_batchsize - self.result_timing = [] - - def handle_tasks(self, tasks_queue): - pass - - def start_run(self, result_dict, take_accuracy): - self.result_dict = result_dict - self.result_timing = [] - self.take_accuracy = take_accuracy - self.post_process.start() - - def run_one_item(self, qitem: Item): - # run the prediction - processed_results = [] - try: - results = self.model.predict(qitem.inputs) - # log.info("[Line 254] runs fine after results") - processed_results = self.post_process( - results, qitem.content_id, qitem.inputs, self.result_dict - ) - # log.info("[Line 258] runs fine after processed_results") - if self.take_accuracy: - self.post_process.add_results(processed_results) - self.result_timing.append(time.time() - qitem.start) - except Exception as ex: # pylint: disable=broad-except - src = [self.ds.get_item_loc(i) for i in qitem.content_id] - log.error("[Line 262] thread: failed on contentid=%s, %s", src, ex) - # since post_process will not run, fake empty responses - processed_results = [[]] * len(qitem.query_id) - finally: - response_array_refs = [] - response = [] - for idx, query_id in enumerate(qitem.query_id): - response_array = array.array( - "B", np.array(processed_results[idx], np.uint8).tobytes() - # "B", np.array(processed_results[idx], np.uint64).tobytes() - ) - response_array_refs.append(response_array) - bi = response_array.buffer_info() - response.append(lg.QuerySampleResponse(query_id, bi[0], bi[1])) - lg.QuerySamplesComplete(response) - - def enqueue(self, query_samples): - idx = [q.index for q in query_samples] - query_id = [q.id for q in query_samples] - if len(query_samples) < self.max_batchsize: - data, label = self.ds.get_samples(idx) - self.run_one_item(Item(query_id, idx, data, label)) - else: - bs = self.max_batchsize - for i in range(0, len(idx), bs): - data, label = self.ds.get_samples(idx[i : i + bs]) - self.run_one_item( - Item(query_id[i : i + bs], idx[i : i + bs], data, label) - ) - - def finish(self): - pass - - -class QueueRunner(RunnerBase): - def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): - super().__init__(model, ds, threads, post_proc, max_batchsize) - self.tasks = Queue(maxsize=threads * 4) - self.workers = [] - self.result_dict = {} - - for _ in range(self.threads): - worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,)) - worker.daemon = True - self.workers.append(worker) - worker.start() - - def handle_tasks(self, tasks_queue): - """Worker thread.""" - while True: - # log.info ('getting tasks') - qitem = tasks_queue.get() - # log.info ('getten tasks') - if qitem is None: - # None in the queue indicates the parent want us to exit - tasks_queue.task_done() - break - self.run_one_item(qitem) - # log.info ('going to task_done') - tasks_queue.task_done() - # log.info ('tasks done') - def enqueue(self, query_samples): - idx = [q.index for q in query_samples] - query_id = [q.id for q in query_samples] - if len(query_samples) < self.max_batchsize: - data, label = self.ds.get_samples(idx) - self.tasks.put(Item(query_id, idx, data, label)) - else: - bs = self.max_batchsize - for i in range(0, len(idx), bs): - ie = i + bs - data, label = self.ds.get_samples(idx[i:ie]) - self.tasks.put(Item(query_id[i:ie], idx[i:ie], data, label)) - def finish(self): - # exit all threads - for _ in self.workers: - self.tasks.put(None) - for worker in self.workers: - worker.join() - - -def main(): +def main(): + args = get_args() - log.info(args) - - # find backend - # backend = get_backend( - # args.backend, - # precision=args.dtype, - # device=args.device, - # model_path=args.model_path, - # batch_size=args.max_batchsize - # ) - # Zixian: Oct 21: create a list of backends for multi-gpu - # backends = [get_backend( - # args.backend, - # precision=args.dtype, - # device=f'cuda:{i}', - # model_path=args.model_path, - # batch_size=args.max_batchsize - # ) - # for i in [0, 1, 2, 3]] + # Define the command and arguments + # command = ['python', 'script_to_run.py', '--num', '10', '--text', 'Hello, world!'] - backends = [get_backend( - args.backend, - precision=args.dtype, - device=f'cuda:{int (i/int (args.gpu_num / 4))}', - model_path=args.model_path, - batch_size=args.max_batchsize - ) - for i in np.arange (args.gpu_num)] - - log.info(f"Zixian: Returned from get_backends") - - - if args.dtype == "fp16": - dtype = torch.float16 - elif args.dtype == "bf16": - dtype = torch.bfloat16 - else: - dtype = torch.float32 - - # --count applies to accuracy mode only and can be used to limit the number of images - # for testing. - count_override = False - count = args.count - if count: - count_override = True - - # load model to backend - # model = backend.load() - log.info(f"Zixian: entering backend.load") - # Zixian: Oct 21: create a list of models corresponding to each backend - models = [backend.load() for backend in backends] - log.info(f"Zixian: loaded models from all backend") - - # dataset to use - dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] - ds = dataset_class( - data_path=args.dataset_path, - name=args.dataset, - pre_process=pre_proc, - count=count, - threads=args.threads, - # pipe_tokenizer=model.pipe.tokenizer, - # pipe_tokenizer_2=model.pipe.tokenizer_2, - pipe_tokenizer=models[0].pipe.tokenizer, - pipe_tokenizer_2=models[0].pipe.tokenizer_2, - latent_dtype=dtype, - latent_device=args.device, - latent_framework=args.latent_framework, - pipe_type=args.backend, - **kwargs, - ) - final_results = { - # "runtime": model.name(), - # "version": model.version(), - "runtime": models[0].name(), - "version": models[0].version(), - "time": int(time.time()), - "args": vars(args), - "cmdline": str(args), - } - - mlperf_conf = os.path.abspath(args.mlperf_conf) - if not os.path.exists(mlperf_conf): - log.error("{} not found".format(mlperf_conf)) - sys.exit(1) - - user_conf = os.path.abspath(args.user_conf) - if not os.path.exists(user_conf): - log.error("{} not found".format(user_conf)) - sys.exit(1) - - audit_config = os.path.abspath(args.audit_conf) - - if args.accuracy: - ids_path = os.path.abspath(args.ids_path) - with open(ids_path) as f: - saved_images_ids = [int(_) for _ in f.readlines()] - - if args.output: - output_dir = os.path.abspath(args.output) - os.makedirs(output_dir, exist_ok=True) - os.chdir(output_dir) - - # - # make one pass over the dataset to validate accuracy - # - count = ds.get_item_count() - - # warmup - syntetic_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit" - latents_pt = torch.rand(ds.latents.shape, dtype=dtype).to(args.device) - # warmup_samples = [ - # { - # "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer), - # "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2), - # "latents": latents_pt, - # } - # for _ in range(args.max_batchsize) - # ] - warmup_samples_gpus = [ - [ - { - "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer), - "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2), - "caption": syntetic_str, - "latents": latents_pt - } - for _ in range(int(args.max_batchsize)) - ] - for model in models] + server_main (args) - # Zixian: Oct 21: warm up each backend - for idx, backend in enumerate (backends): - for i in range(1): - _ = backend.predict(warmup_samples_gpus[idx]) - - scenario = SCENARIO_MAP[args.scenario] - runner_map = { - lg.TestScenario.SingleStream: RunnerBase, - lg.TestScenario.MultiStream: QueueRunner, - lg.TestScenario.Server: QueueRunner, - lg.TestScenario.Offline: QueueRunner, - } + # command = ['python', + # 'py_demo_server_lon.py', + # '--sut-server http://t007-001:8888 http://t006-001:8888', + # '--dataset=coco-1024', + # '--dataset-path=/work1/zixian/ziw081/inference/text_to_image/coco2014', + # '--profile=stable-diffusion-xl-pytorch', + # '--dtype=fp16', + # '--device=cuda', + # '--time=30', + # '--scenario=Offline', + # '--max-batchsize=4' + # ] + + + # # Run the command + # subprocess.run(command) - # Zixian: Oct 21: create a list of runner - # runner = runner_map[scenario]( - # model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize - # ) - runners = [runner_map[scenario]( - model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize - ) - for model in models] - - # def issue_queries(query_samples): - # runner.enqueue(query_samples) - def issue_queries(query_samples): - print (f'\n\n len (query_samples): {len (query_samples)} \n\n') - - query_samples_len = len (query_samples) - query_samples_seg_len = query_samples_len / len (runners) - splitted_query_samples = [] - for idx in range (len (runners)): - log.info (f'\n\n\n') - log.info (f'idx: {idx}') - log.info (f'query_samples_len: {query_samples_len}') - log.info (f'idx: {idx}') - # if idx == len (runners) -1: - # splitted_query_samples.append (query_samples[idx*query_samples_seg_len:]) - # else: - # splitted_query_samples.append (query_samples[idx*query_samples_seg_len : (idx+1)*query_samples_seg_len]) - - splitted_query_samples.append (query_samples [int(round(query_samples_seg_len * idx)): int(round(query_samples_seg_len * (idx + 1)))]) - - - with ThreadPoolExecutor(max_workers=len(runners)) as executor: - # Map each runner to its respective sublist - futures = { - executor.submit(runner.enqueue, queries): runner - for runner, queries in zip(runners, splitted_query_samples) - } - - # Optionally process the results - for future in as_completed(futures): - runner = futures[future] - try: - result = future.result() - print(f'Runner {runner} enqueued successfully.') - except Exception as exc: - print(f'Runner {runner} generated an exception: {exc}') - - def flush_queries(): - pass - - log_output_settings = lg.LogOutputSettings() - log_output_settings.outdir = output_dir - log_output_settings.copy_summary_to_stdout = False - log_settings = lg.LogSettings() - log_settings.enable_trace = args.debug - log_settings.log_output = log_output_settings - - settings = lg.TestSettings() - settings.FromConfig(mlperf_conf, args.model_name, args.scenario) - settings.FromConfig(user_conf, args.model_name, args.scenario) - if os.path.exists(audit_config): - settings.FromConfig(audit_config, args.model_name, args.scenario) - settings.scenario = scenario - settings.mode = lg.TestMode.PerformanceOnly - if args.accuracy: - settings.mode = lg.TestMode.AccuracyOnly - if args.find_peak_performance: - settings.mode = lg.TestMode.FindPeakPerformance - - if args.time: - # override the time we want to run - settings.min_duration_ms = args.time * MILLI_SEC - settings.max_duration_ms = args.time * MILLI_SEC - - if args.qps: - qps = float(args.qps) - settings.server_target_qps = qps - settings.offline_expected_qps = qps - - if count_override: - settings.min_query_count = count - settings.max_query_count = count - - if args.samples_per_query: - settings.multi_stream_samples_per_query = args.samples_per_query - if args.max_latency: - settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) - settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC) - - performance_sample_count = ( - args.performance_sample_count - if args.performance_sample_count - else min(count, 500) - ) - sut = lg.ConstructSUT(issue_queries, flush_queries) - #! [Yalu Ouyang] count here affects how many items to run (even for accuracy) - qsl = lg.ConstructQSL( - count, performance_sample_count, ds.load_query_samples, ds.unload_query_samples - ) - - log.info("starting {}".format(scenario)) - result_dict = {"scenario": str(scenario)} - for runner in runners: - runner.start_run(result_dict, args.accuracy) - # with ThreadPoolExecutor(max_workers=len(runners)) as executor: - # # Map each runner to its respective sublist - # futures = { - # executor.submit(runner.finish(), (result_dict, args.accuracy)): runner - # for runner in runners - # } - - - lg.StartTestWithLogSettings(sut, qsl, settings, log_settings, audit_config) - log.info("Loadgen finished tests") - - if args.accuracy: - post_proc.finalize(result_dict, ds, output_dir=args.output) - final_results["accuracy_results"] = result_dict - post_proc.save_images(saved_images_ids, ds) - - log.info("After processing accuracy") - - for runner in runners: - runner.finish() - - log.info("After runner.finish()") - # with ThreadPoolExecutor(max_workers=len(runners)) as executor: - # # Map each runner to its respective sublist - # futures = { - # executor.submit(runner.finish()): runner - # for runner in runners - # } - - - lg.DestroyQSL(qsl) - lg.DestroySUT(sut) - - # - # write final results - # - if args.output: - with open("results.json", "w") as f: - json.dump(final_results, f, sort_keys=True, indent=4) - if __name__ == "__main__": main() diff --git a/text_to_image/py_demo_server_lon.py b/text_to_image/py_demo_server_lon.py new file mode 100644 index 000000000..c2e4128ec --- /dev/null +++ b/text_to_image/py_demo_server_lon.py @@ -0,0 +1,568 @@ +""" +Python demo showing how to use the MLPerf Inference LoadGen over the Network bindings. +This program runs on the LON Node side. +It runs the demo in MLPerf server mode over the network. +It communicates over the network with Network SUT nodes, +which are running the networked SUT code. +""" + +import argparse +import threading +import requests +import array +import time +import json +import array +import collections +import logging +import os +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed + +from absl import app +# from absl import flags +import mlperf_loadgen as lg +import numpy as np +import torch + +import struct + +import dataset +import coco + +from queue import Queue + +# FLAGS = flags.FLAGS + +# flags.DEFINE_list( +# "sut_server", "http://localhost:8000", "Address of the server(s) under test." +# ) + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("main") + +NANO_SEC = 1e9 +MILLI_SEC = 1000 + +SUPPORTED_DATASETS = { + "coco-1024": ( + coco.Coco, + dataset.preprocess, + coco.PostProcessCoco(), + {"image_size": [3, 1024, 1024]}, + ) +} + +SUPPORTED_PROFILES = { + "defaults": { + "dataset": "coco-1024", + "backend": "pytorch", + "model-name": "stable-diffusion-xl", + }, + "debug": { + "dataset": "coco-1024", + "backend": "debug", + "model-name": "stable-diffusion-xl", + }, + "stable-diffusion-xl-pytorch": { + "dataset": "coco-1024", + "backend": "pytorch", + "model-name": "stable-diffusion-xl", + }, + "stable-diffusion-xl-pytorch-dist": { + "dataset": "coco-1024", + "backend": "pytorch-dist", + "model-name": "stable-diffusion-xl", + }, + "stable-diffusion-migraphx": { + "dataset": "coco-1024", + "backend": "migraphx", + "model-name": "stable-diffusion-xl", + }, +} + +SCENARIO_MAP = { + "SingleStream": lg.TestScenario.SingleStream, + "MultiStream": lg.TestScenario.MultiStream, + "Server": lg.TestScenario.Server, + "Offline": lg.TestScenario.Offline, +} + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--sut-server', required=True, nargs='+', help='A list of server address & port') + parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset") + parser.add_argument("--dataset-path", required=True, help="path to the dataset") + parser.add_argument( + "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles" + ) + parser.add_argument( + "--scenario", + default="SingleStream", + help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())), + ) + parser.add_argument( + "--max-batchsize", + type=int, + default=1, + help="max batch size in a single inference", + ) + parser.add_argument("--threads", default=1, type=int, help="threads") + parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") + parser.add_argument( + "--find-peak-performance", + action="store_true", + help="enable finding peak performance pass", + ) + parser.add_argument("--backend", help="Name of the backend", default="migraphx") + parser.add_argument("--model-name", help="Name of the model") + parser.add_argument("--output", default="output", help="test results") + parser.add_argument("--qps", type=int, help="target qps") + parser.add_argument("--model-path", help="Path to model weights") + + parser.add_argument( + "--dtype", + default="fp32", + choices=["fp32", "fp16", "bf16"], + help="dtype of the model", + ) + parser.add_argument( + "--device", + default="cuda", + choices=["cuda", "cpu", "rocm"], + help="device to run the benchmark", + ) + parser.add_argument( + "--latent-framework", + default="torch", + choices=["torch", "numpy"], + help="framework to load the latents", + ) + + # file to use mlperf rules compliant parameters + parser.add_argument( + "--mlperf_conf", default="mlperf.conf", help="mlperf rules config" + ) + # file for user LoadGen settings such as target QPS + parser.add_argument( + "--user_conf", + default="user.conf", + help="user config for user LoadGen settings such as target QPS", + ) + # file for LoadGen audit settings + parser.add_argument( + "--audit_conf", default="audit.config", help="config for LoadGen audit settings" + ) + # arguments to save images + # pass this argument for official submission + # parser.add_argument("--output-images", action="store_true", help="Store a subset of the generated images") + # do not modify this argument for official submission + parser.add_argument("--ids-path", help="Path to caption ids", default="tools/sample_ids.txt") + + # below will override mlperf rules compliant settings - don't use for official submission + parser.add_argument("--time", type=int, help="time to scan in seconds") + parser.add_argument("--count", type=int, help="dataset items to use") + parser.add_argument("--debug", action="store_true", help="debug") + parser.add_argument( + "--performance-sample-count", type=int, help="performance sample count", default=5000 + ) + parser.add_argument( + "--max-latency", type=float, help="mlperf max latency in pct tile" + ) + parser.add_argument( + "--samples-per-query", + default=8, + type=int, + help="mlperf multi-stream samples per query", + ) + args = parser.parse_args() + + # don't use defaults in argparser. Instead we default to a dict, override that with a profile + # and take this as default unless command line give + defaults = SUPPORTED_PROFILES["defaults"] + + if args.profile: + profile = SUPPORTED_PROFILES[args.profile] + defaults.update(profile) + for k, v in defaults.items(): + kc = k.replace("-", "_") + if getattr(args, kc) is None: + setattr(args, kc, v) + + if args.scenario not in SCENARIO_MAP: + parser.error("valid scanarios:" + str(list(SCENARIO_MAP.keys()))) + return args + +def get_backend(backend, **kwargs): + if backend == "pytorch": + from backend_pytorch import BackendPytorch + + backend = BackendPytorch(**kwargs) + + elif backend == "migraphx": + from backend_migraphx import BackendMIGraphX + + backend = BackendMIGraphX(**kwargs) + + elif backend == "debug": + from backend_debug import BackendDebug + + backend = BackendDebug() + else: + raise ValueError("unknown backend: " + backend) + return backend + +class QSL: + def __init__(self, total_sample_count, performance_sample_count, ds=None): + # self.eval_features = { + # i: {"index": i, "id": i} for i in range(total_sample_count) + # } + self.qsl = lg.ConstructQSL( + total_sample_count, + performance_sample_count, + ds.load_query_samples, + ds.unload_query_samples + ) + + def __del__(self): + lg.DestroyQSL(self.qsl) + +class QDL: + """QDL acting as a proxy to the SUT. + This QDL communicates with the SUT via HTTP. + It uses two endpoints to communicate with the SUT: + - /predict/ : Send a query to the SUT and get a response. + - /getname/ : Get the name of the SUT. Send a getname to the SUT and get a response. + """ + + def __init__(self, qsl: QSL, sut_server_addr: list, ds=None): + """ + Constructor for the QDL. + Args: + qsl: The QSL to use. + sut_server_addr: A list of addresses of the SUT. + """ + self.qsl = qsl + + # Construct QDL from the python binding + self.qdl = lg.ConstructQDL( + self.issue_query, self.flush_queries, self.client_get_name + ) + self.sut_server_addr = sut_server_addr + self.ds = ds + + + def issue_query(self, query_samples): + """Process the query to send to the SUT""" + threading.Thread( + target=self.process_query_async, + args=[query_samples], + daemon=True # remove + ).start() + + def flush_queries(self): + """Flush the queries. Dummy implementation.""" + pass + + def process_query_async(self, query_samples): + """Serialize the query, send it to the SUT in round robin, and return the deserialized response.""" + + query_samples_len = len (query_samples) + query_samples_seg_len = int (query_samples_len / len (self.sut_server_addr)) + splitted_query_samples = [] + for idx in range (len (self.sut_server_addr)): + if idx == len (self.sut_server_addr) -1: + splitted_query_samples.append (query_samples[idx*query_samples_seg_len:]) + else: + splitted_query_samples.append (query_samples[idx*query_samples_seg_len : (idx+1)*query_samples_seg_len]) + + responses = [] + with ThreadPoolExecutor(max_workers=len(self.sut_server_addr)) as executor: + futures = { + executor.submit(self.request_validate, '{}/predict/'.format(url), queries): self + for url, queries in zip(self.sut_server_addr, splitted_query_samples) + } + + + + # Send inference request to one host, receive the inference result + # then calls loadgen to verify the inference result + def request_validate(self, url, query_samples, backend="migraphx"): + # turn query_samples into list of json: + indexes = [q.index for q in query_samples] + ids = [q.id for q in query_samples] + data, label = self.ds.get_samples(indexes) + + if backend == "migraphx": + data = [ + { + 'caption': d['caption'], + 'latents': d['latents'].tolist() # Convert tensor to a list + } + for d in data + ] + else: + data = [ + { + 'input_tokens': d['input_tokens'], + 'input_tokens_2': d['input_tokens_2'], + 'latents': d['latents'].tolist() # Convert tensor to a list + } + for d in data + ] + + ''' + data[0]: + { + 'input_tokens': , + 'input_tokens_2': , + 'latents': + } + or + { + 'caption': + 'latents': + } + ''' + + # Todo: The response got None object when we have 2 inference nodes + # This problem doesn't exist when we just inference on one node + + query_samples = [ {'index': q[0], 'id': q[1], 'data': q[2]} + for q in zip(indexes, ids, data) ] + response = requests.post(url, json={"query_samples": query_samples}) + e = time.time() + print (f'RETURNED from requests.post on predict at time \t {e}') + + + + + # print(response.json()["result"]) + + # print("result type:", type(result)) + # print("result:", result) + # result = response.json()["result"] + # print("result type:", type(type(result))) + # print("result type:", type(result)) + # print("result:", result) + # print("result len:", len(result)) + # print("result[0]:", result[0]) + + + + # response_array_refs = [] + # response = [] + # for sample in result: + # sample_in_memory = array.array("B", sample['data']) + # bi = sample_in_memory.buffer_info() + # response_array_refs.append(sample_in_memory) + # response.append(lg.QuerySampleResponse(sample['query_id'], bi[0], bi[1])) + + response_bytes = response.content + offset = 0 + responses = [] + response_array_refs = [] + + while offset < len(response_bytes): + # Unpack the query_id + query_id = struct.unpack_from('Q', response_bytes, offset)[0] + offset += 8 + + # Unpack the data length + data_length = struct.unpack_from('I', response_bytes, offset)[0] + offset += 4 + + # Extract the data + data_bytes = response_bytes[offset:offset + data_length] + offset += data_length + + # Convert bytes to array + sample_in_memory = array.array("B", data_bytes) + bi = sample_in_memory.buffer_info() + response_array_refs.append(sample_in_memory) + + responses.append(lg.QuerySampleResponse(query_id, bi[0], bi[1])) + + + print (f'BEFORE lg.QuerySamplesComplete(response)') + lg.QuerySamplesComplete(responses) + print (f'AFTER lg.QuerySamplesComplete(response)') + + + ''' + query_samples[0]: + { + 'index': 1, + 'id': 1, + 'data': { + 'inputs_tokens': "this is a prompt", + 'inputs_tokens_2': "this is a prompt", + 'latents': [list converted from tensor] + } + } + or + { + 'index': 1, + 'id': 1, + 'data': { + 'caption': "this is a prompt", + 'latents': [list converted from tensor] + } + } + ''' + + + def client_get_name(self): + """Get the name of the SUT from ALL the SUTS.""" + # if len(self.sut_server_addr) == 1: + # return requests.post( + # f"{self.sut_server_addr[0]}/getname/").json()["name"] + + # sut_names = [ + # requests.post(f"{addr}/getname/").json()["name"] + # for addr in self.sut_server_addr + # ] + # return "Multi-node SUT: " + ", ".join(sut_names) + return "Multi-node SUT: N1, N2" + + def __del__(self): + lg.DestroyQDL(self.qdl) + +def main(args): + # args = get_args() + + backend = get_backend( + args.backend, + precision=args.dtype, + device='cuda:0', + model_path=args.model_path, + batch_size=args.max_batchsize + ) + model = backend.load() + + + if args.dtype == "fp16": + dtype = torch.float16 + elif args.dtype == "bf16": + dtype = torch.bfloat16 + else: + dtype = torch.float32 + + # --count applies to accuracy mode only and can be used to limit the number of images + # for testing. + count_override = False + count = args.count + if count: + count_override = True + + scenario = SCENARIO_MAP[args.scenario] + + dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] + ds = dataset_class( + data_path=args.dataset_path, + name=args.dataset, + pre_process=pre_proc, + count=count, + threads=args.threads, + # pipe_tokenizer=models[0].pipe.tokenizer, + # pipe_tokenizer_2=models[0].pipe.tokenizer_2, + pipe_tokenizer=model.pipe.tokenizer, + pipe_tokenizer_2=model.pipe.tokenizer_2, + latent_dtype=dtype, + latent_device=args.device, + latent_framework=args.latent_framework, + pipe_type=args.backend, + **kwargs, + ) + count = ds.get_item_count() + + + mlperf_conf = os.path.abspath(args.mlperf_conf) + if not os.path.exists(mlperf_conf): + log.error("{} not found".format(mlperf_conf)) + sys.exit(1) + + user_conf = os.path.abspath(args.user_conf) + if not os.path.exists(user_conf): + log.error("{} not found".format(user_conf)) + sys.exit(1) + + audit_config = os.path.abspath(args.audit_conf) + + if args.accuracy: + ids_path = os.path.abspath(args.ids_path) + with open(ids_path) as f: + saved_images_ids = [int(_) for _ in f.readlines()] + + if args.output: + output_dir = os.path.abspath(args.output) + os.makedirs(output_dir, exist_ok=True) + os.chdir(output_dir) + + performance_sample_count = ( + args.performance_sample_count + if args.performance_sample_count + else min(count, 500) + ) + + + + log_output_settings = lg.LogOutputSettings() + log_output_settings.outdir = output_dir + log_output_settings.copy_summary_to_stdout = False + log_settings = lg.LogSettings() + log_settings.enable_trace = args.debug + log_settings.log_output = log_output_settings + + settings = lg.TestSettings() + settings.FromConfig(mlperf_conf, args.model_name, args.scenario) + settings.FromConfig(user_conf, args.model_name, args.scenario) + if os.path.exists(audit_config): + settings.FromConfig(audit_config, args.model_name, args.scenario) + settings.scenario = scenario + settings.mode = lg.TestMode.PerformanceOnly + if args.accuracy: + settings.mode = lg.TestMode.AccuracyOnly + if args.find_peak_performance: + settings.mode = lg.TestMode.FindPeakPerformance + + if args.time: + # override the time we want to run + settings.min_duration_ms = args.time * MILLI_SEC + settings.max_duration_ms = args.time * MILLI_SEC + + if args.qps: + qps = float(args.qps) + settings.server_target_qps = qps + settings.offline_expected_qps = qps + + if count_override: + settings.min_query_count = count + settings.max_query_count = count + + if args.samples_per_query: + settings.multi_stream_samples_per_query = args.samples_per_query + if args.max_latency: + settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) + settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC) + + performance_sample_count = ( + args.performance_sample_count + if args.performance_sample_count + else min(count, 500) + ) + + # QDL and QSL + qsl = QSL(count, performance_sample_count, ds=ds) + # qsl = QSL(50, performance_sample_count, ds=ds) + qdl = QDL(qsl, sut_server_addr=args.sut_server, ds=ds) + + lg.StartTest(qdl.qdl, qsl.qsl, settings) + + del qsl + del qdl + + +if __name__ == "__main__": + # app.run(main) + main(None) \ No newline at end of file diff --git a/text_to_image/sut_over_network_demo.py b/text_to_image/sut_over_network_demo.py new file mode 100644 index 000000000..eb83ccba9 --- /dev/null +++ b/text_to_image/sut_over_network_demo.py @@ -0,0 +1,455 @@ +""" +MLPerf Inference Benchmarking Tool - SUT Node +""" + +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import array +import collections +import json +import logging +import os +import sys +import threading +import time +import socket +import struct + +import numpy as np +import torch + +from flask import Flask, request, jsonify, Response +from concurrent.futures import ThreadPoolExecutor, as_completed +import subprocess + +import dataset +import coco + +from queue import Queue + +import mlperf_loadgen as lg # Only needed if you plan to run LoadGen locally + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("main") + +NANO_SEC = 1e9 +MILLI_SEC = 1000 + +SUPPORTED_DATASETS = { + "coco-1024": ( + coco.Coco, + dataset.preprocess, + coco.PostProcessCoco(), + {"image_size": [3, 1024, 1024]}, + ) +} + +SUPPORTED_PROFILES = { + "defaults": { + "dataset": "coco-1024", + "backend": "pytorch", + "model-name": "stable-diffusion-xl", + }, + "debug": { + "dataset": "coco-1024", + "backend": "debug", + "model-name": "stable-diffusion-xl", + }, + "stable-diffusion-xl-pytorch": { + "dataset": "coco-1024", + "backend": "pytorch", + "model-name": "stable-diffusion-xl", + }, + "stable-diffusion-xl-pytorch-dist": { + "dataset": "coco-1024", + "backend": "pytorch-dist", + "model-name": "stable-diffusion-xl", + }, + "stable-diffusion-xl-migraphx": { + "dataset": "coco-1024", + "backend": "migraphx", + "model-name": "stable-diffusion-xl", + } +} + +SCENARIO_MAP = { + "SingleStream": lg.TestScenario.SingleStream, + "MultiStream": lg.TestScenario.MultiStream, + "Server": lg.TestScenario.Server, + "Offline": lg.TestScenario.Offline, +} + +app = Flask(__name__) + +# Global variables to hold models and runners +backends = [] +models = [] +runners = [] +ds = None +args = None + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset") + parser.add_argument("--dataset-path", required=True, help="path to the dataset") + parser.add_argument( + "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles" + ) + parser.add_argument( + "--scenario", + default="SingleStream", + help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())), + ) + parser.add_argument( + "--max-batchsize", + type=int, + default=1, + help="max batch size in a single inference", + ) + parser.add_argument("--threads", default=1, type=int, help="threads") + parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") + parser.add_argument( + "--find-peak-performance", + action="store_true", + help="enable finding peak performance pass", + ) + parser.add_argument("--backend", help="Name of the backend", default="migraphx") + parser.add_argument("--model-name", help="Name of the model") + parser.add_argument("--output", default="output", help="test results") + parser.add_argument("--qps", type=int, help="target qps") + parser.add_argument("--model-path", help="Path to model weights") + + parser.add_argument( + "--dtype", + default="fp32", + choices=["fp32", "fp16", "bf16"], + help="dtype of the model", + ) + parser.add_argument( + "--device", + default="cuda", + choices=["cuda", "cpu", "rocm"], + help="device to run the benchmark", + ) + parser.add_argument( + "--latent-framework", + default="torch", + choices=["torch", "numpy"], + help="framework to load the latents", + ) + + # file to use mlperf rules compliant parameters + parser.add_argument( + "--mlperf_conf", default="mlperf.conf", help="mlperf rules config" + ) + # file for user LoadGen settings such as target QPS + parser.add_argument( + "--user_conf", + default="user.conf", + help="user config for user LoadGen settings such as target QPS", + ) + # file for LoadGen audit settings + parser.add_argument( + "--audit_conf", default="audit.config", help="config for LoadGen audit settings" + ) + # arguments to save images + parser.add_argument("--ids-path", help="Path to caption ids", default="tools/sample_ids.txt") + + # below will override mlperf rules compliant settings - don't use for official submission + parser.add_argument("--time", type=int, help="time to scan in seconds") + parser.add_argument("--count", type=int, help="dataset items to use") + parser.add_argument("--debug", action="store_true", help="debug") + parser.add_argument( + "--performance-sample-count", type=int, help="performance sample count", default=5000 + ) + parser.add_argument( + "--max-latency", type=float, help="mlperf max latency in pct tile" + ) + parser.add_argument( + "--samples-per-query", + default=8, + type=int, + help="mlperf multi-stream samples per query", + ) + args = parser.parse_args() + + # don't use defaults in argparser. Instead we default to a dict, override that with a profile + # and take this as default unless command line give + defaults = SUPPORTED_PROFILES["defaults"] + + if args.profile: + profile = SUPPORTED_PROFILES[args.profile] + defaults.update(profile) + for k, v in defaults.items(): + kc = k.replace("-", "_") + if getattr(args, kc) is None: + setattr(args, kc, v) + + if args.scenario not in SCENARIO_MAP: + parser.error("valid scenarios:" + str(list(SCENARIO_MAP.keys()))) + return args + +def get_backend(backend, **kwargs): + if backend == "pytorch": + from backend_pytorch import BackendPytorch + + backend = BackendPytorch(**kwargs) + + elif backend == "migraphx": + from backend_migraphx import BackendMIGraphX + + backend = BackendMIGraphX(**kwargs) + + elif backend == "debug": + from backend_debug import BackendDebug + + backend = BackendDebug() + else: + raise ValueError("unknown backend: " + backend) + return backend + +class Item: + """An item that we queue for processing by the thread pool.""" + + def __init__(self, query_id, content_id, inputs, img=None): + self.query_id = query_id + self.content_id = content_id + self.img = img + self.inputs = inputs + self.start = time.time() + +class RunnerBase: + def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): + self.take_accuracy = False + self.ds = ds + self.model = model + self.post_process = post_proc + self.threads = threads + self.take_accuracy = False + self.max_batchsize = max_batchsize + self.result_timing = [] + self.result_dict = {} + + def handle_tasks(self, tasks_queue): + pass + + def start_run(self, result_dict, take_accuracy): + self.result_dict = result_dict + self.result_timing = [] + self.take_accuracy = take_accuracy + self.post_process.start() + + def run_one_item(self, qitem: Item): + # print("in run_one_item") + # run the prediction + processed_results = [] + + # preprocess the prompts: + qitem.inputs = [ + { + # "input_tokens": ds.preprocess(input['input_tokens'], ds.pipe_tokenizer), + # "input_tokens_2": ds.preprocess(input['input_tokens_2'], ds.pipe_tokenizer_2), + "caption": input['caption'], + "latents": torch.tensor(input['latents']).half(), #.half() + } + for input in qitem.inputs + ] + # + try: + # log.info(f"[Yalu] qitem.inputs[0]['caption'] -> {qitem.inputs[0].get('caption')}") + # log.info(f"[Yalu] qitem.inputs[0]['latents'] -> {qitem.inputs[0].get('latents')}") + # log.info(f"[Yalu] qitem.inputs length -> {len(qitem.inputs)}") + results = self.model.predict(qitem.inputs) + processed_results = self.post_process( + results, qitem.content_id, qitem.inputs, self.result_dict + ) + if self.take_accuracy: + self.post_process.add_results(processed_results) + self.result_timing.append(time.time() - qitem.start) + except Exception as ex: # pylint: disable=broad-except + src = [self.ds.get_item_loc(i) for i in qitem.content_id] + log.error("thread: failed on contentid=%s, %s", src, ex) + print("thread: failed on contentid=%s, %s", src, ex) + # since post_process will not run, fake empty responses + processed_results = [[]] * len(qitem.query_id) + finally: + response_array_refs = [] + response = [] + for idx, query_id in enumerate(qitem.query_id): + response_array = array.array( + "B", np.array(processed_results[idx], np.uint8).tobytes() + ) + # response_array_refs.append(response_array) + # bi = response_array.buffer_info() + # response.append({'query_id': query_id, 'data': bi[0], 'size': bi[1]}) + response.append({'query_id': query_id, 'data': response_array.tolist()}) + return response # Return the response instead of calling QuerySamplesComplete + + def enqueue(self, query_samples): + try: + idx = [q['index'] for q in query_samples] + query_id = [q['id'] for q in query_samples] + data = [q['data'] for q in query_samples] + label = None # label is never used in any functions + + responses = [] + if len(idx) < self.max_batchsize: + responses.extend(self.run_one_item(Item(query_id, idx, data, label))) + else: + bs = self.max_batchsize + for i in range(0, len(idx), bs): + # print("samples obtained") + responses.extend( + self.run_one_item( + Item(query_id[i : i + bs], idx[i : i + bs], data[i : i + bs], label) + ) + ) + except Exception as e: + print(f'An error occured in enqueue: {e}') + return responses + + def finish(self): + pass + +def initialize(): + global backends, models, runners, ds, args, post_proc + args = get_args() + + log.info(args) + + # Initialize backends and models + backends = [get_backend( + args.backend, + precision=args.dtype, + device=f'cuda:{i}', + model_path=args.model_path, + batch_size=args.max_batchsize + ) + for i in [0,1,2,3]] # Adjust GPU indices as needed + + models = [backend.load() for backend in backends] + + if args.dtype == "fp16": + dtype = torch.float16 + elif args.dtype == "bf16": + dtype = torch.bfloat16 + else: + dtype = torch.float32 + + # Load dataset + dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] + ds = dataset_class( + data_path=args.dataset_path, + name=args.dataset, + pre_process=pre_proc, + count=args.count, + threads=args.threads, + pipe_tokenizer=models[0].pipe.tokenizer, + pipe_tokenizer_2=models[0].pipe.tokenizer_2, + latent_dtype=dtype, + latent_device=args.device, + latent_framework=args.latent_framework, + pipe_type=args.backend, + **kwargs, + ) + + scenario = SCENARIO_MAP[args.scenario] + runner_map = { + lg.TestScenario.SingleStream: RunnerBase, + lg.TestScenario.MultiStream: RunnerBase, + lg.TestScenario.Server: RunnerBase, + lg.TestScenario.Offline: RunnerBase, + } + + runners = [runner_map[scenario]( + model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize + ) + for model in models] + + # added because we need to pass result_dict to the runner class + log.info("starting {}".format(scenario)) + result_dict = {"scenario": str(scenario)} + for runner in runners: + runner.start_run(result_dict, args.accuracy) + +@app.route('/predict/', methods=['POST']) +def predict(): + query_data = request.get_json(force=True) + query_samples = query_data['query_samples'] + + # Distribute queries among runners + query_samples_len = len(query_samples) + num_runners = len(runners) + query_samples_seg_len = int(query_samples_len / num_runners) + splitted_query_samples = [] + for idx in range(num_runners): + if idx == num_runners -1: + splitted_query_samples.append(query_samples[idx*query_samples_seg_len:]) + else: + splitted_query_samples.append(query_samples[idx*query_samples_seg_len : (idx+1)*query_samples_seg_len]) + + # Use ThreadPoolExecutor to run queries concurrently + responses = [] + with ThreadPoolExecutor(max_workers=num_runners) as executor: + futures = { + executor.submit(runner.enqueue, queries): runner + for runner, queries in zip(runners, splitted_query_samples) + } + + for future in as_completed(futures): + runner = futures[future] + try: + result = future.result() + responses.extend(result) + except Exception as exc: + log.error(f'Runner {runner} generated an exception: {exc}') + + print(f'response of len {len(responses)} returned') + print (f'RETURNING from predict') + + s = time.time() + # output = jsonify(result=responses) + response_bytes = bytearray() + for resp in responses: + query_id = resp['query_id'] + data_array = np.array(resp['data'], dtype=np.uint8) + data_bytes = data_array.tobytes() + + # Pack the query_id (8 bytes) and the length of data (4 bytes), then the data + packed_data = struct.pack('Q', query_id) + packed_data += struct.pack('I', len(data_bytes)) + packed_data += data_bytes + response_bytes.extend(packed_data) + e = time.time() + + print (f'\n Time to jsonify output is: \t {e-s} \n') + print (f'\n Mark Time to return: \t {e} \n') + # Todo: send samples back + # return output + print(f'Type of response_bytes: {type(response_bytes)}') + return Response(bytes(response_bytes), mimetype='application/octet-stream') + +@app.route('/getname/', methods=['POST', 'GET']) +def getname(): + return jsonify(name=f"SUT Node running on {socket.gethostname()}") + +def issue_queries(query_samples): + # This function is not used in the networked version + pass + +def flush_queries(): + pass + +if __name__ == "__main__": + initialize() + + # get public ip addr of current node + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(("8.8.8.8", 80)) + ip_address = s.getsockname()[0] + + # Change host ip addr and port number + app.run(host=ip_address, port=8008) From b164d1f6dd88299dd9a75097e803357954bae3f9 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sun, 17 Nov 2024 19:48:28 +0000 Subject: [PATCH 18/21] Update format.yml | Fix codeformat action --- .github/workflows/format.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index b98a3c37b..be62a71b2 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -3,7 +3,7 @@ name: "format" on: pull_request: branches: [ master ] - types: [opened, closed, synchronize] + types: [opened, synchronize] env: @@ -13,9 +13,10 @@ jobs: format-code: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 + ref: ${{ github.head_ref }} - name: Set up Python ${{ env.python_version }} uses: actions/setup-python@v3 with: @@ -41,9 +42,8 @@ jobs: run: | HAS_CHANGES=$(git diff --staged --name-only) if [ ${#HAS_CHANGES} -gt 0 ]; then - git checkout -B "${{ github.head_ref }}" git config --global user.email "${{ github.actor }}@users.noreply.github.com" git config --global user.name "${{ github.actor }}" git commit -m '[Automated Commit] Format Codebase' - git push origin "${{ github.head_ref }}" + git push fi From 7d1cb6d8ff6a4c4f3c5c8a27bd3f691ad7a55dfb Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sun, 17 Nov 2024 19:56:50 +0000 Subject: [PATCH 19/21] Update format.yml --- .github/workflows/format.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index be62a71b2..5e02360d9 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -17,6 +17,7 @@ jobs: with: fetch-depth: 0 ref: ${{ github.head_ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} - name: Set up Python ${{ env.python_version }} uses: actions/setup-python@v3 with: From 9e9561e57d29e44cb54b7ce1b0cb921fc57de1b2 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sun, 17 Nov 2024 19:58:56 +0000 Subject: [PATCH 20/21] Update format.yml --- .github/workflows/format.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 5e02360d9..47f701d9b 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -18,6 +18,7 @@ jobs: fetch-depth: 0 ref: ${{ github.head_ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} + token: ${{ secrets.GITHUB_TOKEN }} - name: Set up Python ${{ env.python_version }} uses: actions/setup-python@v3 with: From 0654f7ea707b7192167be20948643b41cdf740b9 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sun, 17 Nov 2024 22:17:03 +0000 Subject: [PATCH 21/21] Delete .github/workflows/format.yml --- .github/workflows/format.yml | 51 ------------------------------------ 1 file changed, 51 deletions(-) delete mode 100644 .github/workflows/format.yml diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml deleted file mode 100644 index 47f701d9b..000000000 --- a/.github/workflows/format.yml +++ /dev/null @@ -1,51 +0,0 @@ -# Automatic code formatting -name: "format" -on: - pull_request: - branches: [ master ] - types: [opened, synchronize] - - -env: - python_version: "3.9" - -jobs: - format-code: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.head_ref }} - repository: ${{ github.event.pull_request.head.repo.full_name }} - token: ${{ secrets.GITHUB_TOKEN }} - - name: Set up Python ${{ env.python_version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ env.python_version }} - - - name: Install dependencies - run: | - python3 -m pip install autopep8 - - - name: Grant permissions - run: | - chmod +x "${GITHUB_WORKSPACE}/.github/scripts/format-cpp.sh" - chmod +x "${GITHUB_WORKSPACE}/.github/scripts/format-py.sh" - - - name: Format Codebase - run: | - git remote add upstream ${{ github.event.pull_request.base.repo.clone_url }} - git fetch upstream ${{ github.event.pull_request.base.ref }} - ".github/scripts/format-cpp.sh" "upstream" "${{ github.event.pull_request.base.ref }}" - ".github/scripts/format-py.sh" "upstream" "${{ github.event.pull_request.base.ref }}" - - - name: Commit - run: | - HAS_CHANGES=$(git diff --staged --name-only) - if [ ${#HAS_CHANGES} -gt 0 ]; then - git config --global user.email "${{ github.actor }}@users.noreply.github.com" - git config --global user.name "${{ github.actor }}" - git commit -m '[Automated Commit] Format Codebase' - git push - fi