-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmarks.py
executable file
·230 lines (184 loc) · 10.3 KB
/
benchmarks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env python3
# %% Set up benchmark parameters
SYCL_setvars = '/opt/intel/oneapi/setvars.sh'
batch_size=2**15
gen_batch_size=3000000
Ngpu_runs = 20 #Set to 100 for more accurate results, much smaller standard deviation.
Ncpu_runs = 20 #Set to 100 for more accurate results, much smaller standard deviation.
Ncpu_warmup = 1 #Warmup caches and branch predictor.
Ngpu_warmup = 1 #No branch prediction on GPU, but SYCL runtime incurs overhead the first time each kernel is run.
N_warmup = { "cpu": 20, "gpu": 60 }
N_runs = { "cpu": 30, "gpu": 30 }
#Change this number if the simulation is taking too long.
#Setting this number to -1 will reduce the batch sizes by 1 power of 2 in the kernel dualisation benchmark.
Bathcsize_Offset = { "cpu": -4, "gpu": 0 }
#----------------- BENCHMARK DEFINITIONS --------------------
import numpy as np, pandas as pd
import os, subprocess, platform, pathlib, sys
# Benchmark result filenames
hostname = platform.node()
path = f'{os.getcwd()}/output/{hostname}/'
buildpath = f'{os.getcwd()}/build/'
pathlib.Path(f"{path}/figures").mkdir(parents=True, exist_ok=True)
if(len(sys.argv)<2):
print(f"Syntax: {sys.argv[0]} <task>\n"+
"Where task is one of: 'all', 'batchsize', 'baseline', 'dualize_gpu', 'dualize_cpu', 'generate', 'pipeline', or 'validate'.\n");
exit(-1)
task = sys.argv[1]
# The benchmark output filenames are:
# f'{path}/base.csv'
# f'{path}/one_gpu_v0.csv'
# f'{path}/one_gpu_v1.csv'
# f'{path}/multi_gpu_v0.csv'
# f'{path}/multi_gpu_v1.csv'
# f'{path}/multi_gpu_weak.csv'
# f'{path}/single_gpu_bs.csv'
# f'{path}/single_gpu_bs.csv'
# f'{path}/base_pipeline.csv'
# f'{path}/full_pipeline.csv'
# TODO: Use SYCL to make benchmark cross-vendor
if os.system('which nvidia-smi') == 0:
output = subprocess.check_output(['nvidia-smi', '-L'])
# Convert the byte string to a regular string
output_str = output.decode('utf-8')
# Count the number of lines in the output
num_gpus = len(output_str.strip().split('\n'))
else:
num_gpus = 0
# Print the number of GPUs found
print(f'Found {num_gpus} GPUs on {hostname}')
def source_and_get_environment(script_path, base_environment=None):
"""Source script and return the updated environment."""
if base_environment is None:
base_environment = os.environ.copy()
command = ['/bin/bash', '-c', f'source {script_path} ; env']
proc = subprocess.Popen(command, stdout=subprocess.PIPE, env=base_environment)
output, _ = proc.communicate()
env = dict((line.split("=", 1) for line in output.decode().splitlines() if "=" in line))
env["OMP_PROC_BIND"] = "1"
env["OMP_NUM_THREADS"] = f"{max(os.cpu_count() - 2,1)}"
env["DPCPP_CPU_NUM_CUS"] = f"{max(os.cpu_count() - 2,1)}"
env["DPCPP_CPU_CU_AFFINITY"] = "spread"
env["DPCPP_CPU_PLACES"] = "cores"
return env
def reset_file(filename,header=""):
with open(filename, 'w', newline='') as f:
if(header != ""): print(header,file=f)
env = source_and_get_environment(SYCL_setvars)
def validate_kernel():
# #Validate SYCL kernels against baseline dualisation
if os.path.exists(f'{buildpath}validation/sycl/sycl_validation'):
subprocess.Popen(['/bin/bash', '-c', f'{buildpath}validation/sycl/sycl_validation gpu'], env=env).wait()
else:
print("SYCL validation kernel not found. Make sure your SYCL environment is set up correctly. Then run `make all` again.")
def bench_batchsize():
# # ### Run the batch size experiment
reset_file(f'{path}/single_gpu_bs.csv')
if os.path.exists(f'{buildpath}benchmarks/sycl/dualisation') and num_gpus>0:
for i in range(0,22):
subprocess.Popen(['/bin/bash', '-c', f'{buildpath}benchmarks/sycl/dualisation gpu {200} {2**(i)} {N_runs["gpu"]} {N_warmup["gpu"]} 4 1 {path}/single_gpu_bs.csv'], env=env).wait()
elif os.path.exists(f'{buildpath}benchmarks/sycl/dualisation'):
print("No GPUs found. Skipping batch size experiment.")
else:
print("SYCL dualisation kernel not found. Make sure your SYCL environment is set up correctly. Then run `make all` again.")
# # ### Run the benchmarks
def bench_baseline():
def check_file_exists(filepath, message):
if not os.path.exists(filepath):
print(message)
return False
return True
# Usage example:
if not check_file_exists(f'{buildpath}benchmarks/baseline', "Baseline binary not found. Skipping baseline benchmark"):
return
reset_file(f'{path}/base.csv')
for i in range(20,201,2):
os.system(f'{buildpath}benchmarks/baseline {i} {2**(8)} {N_runs["cpu"]} {N_warmup["cpu"]} 0 {path}/base.csv')
def bench_dualize_cpu_scaling():
if not os.path.exists(f'{buildpath}benchmarks/sycl/dualisation'):
print("SYCL dualisation kernel not found. Skipping cpu scaling benchmark. Make sure your SYCL environment is set up correctly. Then run `make all` again.")
return
if not os.path.exists(f'{buildpath}benchmarks/omp_multicore'):
print("OpenMP kernel not found. Skipping OpenMP scaling benchmark")
return
for i in range(1,5):
reset_file(f'{path}/one_cpu_v{i}_scaling.csv')
reset_file(f'{path}/omp_multicore_tp_scaling.csv')
scale_range = range(1, os.cpu_count()+1)
for i in scale_range:
env["DPCPP_CPU_NUM_CUS"] = str(i)
env["OMP_NUM_THREADS"] = str(i)
subprocess.Popen(['/bin/bash', '-c', f'{buildpath}benchmarks/omp_multicore {200} {2**(20+Bathcsize_Offset["cpu"])} {N_runs["cpu"]} {N_warmup["cpu"]} 1 {path}/omp_multicore_tp_scaling.csv'], env=env).wait()
for j in range(1,5):
subprocess.Popen(['/bin/bash', '-c', f'{buildpath}benchmarks/sycl/dualisation cpu {200} {2**(20+Bathcsize_Offset["cpu"])} {N_runs["cpu"]} {N_warmup["cpu"]} {j} 1 {path}/one_cpu_v{j}_scaling.csv'], env=env).wait()
def bench_dualize(kernel_versions="all", devices="cpu"):
if not os.path.exists(f'{buildpath}benchmarks/sycl/dualisation'):
print("SYCL dualisation kernel not found. Skipping dualisation benchmark. Make sure your SYCL environment is set up correctly. Then run `make all` again.")
return
devices = devices.lower()
kernel_range = range(1, 5) if kernel_versions == "all" else [int(kernel_versions)]
device_range = ["cpu", "gpu"] if devices == "both" else [devices]
if "gpu" in device_range and num_gpus==0:
print("No GPUs found. Skipping dualisation benchmark.")
return
if "gpu" in device_range:
reset_file(f'{path}/multi_gpu_weak.csv')
if "cpu" in device_range:
#reset_file(f'{path}/omp_multicore_sm.csv')
reset_file(f'{path}/omp_multicore_tp.csv')
for j in kernel_range:
if "gpu" in device_range:
reset_file(f'{path}/multi_gpu_v{j}.csv')
for device in device_range:
reset_file(f'{path}/one_{device}_v{j}.csv')
for i in range(20,201,2):
#Currently just running weak scaling for multi-GPU using the fastest kernel (v1)
if "cpu" in device_range:
#OpenMP Benchmark (2 Different Versions: Shared Memory Parallelism and Task Parallelism)
#proc = subprocess.Popen(['/bin/bash', '-c', f'{buildpath}benchmarks/omp_multicore {i} {2**(20+Bathcsize_Offset["cpu"])} {N_runs["cpu"]} {N_warmup["cpu"]} 0 {path}/omp_multicore_sm.csv'], env=env); proc.wait()
subprocess.Popen(['/bin/bash', '-c', f'{buildpath}benchmarks/omp_multicore {i} {2**(20+Bathcsize_Offset["cpu"])} {N_runs["cpu"]} {N_warmup["cpu"]} 1 {path}/omp_multicore_tp.csv'], env=env).wait()
if "gpu" in device_range:
subprocess.Popen(['/bin/bash', '-c', f'{buildpath}benchmarks/sycl/dualisation gpu {i} {num_gpus*2**(20+Bathcsize_Offset["gpu"])} {N_runs["gpu"]} {N_warmup["gpu"]} 1 {num_gpus} {path}/multi_gpu_weak.csv'], env=env).wait()
for j in kernel_range:
if "gpu" in device_range:
subprocess.Popen(['/bin/bash', '-c', f'{buildpath}benchmarks/sycl/dualisation gpu {i} {2**(20+Bathcsize_Offset["gpu"])} {N_runs["gpu"]} {N_warmup["gpu"]} {j} {num_gpus} {path}/multi_gpu_v{j}.csv'], env=env).wait()
for device in device_range:
subprocess.Popen(['/bin/bash', '-c', f'{buildpath}benchmarks/sycl/dualisation {device} {i} {2**(20+Bathcsize_Offset[device])} {N_runs[device]} {N_warmup[device]} {j} 1 {path}/one_{device}_v{j}.csv'], env=env).wait()
if "cpu" in device_range:
bench_dualize_cpu_scaling()
def bench_generate():
reset_file(f"{path}/buckybench.csv", header="N,BS,T_gen,TSD_gen")
if not os.path.exists(f'{buildpath}benchmarks/buckybench'):
print("Buckybench kernel not found. Skipping buckybench benchmark. Make sure requirements are all met and run `make all` again.")
return
for N in range(72,201,2):
proc = subprocess.Popen(['/bin/bash', '-c', f'{buildpath}benchmarks/buckybench {N} {gen_batch_size} 1 0 >> {path}/buckybench.csv'], env=env).wait()
# Benchmarking breakdown of timings
def bench_pipeline():
reset_file(f'{path}/base_pipeline.csv')
reset_file(f'{path}/full_pipeline.csv')
if not os.path.exists(f'{buildpath}benchmarks/sycl/baseline_pipeline'):
print("SYCL baseline pipeline kernel not found. Skipping baseline pipeline benchmark. Make sure your SYCL environment is set up correctly. Then run `make all` again.")
return
if num_gpus==0:
print("No GPUs found. Skipping pipeline benchmark.")
return
for N in range(72,201,2):
proc = subprocess.Popen(['/bin/bash', '-c', f'{buildpath}benchmarks/sycl/baseline_pipeline gpu {N} {batch_size} {N_runs["gpu"]} {N_warmup["gpu"]} 1 1 {path}/base_pipeline.csv'], env=env).wait()
proc = subprocess.Popen(['/bin/bash', '-c', f'{buildpath}benchmarks/sycl/pipeline gpu {N} {batch_size} {N_runs["gpu"]} {N_warmup["gpu"]} 1 1 {path}/full_pipeline.csv'], env=env).wait()
def bench_dualize_cpu():
bench_dualize("all", "cpu")
def bench_dualize_gpu():
bench_dualize("all", "gpu")
tasks = {'batchsize': bench_batchsize,
'baseline': bench_baseline,
'dualize_cpu': bench_dualize_cpu,
'dualize_gpu': bench_dualize_gpu,
'dualize_cpu_scaling': bench_dualize_cpu_scaling,
'generate': bench_generate,
'pipeline': bench_pipeline,
'validate': validate_kernel};
if(task=="all"):
for k in tasks: tasks[k]()
else:
tasks[task]()