Skip to content

Commit

Permalink
Collect timing data for EFA Provider
Browse files Browse the repository at this point in the history
1. Gather timing data from the following code paths
   a. fi_senddata -> rdma-core, rdma-core, rdma-core -> caller
   b. fi_writedata -> rdma-core, rdma-core, rdma-core -> caller
   c. Posting a recv buffer
   d. An empty progress
   e. A fruitful progress
   d. How many messages were progressed in each progress

Modify fi_rdm_pingpong, and fi_rma_pingpong to print these results to a
file.

Create a python script to parse the data, and generate statistics.

Signed-off-by: Seth Zegelstein <szegel@amazon.com>
  • Loading branch information
a-szegel committed Jan 3, 2024
1 parent ea0b742 commit a2f0b02
Show file tree
Hide file tree
Showing 14 changed files with 5,558 additions and 17 deletions.
83 changes: 73 additions & 10 deletions fabtests/benchmarks/benchmark_shared.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,44 @@ int pingpong(void)
}
ft_stop();

if (opts.machr)
show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 2,
opts.argc, opts.argv);
else
show_perf(NULL, opts.transfer_size, opts.iterations, &start, &end, 2);
// if (opts.machr)
// show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 2,
// opts.argc, opts.argv);
// else
// show_perf(NULL, opts.transfer_size, opts.iterations, &start, &end, 2);

FILE* fptr;

if (opts.dst_addr) {
fptr = fopen("/home/ec2-user/libfabric/client_fi_senddata_output.txt", "w");
} else {
fptr = fopen("/home/ec2-user/libfabric/server_fi_senddata_output.txt", "w");
}

for (int i = 0; i < ep->iterations; i++) {
fprintf(fptr, "<fi_senddata> libfabric to rdma_core (ns): %ld, rdma_core (ns): %ld, rdma-core to user (ns): %ld\n",
ep->libfabric_start_to_rdma_time[i],
ep->rdma_core_time[i],
ep->libfabric_from_rdma_to_end_time[i]);
}

for (int i = 0; i < ep->iterations; i++) {
fprintf(fptr, "<fi_senddata> post recv buff (ns): %ld\n", ep->post_recv_buf_time[i]);
}

for (int i = 0; i < rxcq->iterations; i++) {
fprintf(fptr, "<fi_senddata> empty cq progress (ns): %ld\n", rxcq->empty_progress[i]);
}

for (int i = 0; i < rxcq->iterations; i++) {
fprintf(fptr, "<fi_senddata> fruitful cq progress (ns): %ld\n", rxcq->fruitful_progress[i]);
}

for (int i = 0; i < rxcq->iterations; i++) {
fprintf(fptr, "<fi_senddata> num completion events in fruitful progress: %d\n", rxcq->fruitful_progress_num_events[i]);
}

fclose(fptr);

return 0;
}
Expand Down Expand Up @@ -219,11 +252,41 @@ int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
}
ft_stop();

if (opts.machr)
show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 2,
opts.argc, opts.argv);
else
show_perf(NULL, opts.transfer_size, opts.iterations, &start, &end, 2);
// if (opts.machr)
// show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 2,
// opts.argc, opts.argv);
// else
// show_perf(NULL, opts.transfer_size, opts.iterations, &start, &end, 2);


FILE* fptr;

if (opts.dst_addr) {
fptr = fopen("/home/ec2-user/libfabric/client_fi_writedata_output.txt", "w");
} else {
fptr = fopen("/home/ec2-user/libfabric/server_fi_writedata_output.txt", "w");
}

for (int i = 0; i < ep->iterations; i++) {
fprintf(fptr, "<fi_write> libfabric to rdma_core (ns): %ld, rdma_core (ns): %ld, rdma-core to user (ns): %ld\n",
ep->libfabric_start_to_rdma_time[i],
ep->rdma_core_time[i],
ep->libfabric_from_rdma_to_end_time[i]);
}

for (int i = 0; i < rxcq->iterations; i++) {
fprintf(fptr, "<fi_write> empty cq progress (ns): %ld\n", rxcq->empty_progress[i]);
}

for (int i = 0; i < rxcq->iterations; i++) {
fprintf(fptr, "<fi_write> fruitful cq progress (ns): %ld\n", rxcq->fruitful_progress[i]);
}

for (int i = 0; i < rxcq->iterations; i++) {
fprintf(fptr, "<fi_write> num completion events in fruitful progress: %d\n", rxcq->fruitful_progress_num_events[i]);
}

fclose(fptr);

return 0;
}
Expand Down
4 changes: 2 additions & 2 deletions fabtests/common/shared.c
Original file line number Diff line number Diff line change
Expand Up @@ -2137,9 +2137,9 @@ ssize_t ft_post_tx_buf(struct fid_ep *ep, fi_addr_t fi_addr, size_t size,
op_mr_desc, data, fi_addr, ctx);

} else {
FT_POST(fi_send, ft_progress, txcq, tx_seq,
FT_POST(fi_senddata, ft_progress, txcq, tx_seq,
&tx_cq_cntr, "transmit", ep, op_buf, size,
op_mr_desc, fi_addr, ctx);
op_mr_desc, data, fi_addr, ctx);
}
}
return 0;
Expand Down
243 changes: 243 additions & 0 deletions generate_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
#!/bin/python3
# source ~/env/bin/activate to get PF's env
# This script needs to be kicked off on the server IP's host
from matplotlib import pyplot as plt
from PortaFiducia.tests.utils import run_command
import subprocess
from time import sleep


# Configurable server IP (in case we change instances)
sever_ip = "172.31.35.142"
client_ip = "172.31.35.207"
iterations = 2000000
warmup_iterations = 100

# Step 1: Compile Libfabric ~20s
print("Compile libfabric and fabtests:")
autogen = "./autogen.sh"
configure = "./configure --prefix=$HOME/libfabric/install --disable-verbs --disable-psm3 --disable-opx --disable-usnic --disable-rstream --enable-efa --with-cuda=/usr/local/cuda --enable-cuda-dlopen --with-gdrcopy --enable-gdrcopy-dlopen"
clean = "rm -rf $HOME/libfabric/install && make clean"
install = "make -j install"
run_command(f"cd $HOME/libfabric; {autogen} && {configure} && {clean} && {install}")

# Step 2: Compile Fabtests ~10s
configure = "./configure --prefix=$HOME/libfabric/fabtests/install --with-libfabric=$HOME/libfabric/install --with-cuda=/usr/local/cuda"
clean = "rm -rf $HOME/libfabric/fabtests/install && make clean"
run_command(f"cd $HOME/libfabric/fabtests; {autogen} && {configure} && {clean} && {install}")


# Step 3: Run fi_rdm_pingpong
print("\n\n Running Eager Send Data Tests \n\n")

executable = "/home/ec2-user/libfabric/fabtests/install/bin/fi_rdm_pingpong"
server_args = f" --pin-core 1 -i 0 -p efa -D cuda -w {warmup_iterations} -I {iterations} -S 1024 -E "
server_command = executable + server_args
client_command = "ssh " + client_ip + " " + executable + server_args + sever_ip

print(f"Running Server Command: {server_command}")
print(f"Running Client Command: {client_command}")
server_process = subprocess.Popen(server_command, stdout=subprocess.PIPE, stderr=None,
shell=True, universal_newlines=True)
sleep(5)

client_process = subprocess.Popen(client_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
shell=True, universal_newlines=True)

sleep(5)

try:
_, _ = server_process.communicate(timeout=720)
sleep(5)
_, _ = client_process.communicate(timeout=720)
except subprocess.TimeoutExpired:
server_process.terminate()
client_process.terminate()
raise RuntimeError("Client/Server Timed Out")


def parse_rdm_pingpong_output(filename):
libfabric_to_rdma_core = []
rdma_core = []
rdma_core_to_libfabric = []
post_recv_buff = []
empty_cq_progress = []
fruitful_cq_progress = []
fruitful_cq_progress_num_completions = []

with open(filename, "r") as f:
lines = f.readlines()

for line in lines:
if line.startswith("<fi_senddata> libfabric to rdma_core"):
data = line.strip().split(" ")
libfabric_to_rdma_core.append(int(data[5].strip(",")))
rdma_core.append(int(data[8].strip(",")))
rdma_core_to_libfabric.append(int(data[-1]))

elif line.startswith("<fi_senddata> post recv buff"):
post_recv_buff.append(int(line.strip().split(" ")[-1]))

elif line.startswith("<fi_senddata> empty cq progress"):
empty_cq_progress.append(int(line.strip().split(" ")[-1]))

elif line.startswith("<fi_senddata> fruitful cq progress"):
fruitful_cq_progress.append(int(line.strip().split(" ")[-1]))

elif line.startswith("<fi_senddata> num completion events in fruitful progress"):
fruitful_cq_progress_num_completions.append(int(line.strip().split(" ")[-1]))

return libfabric_to_rdma_core, rdma_core, rdma_core_to_libfabric, post_recv_buff, empty_cq_progress, fruitful_cq_progress, fruitful_cq_progress_num_completions

(server_libfabric_to_rdma_core, server_rdma_core, server_rdma_core_to_libfabric,
server_post_recv_buff, server_empty_cq_progress, server_fruitful_cq_progress,
server_fruitful_cq_progress_num_completions) = parse_rdm_pingpong_output("/home/ec2-user/libfabric/server_fi_senddata_output.txt")
(client_libfabric_to_rdma_core, client_rdma_core, client_rdma_core_to_libfabric,
client_post_recv_buff, client_empty_cq_progress, client_fruitful_cq_progress,
client_fruitful_cq_progress_num_completions) = parse_rdm_pingpong_output("/home/ec2-user/libfabric/client_fi_senddata_output.txt")

print("\n")

# Step 3: Get/print statistics

def get_and_print_statistics(vector, name):
import statistics
avg = statistics.mean(vector)
num_outliers = sum(i > avg * 5 for i in vector)
# TODO Remove this logic
# Remove outliers from vector
vector = [i for i in vector if i <= avg * 5]

# get a new average
avg = statistics.mean(vector)
pstd = statistics.pstdev(vector)
minimum = min(vector)
maximum = max(vector)
median = sorted(vector)[len(vector) // 2]
num_samples = len(vector)
unique = len(set(vector))

unit = "ns"
if "fruitful cq progress num completion" in name:
unit = None

print(f"{name} average {unit}: {avg}, pstd: {pstd}, min: {minimum}, max: {maximum}, median: {median}, num_samples: {num_samples}, unique: {unique}, num_outliers removed (5x mean): {num_outliers}")

if "fruitful cq progress num completion" in name:
# num completions is usually always 1, no hist
return

plt.figure()
plt.hist(vector, round(unique/5), range=[minimum-10, avg*2], align='mid')
plt.title(name)
plt.xlabel("Time (ns)")
plt.ylabel("Frequency")
plt.savefig(f"/home/ec2-user/libfabric/plots/{name}.png")
plt.close()


print("Server's fi_senddata stats:")
get_and_print_statistics(server_libfabric_to_rdma_core, "Server fi_senddata libfabric to rdma_core")
get_and_print_statistics(server_rdma_core, "Server fi_senddata rdma_core")
get_and_print_statistics(server_rdma_core_to_libfabric, "Server fi_senddata rdma_core to libfabric")
get_and_print_statistics(server_post_recv_buff, "Server fi_senddata post recv buff")
get_and_print_statistics(server_empty_cq_progress, "Server fi_senddata empty cq progress")
get_and_print_statistics(server_fruitful_cq_progress, "Server fi_senddata fruitful cq progress")
get_and_print_statistics(server_fruitful_cq_progress_num_completions, "Server fi_senddata fruitful cq progress num completion")

print("\n")

print("Client's fi_senddata stats:")
get_and_print_statistics(client_libfabric_to_rdma_core, "Client fi_senddata libfabric to rdma_core")
get_and_print_statistics(client_rdma_core, "Client fi_senddata rdma_core")
get_and_print_statistics(client_rdma_core_to_libfabric, "Client fi_senddata rdma_core to libfabric")
get_and_print_statistics(client_post_recv_buff, "Client fi_senddata post recv buff")
get_and_print_statistics(client_empty_cq_progress, "Client fi_senddata empty cq progress")
get_and_print_statistics(client_fruitful_cq_progress, "Client fi_senddata fruitful cq progress")
get_and_print_statistics(client_fruitful_cq_progress_num_completions, "Client fi_senddata fruitful cq progress num completion")


# Step 4: Run fi_rdm_rma_pingpong
print("\n\n Running RMA Write Data Tests \n\n")
executable = "/home/ec2-user/libfabric/fabtests/install/bin/fi_rma_pingpong"
server_args = f" --pin-core 1 -i 0 -o writedata -p efa -D cuda -w {warmup_iterations} -I {iterations} -S 131072 -E "
server_command = executable + server_args
client_command = "ssh " + client_ip + " " + executable + server_args + sever_ip

print(f"Running Server Command: {server_command}")
print(f"Running Client Command: {client_command}")
server_process = subprocess.Popen(server_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
shell=True, universal_newlines=True)

sleep(5)

client_process = subprocess.Popen(client_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
shell=True, universal_newlines=True)

sleep(5)

try:
server_output, _ = server_process.communicate(timeout=720)
sleep(5)
client_output, _ = client_process.communicate(timeout=720)
except subprocess.TimeoutExpired:
server_process.terminate()
client_process.terminate()
raise RuntimeError("Client/Server Timed Out")

def parse_rma_pingpong_output(filename):
libfabric_to_rdma_core = []
rdma_core = []
rdma_core_to_libfabric = []
empty_cq_progress = []
fruitful_cq_progress = []
fruitful_cq_progress_num_completions = []

with open(filename, "r") as f:
lines = f.readlines()

for line in lines:
if line.startswith("<fi_write> libfabric to rdma_core"):
data = line.strip().split(" ")
libfabric_to_rdma_core.append(int(data[5].strip(",")))
rdma_core.append(int(data[8].strip(",")))
rdma_core_to_libfabric.append(int(data[-1]))

elif line.startswith("<fi_write> empty cq progress"):
empty_cq_progress.append(int(line.strip().split(" ")[-1]))

elif line.startswith("<fi_write> fruitful cq progress"):
fruitful_cq_progress.append(int(line.strip().split(" ")[-1]))

elif line.startswith("<fi_write> num completion events in fruitful progress"):
fruitful_cq_progress_num_completions.append(int(line.strip().split(" ")[-1]))

return libfabric_to_rdma_core, rdma_core, rdma_core_to_libfabric, empty_cq_progress, fruitful_cq_progress, fruitful_cq_progress_num_completions

(server_libfabric_to_rdma_core, server_rdma_core, server_rdma_core_to_libfabric,
server_empty_cq_progress, server_fruitful_cq_progress,
server_fruitful_cq_progress_num_completions) = parse_rma_pingpong_output("/home/ec2-user/libfabric/server_fi_writedata_output.txt")
(client_libfabric_to_rdma_core, client_rdma_core, client_rdma_core_to_libfabric,
client_empty_cq_progress, client_fruitful_cq_progress,
client_fruitful_cq_progress_num_completions) = parse_rma_pingpong_output("/home/ec2-user/libfabric/client_fi_writedata_output.txt")


print("\n")

# Setp 5: Get/print statistics
print("Server's fi_writedata stats:")
get_and_print_statistics(server_libfabric_to_rdma_core, "Server fi_writedata libfabric to rdma_core")
get_and_print_statistics(server_rdma_core, "Server fi_writedata rdma_core")
get_and_print_statistics(server_rdma_core_to_libfabric, "Server fi_writedata rdma_core to libfabric")
get_and_print_statistics(server_empty_cq_progress, "Server fi_writedata empty cq progress")
get_and_print_statistics(server_fruitful_cq_progress, "Server fi_writedata fruitful cq progress")
get_and_print_statistics(server_fruitful_cq_progress_num_completions, "Server fi_writedata fruitful cq progress num completion")

print("\n")
print("Client's fi_writedata stats:")
get_and_print_statistics(client_libfabric_to_rdma_core, "Client fi_writedata libfabric to rdma_core")
get_and_print_statistics(client_rdma_core, "Client fi_writedata rdma_core")
get_and_print_statistics(client_rdma_core_to_libfabric, "Client fi_writedata rdma_core to libfabric")
get_and_print_statistics(client_empty_cq_progress, "Client fi_writedata empty cq progress")
get_and_print_statistics(client_fruitful_cq_progress, "Client fi_writedata fruitful cq progress")
get_and_print_statistics(client_fruitful_cq_progress_num_completions, "Client fi_writedata fruitful cq progress num completion")
Loading

0 comments on commit a2f0b02

Please sign in to comment.