2 Commits

Author SHA1 Message Date
d494228d77 alg benchmarking runner and processor 2025-11-05 12:08:34 +01:00
Erik Fabrizzi
79bc443bcb templating algsupport 2025-10-31 14:08:04 +01:00
45 changed files with 331 additions and 15908 deletions

11
.gitignore vendored
View File

@@ -0,0 +1,11 @@
# Ignore everything
*
# But not these!
!.gitignore
!README.md
!*.py
!*.template
# Optional: Keep subdirectories and their Python files
!*/

203
launch_alg_bench.py Executable file
View File

@@ -0,0 +1,203 @@
import os
import subprocess
from datetime import datetime
################ HELPER FUNCTIONS ################
def load_template(template_path: str):
output_template = ""
with open(template_path, "r") as handle:
output_template = handle.read()
return output_template
def write_batch(batch_fpath: str, batch_content: str):
with open(batch_fpath, "w") as handle:
_ = handle.write(batch_content)
################### SETUP DIRS ###################
output_dir = os.getcwd()+"/output/"
err_dir = os.getcwd()+"/error/"
batch_files_dir = os.getcwd()+"/batchs/"
data_dir = os.getcwd()+"/data/"
if os.path.isdir(output_dir) == False:
os.mkdir(output_dir)
if os.path.isdir(err_dir) == False:
os.mkdir(err_dir)
if os.path.isdir(data_dir) == False:
os.mkdir(data_dir)
if os.path.isdir(batch_files_dir) == False:
os.mkdir(batch_files_dir)
################ GLOBAL DEFAULTS #################
mpi1_bin = "/home/hpc/ihpc/ihpc136h/workspace/mpi-benchmark-tool/bin/IMB-MPI1"
default_parameter = {
"time_stamp": datetime.now().strftime("%y_%m_%d_%H-%M-%S"),
"job_name": "",
"output_dir": os.getcwd()+"/output/",
"err_dir": os.getcwd()+"/error/",
"data_dir": os.getcwd()+"/data/",
"n_procs": 18,
"off_cache_flag": "",
"bin": mpi1_bin,
"n_nodes": 1
}
algs_dic = [{'name': "Allgather",
'flag': "I_MPI_ADJUST_ALLGATHER",
'algs': [
"Recursive doubling ",
"Bruck`s ",
"Ring ",
"Topology aware Gatherv + Bcast ",
"Knomial ",
]},
{'name': "Allreduce",
'flag': "I_MPI_ADJUST_ALLREDUCE",
'algs': [
"Recursive doubling ",
"Rabenseifner`s ",
"Reduce + Bcast ",
"Topology aware Reduce + Bcast ",
"Binomial gather + scatter ",
"Topology aware binominal gather + scatter ",
"Shumilin`s ring ",
"Ring ",
"Knomial ",
"Topology aware SHM-based flat ",
"Topology aware SHM-based Knomial ",
"Topology aware SHM-based Knary ",
]},
{'name': "Alltoall",
'flag': "I_MPI_ADJUST_ALLTOALL",
'algs': [
"Bruck`s ",
"Isend/Irecv + waitall ",
"Pair wise exchange ",
"Plum`s ",
]},
{'name': "Barrier",
'flag': "I_MPI_ADJUST_BARRIER",
'algs': [
"Dissemination ",
"Recursive doubling ",
"Topology aware dissemination ",
"Topology aware recursive doubling ",
"Binominal gather + scatter ",
"Topology aware binominal gather + scatter ",
"Topology aware SHM-based flat ",
"Topology aware SHM-based Knomial ",
"Topology aware SHM-based Knary ",
]},
{'name': "Bcast",
'flag': "I_MPI_ADJUST_BCAST",
'algs': [
"Binomial ",
"Recursive doubling ",
"Ring ",
"Topology aware binomial ",
"Topology aware recursive doubling ",
"Topology aware ring ",
"Shumilin`s ",
"Knomial ",
"Topology aware SHM-based flat ",
"Topology aware SHM-based Knomial ",
"Topology aware SHM-based Knary ",
"NUMA aware SHM-based (SSE4.2) ",
"NUMA aware SHM-based (AVX2) ",
"NUMA aware SHM-based (AVX512) ",
]},
{'name': "Gather",
'flag': "I_MPI_ADJUST_GATHER",
'algs': [
"Binomial ",
"Topology aware binomial ",
"Shumilin`s ",
"Binomial with segmentation ",
]},
{'name': "Reduce_scatter",
'flag': "I_MPI_ADJUST_REDUCE_SCATTER",
'algs': [
"Recursive halving ",
"Pair wise exchange ",
"Recursive doubling ",
"Reduce + Scatterv ",
"Topology aware Reduce + Scatterv ",
]},
{'name': "Reduce",
'flag': "I_MPI_ADJUST_REDUCE",
'algs': [
"Shumilin`s ",
"Binomial ",
"Topology aware Shumilin`s ",
"Topology aware binomial ",
"Rabenseifner`s ",
"Topology aware Rabenseifner`s ",
"Knomial ",
"Topology aware SHM-based flat ",
"Topology aware SHM-based Knomial ",
"Topology aware SHM-based Knary ",
"Topology aware SHM-based binomial ",
]},
{'name': "Scatter",
'flag': "I_MPI_ADJUST_SCATTER",
'algs': [
"Binomial ",
"Topology aware binomial ",
"Shumilin`s ",
]},
]
log = ""
############## MULTIPLE-NODE LAUNCH ##############
off_cache_flags = [
"-off_cache -1",
"-off_cache 50",
""
]
ndcnt = [
2,
3,
4,
5,
6,
7,
8,
9,
10
]
proc_per_node = 72
multiple_node_parameter = dict(default_parameter)
multiple_node_template = load_template("./templates/multinode_algs.template")
for flag in off_cache_flags:
multiple_node_parameter["off_cache_flag"] = flag
for n_nodes in ndcnt:
n_procs = n_nodes*proc_per_node
multiple_node_parameter["n_procs"] = int(n_procs)
multiple_node_parameter["n_nodes"] = n_nodes
for alg_conf in algs_dic:
collective = alg_conf['name']
multiple_node_parameter["job_name"] = collective
multiple_node_parameter["alg_flag"] = alg_conf['flag']
algs = alg_conf["algs"]
for idx, alg in enumerate(algs):
multiple_node_parameter["alg_name"] = alg
multiple_node_parameter["alg_idx"] = idx
batch_file = os.path.join(batch_files_dir,
f"{collective}_{alg.strip().replace('`','').replace(' ','_').replace('/','_')}.sh")
write_batch(batch_file,
multiple_node_template.format(**multiple_node_parameter))
result = subprocess.run(["sbatch", batch_file],
capture_output=True, text=True)
log += f"#{collective} {n_procs}" + "\n"
log += "\tSTDOUT:" + result.stdout + "\n"
log += "\tSTDERR:" + result.stderr + "\n"
print(log)

112
postprocess_data_algs.py Executable file
View File

@@ -0,0 +1,112 @@
from venv import create
import pandas as pd
import os
data_markers = {
"block_separator": "#----------------------------------------------------------------",
"benchmark_type": "# Benchmarking",
"processes_num": "# #processes = ",
"min_bytelen": "# Minimum message length in bytes",
"max_bytelen": "# Maximum message length in bytes",
"mpi_datatype": "# MPI_Datatype :",
"mpi_red_datatype": "# MPI_Datatype for reductions :",
"mpi_red_op": "# MPI_Op",
"end_of_table": "# All processes entering MPI_Finalize",
"creation_time": "# CREATION_TIME :",
"n_nodes": "# N_NODES :",
"off_cache_flag": "# OFF_CACHE_FLAG :",
"algorithm":"# ALGORITHM :"
}
column_names = [
"benchmark_type",
"proc_num",
"msg_size_bytes",
"repetitions",
"t_min_usec",
"t_max_usec",
"t_avg_usec",
"mpi_datatype",
"mpi_red_datatype",
"mpi_red_op",
"creation_time",
"n_nodes",
"off_cache_flag",
"algorithm"
]
data = list()
for file in os.listdir("data/"):
with open("data/"+file, 'r') as f:
lines = f.readlines()
past_preheader = False
in_header = False
in_body = False
btype = "NA"
proc_num = "NA"
mpi_datatype = "NA"
mpi_red_datatype = "NA"
mpi_red_op = "NA"
creation_time = "NA"
n_nodes = "NA"
off_cache_flag = "NA"
algorithm = "NA"
for line in lines:
if data_markers["block_separator"] in line:
if in_header and not past_preheader:
past_preheader = True
elif in_header and past_preheader:
in_body = True
in_header = not in_header
continue
if not in_header and not in_body and past_preheader:
if data_markers["mpi_datatype"] in line:
mpi_datatype = line.split()[-1]
elif data_markers["mpi_red_datatype"] in line:
mpi_red_datatype = line.split()[-1]
elif data_markers["mpi_red_op"] in line:
mpi_red_op = line.split()[-1]
if not in_header and not in_body and not past_preheader:
if data_markers["n_nodes"] in line:
n_nodes = line.split()[-1]
if data_markers["creation_time"] in line:
creation_time = line.split()[-1]
if data_markers["off_cache_flag"] in line:
off_cache_flag = line.split(":")[-1].strip()
if off_cache_flag == "": off_cache_flag = "NA"
else: off_cache_flag = off_cache_flag.replace("-off_cache","")
if data_markers["algorithm"] in line:
algorithm = line.split(":")[-1].strip()
if past_preheader and in_header:
if data_markers["benchmark_type"] in line:
btype = line.split()[2]
if data_markers["processes_num"] in line:
proc_num = int(line.split()[3])
if in_body:
if "#" in line or "".join(line.split()) == "":
continue
if data_markers["end_of_table"] in line:
break
if("int-overflow" in line) : continue
if("out-of-mem" in line) : continue
data.append([btype, proc_num]+[int(s) if s.isdigit()
else float(s) for s in line.split()] +
[
mpi_datatype,
mpi_red_datatype,
mpi_red_op,
creation_time,
n_nodes,
off_cache_flag,
algorithm
])
df = pd.DataFrame(data, columns=column_names)
df.to_csv("data.csv", index=False)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 645 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 696 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 683 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 376 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 389 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 243 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 266 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 232 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 581 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 203 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 197 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 657 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 637 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 580 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 623 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 586 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 588 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 644 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 527 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 572 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 662 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 648 KiB

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,175 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "da7c16b4",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from scipy.optimize import curve_fit\n",
"from matplotlib.cm import get_cmap"
]
},
{
"cell_type": "markdown",
"id": "47341b1d",
"metadata": {},
"source": [
"# Alltoall "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1cc39aab",
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mnotebook controller is DISPOSED. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
"df_multinode = pd.read_csv(\"../data/data-multi-defand100cflag.csv\",delimiter = \",\")\n",
"df_multinode['benchmark_type'].unique()\n",
"df_gather = df_multinode[df_multinode[\"benchmark_type\"]==\"Bcast\"][df_multinode['msg_size_bytes']>1024][df_multinode['off_cache_flag']==-1]\n",
"df_gather.columns.tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4336d3c6",
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mnotebook controller is DISPOSED. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
"def model(proc_num, alpha, beta, msg_size):\n",
" return (alpha * msg_size * (proc_num - 72) * 72) / (12.5 * 1e3) + 1e6*beta\n",
"\n",
"results = []\n",
"msg_sizes = sorted(df_gather['msg_size_bytes'].unique())\n",
"n_rows = int(np.ceil(len(msg_sizes) / 3))\n",
"n_cols = min(len(msg_sizes), 3)\n",
"fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows), squeeze=False)\n",
"cmap = get_cmap('tab10')\n",
"\n",
"for idx, (msg_size, group) in enumerate(df_gather.groupby('msg_size_bytes')):\n",
" x = group['proc_num'].values.copy()\n",
" y = group['t_avg_usec'].values.copy()\n",
" sorted_indices = np.argsort(x)\n",
" x = x[sorted_indices]\n",
" y = y[sorted_indices]\n",
" fit_func = lambda proc_num, alpha, beta: model(proc_num, alpha, beta, msg_size)\n",
" popt, _ = curve_fit(fit_func, x, y, bounds=([1, 0], [np.inf, np.inf]))\n",
" alpha, beta = popt\n",
" results.append({'msg_size_bytes': msg_size, 'alpha': alpha, 'beta': beta})\n",
"\n",
" x_fit = np.linspace(min(x), max(x), 100)\n",
" y_fit = fit_func(x_fit, alpha, beta)\n",
" y_speed = model(x_fit,1,0,msg_size)\n",
" row, col = divmod(idx, n_cols)\n",
" ax = axes[row][col]\n",
"\n",
" color = cmap(idx % 10)\n",
" # ax.scatter(x, y/1e6, label='Data', color=color)\n",
" ax.plot(x, y/1e6, label='Data', color=color)\n",
" # ax.plot(x_fit, y_fit/1e6, linestyle='--', color=color, alpha=0.5, label='Fit')\n",
" # ax.plot(x_fit, y_speed/1e6, linestyle='--', color='red', alpha=0.1, label='Fit')\n",
" ax.set_title(f'msg_size: {msg_size} bytes')\n",
" ax.set_xlabel('num. proc.')\n",
" ax.set_ylabel('Average Time [s]')\n",
" ax.set_xticks(x)\n",
" ax.grid(True)\n",
" max_data =(x[-1]-72)*72*msg_size\n",
" min_data =(x[0]-72)*72*msg_size\n",
"\n",
" textstr = \"\"\n",
" # if(max_data > 1e9):\n",
" # textstr+=f\"max data = {max_data/1e9:0.2f}GB\\n\" \n",
" # else:\n",
" # textstr+=f\"max data = {max_data/1e6:0.2f}MB\\n\" \n",
"\n",
" # if(min_data > 1e9):\n",
" # textstr+=f\"min data = {min_data/1e9:0.2f}GB\\n\" \n",
" # else:\n",
" # textstr+=f\"min data = {min_data/1e6:0.2f}MB\\n\" \n",
" # textstr += r\"$\\alpha$\" +f\"= {alpha:.3e}\\n\"+r\"$b_{eff}=$\"+f\"{12.5/alpha:0.3f}Gbps\\n\"+\\\n",
" # r\"$\\beta$\"+f\"= {beta:.3e} s\"\n",
" # ax.text(0.95, 0.05, textstr, transform=ax.transAxes,\n",
" # fontsize=10, verticalalignment='bottom',\n",
" # horizontalalignment='right',\n",
" # bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))\n",
"\n",
"fig.suptitle('Alltoall Time Fit per Message Size\\nDots = Data Points | Dashed Lines = Fits\\n off_mem=-1', fontsize=14)\n",
"fig.tight_layout(rect=[0, 0.03, 1, 0.95])\n",
"# plt.savefig(\"plots/alltoall_analysis.png\",dpi=300)\n",
"plt.show()\n",
"\n",
"fit_results = pd.DataFrame(results)\n",
"fit_results['inv_alpha'] = 1 / fit_results['alpha']\n",
"print(fit_results)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce632d6f",
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mnotebook controller is DISPOSED. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
"df_gather[df_gather['msg_size_bytes']==1048576]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "data",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,69 +0,0 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import curve_fit
import matplotlib.cm as cm
def max_transfer_size(msg_size, np_procs, benchmark_type):
if benchmark_type == 'Allgather':
return (np_procs-72)*msg_size
elif benchmark_type == 'Scatter':
return (np_procs-72)*msg_size # ?
elif benchmark_type == 'Alltoall':
return 72*(np_procs-72)*msg_size
elif benchmark_type == 'Bcast':
return msg_size
elif benchmark_type == 'Gather':
return (np_procs)*msg_size # ?
elif benchmark_type == 'Reduce_scatter':
return 0.25*(np_procs-72)*(1/72)*msg_size # ?
elif benchmark_type == 'Allreduce':
return 0.25*(np_procs-72)*(1/72)*msg_size
elif benchmark_type == 'Reduce':
return 0.25*(np_procs-72)*(1/72)*msg_size
data_file = "data/data-multi-defand100cflag.csv"
df_multinode = pd.read_csv(data_file, delimiter=',')
df_multinode_offdef = df_multinode[df_multinode['off_cache_flag'] == 100]
benchmarks = df_multinode_offdef['benchmark_type'].unique().tolist()
benchmarks = [x for x in benchmarks if x[-1] != 'v']
print(benchmarks)
df_multinode_offdef = df_multinode_offdef[df_multinode_offdef['benchmark_type'].isin(
benchmarks)][df_multinode_offdef['msg_size_bytes'] > 1000]
df_multinode_offdef["max_transfer"] = df_multinode_offdef.apply(
lambda row: max_transfer_size(
msg_size=row["msg_size_bytes"],
np_procs=row["proc_num"],
benchmark_type=row["benchmark_type"]
),
axis=1
)
df_multinode_offdef["bytes/usec"] = df_multinode_offdef["max_transfer"] / \
df_multinode_offdef["t_avg_usec"]
df_multinode_offdef = df_multinode_offdef[df_multinode_offdef['benchmark_type']!='Allgather'][df_multinode_offdef['benchmark_type']!='Alltoall']
df_multinode_offdef = df_multinode_offdef[['benchmark_type','msg_size_bytes','t_avg_usec','proc_num']]
plt.figure(figsize=(16, 9))
sns.barplot(
data=df_multinode_offdef,
x="benchmark_type",
y="t_avg_usec",
dodge=True,
hue=df_multinode_offdef["msg_size_bytes"].astype(str),
)
# plt.yscale("log")
plt.title("Average Time (usec) per Benchmark Type and Message Size")
plt.ylabel("Average Time (usec)")
plt.xlabel("Benchmark Type")
plt.xticks(rotation=45)
plt.legend(title="Message Size (bytes)")
plt.tight_layout()
# plt.show()
plt.savefig("./plots/benchmark_avg_time_barplot.png", dpi=300)
plt.close()

View File

@@ -1,64 +0,0 @@
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
data_file = "data/data-multi-defand100cflag.csv"
df_multinode = pd.read_csv(data_file, delimiter=',')
df_multinode_offdef = df_multinode[df_multinode['off_cache_flag'] == 100]
df_multinode_offdef = df_multinode_offdef[['benchmark_type','msg_size_bytes','t_avg_usec','proc_num']]
benchmarks = df_multinode_offdef['benchmark_type'].unique().tolist()
benchmarks = [x for x in benchmarks if x[-1] != 'v']
df_multinode_offdef = df_multinode_offdef[df_multinode_offdef['benchmark_type'].isin(
benchmarks)][df_multinode_offdef['msg_size_bytes'] > 1000]
fast_benchmarks = ["Allreduce","Bcast","Reduce","Reduce_scatter"]
df_multinode_offdef = df_multinode_offdef[df_multinode_offdef["benchmark_type"].isin(fast_benchmarks)]
plt.figure(figsize=(16, 9))
sns.barplot(
data=df_multinode_offdef,
x="benchmark_type",
y="t_avg_usec",
dodge=True,
hue=df_multinode_offdef["msg_size_bytes"].astype(str),
)
plt.ylim(0)
plt.title("Average Time (usec) per Benchmark Type and Message Size")
plt.ylabel("Average Time (usec)")
plt.xlabel("Benchmark Type")
plt.xticks(rotation=45)
plt.legend(title="Message Size (bytes)")
plt.tight_layout()
plt.savefig("./plots/fbenchmarks_avg_time_barplot.png", dpi=300)
plt.close()
df_allreduce= df_multinode_offdef[df_multinode_offdef["benchmark_type"]=="Allreduce"]
df_allreduce = df_allreduce[['msg_size_bytes','t_avg_usec','proc_num']]
df_allreduce = df_allreduce[df_allreduce['msg_size_bytes']>2**17]
pivot = df_allreduce.pivot(index="msg_size_bytes", columns="proc_num", values="t_avg_usec")
X = pivot.columns.values # proc_num
Y = pivot.index.values # msg_size_bytes
X, Y = np.meshgrid(X, Y)
Z = pivot.values
fig = plt.figure(figsize=(16, 9))
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(X, Y, Z, cmap="viridis", edgecolor='k')
cbar = fig.colorbar(surf, ax=ax, shrink=0.6, pad=0.01, location='left')
cbar.set_label("Average Time (μs)")
ax.set_xlabel("Process Count")
ax.set_ylabel("Message Size (B)")
ax.set_zlabel("Average Time (μs)")
ax.set_title("Allreduce")
ax.set_xticks(pivot.columns.values) # use the actual process count values
ax.set_xticklabels(pivot.columns.values)
ax.set_yticks(Y[:, 0])
ymin, ymax = ax.get_ylim()
ax.set_ylim(ymin*0.8, ymax) # 30% more space at top
ax.set_yticklabels([f"$2^{{{int(np.log2(v))}}}$" for v in Y[:, 0]])
plt.tight_layout()
plt.savefig("./plots/allreduce_surface.png", dpi=300)
plt.close()

View File

@@ -1,66 +0,0 @@
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
data_file = "data/data-multi-defand100cflag.csv"
df_multinode = pd.read_csv(data_file, delimiter=',')
df_multinode_offdef = df_multinode[df_multinode['off_cache_flag'] == 100]
df_multinode_offdef = df_multinode_offdef[['benchmark_type','msg_size_bytes','t_avg_usec','proc_num']]
benchmarks = df_multinode_offdef['benchmark_type'].unique().tolist()
benchmarks = [x for x in benchmarks if x[-1] != 'v']
df_multinode_offdef = df_multinode_offdef[df_multinode_offdef['benchmark_type'].isin(
benchmarks)][df_multinode_offdef['msg_size_bytes'] > 1000]
# fast_benchmarks = ["Allreduce","Bcast","Reduce","Reduce_scatter"]
medium_benchmarks = ["Gather","Scatter"]
df_multinode_offdef = df_multinode_offdef[df_multinode_offdef["benchmark_type"].isin(medium_benchmarks)]
plt.figure(figsize=(16, 9))
sns.barplot(
data=df_multinode_offdef,
x="benchmark_type",
y="t_avg_usec",
dodge=True,
hue=df_multinode_offdef["msg_size_bytes"].astype(str),
)
plt.ylim(0)
plt.title("Average Time (usec) per Benchmark Type and Message Size")
plt.ylabel("Average Time (usec)")
plt.xlabel("Benchmark Type")
plt.xticks(rotation=45)
plt.legend(title="Message Size (bytes)")
plt.tight_layout()
plt.savefig("./plots/mbenchmarks_avg_time_barplot.png", dpi=300)
plt.close()
df_gather = df_multinode_offdef[df_multinode_offdef['benchmark_type']=='Gather']
df_gather = df_gather[['msg_size_bytes','t_avg_usec','proc_num']]
df_gather = df_gather[df_gather['msg_size_bytes']>2**17]
pivot = df_gather.pivot(index="msg_size_bytes", columns="proc_num", values="t_avg_usec")
X = pivot.columns.values # proc_num
Y = pivot.index.values # msg_size_bytes
X, Y = np.meshgrid(X, Y)
Z = pivot.values
fig = plt.figure(figsize=(16, 9))
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(X, Y, Z, cmap="viridis", edgecolor='k')
cbar = fig.colorbar(surf, ax=ax, shrink=0.6, pad=0.01, location='left')
cbar.set_label("Average Time (μs)")
ax.set_xlabel("Process Count")
ax.set_ylabel("Message Size (B)")
ax.set_zlabel("Average Time (μs)")
ax.set_title("Gather")
ax.set_xticks(pivot.columns.values) # use the actual process count values
ax.set_xticklabels(pivot.columns.values)
ax.set_yticks(Y[:, 0])
ymin, ymax = ax.get_ylim()
ax.set_ylim(ymin*0.8, ymax) # 30% more space at top
ax.set_yticklabels([f"$2^{{{int(np.log2(v))}}}$" for v in Y[:, 0]])
plt.tight_layout()
plt.savefig("./plots/gather_surface.png", dpi=300)
plt.close()

View File

@@ -1,93 +0,0 @@
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
data_file = "data/data-multi-defand100cflag.csv"
df_multinode = pd.read_csv(data_file, delimiter=',')
df_multinode_offdef = df_multinode[df_multinode['off_cache_flag'] == 100]
df_multinode_offdef = df_multinode_offdef[['benchmark_type','msg_size_bytes','t_avg_usec','proc_num']]
benchmarks = df_multinode_offdef['benchmark_type'].unique().tolist()
benchmarks = [x for x in benchmarks if x[-1] != 'v']
df_multinode_offdef = df_multinode_offdef[df_multinode_offdef['benchmark_type'].isin(
benchmarks)][df_multinode_offdef['msg_size_bytes'] > 1000]
slow_benchmarks = ["Alltoall","Allgather"]
df_multinode_offdef = df_multinode_offdef[df_multinode_offdef["benchmark_type"].isin(slow_benchmarks)]
plt.figure(figsize=(16, 9))
sns.barplot(
data=df_multinode_offdef,
x="benchmark_type",
y="t_avg_usec",
dodge=True,
hue=df_multinode_offdef["msg_size_bytes"].astype(str),
)
plt.ylim(0)
plt.title("Average Time (usec) per Benchmark Type and Message Size")
plt.ylabel("Average Time (usec)")
plt.xlabel("Benchmark Type")
plt.xticks(rotation=45)
plt.legend(title="Message Size (bytes)")
plt.tight_layout()
plt.savefig("./plots/sbenchmarks_avg_time_barplot.png", dpi=300)
plt.close()
df_alltoall = df_multinode_offdef[df_multinode_offdef['benchmark_type']=='Alltoall']
df_alltoall = df_alltoall[['msg_size_bytes','t_avg_usec','proc_num']]
df_alltoall = df_alltoall[df_alltoall['msg_size_bytes']>2**17]
pivot = df_alltoall.pivot(index="msg_size_bytes", columns="proc_num", values="t_avg_usec")
X = pivot.columns.values # proc_num
Y = pivot.index.values # msg_size_bytes
X, Y = np.meshgrid(X, Y)
Z = pivot.values
fig = plt.figure(figsize=(16, 9))
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(X, Y, Z, cmap="viridis", edgecolor='k')
cbar = fig.colorbar(surf, ax=ax, shrink=0.6, pad=0.01, location='left')
cbar.set_label("Average Time (μs)")
ax.set_xlabel("Process Count")
ax.set_ylabel("Message Size (B)")
ax.set_zlabel("Average Time (μs)")
ax.set_title("Alltoall")
ax.set_xticks(pivot.columns.values) # use the actual process count values
ax.set_xticklabels(pivot.columns.values)
ax.set_yticks(Y[:, 0])
ymin, ymax = ax.get_ylim()
ax.set_ylim(ymin*0.8, ymax) # 30% more space at top
ax.set_yticklabels([f"$2^{{{int(np.log2(v))}}}$" for v in Y[:, 0]])
plt.tight_layout()
plt.savefig("./plots/alltoall_surface.png", dpi=300)
plt.close()
df_allgather = df_multinode_offdef[df_multinode_offdef['benchmark_type']=='Allgather']
df_allgather = df_allgather[['msg_size_bytes','t_avg_usec','proc_num']]
df_allgather = df_allgather[df_allgather['msg_size_bytes']>2**17]
pivot = df_allgather.pivot(index="msg_size_bytes", columns="proc_num", values="t_avg_usec")
X = pivot.columns.values # proc_num
Y = pivot.index.values # msg_size_bytes
X, Y = np.meshgrid(X, Y)
Z = pivot.values
fig = plt.figure(figsize=(16, 9))
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(X, Y, Z, cmap="viridis", edgecolor='k')
cbar = fig.colorbar(surf, ax=ax, shrink=0.6, pad=0.01, location='left')
cbar.set_label("Average Time (μs)")
ax.set_xlabel("Process Count")
ax.set_ylabel("Message Size (B)")
ax.set_zlabel("Average Time (μs)")
ax.set_title("Allgather")
ax.set_xticks(pivot.columns.values) # use the actual process count values
ax.set_xticklabels(pivot.columns.values)
ax.set_yticks(Y[:, 0])
ymin, ymax = ax.get_ylim()
ax.set_ylim(ymin*0.8, ymax) # 30% more space at top
ax.set_yticklabels([f"$2^{{{int(np.log2(v))}}}$" for v in Y[:, 0]])
plt.tight_layout()
plt.savefig("./plots/allgather_surface.png", dpi=300)
plt.close()

View File

@@ -1,11 +1,12 @@
#!/bin/bash -l #!/bin/bash -l
#SBATCH --job-name={job_name}_{n_procs} #SBATCH --job-name={job_name}_{n_procs}_{alg_idx}
#SBATCH --output={output_dir}{job_name}_{n_procs}.out #SBATCH --output={output_dir}{job_name}_{n_procs}.out
#SBATCH --error={err_dir}{job_name}_{n_procs}.err #SBATCH --error={err_dir}{job_name}_{n_procs}.err
#SBATCH --nodes={n_nodes} #SBATCH --nodes={n_nodes}
#SBATCH --nodelist=f01[01-64] #SBATCH --nodelist=f01[01-64]
#SBATCH --time=00:30:00 #SBATCH --time=00:30:00
#SBATCH --export=NONE #SBATCH --export=NONE
# SwitchName=fswibl01 Level=0 LinkSpeed=1 Nodes=f01[01-64] # SwitchName=fswibl01 Level=0 LinkSpeed=1 Nodes=f01[01-64]
# SwitchName=fswibl02 Level=0 LinkSpeed=1 Nodes=f02[01-64] # SwitchName=fswibl02 Level=0 LinkSpeed=1 Nodes=f02[01-64]
# SwitchName=fswibl03 Level=0 LinkSpeed=1 Nodes=f03[01-64] # SwitchName=fswibl03 Level=0 LinkSpeed=1 Nodes=f03[01-64]
@@ -19,17 +20,19 @@
# SwitchName=fswibl11 Level=0 LinkSpeed=1 Nodes=f08[01-64] # SwitchName=fswibl11 Level=0 LinkSpeed=1 Nodes=f08[01-64]
# SwitchName=fswibl12 Level=0 LinkSpeed=1 Nodes=f09[01-64] # SwitchName=fswibl12 Level=0 LinkSpeed=1 Nodes=f09[01-64]
# SwitchName=fswibl13 Level=0 LinkSpeed=1 Nodes=f10[01-64] # SwitchName=fswibl13 Level=0 LinkSpeed=1 Nodes=f10[01-64]
unset SLURM_EXPORT_ENV unset SLURM_EXPORT_ENV
module load intel intelmpi module load intel intelmpi
export I_MPI_ADJUST_{capital_jobname}={algnumber} export {alg_flag}={alg_idx}
OUTPUT_FILENAME="{data_dir}/{job_name}_$SLURM_JOB_ID.dat" OUTPUT_FILENAME="{data_dir}/{job_name}_$SLURM_JOB_ID.dat"
echo "# CREATION_TIME : {time_stamp}" > $OUTPUT_FILENAME echo "# CREATION_TIME : {time_stamp}" > $OUTPUT_FILENAME
echo "# N_NODES : {n_nodes}" >> $OUTPUT_FILENAME echo "# N_NODES : {n_nodes}" >> $OUTPUT_FILENAME
echo "# OFF_CACHE_FLAG : {off_cache_flag}">> $OUTPUT_FILENAME echo "# OFF_CACHE_FLAG : {off_cache_flag}">> $OUTPUT_FILENAME
echo "# ALGORITHM : {alg_name}">> $OUTPUT_FILENAME
srun --cpu-freq=2000000-2000000:performance -N {n_nodes} -n{n_procs} {bin} {job_name} -npmin {n_procs} {off_cache_flag} -mem 2 -time 60 >> $OUTPUT_FILENAME srun --cpu-freq=2000000-2000000:performance -N {n_nodes} -n{n_procs} {bin} {job_name} -npmin {n_procs} {off_cache_flag} -mem 2 -time 60 >> $OUTPUT_FILENAME