From 8b80f1fd28961957bbb2725cc686e86851aeca4d Mon Sep 17 00:00:00 2001 From: Erik Fabrizzi Date: Wed, 28 May 2025 19:46:24 +0200 Subject: [PATCH] initial_commit --- .gitignore | 11 ++++++ README.md | 2 + launch_bench_multinode.py | 79 ++++++++++++++++++++++++++++++++++++++ launch_bench_singlenode.py | 79 ++++++++++++++++++++++++++++++++++++++ postprocess_data.py | 72 ++++++++++++++++++++++++++++++++++ templates/bench.template | 18 +++++++++ 6 files changed, 261 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 launch_bench_multinode.py create mode 100755 launch_bench_singlenode.py create mode 100755 postprocess_data.py create mode 100644 templates/bench.template diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2985d00 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +# Ignore everything +* + +# But not these! +!.gitignore +!README.md +!*.py +!*.template +# Optional: Keep subdirectories and their Python files + +!*/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..9d3e00b --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# Benchmarking Scripts for IMB + diff --git a/launch_bench_multinode.py b/launch_bench_multinode.py new file mode 100755 index 0000000..d9c9439 --- /dev/null +++ b/launch_bench_multinode.py @@ -0,0 +1,79 @@ +import os +import subprocess +from datetime import datetime + + +def load_template(template_path: str): + output_template = "" + with open(template_path, "r") as handle: + output_template = handle.read() + return output_template + + +def write_batch(batch_fpath: str, batch_content: str): + with open(batch_fpath, "w") as handle: + _ = handle.write(batch_content) + + +collectives = ["Reduce", + # "Reduce_scatter", + # "Reduce_scatter_block", + # "Allreduce", + # "Allgather", + # "Allgatherv", + # "Scatter", + # "Scatterv", + # "Gather", + # "Gatherv", + # "Alltoall", + # "Bcast", + # "Barrier" + ] + +procnt = [ + 18, + # 36, + # 54, + # 72 +] +mpi1_bin = "/home/hpc/ihpc/ihpc136h/workspace/prototyping/bin" +slurm_template = load_template("templates/bench.template") + +template_parameter = {"time_stamp": datetime.now().strftime("%y_%m_%d_%H-%M-%S"), + "job_name": "", + "output_dir": os.getcwd()+"/output/", + "err_dir": os.getcwd()+"/error/", + "data_dir": os.getcwd()+"/data/", + "n_procs": 18, + "off_mem_flag": "", + "bin": mpi1_bin + } + +output_dir = os.getcwd()+"/output/" +err_dir = os.getcwd()+"/error/" +batch_files_dir = os.getcwd()+"/batchs/" +data_dir = os.getcwd()+"/data/" + +if os.path.isdir(output_dir) == False: + os.mkdir(output_dir) +if os.path.isdir(err_dir) == False: + os.mkdir(err_dir) +if os.path.isdir(data_dir) == False: + os.mkdir(data_dir) +if os.path.isdir(batch_files_dir) == False: + os.mkdir(batch_files_dir) + +log = "" + +for n_procs in procnt: + template_parameter["n_procs"] = n_procs + for collective in collectives: + template_parameter["job_name"] = collective + write_batch(batch_files_dir+collective+".sh", + slurm_template.format(**template_parameter)) + result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"], + capture_output=True, text=True) + log += f"#{collective} {n_procs}" + "\n" + log += "\tSTDOUT:" + result.stdout + "\n" + log += "\tSTDERR:" + result.stderr + "\n" +print(log) diff --git a/launch_bench_singlenode.py b/launch_bench_singlenode.py new file mode 100755 index 0000000..78a2c1e --- /dev/null +++ b/launch_bench_singlenode.py @@ -0,0 +1,79 @@ +import os +import subprocess +from datetime import datetime + + +def load_template(template_path: str): + output_template = "" + with open(template_path, "r") as handle: + output_template = handle.read() + return output_template + + +def write_batch(batch_fpath: str, batch_content: str): + with open(batch_fpath, "w") as handle: + _ = handle.write(batch_content) + + +collectives = ["Reduce", + "Reduce_scatter", + "Allreduce", + "Allgather", + "Allgatherv", + "Scatter", + "Scatterv", + "Gather", + "Gatherv", + "Alltoall", + "Bcast", + # "Barrier" + ] + +procnt = [ + 18, + 36, + 54, + 72 +] +mpi1_bin = "/home/hpc/ihpc/ihpc136h/workspace/prototyping/bin/IMB-MPI1" +slurm_template = load_template("templates/bench.template") + +template_parameter = {"time_stamp": datetime.now().strftime("%y_%m_%d_%H-%M-%S"), + "job_name": "", + "output_dir": os.getcwd()+"/output/", + "err_dir": os.getcwd()+"/error/", + "data_dir": os.getcwd()+"/data/", + "n_procs": 18, + "off_mem_flag": "", + "bin": mpi1_bin + } + +output_dir = os.getcwd()+"/output/" +err_dir = os.getcwd()+"/error/" +batch_files_dir = os.getcwd()+"/batchs/" +data_dir = os.getcwd()+"/data/" + +if os.path.isdir(output_dir) == False: + os.mkdir(output_dir) +if os.path.isdir(err_dir) == False: + os.mkdir(err_dir) +if os.path.isdir(data_dir) == False: + os.mkdir(data_dir) +if os.path.isdir(batch_files_dir) == False: + os.mkdir(batch_files_dir) + +log = "" + +for n_procs in procnt: + template_parameter["n_procs"] = n_procs + for collective in collectives: + template_parameter["job_name"] = collective + write_batch(batch_files_dir+collective+".sh", + slurm_template.format(**template_parameter)) + result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"], + capture_output=True, text=True) + log += f"#{collective} {n_procs}" + "\n" + log += "\tSTDOUT:" + result.stdout + "\n" + log += "\tSTDERR:" + result.stderr + "\n" +print(log) +_ = subprocess.run(["./clean.sh"]) diff --git a/postprocess_data.py b/postprocess_data.py new file mode 100755 index 0000000..ec6bf58 --- /dev/null +++ b/postprocess_data.py @@ -0,0 +1,72 @@ +import pandas as pd +import os + +data_markers = { + "block_separator": "#----------------------------------------------------------------", + "benchmark_type": "# Benchmarking", + "processes_num": "# #processes = ", + "min_bytelen": "# Minimum message length in bytes", + "max_bytelen": "# Maximum message length in bytes", + "mpi_datatype": "# MPI_Datatype :", + "mpi_red_datatype": "# MPI_Datatype for reductions :", + "mpi_red_op": "# MPI_Op", + "end_of_table": "# All processes entering MPI_Finalize", +} + +column_names = [ + "benchmark_type", + "proc_num", + "msg_size_bytes", + "repetitions", + "t_min_usec", + "t_max_usec", + "t_avg_usec", + "mpi_datatype", + "mpi_red_datatype", + "mpi_red_op", +] + +data = list() +for file in os.listdir("data/"): + with open("data/"+file, 'r') as f: + lines = f.readlines() + past_preheader = False + in_header = False + in_body = False + btype = None + proc_num = None + mpi_datatype = None + mpi_red_datatype = None + mpi_red_op = None + + for line in lines: + if data_markers["block_separator"] in line: + if in_header and not past_preheader: + past_preheader = True + elif in_header and past_preheader: + in_body = True + in_header = not in_header + continue + if not in_header and not in_body and past_preheader: + if data_markers["mpi_datatype"] in line: + mpi_datatype = line.split()[-1] + elif data_markers["mpi_red_datatype"] in line: + mpi_red_datatype = line.split()[-1] + elif data_markers["mpi_red_op"] in line: + mpi_red_op = line.split()[-1] + + if past_preheader and in_header: + if data_markers["benchmark_type"] in line: + btype = line.split()[2] + if data_markers["processes_num"] in line: + proc_num = int(line.split()[3]) + if in_body: + if "#" in line or "".join(line.split()) == "": + continue + if data_markers["end_of_table"] in line: + break + data.append([btype, proc_num]+[int(s) if s.isdigit() + else float(s) for s in line.split()] + [mpi_datatype, mpi_red_datatype, mpi_red_op]) + +df = pd.DataFrame(data, columns=column_names) +df.to_csv("data.csv", index=False) diff --git a/templates/bench.template b/templates/bench.template new file mode 100644 index 0000000..960f894 --- /dev/null +++ b/templates/bench.template @@ -0,0 +1,18 @@ +#!/bin/bash -l +#SBATCH --job-name={job_name}_{n_procs} +#SBATCH --output={output_dir}{job_name}_{n_procs}.out +#SBATCH --error={err_dir}{job_name}_{n_procs}.err +#SBATCH --nodes=1 +#SBATCH --time=00:10:00 +#SBATCH --export=NONE + +unset SLURM_EXPORT_ENV + +module load intel intelmpi likwid + +unset I_MPI_PMI_LIBRARY +export LIKWID_SILENT=1 +echo CREATION_TIME {time_stamp} + +srun --cpu-freq=2000000-2000000:performance ./likwid-mpirun -np {n_procs} -mpi intelmpi -omp intel -nperdomain M:18 {bin} {job_name} -npmin {n_procs} {off_mem_flag} > {data_dir}/{job_name}_{n_procs}.dat +