diff --git a/launch_bench.py b/launch_bench.py new file mode 100755 index 0000000..fc55c06 --- /dev/null +++ b/launch_bench.py @@ -0,0 +1,127 @@ +import os +import subprocess +from datetime import datetime + +################ HELPER FUNCTIONS ################ + + +def load_template(template_path: str): + output_template = "" + with open(template_path, "r") as handle: + output_template = handle.read() + return output_template + + +def write_batch(batch_fpath: str, batch_content: str): + with open(batch_fpath, "w") as handle: + _ = handle.write(batch_content) + + +################### SETUP DIRS ################### +output_dir = os.getcwd()+"/output/" +err_dir = os.getcwd()+"/error/" +batch_files_dir = os.getcwd()+"/batchs/" +data_dir = os.getcwd()+"/data/" + +if os.path.isdir(output_dir) == False: + os.mkdir(output_dir) +if os.path.isdir(err_dir) == False: + os.mkdir(err_dir) +if os.path.isdir(data_dir) == False: + os.mkdir(data_dir) +if os.path.isdir(batch_files_dir) == False: + os.mkdir(batch_files_dir) + +################ GLOBAL DEFAULTS ################# +mpi1_bin = "/home/hpc/ihpc/ihpc136h/workspace/prototyping/bin/IMB-MPI1" +default_parameter = { + "time_stamp": datetime.now().strftime("%y_%m_%d_%H-%M-%S"), + "job_name": "", + "output_dir": os.getcwd()+"/output/", + "err_dir": os.getcwd()+"/error/", + "data_dir": os.getcwd()+"/data/", + "n_procs": 18, + "off_cache_flag": "", + "bin": mpi1_bin, + "n_nodes": 1 +} + +collectives = [ + "Reduce", + "Reduce_scatter", + "Allreduce", + "Allgather", + "Allgatherv", + "Scatter", + "Scatterv", + "Gather", + "Gatherv", + "Alltoall", + "Bcast", +] + +log = "" + +############### SINGLE-NODE LAUNCH ############### +procnt = [ + 18, + 36, + 54, + 72 +] + +off_cache_flags = [ + "-off_cache -1", + "-off_cache 50", + "" +] + +single_node_parameter = dict(default_parameter) +single_node_template = load_template("templates/singlenode.template") + +for flag in off_cache_flags: + single_node_parameter["off_cache_flag"] = flag + for n_procs in procnt: + single_node_parameter["n_procs"] = n_procs + for collective in collectives: + single_node_parameter["job_name"] = collective + write_batch(batch_files_dir+collective+".sh", + single_node_template.format(**single_node_parameter)) + result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"], + capture_output=True, text=True) + log += f"#{collective} {n_procs}" + "\n" + log += "\tSTDOUT:" + result.stdout + "\n" + log += "\tSTDERR:" + result.stderr + "\n" +############## MULTIPLE-NODE LAUNCH ############## +off_cache_flags = [ + "-off_cache -1", + "-off_cache 50", + "" +] + +ndcnt = [ + 2, + 3, + 4 +] + +proc_per_node = 72 +multiple_node_parameter = dict(default_parameter) +multiple_node_template = load_template("templates/multinode.template") + +for flag in off_cache_flags: + multiple_node_parameter["off_cache_flag"] = flag + for n_nodes in ndcnt: + n_procs = n_nodes*proc_per_node + multiple_node_parameter["n_procs"] = int(n_procs) + multiple_node_parameter["n_nodes"] = n_nodes + for collective in collectives: + multiple_node_parameter["job_name"] = collective + write_batch(batch_files_dir+collective+".sh", + multiple_node_template.format(**multiple_node_parameter)) + result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"], + capture_output=True, text=True) + log += f"#{collective} {n_procs}" + "\n" + log += "\tSTDOUT:" + result.stdout + "\n" + log += "\tSTDERR:" + result.stderr + "\n" +print(log) diff --git a/launch_bench_multinode.py b/launch_bench_multinode.py deleted file mode 100755 index d9c9439..0000000 --- a/launch_bench_multinode.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -import subprocess -from datetime import datetime - - -def load_template(template_path: str): - output_template = "" - with open(template_path, "r") as handle: - output_template = handle.read() - return output_template - - -def write_batch(batch_fpath: str, batch_content: str): - with open(batch_fpath, "w") as handle: - _ = handle.write(batch_content) - - -collectives = ["Reduce", - # "Reduce_scatter", - # "Reduce_scatter_block", - # "Allreduce", - # "Allgather", - # "Allgatherv", - # "Scatter", - # "Scatterv", - # "Gather", - # "Gatherv", - # "Alltoall", - # "Bcast", - # "Barrier" - ] - -procnt = [ - 18, - # 36, - # 54, - # 72 -] -mpi1_bin = "/home/hpc/ihpc/ihpc136h/workspace/prototyping/bin" -slurm_template = load_template("templates/bench.template") - -template_parameter = {"time_stamp": datetime.now().strftime("%y_%m_%d_%H-%M-%S"), - "job_name": "", - "output_dir": os.getcwd()+"/output/", - "err_dir": os.getcwd()+"/error/", - "data_dir": os.getcwd()+"/data/", - "n_procs": 18, - "off_mem_flag": "", - "bin": mpi1_bin - } - -output_dir = os.getcwd()+"/output/" -err_dir = os.getcwd()+"/error/" -batch_files_dir = os.getcwd()+"/batchs/" -data_dir = os.getcwd()+"/data/" - -if os.path.isdir(output_dir) == False: - os.mkdir(output_dir) -if os.path.isdir(err_dir) == False: - os.mkdir(err_dir) -if os.path.isdir(data_dir) == False: - os.mkdir(data_dir) -if os.path.isdir(batch_files_dir) == False: - os.mkdir(batch_files_dir) - -log = "" - -for n_procs in procnt: - template_parameter["n_procs"] = n_procs - for collective in collectives: - template_parameter["job_name"] = collective - write_batch(batch_files_dir+collective+".sh", - slurm_template.format(**template_parameter)) - result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"], - capture_output=True, text=True) - log += f"#{collective} {n_procs}" + "\n" - log += "\tSTDOUT:" + result.stdout + "\n" - log += "\tSTDERR:" + result.stderr + "\n" -print(log) diff --git a/launch_bench_singlenode.py b/launch_bench_singlenode.py deleted file mode 100755 index fb662ea..0000000 --- a/launch_bench_singlenode.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -import subprocess -from datetime import datetime - - -def load_template(template_path: str): - output_template = "" - with open(template_path, "r") as handle: - output_template = handle.read() - return output_template - - -def write_batch(batch_fpath: str, batch_content: str): - with open(batch_fpath, "w") as handle: - _ = handle.write(batch_content) - - -collectives = [ - "Reduce", - "Reduce_scatter", - "Allreduce", - "Allgather", - "Allgatherv", - "Scatter", - "Scatterv", - "Gather", - "Gatherv", - "Alltoall", - "Bcast", - # "Barrier" -] - -procnt = [ - 18, - 36, - 54, - 72 -] - -mpi1_bin = "/home/hpc/ihpc/ihpc136h/workspace/prototyping/bin/IMB-MPI1" -slurm_template = load_template("templates/bench.template") - -template_parameter = { - "time_stamp": datetime.now().strftime("%y_%m_%d_%H-%M-%S"), - "job_name": "", - "output_dir": os.getcwd()+"/output/", - "err_dir": os.getcwd()+"/error/", - "data_dir": os.getcwd()+"/data/", - "n_procs": 18, - "off_mem_flag": "-off_cache 50", - "bin": mpi1_bin, - "n_nodes": 1 -} - -output_dir = os.getcwd()+"/output/" -err_dir = os.getcwd()+"/error/" -batch_files_dir = os.getcwd()+"/batchs/" -data_dir = os.getcwd()+"/data/" - -if os.path.isdir(output_dir) == False: - os.mkdir(output_dir) -if os.path.isdir(err_dir) == False: - os.mkdir(err_dir) -if os.path.isdir(data_dir) == False: - os.mkdir(data_dir) -if os.path.isdir(batch_files_dir) == False: - os.mkdir(batch_files_dir) - -log = "" - -for n_procs in procnt: - template_parameter["n_procs"] = n_procs - for collective in collectives: - template_parameter["job_name"] = collective - write_batch(batch_files_dir+collective+".sh", - slurm_template.format(**template_parameter)) - result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"], - capture_output=True, text=True) - log += f"#{collective} {n_procs}" + "\n" - log += "\tSTDOUT:" + result.stdout + "\n" - log += "\tSTDERR:" + result.stderr + "\n" -print(log) -# _ = subprocess.run(["./clean.sh"]) diff --git a/postprocess_data.py b/postprocess_data.py index df7a88b..ab13abf 100755 --- a/postprocess_data.py +++ b/postprocess_data.py @@ -14,7 +14,7 @@ data_markers = { "end_of_table": "# All processes entering MPI_Finalize", "creation_time": "# CREATION_TIME :", "n_nodes": "# N_NODES :", - "off_mem_flag": "# OFF_MEM_FLAG :" + "off_cache_flag": "# OFF_CACHE_FLAG :" } column_names = [ @@ -30,7 +30,7 @@ column_names = [ "mpi_red_op", "creation_time", "n_nodes", - "off_mem_flag", + "off_cache_flag", ] data = list() @@ -50,7 +50,7 @@ for file in os.listdir("data/"): mpi_red_op = "NA" creation_time = "NA" n_nodes = "NA" - off_mem_flag = "NA" + off_cache_flag = "NA" for line in lines: if data_markers["block_separator"] in line: @@ -73,10 +73,10 @@ for file in os.listdir("data/"): n_nodes = line.split()[-1] if data_markers["creation_time"] in line: creation_time = line.split()[-1] - if data_markers["off_mem_flag"] in line: - off_mem_flag = line.split(":")[-1].strip() - if off_mem_flag == "": off_mem_flag = "NA" - else: off_mem_flag = off_mem_flag.replace("-off_cache","") + if data_markers["off_cache_flag"] in line: + off_cache_flag = line.split(":")[-1].strip() + if off_cache_flag == "": off_cache_flag = "NA" + else: off_cache_flag = off_cache_flag.replace("-off_cache","") if past_preheader and in_header: if data_markers["benchmark_type"] in line: @@ -96,7 +96,7 @@ for file in os.listdir("data/"): mpi_red_op, creation_time, n_nodes, - off_mem_flag, + off_cache_flag, ]) df = pd.DataFrame(data, columns=column_names) diff --git a/templates/multinode.template b/templates/multinode.template new file mode 100644 index 0000000..1f3d6a9 --- /dev/null +++ b/templates/multinode.template @@ -0,0 +1,22 @@ +#!/bin/bash -l +#SBATCH --job-name={job_name}_{n_procs} +#SBATCH --output={output_dir}{job_name}_{n_procs}.out +#SBATCH --error={err_dir}{job_name}_{n_procs}.err +#SBATCH --nodes={n_nodes} +#SBATCH --time=00:10:00 +#SBATCH --export=NONE + +unset SLURM_EXPORT_ENV + +module load intel intelmpi + + +OUTPUT_FILENAME="{data_dir}/{job_name}_$SLURM_JOB_ID.dat" + +echo "# CREATION_TIME : {time_stamp}" > $OUTPUT_FILENAME +echo "# N_NODES : {n_nodes}" >> $OUTPUT_FILENAME +echo "# OFF_CACHE_FLAG : {off_cache_flag}">> $OUTPUT_FILENAME + +srun --cpu-freq=2000000-2000000:performance -N {n_nodes} -n{n_procs} {bin} {job_name} -npmin {n_procs} {off_cache_flag} >> $OUTPUT_FILENAME + + diff --git a/templates/bench.template b/templates/singlenode.template similarity index 83% rename from templates/bench.template rename to templates/singlenode.template index 333f51c..2cabd08 100644 --- a/templates/bench.template +++ b/templates/singlenode.template @@ -17,8 +17,8 @@ OUTPUT_FILENAME="{data_dir}/{job_name}_$SLURM_JOB_ID.dat" echo "# CREATION_TIME : {time_stamp}" > $OUTPUT_FILENAME echo "# N_NODES : {n_nodes}" >> $OUTPUT_FILENAME -echo "# OFF_MEM_FLAG : {off_mem_flag}">> $OUTPUT_FILENAME +echo "# OFF_CACHE_FLAG : {off_cache_flag}">> $OUTPUT_FILENAME -srun --cpu-freq=2000000-2000000:performance ./likwid-mpirun -np {n_procs} -mpi intelmpi -omp intel -nperdomain M:18 {bin} {job_name} -npmin {n_procs} {off_mem_flag} >> $OUTPUT_FILENAME +srun --cpu-freq=2000000-2000000:performance ./likwid-mpirun -np {n_procs} -mpi intelmpi -omp intel -nperdomain M:18 {bin} {job_name} -npmin {n_procs} {off_cache_flag} >> $OUTPUT_FILENAME