Improved handling of metadata, extended benchmark launch and templates to multinode benchmarks

2025-06-01 21:11:58 +02:00
parent ba8cb1ae01
commit a25f8ffec6
6 changed files with 159 additions and 172 deletions
--- a/launch_bench.py
+++ b/launch_bench.py
@@ -0,0 +1,127 @@
 import os
 import subprocess
 from datetime import datetime
 ################ HELPER FUNCTIONS ################
 def load_template(template_path: str):
    output_template = ""
    with open(template_path, "r") as handle:
        output_template = handle.read()
    return output_template
 def write_batch(batch_fpath: str, batch_content: str):
    with open(batch_fpath, "w") as handle:
        _ = handle.write(batch_content)
 ################### SETUP DIRS ###################
 output_dir = os.getcwd()+"/output/"
 err_dir = os.getcwd()+"/error/"
 batch_files_dir = os.getcwd()+"/batchs/"
 data_dir = os.getcwd()+"/data/"
 if os.path.isdir(output_dir) == False:
    os.mkdir(output_dir)
 if os.path.isdir(err_dir) == False:
    os.mkdir(err_dir)
 if os.path.isdir(data_dir) == False:
    os.mkdir(data_dir)
 if os.path.isdir(batch_files_dir) == False:
    os.mkdir(batch_files_dir)
 ################ GLOBAL DEFAULTS #################
 mpi1_bin = "/home/hpc/ihpc/ihpc136h/workspace/prototyping/bin/IMB-MPI1"
 default_parameter = {
    "time_stamp": datetime.now().strftime("%y_%m_%d_%H-%M-%S"),
    "job_name": "",
    "output_dir": os.getcwd()+"/output/",
    "err_dir": os.getcwd()+"/error/",
    "data_dir": os.getcwd()+"/data/",
    "n_procs": 18,
    "off_cache_flag": "",
    "bin": mpi1_bin,
    "n_nodes": 1
 }
 collectives = [
    "Reduce",
    "Reduce_scatter",
    "Allreduce",
    "Allgather",
    "Allgatherv",
    "Scatter",
    "Scatterv",
    "Gather",
    "Gatherv",
    "Alltoall",
    "Bcast",
 ]
 log = ""
 ############### SINGLE-NODE LAUNCH ###############
 procnt = [
    18,
    36,
    54,
    72
 ]
 off_cache_flags = [
    "-off_cache -1",
    "-off_cache 50",
    ""
 ]
 single_node_parameter = dict(default_parameter)
 single_node_template = load_template("templates/singlenode.template")
 for flag in off_cache_flags:
    single_node_parameter["off_cache_flag"] = flag
    for n_procs in procnt:
        single_node_parameter["n_procs"] = n_procs
        for collective in collectives:
            single_node_parameter["job_name"] = collective
            write_batch(batch_files_dir+collective+".sh",
                        single_node_template.format(**single_node_parameter))
            result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"],
                                    capture_output=True, text=True)
            log += f"#{collective} {n_procs}" + "\n"
            log += "\tSTDOUT:" + result.stdout + "\n"
            log += "\tSTDERR:" + result.stderr + "\n"
 ############## MULTIPLE-NODE LAUNCH ##############
 off_cache_flags = [
    "-off_cache -1",
    "-off_cache 50",
    ""
 ]
 ndcnt = [
    2,
    3,
    4
 ]
 proc_per_node = 72
 multiple_node_parameter = dict(default_parameter)
 multiple_node_template = load_template("templates/multinode.template")
 for flag in off_cache_flags:
    multiple_node_parameter["off_cache_flag"] = flag
    for n_nodes in ndcnt:
        n_procs = n_nodes*proc_per_node
        multiple_node_parameter["n_procs"] = int(n_procs)
        multiple_node_parameter["n_nodes"] = n_nodes
        for collective in collectives:
            multiple_node_parameter["job_name"] = collective
            write_batch(batch_files_dir+collective+".sh",
                        multiple_node_template.format(**multiple_node_parameter))
            result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"],
                                    capture_output=True, text=True)
            log += f"#{collective} {n_procs}" + "\n"
            log += "\tSTDOUT:" + result.stdout + "\n"
            log += "\tSTDERR:" + result.stderr + "\n"
 print(log)
--- a/launch_bench_multinode.py
+++ b/launch_bench_multinode.py
@@ -1,79 +0,0 @@
 import os
 import subprocess
 from datetime import datetime
 def load_template(template_path: str):
    output_template = ""
    with open(template_path, "r") as handle:
        output_template = handle.read()
    return output_template
 def write_batch(batch_fpath: str, batch_content: str):
    with open(batch_fpath, "w") as handle:
        _ = handle.write(batch_content)
 collectives = ["Reduce",
               # "Reduce_scatter",
               # "Reduce_scatter_block",
               # "Allreduce",
               # "Allgather",
               # "Allgatherv",
               # "Scatter",
               # "Scatterv",
               # "Gather",
               # "Gatherv",
               # "Alltoall",
               # "Bcast",
               # "Barrier"
               ]
 procnt = [
    18,
    # 36,
    # 54,
    # 72
 ]
 mpi1_bin = "/home/hpc/ihpc/ihpc136h/workspace/prototyping/bin"
 slurm_template = load_template("templates/bench.template")
 template_parameter = {"time_stamp": datetime.now().strftime("%y_%m_%d_%H-%M-%S"),
                      "job_name": "",
                      "output_dir": os.getcwd()+"/output/",
                      "err_dir": os.getcwd()+"/error/",
                      "data_dir": os.getcwd()+"/data/",
                      "n_procs": 18,
                      "off_mem_flag": "",
                      "bin": mpi1_bin
                      }
 output_dir = os.getcwd()+"/output/"
 err_dir = os.getcwd()+"/error/"
 batch_files_dir = os.getcwd()+"/batchs/"
 data_dir = os.getcwd()+"/data/"
 if os.path.isdir(output_dir) == False:
    os.mkdir(output_dir)
 if os.path.isdir(err_dir) == False:
    os.mkdir(err_dir)
 if os.path.isdir(data_dir) == False:
    os.mkdir(data_dir)
 if os.path.isdir(batch_files_dir) == False:
    os.mkdir(batch_files_dir)
 log = ""
 for n_procs in procnt:
    template_parameter["n_procs"] = n_procs
    for collective in collectives:
        template_parameter["job_name"] = collective
        write_batch(batch_files_dir+collective+".sh",
                    slurm_template.format(**template_parameter))
        result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"],
                                capture_output=True, text=True)
        log += f"#{collective} {n_procs}" + "\n"
        log += "\tSTDOUT:" + result.stdout + "\n"
        log += "\tSTDERR:" + result.stderr + "\n"
 print(log)
--- a/launch_bench_singlenode.py
+++ b/launch_bench_singlenode.py
@@ -1,83 +0,0 @@
 import os
 import subprocess
 from datetime import datetime
 def load_template(template_path: str):
    output_template = ""
    with open(template_path, "r") as handle:
        output_template = handle.read()
    return output_template
 def write_batch(batch_fpath: str, batch_content: str):
    with open(batch_fpath, "w") as handle:
        _ = handle.write(batch_content)
 collectives = [
    "Reduce",
    "Reduce_scatter",
    "Allreduce",
    "Allgather",
    "Allgatherv",
    "Scatter",
    "Scatterv",
    "Gather",
    "Gatherv",
    "Alltoall",
    "Bcast",
    # "Barrier"
 ]
 procnt = [
    18,
    36,
    54,
    72
 ]
 mpi1_bin = "/home/hpc/ihpc/ihpc136h/workspace/prototyping/bin/IMB-MPI1"
 slurm_template = load_template("templates/bench.template")
 template_parameter = {
    "time_stamp": datetime.now().strftime("%y_%m_%d_%H-%M-%S"),
    "job_name": "",
    "output_dir": os.getcwd()+"/output/",
    "err_dir": os.getcwd()+"/error/",
    "data_dir": os.getcwd()+"/data/",
    "n_procs": 18,
    "off_mem_flag": "-off_cache 50",
    "bin": mpi1_bin,
    "n_nodes": 1
 }
 output_dir = os.getcwd()+"/output/"
 err_dir = os.getcwd()+"/error/"
 batch_files_dir = os.getcwd()+"/batchs/"
 data_dir = os.getcwd()+"/data/"
 if os.path.isdir(output_dir) == False:
    os.mkdir(output_dir)
 if os.path.isdir(err_dir) == False:
    os.mkdir(err_dir)
 if os.path.isdir(data_dir) == False:
    os.mkdir(data_dir)
 if os.path.isdir(batch_files_dir) == False:
    os.mkdir(batch_files_dir)
 log = ""
 for n_procs in procnt:
    template_parameter["n_procs"] = n_procs
    for collective in collectives:
        template_parameter["job_name"] = collective
        write_batch(batch_files_dir+collective+".sh",
                    slurm_template.format(**template_parameter))
        result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"],
                                capture_output=True, text=True)
        log += f"#{collective} {n_procs}" + "\n"
        log += "\tSTDOUT:" + result.stdout + "\n"
        log += "\tSTDERR:" + result.stderr + "\n"
 print(log)
 # _ = subprocess.run(["./clean.sh"])
--- a/postprocess_data.py
+++ b/postprocess_data.py
@@ -14,7 +14,7 @@ data_markers = {
    "end_of_table": "# All processes entering MPI_Finalize",
    "creation_time": "# CREATION_TIME :",
    "n_nodes": "# N_NODES       :",
-    "off_mem_flag": "# OFF_MEM_FLAG  :"
+    "off_cache_flag": "# OFF_CACHE_FLAG  :"
 }
 column_names = [
@@ -30,7 +30,7 @@ column_names = [
    "mpi_red_op",
    "creation_time",
    "n_nodes",
-    "off_mem_flag",
+    "off_cache_flag",
 ]
 data = list()
@@ -50,7 +50,7 @@ for file in os.listdir("data/"):
        mpi_red_op = "NA"
        creation_time = "NA"
        n_nodes = "NA"
-        off_mem_flag = "NA"
+        off_cache_flag = "NA"
        for line in lines:
            if data_markers["block_separator"] in line:
@@ -73,10 +73,10 @@ for file in os.listdir("data/"):
                    n_nodes = line.split()[-1]
                if data_markers["creation_time"] in line:
                    creation_time = line.split()[-1]
-                if data_markers["off_mem_flag"] in line:
+                if data_markers["off_cache_flag"] in line:
-                    off_mem_flag = line.split(":")[-1].strip()
+                    off_cache_flag = line.split(":")[-1].strip()
-                    if off_mem_flag == "": off_mem_flag = "NA"
+                    if off_cache_flag == "": off_cache_flag = "NA"
-                    else: off_mem_flag = off_mem_flag.replace("-off_cache","")
+                    else: off_cache_flag = off_cache_flag.replace("-off_cache","")
            if past_preheader and in_header:
                if data_markers["benchmark_type"] in line:
@@ -96,7 +96,7 @@ for file in os.listdir("data/"):
                                mpi_red_op,
                                creation_time,
                                n_nodes,
-                                off_mem_flag,
+                                off_cache_flag,
                ])
 df = pd.DataFrame(data, columns=column_names)
--- a/templates/multinode.template
+++ b/templates/multinode.template
@@ -0,0 +1,22 @@
 #!/bin/bash -l
 #SBATCH --job-name={job_name}_{n_procs}
 #SBATCH --output={output_dir}{job_name}_{n_procs}.out
 #SBATCH --error={err_dir}{job_name}_{n_procs}.err
 #SBATCH --nodes={n_nodes}
 #SBATCH --time=00:10:00
 #SBATCH --export=NONE
 unset SLURM_EXPORT_ENV
 module load intel intelmpi 
 OUTPUT_FILENAME="{data_dir}/{job_name}_$SLURM_JOB_ID.dat"
 echo  "# CREATION_TIME : {time_stamp}" > $OUTPUT_FILENAME
 echo  "# N_NODES       : {n_nodes}" >> $OUTPUT_FILENAME
 echo  "# OFF_CACHE_FLAG  : {off_cache_flag}">> $OUTPUT_FILENAME
 srun --cpu-freq=2000000-2000000:performance -N {n_nodes} -n{n_procs} {bin} {job_name} -npmin {n_procs} {off_cache_flag}  >> $OUTPUT_FILENAME 
--- a/templates/singlenode.template
+++ b/templates/singlenode.template
@@ -17,8 +17,8 @@ OUTPUT_FILENAME="{data_dir}/{job_name}_$SLURM_JOB_ID.dat"
 echo  "# CREATION_TIME : {time_stamp}" > $OUTPUT_FILENAME
 echo  "# N_NODES       : {n_nodes}" >> $OUTPUT_FILENAME
-echo  "# OFF_MEM_FLAG  : {off_mem_flag}">> $OUTPUT_FILENAME
+echo  "# OFF_CACHE_FLAG  : {off_cache_flag}">> $OUTPUT_FILENAME
-srun --cpu-freq=2000000-2000000:performance  ./likwid-mpirun -np {n_procs} -mpi intelmpi -omp intel -nperdomain M:18 {bin} {job_name} -npmin {n_procs} {off_mem_flag}  >> $OUTPUT_FILENAME 
+srun --cpu-freq=2000000-2000000:performance  ./likwid-mpirun -np {n_procs} -mpi intelmpi -omp intel -nperdomain M:18 {bin} {job_name} -npmin {n_procs} {off_cache_flag}  >> $OUTPUT_FILENAME