import os import subprocess from datetime import datetime ################ HELPER FUNCTIONS ################ def load_template(template_path: str): output_template = "" with open(template_path, "r") as handle: output_template = handle.read() return output_template def write_batch(batch_fpath: str, batch_content: str): with open(batch_fpath, "w") as handle: _ = handle.write(batch_content) ################### SETUP DIRS ################### output_dir = os.getcwd()+"/output/" err_dir = os.getcwd()+"/error/" batch_files_dir = os.getcwd()+"/batchs/" data_dir = os.getcwd()+"/data/" if os.path.isdir(output_dir) == False: os.mkdir(output_dir) if os.path.isdir(err_dir) == False: os.mkdir(err_dir) if os.path.isdir(data_dir) == False: os.mkdir(data_dir) if os.path.isdir(batch_files_dir) == False: os.mkdir(batch_files_dir) ################ GLOBAL DEFAULTS ################# mpi1_bin = "/home/hpc/ihpc/ihpc136h/workspace/prototyping/bin/IMB-MPI1" default_parameter = { "time_stamp": datetime.now().strftime("%y_%m_%d_%H-%M-%S"), "job_name": "", "output_dir": os.getcwd()+"/output/", "err_dir": os.getcwd()+"/error/", "data_dir": os.getcwd()+"/data/", "n_procs": 18, "off_cache_flag": "", "bin": mpi1_bin, "n_nodes": 1 } collectives = [ "Reduce", "Reduce_scatter", "Allreduce", "Allgather", "Allgatherv", "Scatter", "Scatterv", "Gather", "Gatherv", "Alltoall", "Bcast", ] log = "" ############### SINGLE-NODE LAUNCH ############### procnt = [ 18, 36, 54, 72 ] off_cache_flags = [ "-off_cache -1", "-off_cache 50", "" ] single_node_parameter = dict(default_parameter) single_node_template = load_template("templates/singlenode.template") for flag in off_cache_flags: single_node_parameter["off_cache_flag"] = flag for n_procs in procnt: single_node_parameter["n_procs"] = n_procs for collective in collectives: single_node_parameter["job_name"] = collective write_batch(batch_files_dir+collective+".sh", single_node_template.format(**single_node_parameter)) result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"], capture_output=True, text=True) log += f"#{collective} {n_procs}" + "\n" log += "\tSTDOUT:" + result.stdout + "\n" log += "\tSTDERR:" + result.stderr + "\n" ############## MULTIPLE-NODE LAUNCH ############## off_cache_flags = [ "-off_cache -1", "-off_cache 50", "" ] ndcnt = [ 2, 3, 4 ] proc_per_node = 72 multiple_node_parameter = dict(default_parameter) multiple_node_template = load_template("templates/multinode.template") for flag in off_cache_flags: multiple_node_parameter["off_cache_flag"] = flag for n_nodes in ndcnt: n_procs = n_nodes*proc_per_node multiple_node_parameter["n_procs"] = int(n_procs) multiple_node_parameter["n_nodes"] = n_nodes for collective in collectives: multiple_node_parameter["job_name"] = collective write_batch(batch_files_dir+collective+".sh", multiple_node_template.format(**multiple_node_parameter)) result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"], capture_output=True, text=True) log += f"#{collective} {n_procs}" + "\n" log += "\tSTDOUT:" + result.stdout + "\n" log += "\tSTDERR:" + result.stderr + "\n" print(log)