diff --git a/launch_alg_bench.py b/launch_alg_bench.py new file mode 100755 index 0000000..4b10d29 --- /dev/null +++ b/launch_alg_bench.py @@ -0,0 +1,206 @@ +import os +import subprocess +from datetime import datetime + +################ HELPER FUNCTIONS ################ + + +def load_template(template_path: str): + output_template = "" + with open(template_path, "r") as handle: + output_template = handle.read() + return output_template + + +def write_batch(batch_fpath: str, batch_content: str): + with open(batch_fpath, "w") as handle: + _ = handle.write(batch_content) + + +################### SETUP DIRS ################### +output_dir = os.getcwd()+"/output/" +err_dir = os.getcwd()+"/error/" +batch_files_dir = os.getcwd()+"/batchs/" +data_dir = os.getcwd()+"/data/" + +if os.path.isdir(output_dir) == False: + os.mkdir(output_dir) +if os.path.isdir(err_dir) == False: + os.mkdir(err_dir) +if os.path.isdir(data_dir) == False: + os.mkdir(data_dir) +if os.path.isdir(batch_files_dir) == False: + os.mkdir(batch_files_dir) + +################ GLOBAL DEFAULTS ################# +mpi1_bin = "/home/hpc/ihpc/ihpc136h/workspace/prototyping/bin/IMB-MPI1" +default_parameter = { + "time_stamp": datetime.now().strftime("%y_%m_%d_%H-%M-%S"), + "job_name": "", + "output_dir": os.getcwd()+"/output/", + "err_dir": os.getcwd()+"/error/", + "data_dir": os.getcwd()+"/data/", + "n_procs": 18, + "off_cache_flag": "", + "bin": mpi1_bin, + "n_nodes": 1 +} + +algs_dic = [{'name': "Allgather", + 'flag': "I_MPI_ADJUST_ALLGATHER", + 'algs': [ + "Recursive doubling ", + "Bruck`s ", + "Ring ", + "Topology aware Gatherv + Bcast ", + "Knomial ", + ]}, + {'name': "Allreduce", + 'flag': "I_MPI_ADJUST_ALLREDUCE", + 'algs': [ + "Recursive doubling ", + "Rabenseifner`s ", + "Reduce + Bcast ", + "Topology aware Reduce + Bcast ", + "Binomial gather + scatter ", + "Topology aware binominal gather + scatter ", + "Shumilin`s ring ", + "Ring ", + "Knomial ", + "Topology aware SHM-based flat ", + "Topology aware SHM-based Knomial ", + "Topology aware SHM-based Knary ", + ]}, + + {'name': "Alltoall", + 'flag': "I_MPI_ADJUST_ALLTOALL", + 'algs': [ + "Bruck`s ", + "Isend/Irecv + waitall ", + "Pair wise exchange ", + "Plum`s ", + ]}, + {'name': "Barrier", + 'flag': "I_MPI_ADJUST_BARRIER", + 'algs': [ + "Dissemination ", + "Recursive doubling ", + "Topology aware dissemination ", + "Topology aware recursive doubling ", + "Binominal gather + scatter ", + "Topology aware binominal gather + scatter ", + "Topology aware SHM-based flat ", + "Topology aware SHM-based Knomial ", + "Topology aware SHM-based Knary ", + ]}, + {'name': "Bcast", + 'flag': "I_MPI_ADJUST_BCAST", + 'algs': [ + "Binomial ", + "Recursive doubling ", + "Ring ", + "Topology aware binomial ", + "Topology aware recursive doubling ", + "Topology aware ring ", + "Shumilin`s ", + "Knomial ", + "Topology aware SHM-based flat ", + "Topology aware SHM-based Knomial ", + "Topology aware SHM-based Knary ", + "NUMA aware SHM-based (SSE4.2) ", + "NUMA aware SHM-based (AVX2) ", + "NUMA aware SHM-based (AVX512) ", + ]}, + + {'name': "Gather", + 'flag': "I_MPI_ADJUST_GATHER", + 'algs': [ + "Binomial ", + "Topology aware binomial ", + "Shumilin`s ", + "Binomial with segmentation ", + ]}, + + {'name': "Reduce_scatter", + 'flag': "I_MPI_ADJUST_REDUCE_SCATTER", + 'algs': [ + "Recursive halving ", + "Pair wise exchange ", + "Recursive doubling ", + "Reduce + Scatterv ", + "Topology aware Reduce + Scatterv ", + ]}, + + {'name': "Reduce", + 'flag': "I_MPI_ADJUST_REDUCE", + 'algs': [ + "Shumilin`s ", + "Binomial ", + "Topology aware Shumilin`s ", + "Topology aware binomial ", + "Rabenseifner`s ", + "Topology aware Rabenseifner`s ", + "Knomial ", + "Topology aware SHM-based flat ", + "Topology aware SHM-based Knomial ", + "Topology aware SHM-based Knary ", + "Topology aware SHM-based binomial ", + ]}, + + {'name': "Scatter", + 'flag': "I_MPI_ADJUST_SCATTER", + 'algs': [ + "Binomial ", + "Topology aware binomial ", + "Shumilin`s ", + ]}, + ] +log = "" + +############## MULTIPLE-NODE LAUNCH ############## +off_cache_flags = [ + "-off_cache -1", + "-off_cache 50", + "" +] + +ndcnt = [ + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 +] + +proc_per_node = 72 +multiple_node_parameter = dict(default_parameter) +multiple_node_template = load_template("./templates/multinode_algs.template") + +for flag in off_cache_flags: + multiple_node_parameter["off_cache_flag"] = flag + for n_nodes in ndcnt: + n_procs = n_nodes*proc_per_node + multiple_node_parameter["n_procs"] = int(n_procs) + multiple_node_parameter["n_nodes"] = n_nodes + for alg_conf in algs_dic: + collective = alg_conf['name'] + multiple_node_parameter["job_name"] = collective + multiple_node_parameter["alg_flag"] = alg_conf['flag'] + algs = alg_conf["algs"] + for idx, alg in enumerate(algs): + multiple_node_parameter["alg_name"] = alg + multiple_node_parameter["alg_idx"] = idx + batch_file = os.path.join(batch_files_dir, + f"{collective}_{alg.strip().replace('`','').replace(' ','_').replace('/','_')}.sh") + write_batch(batch_file, + multiple_node_template.format(**multiple_node_parameter)) + # result = subprocess.run(["sbatch", batch_files_dir+collective+".sh"], + # capture_output=True, text=True) + # log += f"#{collective} {n_procs}" + "\n" + # log += "\tSTDOUT:" + result.stdout + "\n" + # log += "\tSTDERR:" + result.stderr + "\n" +print(log) diff --git a/templates/multinode_algs.template b/templates/multinode_algs.template index 55cdb63..ebab059 100644 --- a/templates/multinode_algs.template +++ b/templates/multinode_algs.template @@ -1,11 +1,12 @@ #!/bin/bash -l -#SBATCH --job-name={job_name}_{n_procs} +#SBATCH --job-name={job_name}_{n_procs}_{alg_idx} #SBATCH --output={output_dir}{job_name}_{n_procs}.out #SBATCH --error={err_dir}{job_name}_{n_procs}.err #SBATCH --nodes={n_nodes} #SBATCH --nodelist=f01[01-64] #SBATCH --time=00:30:00 #SBATCH --export=NONE + # SwitchName=fswibl01 Level=0 LinkSpeed=1 Nodes=f01[01-64] # SwitchName=fswibl02 Level=0 LinkSpeed=1 Nodes=f02[01-64] # SwitchName=fswibl03 Level=0 LinkSpeed=1 Nodes=f03[01-64] @@ -19,17 +20,19 @@ # SwitchName=fswibl11 Level=0 LinkSpeed=1 Nodes=f08[01-64] # SwitchName=fswibl12 Level=0 LinkSpeed=1 Nodes=f09[01-64] # SwitchName=fswibl13 Level=0 LinkSpeed=1 Nodes=f10[01-64] + unset SLURM_EXPORT_ENV module load intel intelmpi -export I_MPI_ADJUST_{capital_jobname}={algnumber} +export {alg_flag}={alg_idx} OUTPUT_FILENAME="{data_dir}/{job_name}_$SLURM_JOB_ID.dat" echo "# CREATION_TIME : {time_stamp}" > $OUTPUT_FILENAME echo "# N_NODES : {n_nodes}" >> $OUTPUT_FILENAME echo "# OFF_CACHE_FLAG : {off_cache_flag}">> $OUTPUT_FILENAME +echo "# ALGORITHM : {alg_name}">> $OUTPUT_FILENAME srun --cpu-freq=2000000-2000000:performance -N {n_nodes} -n{n_procs} {bin} {job_name} -npmin {n_procs} {off_cache_flag} -mem 2 -time 60 >> $OUTPUT_FILENAME