alg benchmarking runner and processor

2025-11-05 12:08:34 +01:00
parent 79bc443bcb
commit d494228d77
2 changed files with 118 additions and 9 deletions
--- a/postprocess_data_algs.py
+++ b/postprocess_data_algs.py
@@ -0,0 +1,112 @@
+from venv import create
+import pandas as pd
+import os
+
+data_markers = {
+    "block_separator": "#----------------------------------------------------------------",
+    "benchmark_type": "# Benchmarking",
+    "processes_num": "# #processes = ",
+    "min_bytelen": "# Minimum message length in bytes",
+    "max_bytelen": "# Maximum message length in bytes",
+    "mpi_datatype": "# MPI_Datatype                   :",
+    "mpi_red_datatype": "# MPI_Datatype for reductions    :",
+    "mpi_red_op": "# MPI_Op",
+    "end_of_table": "# All processes entering MPI_Finalize",
+    "creation_time": "# CREATION_TIME :",
+    "n_nodes": "# N_NODES       :",
+    "off_cache_flag": "# OFF_CACHE_FLAG  :",
+    "algorithm":"# ALGORITHM  :"
+}
+
+column_names = [
+    "benchmark_type",
+    "proc_num",
+    "msg_size_bytes",
+    "repetitions",
+    "t_min_usec",
+    "t_max_usec",
+    "t_avg_usec",
+    "mpi_datatype",
+    "mpi_red_datatype",
+    "mpi_red_op",
+    "creation_time",
+    "n_nodes",
+    "off_cache_flag",
+    "algorithm"
+]
+
+data = list()
+
+for file in os.listdir("data/"):
+    with open("data/"+file, 'r') as f:
+        lines = f.readlines()
+
+        past_preheader = False
+        in_header = False
+        in_body = False
+
+        btype = "NA"
+        proc_num = "NA"
+        mpi_datatype = "NA"
+        mpi_red_datatype = "NA"
+        mpi_red_op = "NA"
+        creation_time = "NA"
+        n_nodes = "NA"
+        off_cache_flag = "NA"
+        algorithm = "NA"
+
+        for line in lines:
+            if data_markers["block_separator"] in line:
+                if in_header and not past_preheader:
+                    past_preheader = True
+                elif in_header and past_preheader:
+                    in_body = True
+                in_header = not in_header
+                continue
+            if not in_header and not in_body and past_preheader:
+                if data_markers["mpi_datatype"] in line:
+                    mpi_datatype = line.split()[-1]
+                elif data_markers["mpi_red_datatype"] in line:
+                    mpi_red_datatype = line.split()[-1]
+                elif data_markers["mpi_red_op"] in line:
+                    mpi_red_op = line.split()[-1]
+
+            if not in_header and not in_body and not past_preheader:
+                if data_markers["n_nodes"] in line:
+                    n_nodes = line.split()[-1]
+                if data_markers["creation_time"] in line:
+                    creation_time = line.split()[-1]
+                if data_markers["off_cache_flag"] in line:
+                    off_cache_flag = line.split(":")[-1].strip()
+                    if off_cache_flag == "": off_cache_flag = "NA"
+                    else: off_cache_flag = off_cache_flag.replace("-off_cache","")
+                if data_markers["algorithm"] in line:
+                    algorithm = line.split(":")[-1].strip()
+
+            if past_preheader and in_header:
+                if data_markers["benchmark_type"] in line:
+                    btype = line.split()[2]
+                if data_markers["processes_num"] in line:
+                    proc_num = int(line.split()[3])
+
+            if in_body:
+                if "#" in line or "".join(line.split()) == "":
+                    continue
+                if data_markers["end_of_table"] in line:
+                    break
+                if("int-overflow" in line) : continue 
+                if("out-of-mem" in line) : continue 
+                data.append([btype, proc_num]+[int(s) if s.isdigit()
+                            else float(s) for s in line.split()] +
+                            [
+                                mpi_datatype,
+                                mpi_red_datatype,
+                                mpi_red_op,
+                                creation_time,
+                                n_nodes,
+                                off_cache_flag,
+                                algorithm
+                ])
+
+df = pd.DataFrame(data, columns=column_names)
+df.to_csv("data.csv", index=False)