From 911ba6333680d198a9692c9346bbf7d5bbafa99a Mon Sep 17 00:00:00 2001
From: Rafael Ravedutti <rafaelravedutti@gmail.com>
Date: Tue, 16 Aug 2022 18:36:47 +0200
Subject: [PATCH] Adjust ISA options and improve output

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
---
 Makefile                                |  8 +++-
 config.mk                               |  3 ++
 gromacs/includes/simd/avx_avx2_double.h |  6 +--
 include_ISA.mk                          |  3 +-
 lammps/force_dem.c                      |  8 ++--
 lammps/force_lj.c                       | 59 ++++++++++++++-----------
 lammps/includes/util.h                  | 21 ++++++---
 lammps/main.c                           |  9 +---
 lammps/parameter.c                      |  6 ++-
 9 files changed, 70 insertions(+), 53 deletions(-)

diff --git a/Makefile b/Makefile
index 668461a..5a8f1cd 100644
--- a/Makefile
+++ b/Makefile
@@ -64,8 +64,12 @@ ifneq ($(VECTOR_WIDTH),)
     DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
 endif
 
-ifeq ($(strip $(NO_AVX2)),true)
-    DEFINES += -DNO_AVX2
+ifeq ($(strip $(MASK_REGISTERS)),true)
+    DEFINES += -DMASK_REGISTERS
+endif
+
+ifeq ($(strip $(SIMD_KERNEL_AVAILABLE)),true)
+    DEFINES += -DSIMD_KERNEL_AVAILABLE
 endif
 
 ifeq ($(strip $(AVX512)),true)
diff --git a/config.mk b/config.mk
index 696f1e5..290c150 100644
--- a/config.mk
+++ b/config.mk
@@ -2,6 +2,9 @@
 TAG ?= ICC
 # Instruction set (SSE/AVX/AVX2/AVX512)
 ISA ?= AVX512
+# Use mask registers (AVX512 must be available in the target CPU)
+# This is always true when ISA is set to AVX512
+MASK_REGISTERS ?= true
 # Optimization scheme (lammps/gromacs/clusters_per_bin)
 OPT_SCHEME ?= lammps
 # Enable likwid (true or false)
diff --git a/gromacs/includes/simd/avx_avx2_double.h b/gromacs/includes/simd/avx_avx2_double.h
index c0c52e8..637d1ed 100644
--- a/gromacs/includes/simd/avx_avx2_double.h
+++ b/gromacs/includes/simd/avx_avx2_double.h
@@ -28,10 +28,10 @@
 #define MD_SIMD_FLOAT       __m256d
 #define MD_SIMD_INT         __m128i
 
-#ifdef NO_AVX2
-#   define MD_SIMD_MASK     __m256d
-#else
+#ifdef MASK_REGISTERS
 #   define MD_SIMD_MASK     __mmask8
+#else
+#   define MD_SIMD_MASK     __m256d
 #endif
 
 static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
diff --git a/include_ISA.mk b/include_ISA.mk
index fa9b90b..7008d26 100644
--- a/include_ISA.mk
+++ b/include_ISA.mk
@@ -2,12 +2,13 @@ ifeq ($(strip $(ISA)), SSE)
 _VECTOR_WIDTH=2
 else ifeq ($(strip $(ISA)), AVX)
 # Vector width is 4 but AVX2 instruction set is not supported
-NO_AVX2=true
 _VECTOR_WIDTH=4
 else ifeq ($(strip $(ISA)), AVX2)
+#SIMD_KERNEL_AVAILABLE=true
 _VECTOR_WIDTH=4
 else ifeq ($(strip $(ISA)), AVX512)
 AVX512=true
+SIMD_KERNEL_AVAILABLE=true
 _VECTOR_WIDTH=8
 endif
 
diff --git a/lammps/force_dem.c b/lammps/force_dem.c
index 47b5564..fdae21f 100644
--- a/lammps/force_dem.c
+++ b/lammps/force_dem.c
@@ -21,16 +21,14 @@
  * =======================================================================================
  */
 #include <math.h>
+//---
+#include <atom.h>
 #include <likwid-marker.h>
-
-#include <timing.h>
 #include <neighbor.h>
 #include <parameter.h>
-#include <atom.h>
 #include <stats.h>
+#include <timing.h>
 
-// TODO: Joint common files for gromacs and lammps variants
-#include "../gromacs/includes/simd.h"
 
 double computeForceDemFullNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
     int Nlocal = atom->Nlocal;
diff --git a/lammps/force_lj.c b/lammps/force_lj.c
index 0c72873..408bb21 100644
--- a/lammps/force_lj.c
+++ b/lammps/force_lj.c
@@ -20,25 +20,29 @@
  *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
  * =======================================================================================
  */
+#include <stdio.h>
+#include <stdlib.h>
+//---
+#include <atom.h>
 #include <likwid-marker.h>
-
-#include <timing.h>
 #include <neighbor.h>
 #include <parameter.h>
-#include <atom.h>
 #include <stats.h>
+#include <timing.h>
 
 // TODO: Joint common files for gromacs and lammps variants
+#ifdef SIMD_KERNEL_AVAILABLE
 #include "../gromacs/includes/simd.h"
+#endif
 
 double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
     int Nlocal = atom->Nlocal;
     int* neighs;
-#ifndef EXPLICIT_TYPES
+    #ifndef EXPLICIT_TYPES
     MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
     MD_FLOAT sigma6 = param->sigma6;
     MD_FLOAT epsilon = param->epsilon;
-#endif
+    #endif
 
     for(int i = 0; i < Nlocal; i++) {
         atom_fx(i) = 0.0;
@@ -49,7 +53,7 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
     double S = getTimeStamp();
     LIKWID_MARKER_START("force");
 
-#pragma omp parallel for
+    #pragma omp parallel for
     for(int i = 0; i < Nlocal; i++) {
         neighs = &neighbor->neighbors[i * neighbor->maxneighs];
         int numneighs = neighbor->numneigh[i];
@@ -60,9 +64,9 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
         MD_FLOAT fiy = 0;
         MD_FLOAT fiz = 0;
 
-#ifdef EXPLICIT_TYPES
+        #ifdef EXPLICIT_TYPES
         const int type_i = atom->type[i];
-#endif
+        #endif
 
         for(int k = 0; k < numneighs; k++) {
             int j = neighs[k];
@@ -71,13 +75,13 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
             MD_FLOAT delz = ztmp - atom_z(j);
             MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
 
-#ifdef EXPLICIT_TYPES
+            #ifdef EXPLICIT_TYPES
             const int type_j = atom->type[j];
             const int type_ij = type_i * atom->ntypes + type_j;
             const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
             const MD_FLOAT sigma6 = atom->sigma6[type_ij];
             const MD_FLOAT epsilon = atom->epsilon[type_ij];
-#endif
+            #endif
 
             if(rsq < cutforcesq) {
                 MD_FLOAT sr2 = 1.0 / rsq;
@@ -86,11 +90,11 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
                 fix += delx * force;
                 fiy += dely * force;
                 fiz += delz * force;
-#ifdef USE_REFERENCE_VERSION
+            #ifdef USE_REFERENCE_VERSION
                 addStat(stats->atoms_within_cutoff, 1);
             } else {
                 addStat(stats->atoms_outside_cutoff, 1);
-#endif
+            #endif
             }
         }
 
@@ -110,11 +114,11 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
 double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
     int Nlocal = atom->Nlocal;
     int* neighs;
-#ifndef EXPLICIT_TYPES
+    #ifndef EXPLICIT_TYPES
     MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
     MD_FLOAT sigma6 = param->sigma6;
     MD_FLOAT epsilon = param->epsilon;
-#endif
+    #endif
 
     for(int i = 0; i < Nlocal; i++) {
         atom_fx(i) = 0.0;
@@ -135,14 +139,14 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
         MD_FLOAT fiy = 0;
         MD_FLOAT fiz = 0;
 
-#ifdef EXPLICIT_TYPES
+        #ifdef EXPLICIT_TYPES
         const int type_i = atom->type[i];
-#endif
+        #endif
 
         // Pragma required to vectorize the inner loop
-#ifdef ENABLE_OMP_SIMD
+        #ifdef ENABLE_OMP_SIMD
         #pragma omp simd reduction(+: fix,fiy,fiz)
-#endif
+        #endif
         for(int k = 0; k < numneighs; k++) {
             int j = neighs[k];
             MD_FLOAT delx = xtmp - atom_x(j);
@@ -150,13 +154,13 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
             MD_FLOAT delz = ztmp - atom_z(j);
             MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
 
-#ifdef EXPLICIT_TYPES
+            #ifdef EXPLICIT_TYPES
             const int type_j = atom->type[j];
             const int type_ij = type_i * atom->ntypes + type_j;
             const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
             const MD_FLOAT sigma6 = atom->sigma6[type_ij];
             const MD_FLOAT epsilon = atom->epsilon[type_ij];
-#endif
+            #endif
 
             if(rsq < cutforcesq) {
                 MD_FLOAT sr2 = 1.0 / rsq;
@@ -194,11 +198,6 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
     MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
     MD_FLOAT sigma6 = param->sigma6;
     MD_FLOAT epsilon = param->epsilon;
-    MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
-    MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
-    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
-    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
-    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
 
     for(int i = 0; i < Nlocal; i++) {
         atom_fx(i) = 0.0;
@@ -209,10 +208,16 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
     double S = getTimeStamp();
     LIKWID_MARKER_START("force");
 
-    #ifdef NO_AVX2
-    fprintf(stderr, "Error: SIMD kernel implemented for AVX2 and AVX512 only!");
+    #ifndef SIMD_KERNEL_AVAILABLE
+    fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
     exit(-1);
     #else
+    MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
+    MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
+    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
+    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
+    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
+
     #pragma omp parallel for
     for(int i = 0; i < Nlocal; i++) {
         neighs = &neighbor->neighbors[i * neighbor->maxneighs];
diff --git a/lammps/includes/util.h b/lammps/includes/util.h
index 3f188b2..680c78d 100644
--- a/lammps/includes/util.h
+++ b/lammps/includes/util.h
@@ -24,28 +24,37 @@
 #define __UTIL_H_
 
 #ifndef MIN
-#define MIN(x,y) ((x)<(y)?(x):(y))
+#   define MIN(x,y) ((x)<(y)?(x):(y))
 #endif
+
 #ifndef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
+#   define MAX(x,y) ((x)>(y)?(x):(y))
 #endif
+
 #ifndef ABS
-#define ABS(a) ((a) >= 0 ? (a) : -(a))
+#   define ABS(a) ((a) >= 0 ? (a) : -(a))
 #endif
+
 #ifdef DEBUG
-#define DEBUG_MESSAGE   printf
+#   define DEBUG_MESSAGE   printf
 #else
-#define DEBUG_MESSAGE
+#   define DEBUG_MESSAGE
 #endif
 
 #ifndef MAXLINE
-#define MAXLINE 4096
+#   define MAXLINE 4096
 #endif
 
 #define FF_LJ   0
 #define FF_EAM  1
 #define FF_DEM  2
 
+#if PRECISION == 1
+#   define PRECISION_STRING     "single"
+#else
+#   define PRECISION_STRING     "double"
+#endif
+
 extern double myrandom(int*);
 extern void random_reset(int *seed, int ibase, double *coord);
 extern int str2ff(const char *string);
diff --git a/lammps/main.c b/lammps/main.c
index 465f7a2..89a6d08 100644
--- a/lammps/main.c
+++ b/lammps/main.c
@@ -226,6 +226,7 @@ int main(int argc, char** argv) {
     param.cutneigh = param.cutforce + param.skin;
     setup(&param, &eam, &atom, &neighbor, &stats);
     printParameter(&param);
+    printf(HLINE);
 
     printf("step\ttemp\t\tpressure\n");
     computeThermo(0, &param, &atom);
@@ -272,14 +273,6 @@ int main(int argc, char** argv) {
     timer[TOTAL] = getTimeStamp() - timer[TOTAL];
     computeThermo(-1, &param, &atom);
 
-    printf(HLINE);
-    printf("Force field: %s\n", ff2str(param.force_field));
-    printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
-    #if PRECISION == 1
-    printf("Using single precision floating point.\n");
-    #else
-    printf("Using double precision floating point.\n");
-    #endif
     printf(HLINE);
     printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
     printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
diff --git a/lammps/parameter.c b/lammps/parameter.c
index 5606bb2..e5282d2 100644
--- a/lammps/parameter.c
+++ b/lammps/parameter.c
@@ -24,6 +24,7 @@
 #include <stdlib.h>
 #include <string.h>
 //---
+#include <atom.h>
 #include <parameter.h>
 #include <util.h>
 
@@ -154,6 +155,9 @@ void printParameter(Parameter *param) {
     }
 
     printf("\tForce field: %s\n", ff2str(param->force_field));
+    printf("\tKernel: %s\n", KERNEL_NAME);
+    printf("\tData layout: %s\n", POS_DATA_LAYOUT);
+    printf("\tFloating-point precision: %s\n", PRECISION_STRING);
     printf("\tUnit cells (nx, ny, nz): %d, %d, %d\n", param->nx, param->ny, param->nz);
     printf("\tDomain box sizes (x, y, z): %e, %e, %e\n", param->xprd, param->yprd, param->zprd);
     printf("\tPeriodic (x, y, z): %d, %d, %d\n", param->pbc_x, param->pbc_y, param->pbc_z);
@@ -175,5 +179,5 @@ void printParameter(Parameter *param) {
     printf("\tCutoff radius: %e\n", param->cutforce);
     printf("\tSkin: %e\n", param->skin);
     printf("\tHalf neighbor lists: %d\n", param->half_neigh);
-    printf("\tProcessor frequency (GHz): %.4f\n\n", param->proc_freq);
+    printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq);
 }