From ab2eb1ff50d576cca3422d749ebcdffa8b031517 Mon Sep 17 00:00:00 2001
From: Rafael Ravedutti <rafaelravedutti@gmail.com>
Date: Tue, 5 Apr 2022 02:57:23 +0200
Subject: [PATCH] Write LAMMPS kernel with SIMD intrinsics and implement AVX512
 with double-precision functions

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
---
 Makefile                              |  4 ++
 config.mk                             |  8 ++-
 gromacs/includes/simd.h               |  9 ++++
 gromacs/includes/simd/avx512_double.h | 12 +++++
 lammps/force_lj.c                     | 73 ++++++++++++++++++++++++++-
 lammps/main.c                         | 11 +++-
 6 files changed, 113 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 4cff876..c82abdf 100644
--- a/Makefile
+++ b/Makefile
@@ -75,6 +75,10 @@ ifeq ($(strip $(ENABLE_OMP_SIMD)),true)
     DEFINES += -DENABLE_OMP_SIMD
 endif
 
+ifeq ($(strip $(USE_SIMD_KERNEL)),true)
+    DEFINES += -DUSE_SIMD_KERNEL
+endif
+
 VPATH     = $(SRC_DIR) $(ASM_DIR)
 ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
diff --git a/config.mk b/config.mk
index 3f398fe..8140400 100644
--- a/config.mk
+++ b/config.mk
@@ -14,8 +14,6 @@ DATA_LAYOUT ?= AOS
 ASM_SYNTAX ?= ATT
 # Debug
 DEBUG ?= false
-# Use omp simd pragma for lammps halfneigh
-ENABLE_OMP_SIMD ?= true
 
 # Explicitly store and load atom types (true or false)
 EXPLICIT_TYPES ?= false
@@ -26,6 +24,12 @@ INDEX_TRACER ?= false
 # Compute statistics
 COMPUTE_STATS ?= false
 
+# Configurations for lammps optimization scheme
+# Use omp simd pragma when running with half neighbor-lists
+ENABLE_OMP_SIMD ?= true
+# Use kernel with explicit SIMD intrinsics
+USE_SIMD_KERNEL ?= false
+
 # Configurations for gromacs optimization scheme
 # Use reference version
 USE_REFERENCE_VERSION ?= false
diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h
index 32c6020..0b37f61 100644
--- a/gromacs/includes/simd.h
+++ b/gromacs/includes/simd.h
@@ -21,11 +21,20 @@
  * =======================================================================================
  */
 
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <immintrin.h>
 #include <zmmintrin.h>
 
+#ifndef CLUSTER_M
+#   define CLUSTER_M 1
+#endif
+
+#ifndef CLUSTER_N
+#   define CLUSTER_N 1
+#endif
+
 #ifdef AVX512
 #   if PRECISION == 2
 #       include "simd/avx512_double.h"
diff --git a/gromacs/includes/simd/avx512_double.h b/gromacs/includes/simd/avx512_double.h
index 7294c54..a482185 100644
--- a/gromacs/includes/simd/avx512_double.h
+++ b/gromacs/includes/simd/avx512_double.h
@@ -26,6 +26,7 @@
 
 #define MD_SIMD_FLOAT   __m512d
 #define MD_SIMD_MASK    __mmask8
+#define MD_SIMD_INT     __m256i
 
 static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
 static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
@@ -110,3 +111,14 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1,
     simd_h_decr(m + CLUSTER_N, a1);
     simd_h_decr(m + CLUSTER_N * 2, a2);
 }
+
+// Functions used in LAMMPS kernel
+static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm512_i32gather_pd(vidx, m, s); }
+static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm256_set1_epi32(scalar); }
+static inline MD_SIMD_INT simd_int_zero() { return _mm256_setzero_si256(); }
+static inline MD_SIMD_INT simd_int_seq() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
+static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_epi32(m); }
+static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_add_epi32(a, b); }
+static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_mul_epi32(a, b); }
+static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return _mm256_mask_load_epi32(simd_int_zero(), k, m); }
+static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cmp_epi32_mask(a, b, _MM_CMPINT_LT); }
diff --git a/lammps/force_lj.c b/lammps/force_lj.c
index b111c4e..ba9aa8f 100644
--- a/lammps/force_lj.c
+++ b/lammps/force_lj.c
@@ -28,7 +28,10 @@
 #include <atom.h>
 #include <stats.h>
 
-double computeForceLJFullNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+// TODO: Joint common files for gromacs and lammps variants
+#include "../gromacs/includes/simd.h"
+
+double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
     int Nlocal = atom->Nlocal;
     int* neighs;
 #ifndef EXPLICIT_TYPES
@@ -184,3 +187,71 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
     double E = getTimeStamp();
     return E-S;
 }
+
+double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    int Nlocal = atom->Nlocal;
+    int* neighs;
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+    MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
+    MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
+    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
+    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
+    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
+
+    for(int i = 0; i < Nlocal; i++) {
+        atom_fx(i) = 0.0;
+        atom_fy(i) = 0.0;
+        atom_fz(i) = 0.0;
+    }
+
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+
+    #pragma omp parallel for
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MD_SIMD_INT numneighs_vec = simd_int_broadcast(numneighs);
+        MD_SIMD_FLOAT xtmp = simd_broadcast(atom_x(i));
+        MD_SIMD_FLOAT ytmp = simd_broadcast(atom_y(i));
+        MD_SIMD_FLOAT ztmp = simd_broadcast(atom_z(i));
+        MD_SIMD_FLOAT fix = simd_zero();
+        MD_SIMD_FLOAT fiy = simd_zero();
+        MD_SIMD_FLOAT fiz = simd_zero();
+
+        for(int k = 0; k < numneighs; k += VECTOR_WIDTH) {
+            // If the last iteration of this loop is separated from the rest, this mask can be set only there
+            MD_SIMD_MASK mask_numneighs = simd_mask_int_cond_lt(simd_int_add(simd_int_broadcast(k), simd_int_seq()), numneighs_vec);
+            MD_SIMD_INT j = simd_int_mask_load(&neighs[k], mask_numneighs);
+            #ifdef AOS
+            MD_SIMD_INT j3 = simd_int_add(simd_int_add(j, j), j); // j * 3
+            MD_SIMD_FLOAT delx = xtmp - simd_gather(j3, &(atom->x[0]), sizeof(MD_FLOAT));
+            MD_SIMD_FLOAT dely = ytmp - simd_gather(j3, &(atom->x[1]), sizeof(MD_FLOAT));
+            MD_SIMD_FLOAT delz = ztmp - simd_gather(j3, &(atom->x[2]), sizeof(MD_FLOAT));
+            #else
+            MD_SIMD_FLOAT delx = xtmp - simd_gather(j, atom->x, sizeof(MD_FLOAT));
+            MD_SIMD_FLOAT dely = ytmp - simd_gather(j, atom->y, sizeof(MD_FLOAT));
+            MD_SIMD_FLOAT delz = ztmp - simd_gather(j, atom->z, sizeof(MD_FLOAT));
+            #endif
+            MD_SIMD_FLOAT rsq = simd_fma(delx, delx, simd_fma(dely, dely, simd_mul(delz, delz)));
+            MD_SIMD_MASK cutoff_mask = simd_mask_and(mask_numneighs, simd_mask_cond_lt(rsq, cutforcesq_vec));
+            MD_SIMD_FLOAT sr2 = simd_reciprocal(rsq);
+            MD_SIMD_FLOAT sr6 = simd_mul(sr2, simd_mul(sr2, simd_mul(sr2, sigma6_vec)));
+            MD_SIMD_FLOAT force = simd_mul(c48_vec, simd_mul(sr6, simd_mul(simd_sub(sr6, c05_vec), simd_mul(sr2, eps_vec))));
+
+            fix = simd_masked_add(fix, simd_mul(delx, force), cutoff_mask);
+            fiy = simd_masked_add(fiy, simd_mul(dely, force), cutoff_mask);
+            fiz = simd_masked_add(fiz, simd_mul(delz, force), cutoff_mask);
+        }
+
+        atom_fx(i) += simd_h_reduce_sum(fix);
+        atom_fy(i) += simd_h_reduce_sum(fiy);
+        atom_fz(i) += simd_h_reduce_sum(fiz);
+    }
+
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+    return E-S;
+}
diff --git a/lammps/main.c b/lammps/main.c
index 3b7b63c..f770fa1 100644
--- a/lammps/main.c
+++ b/lammps/main.c
@@ -45,10 +45,19 @@
 
 #define HLINE "----------------------------------------------------------------------------\n"
 
-extern double computeForceLJFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*);
 extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
 extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
 
+#ifdef USE_SIMD_KERNEL
+#   define KERNEL_NAME              "SIMD"
+#   define computeForceLJFullNeigh  computeForceLJFullNeigh_simd
+#else
+#   define KERNEL_NAME              "plain-C"
+#   define computeForceLJFullNeigh  computeForceLJFullNeigh_plain_c
+#endif
+
 double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
     if(param->force_field == FF_EAM) { initEam(eam, param); }
     double S, E;