Adjust ISA options and improve output
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
0caeea0494
commit
911ba63336
8
Makefile
8
Makefile
@ -64,8 +64,12 @@ ifneq ($(VECTOR_WIDTH),)
|
||||
DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(NO_AVX2)),true)
|
||||
DEFINES += -DNO_AVX2
|
||||
ifeq ($(strip $(MASK_REGISTERS)),true)
|
||||
DEFINES += -DMASK_REGISTERS
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(SIMD_KERNEL_AVAILABLE)),true)
|
||||
DEFINES += -DSIMD_KERNEL_AVAILABLE
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(AVX512)),true)
|
||||
|
@ -2,6 +2,9 @@
|
||||
TAG ?= ICC
|
||||
# Instruction set (SSE/AVX/AVX2/AVX512)
|
||||
ISA ?= AVX512
|
||||
# Use mask registers (AVX512 must be available in the target CPU)
|
||||
# This is always true when ISA is set to AVX512
|
||||
MASK_REGISTERS ?= true
|
||||
# Optimization scheme (lammps/gromacs/clusters_per_bin)
|
||||
OPT_SCHEME ?= lammps
|
||||
# Enable likwid (true or false)
|
||||
|
@ -28,10 +28,10 @@
|
||||
#define MD_SIMD_FLOAT __m256d
|
||||
#define MD_SIMD_INT __m128i
|
||||
|
||||
#ifdef NO_AVX2
|
||||
# define MD_SIMD_MASK __m256d
|
||||
#else
|
||||
#ifdef MASK_REGISTERS
|
||||
# define MD_SIMD_MASK __mmask8
|
||||
#else
|
||||
# define MD_SIMD_MASK __m256d
|
||||
#endif
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
|
||||
|
@ -2,12 +2,13 @@ ifeq ($(strip $(ISA)), SSE)
|
||||
_VECTOR_WIDTH=2
|
||||
else ifeq ($(strip $(ISA)), AVX)
|
||||
# Vector width is 4 but AVX2 instruction set is not supported
|
||||
NO_AVX2=true
|
||||
_VECTOR_WIDTH=4
|
||||
else ifeq ($(strip $(ISA)), AVX2)
|
||||
#SIMD_KERNEL_AVAILABLE=true
|
||||
_VECTOR_WIDTH=4
|
||||
else ifeq ($(strip $(ISA)), AVX512)
|
||||
AVX512=true
|
||||
SIMD_KERNEL_AVAILABLE=true
|
||||
_VECTOR_WIDTH=8
|
||||
endif
|
||||
|
||||
|
@ -21,16 +21,14 @@
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <math.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <likwid-marker.h>
|
||||
|
||||
#include <timing.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
|
||||
// TODO: Joint common files for gromacs and lammps variants
|
||||
#include "../gromacs/includes/simd.h"
|
||||
|
||||
double computeForceDemFullNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
int Nlocal = atom->Nlocal;
|
||||
|
@ -20,25 +20,29 @@
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <likwid-marker.h>
|
||||
|
||||
#include <timing.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
|
||||
// TODO: Joint common files for gromacs and lammps variants
|
||||
#ifdef SIMD_KERNEL_AVAILABLE
|
||||
#include "../gromacs/includes/simd.h"
|
||||
#endif
|
||||
|
||||
double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
@ -49,7 +53,7 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@ -60,9 +64,9 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
@ -71,13 +75,13 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
@ -86,11 +90,11 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
addStat(stats->atoms_within_cutoff, 1);
|
||||
} else {
|
||||
addStat(stats->atoms_outside_cutoff, 1);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -110,11 +114,11 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
@ -135,14 +139,14 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Pragma required to vectorize the inner loop
|
||||
#ifdef ENABLE_OMP_SIMD
|
||||
#ifdef ENABLE_OMP_SIMD
|
||||
#pragma omp simd reduction(+: fix,fiy,fiz)
|
||||
#endif
|
||||
#endif
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
@ -150,13 +154,13 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
@ -194,11 +198,6 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
||||
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
@ -209,10 +208,16 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#ifdef NO_AVX2
|
||||
fprintf(stderr, "Error: SIMD kernel implemented for AVX2 and AVX512 only!");
|
||||
#ifndef SIMD_KERNEL_AVAILABLE
|
||||
fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
|
||||
exit(-1);
|
||||
#else
|
||||
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
||||
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
|
@ -24,28 +24,37 @@
|
||||
#define __UTIL_H_
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(x,y) ((x)<(y)?(x):(y))
|
||||
# define MIN(x,y) ((x)<(y)?(x):(y))
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x,y) ((x)>(y)?(x):(y))
|
||||
# define MAX(x,y) ((x)>(y)?(x):(y))
|
||||
#endif
|
||||
|
||||
#ifndef ABS
|
||||
#define ABS(a) ((a) >= 0 ? (a) : -(a))
|
||||
# define ABS(a) ((a) >= 0 ? (a) : -(a))
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
#define DEBUG_MESSAGE printf
|
||||
# define DEBUG_MESSAGE printf
|
||||
#else
|
||||
#define DEBUG_MESSAGE
|
||||
# define DEBUG_MESSAGE
|
||||
#endif
|
||||
|
||||
#ifndef MAXLINE
|
||||
#define MAXLINE 4096
|
||||
# define MAXLINE 4096
|
||||
#endif
|
||||
|
||||
#define FF_LJ 0
|
||||
#define FF_EAM 1
|
||||
#define FF_DEM 2
|
||||
|
||||
#if PRECISION == 1
|
||||
# define PRECISION_STRING "single"
|
||||
#else
|
||||
# define PRECISION_STRING "double"
|
||||
#endif
|
||||
|
||||
extern double myrandom(int*);
|
||||
extern void random_reset(int *seed, int ibase, double *coord);
|
||||
extern int str2ff(const char *string);
|
||||
|
@ -226,6 +226,7 @@ int main(int argc, char** argv) {
|
||||
param.cutneigh = param.cutforce + param.skin;
|
||||
setup(¶m, &eam, &atom, &neighbor, &stats);
|
||||
printParameter(¶m);
|
||||
printf(HLINE);
|
||||
|
||||
printf("step\ttemp\t\tpressure\n");
|
||||
computeThermo(0, ¶m, &atom);
|
||||
@ -272,14 +273,6 @@ int main(int argc, char** argv) {
|
||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||
computeThermo(-1, ¶m, &atom);
|
||||
|
||||
printf(HLINE);
|
||||
printf("Force field: %s\n", ff2str(param.force_field));
|
||||
printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
|
||||
#if PRECISION == 1
|
||||
printf("Using single precision floating point.\n");
|
||||
#else
|
||||
printf("Using double precision floating point.\n");
|
||||
#endif
|
||||
printf(HLINE);
|
||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
|
||||
@ -154,6 +155,9 @@ void printParameter(Parameter *param) {
|
||||
}
|
||||
|
||||
printf("\tForce field: %s\n", ff2str(param->force_field));
|
||||
printf("\tKernel: %s\n", KERNEL_NAME);
|
||||
printf("\tData layout: %s\n", POS_DATA_LAYOUT);
|
||||
printf("\tFloating-point precision: %s\n", PRECISION_STRING);
|
||||
printf("\tUnit cells (nx, ny, nz): %d, %d, %d\n", param->nx, param->ny, param->nz);
|
||||
printf("\tDomain box sizes (x, y, z): %e, %e, %e\n", param->xprd, param->yprd, param->zprd);
|
||||
printf("\tPeriodic (x, y, z): %d, %d, %d\n", param->pbc_x, param->pbc_y, param->pbc_z);
|
||||
@ -175,5 +179,5 @@ void printParameter(Parameter *param) {
|
||||
printf("\tCutoff radius: %e\n", param->cutforce);
|
||||
printf("\tSkin: %e\n", param->skin);
|
||||
printf("\tHalf neighbor lists: %d\n", param->half_neigh);
|
||||
printf("\tProcessor frequency (GHz): %.4f\n\n", param->proc_freq);
|
||||
printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user