diff --git a/common/includes/simd/avx512_double.h b/common/includes/simd/avx512_double.h index 62a0d67..453af04 100644 --- a/common/includes/simd/avx512_double.h +++ b/common/includes/simd/avx512_double.h @@ -102,7 +102,8 @@ static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm256_set1_epi32(scalar); } static inline MD_SIMD_INT simd_int_zero() { return _mm256_setzero_si256(); } static inline MD_SIMD_INT simd_int_seq() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); } -static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_epi32(m); } +static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_si256((const MD_SIMD_INT *) m); } +//static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_epi32(m); } static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_add_epi32(a, b); } static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_mul_epi32(a, b); } static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return _mm256_mask_load_epi32(simd_int_zero(), k, m); } diff --git a/include_NVCC.mk b/include_NVCC.mk index 07edb9b..446ccbf 100644 --- a/include_NVCC.mk +++ b/include_NVCC.mk @@ -6,9 +6,11 @@ ANSI_CFLAGS += -std=c99 ANSI_CFLAGS += -pedantic ANSI_CFLAGS += -Wextra -# CFLAGS = -O0 -g -std=c99 -fargument-noalias +CFLAGS = -O3 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp +#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp +#CFLAGS = -O3 -g # -fopenmp +#CFLAGS = -O0 -g -std=c99 -fargument-noalias #CFLAGS = -O3 -g -arch=sm_61 # -fopenmp -CFLAGS = -O3 -g # -fopenmp ASFLAGS = -masm=intel LFLAGS = DEFINES = -D_GNU_SOURCE -DCUDA_TARGET -DNO_ZMM_INTRIN #-DLIKWID_PERFMON diff --git a/lammps/main.c b/lammps/main.c index c08f1da..1cc5c46 100644 --- a/lammps/main.c +++ b/lammps/main.c @@ -116,6 +116,17 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, #endif } +void writeInput(Parameter *param, Atom *atom) { + FILE *fpin = fopen("input.in", "w"); + fprintf(fpin, "0,%f,0,%f,0,%f\n", param->xprd, param->yprd, param->zprd); + + for(int i = 0; i < atom->Nlocal; i++) { + fprintf(fpin, "1,%f,%f,%f,%f,%f,%f\n", atom_x(i), atom_y(i), atom_z(i), atom_vx(i), atom_vy(i), atom_vz(i)); + } + + fclose(fpin); +} + int main(int argc, char** argv) { double timer[NUMTIMER]; Eam eam; @@ -218,6 +229,8 @@ int main(int argc, char** argv) { traceAddresses(¶m, &atom, &neighbor, n + 1); #endif + //writeInput(¶m, &atom); + timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats); timer[NEIGH] = 0.0; timer[TOTAL] = getTimeStamp();