diff --git a/common/includes/simd/avx2_double.h b/common/includes/simd/avx2_double.h index c48fe6c..e57df9b 100644 --- a/common/includes/simd/avx2_double.h +++ b/common/includes/simd/avx2_double.h @@ -48,11 +48,13 @@ static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_S t2 = _mm256_permute2f128_pd(t0, t1, 0x21); t0 = _mm256_add_pd(t0, t2); t1 = _mm256_add_pd(t1, t2); - t0 = _mm256_blend_pd(t0, t1, 0b1100); + t0 = _mm256_blend_pd(t0, t1, 0xC); + //t0 = _mm256_blend_pd(t0, t1, 0b1100); t1 = _mm256_add_pd(t0, _mm256_load_pd(m)); _mm256_store_pd(m, t1); - t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101)); + t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5)); + //t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101)); a0 = _mm256_castpd256_pd128(t0); a1 = _mm256_extractf128_pd(t0, 0x1); a0 = _mm_add_sd(a0, a1); @@ -91,7 +93,7 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, } // Functions used in LAMMPS kernel -static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); } +#define simd_gather(vidx, m, s) _mm256_i32gather_pd(m, vidx, s); static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); } static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); } static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); } diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index dfba8f5..ba9fc82 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -58,6 +58,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) { neighbor->numneigh = NULL; neighbor->numneigh_masked = NULL; neighbor->neighbors = NULL; + neighbor->neighbors_imask = NULL; } void setupNeighbor(Parameter *param, Atom *atom) { @@ -229,7 +230,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { if(atom->Nclusters_local > nmax) { nmax = atom->Nclusters_local; if(neighbor->numneigh) free(neighbor->numneigh); + if(neighbor->numneigh_masked) free(neighbor->numneigh_masked); if(neighbor->neighbors) free(neighbor->neighbors); + if(neighbor->neighbors_imask) free(neighbor->neighbors_imask); neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int)); neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int)); @@ -326,15 +329,17 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { imask = get_imask_simd_4xn(1, ci, cj); #endif - if(imask == NBNXN_INTERACTION_MASK_ALL) { - neighptr[n] = cj; - neighptr_imask[n] = imask; - } else { - neighptr[n] = neighptr[nmasked]; - neighptr_imask[n] = neighptr_imask[nmasked]; - neighptr[nmasked] = cj; - neighptr_imask[nmasked] = imask; - nmasked++; + if(n < neighbor->maxneighs) { + if(imask == NBNXN_INTERACTION_MASK_ALL) { + neighptr[n] = cj; + neighptr_imask[n] = imask; + } else { + neighptr[n] = neighptr[nmasked]; + neighptr_imask[n] = neighptr_imask[nmasked]; + neighptr[nmasked] = cj; + neighptr_imask[nmasked] = imask; + nmasked++; + } } n++; @@ -377,11 +382,12 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { } if(resize) { - fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs); neighbor->maxneighs = new_maxneighs * 1.2; + fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs); free(neighbor->neighbors); - neighbor->neighbors = (int *) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int)); - neighbor->neighbors_imask = (unsigned int *) malloc(atom->Nmax * neighbor->maxneighs * sizeof(unsigned int)); + free(neighbor->neighbors_imask); + neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int)); + neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int)); } }