diff --git a/Makefile b/Makefile
index da24a29..2b480c0 100644
--- a/Makefile
+++ b/Makefile
@@ -33,6 +33,9 @@ ifneq ($(NEIGHBORS_LOOP_RUNS),)
DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
endif
+ifeq ($(strip $(PRINT_STATS)),true)
+ DEFINES += -DPRINT_STATS
+endif
ifeq ($(strip $(EXPLICIT_TYPES)),true)
DEFINES += -DEXPLICIT_TYPES
endif
diff --git a/asm/unused/force.s b/asm/unused/force.s
new file mode 100644
index 0000000..11cf275
--- /dev/null
+++ b/asm/unused/force.s
@@ -0,0 +1,376 @@
+.intel_syntax noprefix
+
+.text
+.align 16,0x90
+.globl computeForce
+computeForce:
+# parameter 1: rdi Parameter*
+# parameter 2: rsi Atom*
+# parameter 3: rdx Neighbor*
+push r12
+push r13
+push r14
+
+# r9d <- atom->Nlocal
+ mov r9d, DWORD PTR [4+rsi]
+# xmm2 <- param->cutforce
+ vmovsd xmm2, QWORD PTR [72+rdi]
+# xmm1 <- param->sigma6
+ vmovsd xmm1, QWORD PTR [8+rdi]
+# xmm0 <- param->epsilon
+ vmovsd xmm0, QWORD PTR [rdi]
+
+# r13 <- atom->fx
+ mov r13, QWORD PTR [64+rsi]
+# r14 <- atom->fy
+ mov r14, QWORD PTR [72+rsi]
+# rdi <- atom->fz
+ mov rdi, QWORD PTR [80+rsi]
+
+# atom->Nlocal <= 0
+ test r9d, r9d
+ jle ..B1.30
+
+..B1.2:
+# r10d <- 0
+ xor r10d, r10d
+# ecx <- atom->Nlocal
+ mov ecx, r9d
+# r8d <- 0
+ xor r8d, r8d
+# r11d <- 1
+ mov r11d, 1
+# eax <- 0
+ xor eax, eax
+# ecx <- atom->Nlocal >> 1
+ shr ecx, 1
+ je ..B1.6
+
+# Init forces to zero loop
+..B1.4:
+# fx[i] <- 0
+ mov QWORD PTR [r8+r13], rax
+# i++
+ inc r10
+# fy[i] <- 0
+ mov QWORD PTR [r8+r14], rax
+# fz[i] <- 0
+ mov QWORD PTR [r8+rdi], rax
+# fx[i] <- 0
+ mov QWORD PTR [8+r8+r13], rax
+# fy[i] <- 0
+ mov QWORD PTR [8+r8+r14], rax
+# fz[i] <- 0
+ mov QWORD PTR [8+r8+rdi], rax
+# i++
+ add r8, 16
+# i < Nlocal
+ cmp r10, rcx
+ jb ..B1.4
+
+..B1.5:
+# r11d <- i * 2 + 1
+ lea r11d, DWORD PTR [1+r10+r10]
+..B1.6:
+# r11d <- i * 2
+ lea ecx, DWORD PTR [-1+r11]
+# i < Nlocal
+ cmp ecx, r9d
+ jae ..B1.8
+
+..B1.7:
+
+# r11 <- i * 2
+ movsxd r11, r11d
+# fx[i] <- 0
+ mov QWORD PTR [-8+r13+r11*8], rax
+# fy[i] <- 0
+ mov QWORD PTR [-8+r14+r11*8], rax
+# fz[i] <- 0
+ mov QWORD PTR [-8+rdi+r11*8], rax
+
+..B1.8:
+
+# xmm15 <- cutforcesq
+ vmulsd xmm15, xmm2, xmm2
+# r8d <- 0
+ xor r8d, r8d
+# ymm18 <- 8
+ vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip]
+# xmm0 <- 48 * epsilon
+ vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip]
+# ymm17 <- [0..7]
+ vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip]
+# zmm7 <- 0.5
+ vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip]
+# zmm16 <- cutforcesq
+ vbroadcastsd zmm16, xmm15
+# zmm15 <- param->sigma6
+ vbroadcastsd zmm15, xmm1
+# zmm16 <- 48 * epsilon
+ vbroadcastsd zmm14, xmm0
+# r9 <- atom->Nlocal
+ movsxd r9, r9d
+# r10d <- 0 (i)
+ xor r10d, r10d
+# rcx <- neighbor->numneigh
+ mov rcx, QWORD PTR [24+rdx]
+# r11 <- neighbor->neighbors
+ mov r11, QWORD PTR [8+rdx]
+# r12 <- neighbor->maxneighs
+ movsxd r12, DWORD PTR [16+rdx]
+
+# rdx <- atom->x
+ mov rdx, QWORD PTR [16+rsi]
+# rax <- atom->y
+ mov rax, QWORD PTR [24+rsi]
+# rsi <- atom->z
+ mov rsi, QWORD PTR [32+rsi]
+
+# r12 <- neighbor->maxneighs * 4
+ shl r12, 2
+# [-32+rsp] <- atom->Nlocal
+ mov QWORD PTR [-32+rsp], r9
+# [-24+rsp] <- neighbor->numneigh
+ mov QWORD PTR [-24+rsp], rcx
+# [-16+rsp] <- atom->fy
+ mov QWORD PTR [-16+rsp], r14
+# [-8+rsp] <- atom->fx
+ mov QWORD PTR [-8+rsp], r13
+# [-40+rsp] <- r15
+ mov QWORD PTR [-40+rsp], r15
+# [-48+rsp] <- rbx
+ mov QWORD PTR [-48+rsp], rbx
+# zmm19 <- 0
+ vpxord zmm19, zmm19, zmm19
+
+# Loop over all atoms
+..B1.9:
+
+# rcx <- neighbor->numneigh
+ mov rcx, QWORD PTR [-24+rsp]
+# xmm25 <- 0
+ vxorpd xmm25, xmm25, xmm25
+# xmm20 <- 0
+ vmovapd xmm20, xmm25
+# r13d <- neighbor->numneigh[i] (numneighs)
+ mov r13d, DWORD PTR [rcx+r10*4]
+# xmm4 <- 0
+ vmovapd xmm4, xmm20
+# xmm8 <- atom->x[i]
+ vmovsd xmm8, QWORD PTR [rdx+r10*8]
+# xmm9 <- atom->y[i]
+ vmovsd xmm9, QWORD PTR [rax+r10*8]
+# xmm9 <- atom->z[i]
+ vmovsd xmm10, QWORD PTR [rsi+r10*8]
+# numneighs <= 0
+ test r13d, r13d
+ jle ..B1.27
+
+..B1.10:
+
+# zmm13 <- 0
+ vmovaps zmm13, zmm19
+# zmm12 <- 0
+ vmovaps zmm12, zmm13
+# zmm11 <- 0
+ vmovaps zmm11, zmm12
+
+# numneighs < 8
+ cmp r13d, 8
+ jl ..B1.32
+
+..B1.11:
+
+# numneighs < 1200
+ cmp r13d, 1200
+ jl ..B1.31
+
+..B1.12:
+
+ mov rcx, r12
+ imul rcx, r8
+ add rcx, r11
+ mov r9, rcx
+ and r9, 63
+ test r9d, 3
+ je ..B1.14
+
+..B1.13:
+
+ xor r9d, r9d
+ jmp ..B1.16
+
+..B1.14:
+
+ test r9d, r9d
+ je ..B1.16
+
+..B1.15:
+
+ neg r9d
+ add r9d, 64
+ shr r9d, 2
+ cmp r13d, r9d
+ cmovl r9d, r13d
+
+..B1.16:
+
+ mov ebx, r13d
+ sub ebx, r9d
+ and ebx, 7
+ neg ebx
+ add ebx, r13d
+ cmp r9d, 1
+ jb ..B1.20
+
+..B1.20:
+ lea ecx, DWORD PTR [8+r9]
+ cmp ebx, ecx
+ jl ..B1.24
+
+..B1.21:
+ mov rcx, r12
+ imul rcx, r8
+ vbroadcastsd zmm0, xmm8
+ vbroadcastsd zmm1, xmm9
+ vbroadcastsd zmm2, xmm10
+ movsxd r14, r9d
+ add rcx, r11
+
+..B1.22:
+ vpcmpeqb k2, xmm0, xmm0
+ add r9d, 8
+ vpcmpeqb k1, xmm0, xmm0
+ vpcmpeqb k3, xmm0, xmm0
+ vmovdqu ymm3, YMMWORD PTR [rcx+r14*4]
+ add r14, 8
+ vpxord zmm5, zmm5, zmm5
+ vpxord zmm4, zmm4, zmm4
+ vpxord zmm6, zmm6, zmm6
+
+ vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8]
+ vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8]
+ vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8]
+
+ vsubpd zmm29, zmm1, zmm5
+ vsubpd zmm28, zmm0, zmm4
+ vsubpd zmm31, zmm2, zmm6
+
+ vmulpd zmm20, zmm29, zmm29
+ vfmadd231pd zmm20, zmm28, zmm28
+ vfmadd231pd zmm20, zmm31, zmm31
+
+# if condition cutoff radius
+ vrcp14pd zmm27, zmm20 #-> sr2
+ vcmppd k5, zmm20, zmm16, 1
+
+ vmulpd zmm22, zmm27, zmm15 #-> sr2 * sigma6
+ vmulpd zmm24, zmm27, zmm14 #-> 48.0 * epsilon * sr2
+
+ vmulpd zmm25, zmm27, zmm22 #-> sr2 * sigma6 * sr2
+ vmulpd zmm23, zmm27, zmm25 #-> sr2 * sigma6 * sr2 * sr2
+
+ vfmsub213pd zmm27, zmm25, zmm7 #-> sr2 * sigma * sr2 * sr2 - 0.5
+ vmulpd zmm26, zmm23, zmm24 #-> 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
+ vmulpd zmm30, zmm26, zmm27 #->
+
+ vfmadd231pd zmm13{k5}, zmm30, zmm28
+ vfmadd231pd zmm12{k5}, zmm30, zmm29
+ vfmadd231pd zmm11{k5}, zmm30, zmm31
+ cmp r9d, ebx
+ jb ..B1.22
+#end neighbor loop
+
+..B1.26:
+ vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
+ vpermd zmm0, zmm10, zmm11
+ vpermd zmm5, zmm10, zmm12
+ vpermd zmm21, zmm10, zmm13
+ vaddpd zmm11, zmm0, zmm11
+ vaddpd zmm12, zmm5, zmm12
+ vaddpd zmm13, zmm21, zmm13
+ vpermpd zmm1, zmm11, 78
+ vpermpd zmm6, zmm12, 78
+ vpermpd zmm22, zmm13, 78
+ vaddpd zmm2, zmm11, zmm1
+ vaddpd zmm8, zmm12, zmm6
+ vaddpd zmm23, zmm13, zmm22
+ vpermpd zmm3, zmm2, 177
+ vpermpd zmm9, zmm8, 177
+ vpermpd zmm24, zmm23, 177
+ vaddpd zmm4, zmm2, zmm3
+ vaddpd zmm20, zmm8, zmm9
+ vaddpd zmm25, zmm23, zmm24
+
+#exit function
+..B1.27:
+ mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
+ mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
+ movsxd r8, r10d #55.32
+ inc r8 #55.32
+ vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
+ vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
+ vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
+ vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
+ vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
+ vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
+ inc r10 #55.5
+ cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
+ jb ..B1.9
+
+
+ vzeroupper #93.12
+ vxorpd xmm0, xmm0, xmm0 #93.12
+ pop r14 #93.12
+ pop r13 #93.12
+ pop r12 #93.12
+ ret #93.12
+
+.type computeForce,@function
+.size computeForce,.-computeForce
+
+
+..LNcomputeForce.0:
+ .data
+# -- End computeForce
+ .section .rodata, "a"
+ .align 64
+ .align 64
+.L_2il0floatpacket.2:
+ .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
+ .type .L_2il0floatpacket.2,@object
+ .size .L_2il0floatpacket.2,64
+ .align 64
+.L_2il0floatpacket.4:
+ .long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
+ .type .L_2il0floatpacket.4,@object
+ .size .L_2il0floatpacket.4,64
+ .align 64
+.L_2il0floatpacket.6:
+ .long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
+ .type .L_2il0floatpacket.6,@object
+ .size .L_2il0floatpacket.6,64
+ .align 32
+.L_2il0floatpacket.0:
+ .long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
+ .type .L_2il0floatpacket.0,@object
+ .size .L_2il0floatpacket.0,32
+ .align 32
+.L_2il0floatpacket.1:
+ .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
+ .type .L_2il0floatpacket.1,@object
+ .size .L_2il0floatpacket.1,32
+ .align 8
+.L_2il0floatpacket.3:
+ .long 0x00000000,0x40480000
+ .type .L_2il0floatpacket.3,@object
+ .size .L_2il0floatpacket.3,8
+ .align 8
+.L_2il0floatpacket.5:
+ .long 0x00000000,0x3ff00000
+ .type .L_2il0floatpacket.5,@object
+ .size .L_2il0floatpacket.5,8
+ .data
+ .section .note.GNU-stack, ""
+# End
diff --git a/config.mk b/config.mk
index 066274a..312c19d 100644
--- a/config.mk
+++ b/config.mk
@@ -1,7 +1,7 @@
# Compiler tag (GCC/CLANG/ICC)
-TAG ?= ICC
+TAG ?= CLANG
# Enable likwid (true or false)
-ENABLE_LIKWID ?= true
+ENABLE_LIKWID ?= false
# SP or DP
DATA_TYPE ?= DP
# AOS or SOA
@@ -9,6 +9,8 @@ DATA_LAYOUT ?= AOS
# Assembly syntax to generate (ATT/INTEL)
ASM_SYNTAX ?= ATT
+# Output detailed statistics
+PRINT_STATS ?= true
# Number of times to run the atoms loop on stubbed variant
ATOMS_LOOP_RUNS ?= 1
# Number of times to run the neighbors loop on stubbed variant
diff --git a/src/force-tracing.c b/src/force-tracing.c
new file mode 100644
index 0000000..b73305a
--- /dev/null
+++ b/src/force-tracing.c
@@ -0,0 +1,232 @@
+/*
+ * =======================================================================================
+ *
+ * Author: Jan Eitzinger (je), jan.eitzinger@fau.de
+ * Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ * This file is part of MD-Bench.
+ *
+ * MD-Bench is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License along
+ * with MD-Bench. If not, see .
+ * =======================================================================================
+ */
+#include
+
+#include
+#include
+#include
+#include
+#include
+
+#if defined(MEM_TRACER) || defined(INDEX_TRACER)
+#include
+#include
+#endif
+
+#ifndef VECTOR_WIDTH
+# define VECTOR_WIDTH 8
+#endif
+
+#ifndef TRACER_CONDITION
+# define TRACER_CONDITION (!(timestep % param->every))
+#endif
+
+#ifdef MEM_TRACER
+# define MEM_TRACER_INIT FILE *mem_tracer_fp; \
+ if(TRACER_CONDITION) { \
+ char mem_tracer_fn[128]; \
+ snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \
+ mem_tracer_fp = fopen(mem_tracer_fn, "w");
+ }
+
+# define MEM_TRACER_END if(TRACER_CONDITION) { fclose(mem_tracer_fp); }
+# define MEM_TRACE(addr, op) if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); }
+#else
+# define MEM_TRACER_INIT
+# define MEM_TRACER_END
+# define MEM_TRACE(addr, op)
+#endif
+
+#ifdef INDEX_TRACER
+# define INDEX_TRACER_INIT FILE *index_tracer_fp; \
+ if(TRACER_CONDITION) { \
+ char index_tracer_fn[128]; \
+ snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \
+ index_tracer_fp = fopen(index_tracer_fn, "w"); \
+ }
+
+# define INDEX_TRACER_END if(TRACER_CONDITION) { fclose(index_tracer_fp); }
+# define INDEX_TRACE_NATOMS(nl, ng, mn) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); }
+# define INDEX_TRACE_ATOM(a) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); }
+# define INDEX_TRACE(l, e) if(TRACER_CONDITION) { \
+ for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+ int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+ fprintf(index_tracer_fp, "I: "); \
+ for(int __j = 0; __j < __e; ++__j) { \
+ fprintf(index_tracer_fp, "%d ", l[__i + __j]); \
+ } \
+ fprintf(index_tracer_fp, "\n"); \
+ } \
+ }
+
+# define DIST_TRACE_SORT(l, e) if(TRACER_CONDITION) { \
+ for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+ int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+ if(__e > 1) { \
+ for(int __j = __i; __j < __i + __e - 1; ++__j) { \
+ for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \
+ if(l[__k] > l[__k + 1]) { \
+ int __t = l[__k]; \
+ l[__k] = l[__k + 1]; \
+ l[__k + 1] = __t; \
+ } \
+ } \
+ } \
+ } \
+ } \
+ }
+
+# define DIST_TRACE(l, e) if(TRACER_CONDITION) { \
+ for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+ int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+ if(__e > 1) { \
+ fprintf(index_tracer_fp, "D: "); \
+ for(int __j = 0; __j < __e - 1; ++__j) { \
+ int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \
+ fprintf(index_tracer_fp, "%d ", __dist); \
+ } \
+ fprintf(index_tracer_fp, "\n"); \
+ } \
+ } \
+ }
+#else
+# define INDEX_TRACER_INIT
+# define INDEX_TRACER_END
+# define INDEX_TRACE_NATOMS(nl, ng, mn)
+# define INDEX_TRACE_ATOM(a)
+# define INDEX_TRACE(l, e)
+# define DIST_TRACE_SORT(l, e)
+# define DIST_TRACE(l, e)
+#endif
+
+double computeForceTracing(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) {
+ MEM_TRACER_INIT;
+ INDEX_TRACER_INIT;
+ int Nlocal = atom->Nlocal;
+ int* neighs;
+ MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
+ #ifndef EXPLICIT_TYPES
+ MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+ MD_FLOAT sigma6 = param->sigma6;
+ MD_FLOAT epsilon = param->epsilon;
+ #endif
+
+ for(int i = 0; i < Nlocal; i++) {
+ fx[i] = 0.0;
+ fy[i] = 0.0;
+ fz[i] = 0.0;
+ }
+
+ INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
+ double S = getTimeStamp();
+ LIKWID_MARKER_START("force");
+
+ for(int na = 0; na < (first_exec ? 1 : ATOMS_LOOP_RUNS); na++) {
+ #pragma omp parallel for
+ for(int i = 0; i < Nlocal; i++) {
+ neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+ int numneighs = neighbor->numneigh[i];
+ MD_FLOAT xtmp = atom_x(i);
+ MD_FLOAT ytmp = atom_y(i);
+ MD_FLOAT ztmp = atom_z(i);
+ MD_FLOAT fix = 0;
+ MD_FLOAT fiy = 0;
+ MD_FLOAT fiz = 0;
+
+ MEM_TRACE(atom_x(i), 'R');
+ MEM_TRACE(atom_y(i), 'R');
+ MEM_TRACE(atom_z(i), 'R');
+ INDEX_TRACE_ATOM(i);
+
+ #ifdef EXPLICIT_TYPES
+ const int type_i = atom->type[i];
+ MEM_TRACE(atom->type(i), 'R');
+ #endif
+
+ #if defined(VARIANT) && VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1
+ #define REPEAT_NEIGHBORS_LOOP
+ int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS;
+ for(int nn = 0; nn < (first_exec ? 1 : NEIGHBORS_LOOP_RUNS); nn++) {
+ #endif
+
+ //DIST_TRACE_SORT(neighs, numneighs);
+ INDEX_TRACE(neighs, numneighs);
+ //DIST_TRACE(neighs, numneighs);
+
+ for(int k = 0; k < numneighs; k++) {
+ int j = neighs[k];
+ MD_FLOAT delx = xtmp - atom_x(j);
+ MD_FLOAT dely = ytmp - atom_y(j);
+ MD_FLOAT delz = ztmp - atom_z(j);
+ MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+ MEM_TRACE(neighs[k], 'R');
+ MEM_TRACE(atom_x(j), 'R');
+ MEM_TRACE(atom_y(j), 'R');
+ MEM_TRACE(atom_z(j), 'R');
+
+ #ifdef EXPLICIT_TYPES
+ const int type_j = atom->type[j];
+ const int type_ij = type_i * atom->ntypes + type_j;
+ const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+ const MD_FLOAT sigma6 = atom->sigma6[type_ij];
+ const MD_FLOAT epsilon = atom->epsilon[type_ij];
+ MEM_TRACE(atom->type(j), 'R');
+ #endif
+
+ if(rsq < cutforcesq) {
+ MD_FLOAT sr2 = 1.0 / rsq;
+ MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
+ MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+ fix += delx * force;
+ fiy += dely * force;
+ fiz += delz * force;
+ }
+ }
+
+ #ifdef REPEAT_NEIGHBORS_LOOP
+ }
+ #endif
+
+ fx[i] += fix;
+ fy[i] += fiy;
+ fz[i] += fiz;
+
+ addStat(stats->total_force_neighs, numneighs);
+ addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
+ MEM_TRACE(fx[i], 'R');
+ MEM_TRACE(fx[i], 'W');
+ MEM_TRACE(fy[i], 'R');
+ MEM_TRACE(fy[i], 'W');
+ MEM_TRACE(fz[i], 'R');
+ MEM_TRACE(fz[i], 'W');
+ }
+ }
+
+ LIKWID_MARKER_STOP("force");
+ double E = getTimeStamp();
+
+ INDEX_TRACER_END;
+ MEM_TRACER_END;
+ return E-S;
+}
diff --git a/src/force.c b/src/force.c
index 79ab32f..5ec92e7 100644
--- a/src/force.c
+++ b/src/force.c
@@ -26,110 +26,23 @@
#include
#include
#include
-#include
-#if defined(MEM_TRACER) || defined(INDEX_TRACER)
-#include
-#include
-#endif
-
-#ifndef VECTOR_WIDTH
-# define VECTOR_WIDTH 8
-#endif
-
-#ifndef TRACER_CONDITION
-# define TRACER_CONDITION (!(timestep % param->every))
-#endif
-
-#ifdef MEM_TRACER
-# define MEM_TRACER_INIT FILE *mem_tracer_fp; \
- if(TRACER_CONDITION) { \
- char mem_tracer_fn[128]; \
- snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \
- mem_tracer_fp = fopen(mem_tracer_fn, "w");
- }
-
-# define MEM_TRACER_END if(TRACER_CONDITION) { fclose(mem_tracer_fp); }
-# define MEM_TRACE(addr, op) if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); }
-#else
-# define MEM_TRACER_INIT
-# define MEM_TRACER_END
-# define MEM_TRACE(addr, op)
-#endif
-
-#ifdef INDEX_TRACER
-# define INDEX_TRACER_INIT FILE *index_tracer_fp; \
- if(TRACER_CONDITION) { \
- char index_tracer_fn[128]; \
- snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \
- index_tracer_fp = fopen(index_tracer_fn, "w"); \
- }
-
-# define INDEX_TRACER_END if(TRACER_CONDITION) { fclose(index_tracer_fp); }
-# define INDEX_TRACE_NATOMS(nl, ng, mn) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); }
-# define INDEX_TRACE_ATOM(a) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); }
-# define INDEX_TRACE(l, e) if(TRACER_CONDITION) { \
- for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
- int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
- fprintf(index_tracer_fp, "I: "); \
- for(int __j = 0; __j < __e; ++__j) { \
- fprintf(index_tracer_fp, "%d ", l[__i + __j]); \
- } \
- fprintf(index_tracer_fp, "\n"); \
- } \
- }
-
-# define DIST_TRACE_SORT(l, e) if(TRACER_CONDITION) { \
- for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
- int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
- if(__e > 1) { \
- for(int __j = __i; __j < __i + __e - 1; ++__j) { \
- for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \
- if(l[__k] > l[__k + 1]) { \
- int __t = l[__k]; \
- l[__k] = l[__k + 1]; \
- l[__k + 1] = __t; \
- } \
- } \
- } \
- } \
- } \
- }
-
-# define DIST_TRACE(l, e) if(TRACER_CONDITION) { \
- for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
- int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
- if(__e > 1) { \
- fprintf(index_tracer_fp, "D: "); \
- for(int __j = 0; __j < __e - 1; ++__j) { \
- int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \
- fprintf(index_tracer_fp, "%d ", __dist); \
- } \
- fprintf(index_tracer_fp, "\n"); \
- } \
- } \
- }
-#else
-# define INDEX_TRACER_INIT
-# define INDEX_TRACER_END
-# define INDEX_TRACE_NATOMS(nl, ng, mn)
-# define INDEX_TRACE_ATOM(a)
-# define INDEX_TRACE(l, e)
-# define DIST_TRACE_SORT(l, e)
-# define DIST_TRACE(l, e)
-#endif
-
-double computeForce(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) {
- MEM_TRACER_INIT;
- INDEX_TRACER_INIT;
+double computeForce(
+ Parameter *param,
+ Atom *atom,
+ Neighbor *neighbor
+ )
+{
int Nlocal = atom->Nlocal;
int* neighs;
- MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
- #ifndef EXPLICIT_TYPES
+ MD_FLOAT* fx = atom->fx;
+ MD_FLOAT* fy = atom->fy;
+ MD_FLOAT* fz = atom->fz;
+#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
- #endif
+#endif
for(int i = 0; i < Nlocal; i++) {
fx[i] = 0.0;
@@ -137,96 +50,57 @@ double computeForce(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *sta
fz[i] = 0.0;
}
- INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
- for(int na = 0; na < (first_exec ? 1 : ATOMS_LOOP_RUNS); na++) {
- #pragma omp parallel for
- for(int i = 0; i < Nlocal; i++) {
- neighs = &neighbor->neighbors[i * neighbor->maxneighs];
- int numneighs = neighbor->numneigh[i];
- MD_FLOAT xtmp = atom_x(i);
- MD_FLOAT ytmp = atom_y(i);
- MD_FLOAT ztmp = atom_z(i);
- MD_FLOAT fix = 0;
- MD_FLOAT fiy = 0;
- MD_FLOAT fiz = 0;
+#pragma omp parallel for
+ for(int i = 0; i < Nlocal; i++) {
+ neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+ int numneighs = neighbor->numneigh[i];
+ MD_FLOAT xtmp = atom_x(i);
+ MD_FLOAT ytmp = atom_y(i);
+ MD_FLOAT ztmp = atom_z(i);
+ MD_FLOAT fix = 0;
+ MD_FLOAT fiy = 0;
+ MD_FLOAT fiz = 0;
- MEM_TRACE(atom_x(i), 'R');
- MEM_TRACE(atom_y(i), 'R');
- MEM_TRACE(atom_z(i), 'R');
- INDEX_TRACE_ATOM(i);
+#ifdef EXPLICIT_TYPES
+ const int type_i = atom->type[i];
+#endif
- #ifdef EXPLICIT_TYPES
- const int type_i = atom->type[i];
- MEM_TRACE(atom->type(i), 'R');
- #endif
- #if defined(VARIANT) && VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1
- #define REPEAT_NEIGHBORS_LOOP
- int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS;
- for(int nn = 0; nn < (first_exec ? 1 : NEIGHBORS_LOOP_RUNS); nn++) {
- #endif
+ for(int k = 0; k < numneighs; k++) {
+ int j = neighs[k];
+ MD_FLOAT delx = xtmp - atom_x(j);
+ MD_FLOAT dely = ytmp - atom_y(j);
+ MD_FLOAT delz = ztmp - atom_z(j);
+ MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
- //DIST_TRACE_SORT(neighs, numneighs);
- INDEX_TRACE(neighs, numneighs);
- //DIST_TRACE(neighs, numneighs);
+#ifdef EXPLICIT_TYPES
+ const int type_j = atom->type[j];
+ const int type_ij = type_i * atom->ntypes + type_j;
+ const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+ const MD_FLOAT sigma6 = atom->sigma6[type_ij];
+ const MD_FLOAT epsilon = atom->epsilon[type_ij];
+#endif
- for(int k = 0; k < numneighs; k++) {
- int j = neighs[k];
- MD_FLOAT delx = xtmp - atom_x(j);
- MD_FLOAT dely = ytmp - atom_y(j);
- MD_FLOAT delz = ztmp - atom_z(j);
- MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
-
- MEM_TRACE(neighs[k], 'R');
- MEM_TRACE(atom_x(j), 'R');
- MEM_TRACE(atom_y(j), 'R');
- MEM_TRACE(atom_z(j), 'R');
-
- #ifdef EXPLICIT_TYPES
- const int type_j = atom->type[j];
- const int type_ij = type_i * atom->ntypes + type_j;
- const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
- const MD_FLOAT sigma6 = atom->sigma6[type_ij];
- const MD_FLOAT epsilon = atom->epsilon[type_ij];
- MEM_TRACE(atom->type(j), 'R');
- #endif
-
- if(rsq < cutforcesq) {
- MD_FLOAT sr2 = 1.0 / rsq;
- MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
- MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
- fix += delx * force;
- fiy += dely * force;
- fiz += delz * force;
- }
- }
-
- #ifdef REPEAT_NEIGHBORS_LOOP
+ if(rsq < cutforcesq) {
+ MD_FLOAT sr2 = 1.0 / rsq;
+ MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
+ MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+ fix += delx * force;
+ fiy += dely * force;
+ fiz += delz * force;
}
- #endif
-
- fx[i] += fix;
- fy[i] += fiy;
- fz[i] += fiz;
-
- addStat(stats->total_force_neighs, numneighs);
- addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
- MEM_TRACE(fx[i], 'R');
- MEM_TRACE(fx[i], 'W');
- MEM_TRACE(fy[i], 'R');
- MEM_TRACE(fy[i], 'W');
- MEM_TRACE(fz[i], 'R');
- MEM_TRACE(fz[i], 'W');
}
+
+ fx[i] += fix;
+ fy[i] += fiy;
+ fz[i] += fiz;
}
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
- INDEX_TRACER_END;
- MEM_TRACER_END;
return E-S;
}
diff --git a/src/main.c b/src/main.c
index bff4422..703ca9f 100644
--- a/src/main.c
+++ b/src/main.c
@@ -42,7 +42,8 @@
#define HLINE "----------------------------------------------------------------------------\n"
-extern double computeForce(Parameter*, Atom*, Neighbor*, Stats*, int, int);
+extern double computeForce(Parameter*, Atom*, Neighbor*);
+extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
void init(Parameter *param)
{
@@ -211,7 +212,11 @@ int main (int argc, char** argv)
setup(¶m, &atom, &neighbor, &stats);
computeThermo(0, ¶m, &atom);
- computeForce(¶m, &atom, &neighbor, &stats, 1, 0);
+#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(PRINT_STATS)
+ computeForceTracing(¶m, &atom, &neighbor, &stats, 1, 0);
+#else
+ computeForce(¶m, &atom, &neighbor);
+#endif
timer[FORCE] = 0.0;
timer[NEIGH] = 0.0;
@@ -226,7 +231,11 @@ int main (int argc, char** argv)
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
}
- timer[FORCE] += computeForce(¶m, &atom, &neighbor, &stats, 0, n + 1);
+#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(PRINT_STATS)
+ timer[FORCE] += computeForceTracing(¶m, &atom, &neighbor, &stats, 0, n + 1);
+#else
+ timer[FORCE] += computeForce(¶m, &atom, &neighbor);
+#endif
finalIntegrate(¶m, &atom);
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
@@ -251,7 +260,9 @@ int main (int argc, char** argv)
printf(HLINE);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
+#ifdef PRINT_STATS
displayStatistics(&atom, ¶m, &stats, timer);
+#endif
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}