From 3c3d27b48a69f2cbeeb63bf61e5bd65322478edc Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 26 Oct 2021 09:11:17 +0200 Subject: [PATCH] Introduce separate version for traced force routine. --- Makefile | 3 + asm/unused/force.s | 376 ++++++++++++++++++++++++++++++++++++++++++++ config.mk | 6 +- src/force-tracing.c | 232 +++++++++++++++++++++++++++ src/force.c | 222 ++++++-------------------- src/main.c | 17 +- 6 files changed, 677 insertions(+), 179 deletions(-) create mode 100644 asm/unused/force.s create mode 100644 src/force-tracing.c diff --git a/Makefile b/Makefile index da24a29..2b480c0 100644 --- a/Makefile +++ b/Makefile @@ -33,6 +33,9 @@ ifneq ($(NEIGHBORS_LOOP_RUNS),) DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS) endif +ifeq ($(strip $(PRINT_STATS)),true) + DEFINES += -DPRINT_STATS +endif ifeq ($(strip $(EXPLICIT_TYPES)),true) DEFINES += -DEXPLICIT_TYPES endif diff --git a/asm/unused/force.s b/asm/unused/force.s new file mode 100644 index 0000000..11cf275 --- /dev/null +++ b/asm/unused/force.s @@ -0,0 +1,376 @@ +.intel_syntax noprefix + +.text +.align 16,0x90 +.globl computeForce +computeForce: +# parameter 1: rdi Parameter* +# parameter 2: rsi Atom* +# parameter 3: rdx Neighbor* +push r12 +push r13 +push r14 + +# r9d <- atom->Nlocal + mov r9d, DWORD PTR [4+rsi] +# xmm2 <- param->cutforce + vmovsd xmm2, QWORD PTR [72+rdi] +# xmm1 <- param->sigma6 + vmovsd xmm1, QWORD PTR [8+rdi] +# xmm0 <- param->epsilon + vmovsd xmm0, QWORD PTR [rdi] + +# r13 <- atom->fx + mov r13, QWORD PTR [64+rsi] +# r14 <- atom->fy + mov r14, QWORD PTR [72+rsi] +# rdi <- atom->fz + mov rdi, QWORD PTR [80+rsi] + +# atom->Nlocal <= 0 + test r9d, r9d + jle ..B1.30 + +..B1.2: +# r10d <- 0 + xor r10d, r10d +# ecx <- atom->Nlocal + mov ecx, r9d +# r8d <- 0 + xor r8d, r8d +# r11d <- 1 + mov r11d, 1 +# eax <- 0 + xor eax, eax +# ecx <- atom->Nlocal >> 1 + shr ecx, 1 + je ..B1.6 + +# Init forces to zero loop +..B1.4: +# fx[i] <- 0 + mov QWORD PTR [r8+r13], rax +# i++ + inc r10 +# fy[i] <- 0 + mov QWORD PTR [r8+r14], rax +# fz[i] <- 0 + mov QWORD PTR [r8+rdi], rax +# fx[i] <- 0 + mov QWORD PTR [8+r8+r13], rax +# fy[i] <- 0 + mov QWORD PTR [8+r8+r14], rax +# fz[i] <- 0 + mov QWORD PTR [8+r8+rdi], rax +# i++ + add r8, 16 +# i < Nlocal + cmp r10, rcx + jb ..B1.4 + +..B1.5: +# r11d <- i * 2 + 1 + lea r11d, DWORD PTR [1+r10+r10] +..B1.6: +# r11d <- i * 2 + lea ecx, DWORD PTR [-1+r11] +# i < Nlocal + cmp ecx, r9d + jae ..B1.8 + +..B1.7: + +# r11 <- i * 2 + movsxd r11, r11d +# fx[i] <- 0 + mov QWORD PTR [-8+r13+r11*8], rax +# fy[i] <- 0 + mov QWORD PTR [-8+r14+r11*8], rax +# fz[i] <- 0 + mov QWORD PTR [-8+rdi+r11*8], rax + +..B1.8: + +# xmm15 <- cutforcesq + vmulsd xmm15, xmm2, xmm2 +# r8d <- 0 + xor r8d, r8d +# ymm18 <- 8 + vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] +# xmm0 <- 48 * epsilon + vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] +# ymm17 <- [0..7] + vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] +# zmm7 <- 0.5 + vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] +# zmm16 <- cutforcesq + vbroadcastsd zmm16, xmm15 +# zmm15 <- param->sigma6 + vbroadcastsd zmm15, xmm1 +# zmm16 <- 48 * epsilon + vbroadcastsd zmm14, xmm0 +# r9 <- atom->Nlocal + movsxd r9, r9d +# r10d <- 0 (i) + xor r10d, r10d +# rcx <- neighbor->numneigh + mov rcx, QWORD PTR [24+rdx] +# r11 <- neighbor->neighbors + mov r11, QWORD PTR [8+rdx] +# r12 <- neighbor->maxneighs + movsxd r12, DWORD PTR [16+rdx] + +# rdx <- atom->x + mov rdx, QWORD PTR [16+rsi] +# rax <- atom->y + mov rax, QWORD PTR [24+rsi] +# rsi <- atom->z + mov rsi, QWORD PTR [32+rsi] + +# r12 <- neighbor->maxneighs * 4 + shl r12, 2 +# [-32+rsp] <- atom->Nlocal + mov QWORD PTR [-32+rsp], r9 +# [-24+rsp] <- neighbor->numneigh + mov QWORD PTR [-24+rsp], rcx +# [-16+rsp] <- atom->fy + mov QWORD PTR [-16+rsp], r14 +# [-8+rsp] <- atom->fx + mov QWORD PTR [-8+rsp], r13 +# [-40+rsp] <- r15 + mov QWORD PTR [-40+rsp], r15 +# [-48+rsp] <- rbx + mov QWORD PTR [-48+rsp], rbx +# zmm19 <- 0 + vpxord zmm19, zmm19, zmm19 + +# Loop over all atoms +..B1.9: + +# rcx <- neighbor->numneigh + mov rcx, QWORD PTR [-24+rsp] +# xmm25 <- 0 + vxorpd xmm25, xmm25, xmm25 +# xmm20 <- 0 + vmovapd xmm20, xmm25 +# r13d <- neighbor->numneigh[i] (numneighs) + mov r13d, DWORD PTR [rcx+r10*4] +# xmm4 <- 0 + vmovapd xmm4, xmm20 +# xmm8 <- atom->x[i] + vmovsd xmm8, QWORD PTR [rdx+r10*8] +# xmm9 <- atom->y[i] + vmovsd xmm9, QWORD PTR [rax+r10*8] +# xmm9 <- atom->z[i] + vmovsd xmm10, QWORD PTR [rsi+r10*8] +# numneighs <= 0 + test r13d, r13d + jle ..B1.27 + +..B1.10: + +# zmm13 <- 0 + vmovaps zmm13, zmm19 +# zmm12 <- 0 + vmovaps zmm12, zmm13 +# zmm11 <- 0 + vmovaps zmm11, zmm12 + +# numneighs < 8 + cmp r13d, 8 + jl ..B1.32 + +..B1.11: + +# numneighs < 1200 + cmp r13d, 1200 + jl ..B1.31 + +..B1.12: + + mov rcx, r12 + imul rcx, r8 + add rcx, r11 + mov r9, rcx + and r9, 63 + test r9d, 3 + je ..B1.14 + +..B1.13: + + xor r9d, r9d + jmp ..B1.16 + +..B1.14: + + test r9d, r9d + je ..B1.16 + +..B1.15: + + neg r9d + add r9d, 64 + shr r9d, 2 + cmp r13d, r9d + cmovl r9d, r13d + +..B1.16: + + mov ebx, r13d + sub ebx, r9d + and ebx, 7 + neg ebx + add ebx, r13d + cmp r9d, 1 + jb ..B1.20 + +..B1.20: + lea ecx, DWORD PTR [8+r9] + cmp ebx, ecx + jl ..B1.24 + +..B1.21: + mov rcx, r12 + imul rcx, r8 + vbroadcastsd zmm0, xmm8 + vbroadcastsd zmm1, xmm9 + vbroadcastsd zmm2, xmm10 + movsxd r14, r9d + add rcx, r11 + +..B1.22: + vpcmpeqb k2, xmm0, xmm0 + add r9d, 8 + vpcmpeqb k1, xmm0, xmm0 + vpcmpeqb k3, xmm0, xmm0 + vmovdqu ymm3, YMMWORD PTR [rcx+r14*4] + add r14, 8 + vpxord zmm5, zmm5, zmm5 + vpxord zmm4, zmm4, zmm4 + vpxord zmm6, zmm6, zmm6 + + vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8] + vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8] + vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8] + + vsubpd zmm29, zmm1, zmm5 + vsubpd zmm28, zmm0, zmm4 + vsubpd zmm31, zmm2, zmm6 + + vmulpd zmm20, zmm29, zmm29 + vfmadd231pd zmm20, zmm28, zmm28 + vfmadd231pd zmm20, zmm31, zmm31 + +# if condition cutoff radius + vrcp14pd zmm27, zmm20 #-> sr2 + vcmppd k5, zmm20, zmm16, 1 + + vmulpd zmm22, zmm27, zmm15 #-> sr2 * sigma6 + vmulpd zmm24, zmm27, zmm14 #-> 48.0 * epsilon * sr2 + + vmulpd zmm25, zmm27, zmm22 #-> sr2 * sigma6 * sr2 + vmulpd zmm23, zmm27, zmm25 #-> sr2 * sigma6 * sr2 * sr2 + + vfmsub213pd zmm27, zmm25, zmm7 #-> sr2 * sigma * sr2 * sr2 - 0.5 + vmulpd zmm26, zmm23, zmm24 #-> 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2 + vmulpd zmm30, zmm26, zmm27 #-> + + vfmadd231pd zmm13{k5}, zmm30, zmm28 + vfmadd231pd zmm12{k5}, zmm30, zmm29 + vfmadd231pd zmm11{k5}, zmm30, zmm31 + cmp r9d, ebx + jb ..B1.22 +#end neighbor loop + +..B1.26: + vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip] + vpermd zmm0, zmm10, zmm11 + vpermd zmm5, zmm10, zmm12 + vpermd zmm21, zmm10, zmm13 + vaddpd zmm11, zmm0, zmm11 + vaddpd zmm12, zmm5, zmm12 + vaddpd zmm13, zmm21, zmm13 + vpermpd zmm1, zmm11, 78 + vpermpd zmm6, zmm12, 78 + vpermpd zmm22, zmm13, 78 + vaddpd zmm2, zmm11, zmm1 + vaddpd zmm8, zmm12, zmm6 + vaddpd zmm23, zmm13, zmm22 + vpermpd zmm3, zmm2, 177 + vpermpd zmm9, zmm8, 177 + vpermpd zmm24, zmm23, 177 + vaddpd zmm4, zmm2, zmm3 + vaddpd zmm20, zmm8, zmm9 + vaddpd zmm25, zmm23, zmm24 + +#exit function +..B1.27: + mov rcx, QWORD PTR [-8+rsp] #84.9[spill] + mov rbx, QWORD PTR [-16+rsp] #85.9[spill] + movsxd r8, r10d #55.32 + inc r8 #55.32 + vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9 + vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9 + vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9 + vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9 + vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9 + vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9 + inc r10 #55.5 + cmp r10, QWORD PTR [-32+rsp] #55.5[spill] + jb ..B1.9 + + + vzeroupper #93.12 + vxorpd xmm0, xmm0, xmm0 #93.12 + pop r14 #93.12 + pop r13 #93.12 + pop r12 #93.12 + ret #93.12 + +.type computeForce,@function +.size computeForce,.-computeForce + + +..LNcomputeForce.0: + .data +# -- End computeForce + .section .rodata, "a" + .align 64 + .align 64 +.L_2il0floatpacket.2: + .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000 + .type .L_2il0floatpacket.2,@object + .size .L_2il0floatpacket.2,64 + .align 64 +.L_2il0floatpacket.4: + .long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000 + .type .L_2il0floatpacket.4,@object + .size .L_2il0floatpacket.4,64 + .align 64 +.L_2il0floatpacket.6: + .long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f + .type .L_2il0floatpacket.6,@object + .size .L_2il0floatpacket.6,64 + .align 32 +.L_2il0floatpacket.0: + .long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008 + .type .L_2il0floatpacket.0,@object + .size .L_2il0floatpacket.0,32 + .align 32 +.L_2il0floatpacket.1: + .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007 + .type .L_2il0floatpacket.1,@object + .size .L_2il0floatpacket.1,32 + .align 8 +.L_2il0floatpacket.3: + .long 0x00000000,0x40480000 + .type .L_2il0floatpacket.3,@object + .size .L_2il0floatpacket.3,8 + .align 8 +.L_2il0floatpacket.5: + .long 0x00000000,0x3ff00000 + .type .L_2il0floatpacket.5,@object + .size .L_2il0floatpacket.5,8 + .data + .section .note.GNU-stack, "" +# End diff --git a/config.mk b/config.mk index 066274a..312c19d 100644 --- a/config.mk +++ b/config.mk @@ -1,7 +1,7 @@ # Compiler tag (GCC/CLANG/ICC) -TAG ?= ICC +TAG ?= CLANG # Enable likwid (true or false) -ENABLE_LIKWID ?= true +ENABLE_LIKWID ?= false # SP or DP DATA_TYPE ?= DP # AOS or SOA @@ -9,6 +9,8 @@ DATA_LAYOUT ?= AOS # Assembly syntax to generate (ATT/INTEL) ASM_SYNTAX ?= ATT +# Output detailed statistics +PRINT_STATS ?= true # Number of times to run the atoms loop on stubbed variant ATOMS_LOOP_RUNS ?= 1 # Number of times to run the neighbors loop on stubbed variant diff --git a/src/force-tracing.c b/src/force-tracing.c new file mode 100644 index 0000000..b73305a --- /dev/null +++ b/src/force-tracing.c @@ -0,0 +1,232 @@ +/* + * ======================================================================================= + * + * Author: Jan Eitzinger (je), jan.eitzinger@fau.de + * Copyright (c) 2021 RRZE, University Erlangen-Nuremberg + * + * This file is part of MD-Bench. + * + * MD-Bench is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License along + * with MD-Bench. If not, see . + * ======================================================================================= + */ +#include + +#include +#include +#include +#include +#include + +#if defined(MEM_TRACER) || defined(INDEX_TRACER) +#include +#include +#endif + +#ifndef VECTOR_WIDTH +# define VECTOR_WIDTH 8 +#endif + +#ifndef TRACER_CONDITION +# define TRACER_CONDITION (!(timestep % param->every)) +#endif + +#ifdef MEM_TRACER +# define MEM_TRACER_INIT FILE *mem_tracer_fp; \ + if(TRACER_CONDITION) { \ + char mem_tracer_fn[128]; \ + snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \ + mem_tracer_fp = fopen(mem_tracer_fn, "w"); + } + +# define MEM_TRACER_END if(TRACER_CONDITION) { fclose(mem_tracer_fp); } +# define MEM_TRACE(addr, op) if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); } +#else +# define MEM_TRACER_INIT +# define MEM_TRACER_END +# define MEM_TRACE(addr, op) +#endif + +#ifdef INDEX_TRACER +# define INDEX_TRACER_INIT FILE *index_tracer_fp; \ + if(TRACER_CONDITION) { \ + char index_tracer_fn[128]; \ + snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \ + index_tracer_fp = fopen(index_tracer_fn, "w"); \ + } + +# define INDEX_TRACER_END if(TRACER_CONDITION) { fclose(index_tracer_fp); } +# define INDEX_TRACE_NATOMS(nl, ng, mn) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); } +# define INDEX_TRACE_ATOM(a) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); } +# define INDEX_TRACE(l, e) if(TRACER_CONDITION) { \ + for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \ + int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \ + fprintf(index_tracer_fp, "I: "); \ + for(int __j = 0; __j < __e; ++__j) { \ + fprintf(index_tracer_fp, "%d ", l[__i + __j]); \ + } \ + fprintf(index_tracer_fp, "\n"); \ + } \ + } + +# define DIST_TRACE_SORT(l, e) if(TRACER_CONDITION) { \ + for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \ + int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \ + if(__e > 1) { \ + for(int __j = __i; __j < __i + __e - 1; ++__j) { \ + for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \ + if(l[__k] > l[__k + 1]) { \ + int __t = l[__k]; \ + l[__k] = l[__k + 1]; \ + l[__k + 1] = __t; \ + } \ + } \ + } \ + } \ + } \ + } + +# define DIST_TRACE(l, e) if(TRACER_CONDITION) { \ + for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \ + int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \ + if(__e > 1) { \ + fprintf(index_tracer_fp, "D: "); \ + for(int __j = 0; __j < __e - 1; ++__j) { \ + int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \ + fprintf(index_tracer_fp, "%d ", __dist); \ + } \ + fprintf(index_tracer_fp, "\n"); \ + } \ + } \ + } +#else +# define INDEX_TRACER_INIT +# define INDEX_TRACER_END +# define INDEX_TRACE_NATOMS(nl, ng, mn) +# define INDEX_TRACE_ATOM(a) +# define INDEX_TRACE(l, e) +# define DIST_TRACE_SORT(l, e) +# define DIST_TRACE(l, e) +#endif + +double computeForceTracing(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) { + MEM_TRACER_INIT; + INDEX_TRACER_INIT; + int Nlocal = atom->Nlocal; + int* neighs; + MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; + #ifndef EXPLICIT_TYPES + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + #endif + + for(int i = 0; i < Nlocal; i++) { + fx[i] = 0.0; + fy[i] = 0.0; + fz[i] = 0.0; + } + + INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs); + double S = getTimeStamp(); + LIKWID_MARKER_START("force"); + + for(int na = 0; na < (first_exec ? 1 : ATOMS_LOOP_RUNS); na++) { + #pragma omp parallel for + for(int i = 0; i < Nlocal; i++) { + neighs = &neighbor->neighbors[i * neighbor->maxneighs]; + int numneighs = neighbor->numneigh[i]; + MD_FLOAT xtmp = atom_x(i); + MD_FLOAT ytmp = atom_y(i); + MD_FLOAT ztmp = atom_z(i); + MD_FLOAT fix = 0; + MD_FLOAT fiy = 0; + MD_FLOAT fiz = 0; + + MEM_TRACE(atom_x(i), 'R'); + MEM_TRACE(atom_y(i), 'R'); + MEM_TRACE(atom_z(i), 'R'); + INDEX_TRACE_ATOM(i); + + #ifdef EXPLICIT_TYPES + const int type_i = atom->type[i]; + MEM_TRACE(atom->type(i), 'R'); + #endif + + #if defined(VARIANT) && VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1 + #define REPEAT_NEIGHBORS_LOOP + int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS; + for(int nn = 0; nn < (first_exec ? 1 : NEIGHBORS_LOOP_RUNS); nn++) { + #endif + + //DIST_TRACE_SORT(neighs, numneighs); + INDEX_TRACE(neighs, numneighs); + //DIST_TRACE(neighs, numneighs); + + for(int k = 0; k < numneighs; k++) { + int j = neighs[k]; + MD_FLOAT delx = xtmp - atom_x(j); + MD_FLOAT dely = ytmp - atom_y(j); + MD_FLOAT delz = ztmp - atom_z(j); + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + MEM_TRACE(neighs[k], 'R'); + MEM_TRACE(atom_x(j), 'R'); + MEM_TRACE(atom_y(j), 'R'); + MEM_TRACE(atom_z(j), 'R'); + + #ifdef EXPLICIT_TYPES + const int type_j = atom->type[j]; + const int type_ij = type_i * atom->ntypes + type_j; + const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij]; + const MD_FLOAT sigma6 = atom->sigma6[type_ij]; + const MD_FLOAT epsilon = atom->epsilon[type_ij]; + MEM_TRACE(atom->type(j), 'R'); + #endif + + if(rsq < cutforcesq) { + MD_FLOAT sr2 = 1.0 / rsq; + MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; + MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; + fix += delx * force; + fiy += dely * force; + fiz += delz * force; + } + } + + #ifdef REPEAT_NEIGHBORS_LOOP + } + #endif + + fx[i] += fix; + fy[i] += fiy; + fz[i] += fiz; + + addStat(stats->total_force_neighs, numneighs); + addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH); + MEM_TRACE(fx[i], 'R'); + MEM_TRACE(fx[i], 'W'); + MEM_TRACE(fy[i], 'R'); + MEM_TRACE(fy[i], 'W'); + MEM_TRACE(fz[i], 'R'); + MEM_TRACE(fz[i], 'W'); + } + } + + LIKWID_MARKER_STOP("force"); + double E = getTimeStamp(); + + INDEX_TRACER_END; + MEM_TRACER_END; + return E-S; +} diff --git a/src/force.c b/src/force.c index 79ab32f..5ec92e7 100644 --- a/src/force.c +++ b/src/force.c @@ -26,110 +26,23 @@ #include #include #include -#include -#if defined(MEM_TRACER) || defined(INDEX_TRACER) -#include -#include -#endif - -#ifndef VECTOR_WIDTH -# define VECTOR_WIDTH 8 -#endif - -#ifndef TRACER_CONDITION -# define TRACER_CONDITION (!(timestep % param->every)) -#endif - -#ifdef MEM_TRACER -# define MEM_TRACER_INIT FILE *mem_tracer_fp; \ - if(TRACER_CONDITION) { \ - char mem_tracer_fn[128]; \ - snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \ - mem_tracer_fp = fopen(mem_tracer_fn, "w"); - } - -# define MEM_TRACER_END if(TRACER_CONDITION) { fclose(mem_tracer_fp); } -# define MEM_TRACE(addr, op) if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); } -#else -# define MEM_TRACER_INIT -# define MEM_TRACER_END -# define MEM_TRACE(addr, op) -#endif - -#ifdef INDEX_TRACER -# define INDEX_TRACER_INIT FILE *index_tracer_fp; \ - if(TRACER_CONDITION) { \ - char index_tracer_fn[128]; \ - snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \ - index_tracer_fp = fopen(index_tracer_fn, "w"); \ - } - -# define INDEX_TRACER_END if(TRACER_CONDITION) { fclose(index_tracer_fp); } -# define INDEX_TRACE_NATOMS(nl, ng, mn) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); } -# define INDEX_TRACE_ATOM(a) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); } -# define INDEX_TRACE(l, e) if(TRACER_CONDITION) { \ - for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \ - int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \ - fprintf(index_tracer_fp, "I: "); \ - for(int __j = 0; __j < __e; ++__j) { \ - fprintf(index_tracer_fp, "%d ", l[__i + __j]); \ - } \ - fprintf(index_tracer_fp, "\n"); \ - } \ - } - -# define DIST_TRACE_SORT(l, e) if(TRACER_CONDITION) { \ - for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \ - int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \ - if(__e > 1) { \ - for(int __j = __i; __j < __i + __e - 1; ++__j) { \ - for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \ - if(l[__k] > l[__k + 1]) { \ - int __t = l[__k]; \ - l[__k] = l[__k + 1]; \ - l[__k + 1] = __t; \ - } \ - } \ - } \ - } \ - } \ - } - -# define DIST_TRACE(l, e) if(TRACER_CONDITION) { \ - for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \ - int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \ - if(__e > 1) { \ - fprintf(index_tracer_fp, "D: "); \ - for(int __j = 0; __j < __e - 1; ++__j) { \ - int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \ - fprintf(index_tracer_fp, "%d ", __dist); \ - } \ - fprintf(index_tracer_fp, "\n"); \ - } \ - } \ - } -#else -# define INDEX_TRACER_INIT -# define INDEX_TRACER_END -# define INDEX_TRACE_NATOMS(nl, ng, mn) -# define INDEX_TRACE_ATOM(a) -# define INDEX_TRACE(l, e) -# define DIST_TRACE_SORT(l, e) -# define DIST_TRACE(l, e) -#endif - -double computeForce(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) { - MEM_TRACER_INIT; - INDEX_TRACER_INIT; +double computeForce( + Parameter *param, + Atom *atom, + Neighbor *neighbor + ) +{ int Nlocal = atom->Nlocal; int* neighs; - MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; - #ifndef EXPLICIT_TYPES + MD_FLOAT* fx = atom->fx; + MD_FLOAT* fy = atom->fy; + MD_FLOAT* fz = atom->fz; +#ifndef EXPLICIT_TYPES MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT sigma6 = param->sigma6; MD_FLOAT epsilon = param->epsilon; - #endif +#endif for(int i = 0; i < Nlocal; i++) { fx[i] = 0.0; @@ -137,96 +50,57 @@ double computeForce(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *sta fz[i] = 0.0; } - INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs); double S = getTimeStamp(); LIKWID_MARKER_START("force"); - for(int na = 0; na < (first_exec ? 1 : ATOMS_LOOP_RUNS); na++) { - #pragma omp parallel for - for(int i = 0; i < Nlocal; i++) { - neighs = &neighbor->neighbors[i * neighbor->maxneighs]; - int numneighs = neighbor->numneigh[i]; - MD_FLOAT xtmp = atom_x(i); - MD_FLOAT ytmp = atom_y(i); - MD_FLOAT ztmp = atom_z(i); - MD_FLOAT fix = 0; - MD_FLOAT fiy = 0; - MD_FLOAT fiz = 0; +#pragma omp parallel for + for(int i = 0; i < Nlocal; i++) { + neighs = &neighbor->neighbors[i * neighbor->maxneighs]; + int numneighs = neighbor->numneigh[i]; + MD_FLOAT xtmp = atom_x(i); + MD_FLOAT ytmp = atom_y(i); + MD_FLOAT ztmp = atom_z(i); + MD_FLOAT fix = 0; + MD_FLOAT fiy = 0; + MD_FLOAT fiz = 0; - MEM_TRACE(atom_x(i), 'R'); - MEM_TRACE(atom_y(i), 'R'); - MEM_TRACE(atom_z(i), 'R'); - INDEX_TRACE_ATOM(i); +#ifdef EXPLICIT_TYPES + const int type_i = atom->type[i]; +#endif - #ifdef EXPLICIT_TYPES - const int type_i = atom->type[i]; - MEM_TRACE(atom->type(i), 'R'); - #endif - #if defined(VARIANT) && VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1 - #define REPEAT_NEIGHBORS_LOOP - int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS; - for(int nn = 0; nn < (first_exec ? 1 : NEIGHBORS_LOOP_RUNS); nn++) { - #endif + for(int k = 0; k < numneighs; k++) { + int j = neighs[k]; + MD_FLOAT delx = xtmp - atom_x(j); + MD_FLOAT dely = ytmp - atom_y(j); + MD_FLOAT delz = ztmp - atom_z(j); + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; - //DIST_TRACE_SORT(neighs, numneighs); - INDEX_TRACE(neighs, numneighs); - //DIST_TRACE(neighs, numneighs); +#ifdef EXPLICIT_TYPES + const int type_j = atom->type[j]; + const int type_ij = type_i * atom->ntypes + type_j; + const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij]; + const MD_FLOAT sigma6 = atom->sigma6[type_ij]; + const MD_FLOAT epsilon = atom->epsilon[type_ij]; +#endif - for(int k = 0; k < numneighs; k++) { - int j = neighs[k]; - MD_FLOAT delx = xtmp - atom_x(j); - MD_FLOAT dely = ytmp - atom_y(j); - MD_FLOAT delz = ztmp - atom_z(j); - MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; - - MEM_TRACE(neighs[k], 'R'); - MEM_TRACE(atom_x(j), 'R'); - MEM_TRACE(atom_y(j), 'R'); - MEM_TRACE(atom_z(j), 'R'); - - #ifdef EXPLICIT_TYPES - const int type_j = atom->type[j]; - const int type_ij = type_i * atom->ntypes + type_j; - const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij]; - const MD_FLOAT sigma6 = atom->sigma6[type_ij]; - const MD_FLOAT epsilon = atom->epsilon[type_ij]; - MEM_TRACE(atom->type(j), 'R'); - #endif - - if(rsq < cutforcesq) { - MD_FLOAT sr2 = 1.0 / rsq; - MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; - MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; - fix += delx * force; - fiy += dely * force; - fiz += delz * force; - } - } - - #ifdef REPEAT_NEIGHBORS_LOOP + if(rsq < cutforcesq) { + MD_FLOAT sr2 = 1.0 / rsq; + MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; + MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; + fix += delx * force; + fiy += dely * force; + fiz += delz * force; } - #endif - - fx[i] += fix; - fy[i] += fiy; - fz[i] += fiz; - - addStat(stats->total_force_neighs, numneighs); - addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH); - MEM_TRACE(fx[i], 'R'); - MEM_TRACE(fx[i], 'W'); - MEM_TRACE(fy[i], 'R'); - MEM_TRACE(fy[i], 'W'); - MEM_TRACE(fz[i], 'R'); - MEM_TRACE(fz[i], 'W'); } + + fx[i] += fix; + fy[i] += fiy; + fz[i] += fiz; } LIKWID_MARKER_STOP("force"); double E = getTimeStamp(); - INDEX_TRACER_END; - MEM_TRACER_END; return E-S; } diff --git a/src/main.c b/src/main.c index bff4422..703ca9f 100644 --- a/src/main.c +++ b/src/main.c @@ -42,7 +42,8 @@ #define HLINE "----------------------------------------------------------------------------\n" -extern double computeForce(Parameter*, Atom*, Neighbor*, Stats*, int, int); +extern double computeForce(Parameter*, Atom*, Neighbor*); +extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int); void init(Parameter *param) { @@ -211,7 +212,11 @@ int main (int argc, char** argv) setup(¶m, &atom, &neighbor, &stats); computeThermo(0, ¶m, &atom); - computeForce(¶m, &atom, &neighbor, &stats, 1, 0); +#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(PRINT_STATS) + computeForceTracing(¶m, &atom, &neighbor, &stats, 1, 0); +#else + computeForce(¶m, &atom, &neighbor); +#endif timer[FORCE] = 0.0; timer[NEIGH] = 0.0; @@ -226,7 +231,11 @@ int main (int argc, char** argv) timer[NEIGH] += reneighbour(¶m, &atom, &neighbor); } - timer[FORCE] += computeForce(¶m, &atom, &neighbor, &stats, 0, n + 1); +#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(PRINT_STATS) + timer[FORCE] += computeForceTracing(¶m, &atom, &neighbor, &stats, 0, n + 1); +#else + timer[FORCE] += computeForce(¶m, &atom, &neighbor); +#endif finalIntegrate(¶m, &atom); if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { @@ -251,7 +260,9 @@ int main (int argc, char** argv) printf(HLINE); printf("Performance: %.2f million atom updates per second\n", 1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]); +#ifdef PRINT_STATS displayStatistics(&atom, ¶m, &stats, timer); +#endif LIKWID_MARKER_CLOSE; return EXIT_SUCCESS; }