3 Commits

Author SHA1 Message Date
Andropov Arsenii
055a009dbd Neighbor list preparation 2023-05-23 16:25:00 +02:00
Andropov Arsenii
182c065fe2 Neighbor list preparation 2023-05-09 00:44:37 +02:00
Andropov Arsenii
ee3f6de050 Building of super clusters complete, force computation kernel WIP 2023-04-11 02:55:30 +02:00
94 changed files with 4391 additions and 6810 deletions

23
.gitignore vendored
View File

@@ -51,17 +51,14 @@ Module.symvers
Mkfile.old
dkms.conf
# Logs
*.log
# TODO list
todo.txt
# Build directories and executables
#GCC-*/
#ICC-*/
#ICX-*/
#CLANG-*/
#NVCC-*/
build-*/
MDBench-*
GCC/
ICC/
ICX/
CLANG/
NVCC/
MDBench-GCC*
MDBench-ICC*
MDBench-ICX*
MDBench-CLANG*
MDBench-NVCC*

View File

@@ -1,7 +1,6 @@
#CONFIGURE BUILD SYSTEM
IDENTIFIER = $(OPT_SCHEME)-$(TAG)-$(ISA)-$(DATA_TYPE)
TARGET = MDBench-$(IDENTIFIER)
BUILD_DIR = ./build-$(IDENTIFIER)
TARGET = MDBench-$(TAG)-$(OPT_SCHEME)
BUILD_DIR = ./$(TAG)-$(OPT_SCHEME)
SRC_DIR = ./$(OPT_SCHEME)
ASM_DIR = ./asm
COMMON_DIR = ./common
@@ -17,9 +16,6 @@ include $(MAKE_DIR)/include_ISA.mk
include $(MAKE_DIR)/include_GROMACS.mk
INCLUDES += -I./$(SRC_DIR)/includes -I./$(COMMON_DIR)/includes
ifeq ($(strip $(OPT_SCHEME)),gromacs)
DEFINES += -DGROMACS
endif
ifeq ($(strip $(DATA_LAYOUT)),AOS)
DEFINES += -DAOS
endif
@@ -33,10 +29,6 @@ ifneq ($(ASM_SYNTAX), ATT)
ASFLAGS += -masm=intel
endif
ifeq ($(strip $(SORT_ATOMS)),true)
DEFINES += -DSORT_ATOMS
endif
ifeq ($(strip $(EXPLICIT_TYPES)),true)
DEFINES += -DEXPLICIT_TYPES
endif
@@ -105,6 +97,10 @@ ifeq ($(strip $(USE_SIMD_KERNEL)),true)
DEFINES += -DUSE_SIMD_KERNEL
endif
ifeq ($(strip $(USE_SUPER_CLUSTERS)),true)
DEFINES += -DUSE_SUPER_CLUSTERS
endif
VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR)
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
@@ -159,13 +155,6 @@ $(BUILD_DIR)/%.o: %.s
clean:
$(info ===> CLEAN)
@rm -rf $(BUILD_DIR)
@rm -rf $(TARGET)*
@rm -f tags
cleanall:
$(info ===> CLEAN)
@rm -rf build-*
@rm -rf MDBench-*
@rm -f tags
distclean: clean

0
asm/.gitkeep Normal file
View File

View File

@@ -0,0 +1,626 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
# mark_description "-I/mnt/opt/likwid-5.2-dev/include -I./src/includes -S -D_GNU_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DN";
# mark_description "EIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ";
# mark_description "ICC/force.s";
.file "force.c"
.text
..TXTST0:
.L_2__routine_start_computeForce_0:
# -- Begin computeForce
.text
# mark_begin;
.align 16,0x90
.globl computeForce
# --- computeForce(Parameter *, Atom *, Neighbor *, int, int, int)
computeForce:
# parameter 1: %rdi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %ecx
# parameter 5: %r8d
# parameter 6: %r9d
..B1.1: # Preds ..B1.0
# Execution count [1.00e+00]
.cfi_startproc
..___tag_value_computeForce.1:
..L2:
#121.112
pushq %rbp #121.112
.cfi_def_cfa_offset 16
movq %rsp, %rbp #121.112
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-64, %rsp #121.112
pushq %r12 #121.112
pushq %r13 #121.112
pushq %r14 #121.112
pushq %r15 #121.112
pushq %rbx #121.112
subq $88, %rsp #121.112
xorl %eax, %eax #124.16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
movq %rdx, %r15 #121.112
movq %rsi, %r12 #121.112
movq %rdi, %rbx #121.112
..___tag_value_computeForce.11:
# getTimeStamp()
call getTimeStamp #124.16
..___tag_value_computeForce.12:
# LOE rbx r12 r15 xmm0
..B1.51: # Preds ..B1.1
# Execution count [1.00e+00]
vmovsd %xmm0, 24(%rsp) #124.16[spill]
# LOE rbx r12 r15
..B1.2: # Preds ..B1.51
# Execution count [1.00e+00]
movl 4(%r12), %r13d #125.18
movq 64(%r12), %r9 #127.20
movq 72(%r12), %r14 #127.45
movq 80(%r12), %r8 #127.70
vmovsd 72(%rbx), %xmm2 #129.27
vmovsd 8(%rbx), %xmm1 #130.23
vmovsd (%rbx), %xmm0 #131.24
testl %r13d, %r13d #134.24
jle ..B1.43 # Prob 50% #134.24
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.3: # Preds ..B1.2
# Execution count [1.00e+00]
xorl %ebx, %ebx #134.5
movl %r13d, %edx #134.5
xorl %ecx, %ecx #134.5
movl $1, %esi #134.5
xorl %eax, %eax #135.17
shrl $1, %edx #134.5
je ..B1.7 # Prob 9% #134.5
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.5: # Preds ..B1.3 ..B1.5
# Execution count [2.50e+00]
movq %rax, (%rcx,%r9) #135.9
incq %rbx #134.5
movq %rax, (%rcx,%r14) #136.9
movq %rax, (%rcx,%r8) #137.9
movq %rax, 8(%rcx,%r9) #135.9
movq %rax, 8(%rcx,%r14) #136.9
movq %rax, 8(%rcx,%r8) #137.9
addq $16, %rcx #134.5
cmpq %rdx, %rbx #134.5
jb ..B1.5 # Prob 63% #134.5
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.6: # Preds ..B1.5
# Execution count [9.00e-01]
lea 1(%rbx,%rbx), %esi #135.9
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.7: # Preds ..B1.3 ..B1.6
# Execution count [1.00e+00]
lea -1(%rsi), %edx #134.5
cmpl %r13d, %edx #134.5
jae ..B1.9 # Prob 9% #134.5
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.8: # Preds ..B1.7
# Execution count [9.00e-01]
movslq %esi, %rsi #134.5
movq %rax, -8(%r9,%rsi,8) #135.9
movq %rax, -8(%r14,%rsi,8) #136.9
movq %rax, -8(%r8,%rsi,8) #137.9
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.9: # Preds ..B1.7 ..B1.8
# Execution count [5.00e-01]
movl $.L_2__STRING.0, %edi #141.5
movq %r8, 32(%rsp) #141.5[spill]
movq %r9, 80(%rsp) #141.5[spill]
vmovsd %xmm2, (%rsp) #141.5[spill]
vmovsd %xmm1, 8(%rsp) #141.5[spill]
vmovsd %xmm0, 16(%rsp) #141.5[spill]
..___tag_value_computeForce.18:
# likwid_markerStartRegion(const char *)
call likwid_markerStartRegion #141.5
..___tag_value_computeForce.19:
# LOE r12 r14 r15 r13d
..B1.10: # Preds ..B1.9
# Execution count [9.00e-01]
vmovsd 16(%rsp), %xmm0 #[spill]
xorl %esi, %esi #143.15
vmovsd (%rsp), %xmm2 #[spill]
xorl %eax, %eax #143.5
vmulsd %xmm2, %xmm2, %xmm13 #129.45
xorl %edi, %edi #143.5
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #173.13
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #197.45
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #173.13
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #197.58
vmovsd 8(%rsp), %xmm1 #[spill]
vbroadcastsd %xmm13, %zmm14 #129.25
vbroadcastsd %xmm1, %zmm13 #130.21
vbroadcastsd %xmm0, %zmm9 #197.45
movslq %r13d, %r13 #143.5
movq 24(%r15), %r10 #145.25
movslq 16(%r15), %rdx #144.43
movq 8(%r15), %rcx #144.19
movq 32(%rsp), %r8 #[spill]
movq 16(%r12), %rbx #146.25
shlq $2, %rdx #126.5
movq %r13, 64(%rsp) #143.5[spill]
movq %r10, 72(%rsp) #143.5[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.11: # Preds ..B1.41 ..B1.10
# Execution count [5.00e+00]
movq 72(%rsp), %r9 #145.25[spill]
vxorpd %xmm24, %xmm24, %xmm24 #149.22
vmovapd %xmm24, %xmm18 #150.22
movl (%r9,%rax,4), %r10d #145.25
vmovapd %xmm18, %xmm4 #151.22
vmovsd (%rdi,%rbx), %xmm10 #146.25
vmovsd 8(%rdi,%rbx), %xmm6 #147.25
vmovsd 16(%rdi,%rbx), %xmm12 #148.25
testl %r10d, %r10d #173.32
jle ..B1.41 # Prob 50% #173.32
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.12: # Preds ..B1.11
# Execution count [4.50e+00]
vpxord %zmm8, %zmm8, %zmm8 #149.22
vmovaps %zmm8, %zmm7 #150.22
vmovaps %zmm7, %zmm11 #151.22
cmpl $8, %r10d #173.13
jl ..B1.48 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.13: # Preds ..B1.12
# Execution count [4.50e+00]
cmpl $1200, %r10d #173.13
jl ..B1.47 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.14: # Preds ..B1.13
# Execution count [4.50e+00]
movq %rdx, %r15 #144.43
imulq %rsi, %r15 #144.43
addq %rcx, %r15 #126.5
movq %r15, %r11 #173.13
andq $63, %r11 #173.13
testl $3, %r11d #173.13
je ..B1.16 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.15: # Preds ..B1.14
# Execution count [2.25e+00]
xorl %r11d, %r11d #173.13
jmp ..B1.18 # Prob 100% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.16: # Preds ..B1.14
# Execution count [2.25e+00]
testl %r11d, %r11d #173.13
je ..B1.18 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.17: # Preds ..B1.16
# Execution count [2.50e+01]
negl %r11d #173.13
addl $64, %r11d #173.13
shrl $2, %r11d #173.13
cmpl %r11d, %r10d #173.13
cmovl %r10d, %r11d #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.18: # Preds ..B1.15 ..B1.17 ..B1.16
# Execution count [5.00e+00]
movl %r10d, %r13d #173.13
subl %r11d, %r13d #173.13
andl $7, %r13d #173.13
negl %r13d #173.13
addl %r10d, %r13d #173.13
cmpl $1, %r11d #173.13
jb ..B1.26 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.19: # Preds ..B1.18
# Execution count [4.50e+00]
vmovdqa %ymm15, %ymm4 #173.13
xorl %r12d, %r12d #173.13
vpbroadcastd %r11d, %ymm3 #173.13
vbroadcastsd %xmm10, %zmm2 #146.23
vbroadcastsd %xmm6, %zmm1 #147.23
vbroadcastsd %xmm12, %zmm0 #148.23
movslq %r11d, %r9 #173.13
movq %r8, 32(%rsp) #173.13[spill]
movq %r14, (%rsp) #173.13[spill]
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.20: # Preds ..B1.24 ..B1.19
# Execution count [2.50e+01]
vpcmpgtd %ymm4, %ymm3, %k3 #173.13
vmovdqu32 (%r15,%r12,4), %ymm17{%k3}{z} #174.25
kmovw %k3, %r14d #173.13
vpaddd %ymm17, %ymm17, %ymm18 #175.40
vpaddd %ymm18, %ymm17, %ymm17 #175.40
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.23: # Preds ..B1.20
# Execution count [1.25e+01]
kmovw %k3, %k1 #175.40
kmovw %k3, %k2 #175.40
vpxord %zmm18, %zmm18, %zmm18 #175.40
vpxord %zmm19, %zmm19, %zmm19 #175.40
vpxord %zmm20, %zmm20, %zmm20 #175.40
vgatherdpd 16(%rbx,%ymm17,8), %zmm18{%k1} #175.40
vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2} #175.40
vgatherdpd (%rbx,%ymm17,8), %zmm20{%k3} #175.40
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
..B1.24: # Preds ..B1.23
# Execution count [2.50e+01]
addq $8, %r12 #173.13
#vpaddd %ymm16, %ymm4, %ymm4 #173.13
#vsubpd %zmm18, %zmm0, %zmm29 #177.40
#vsubpd %zmm19, %zmm1, %zmm27 #176.40
#vsubpd %zmm20, %zmm2, %zmm26 #175.40
#vmulpd %zmm27, %zmm27, %zmm25 #178.53
#vfmadd231pd %zmm26, %zmm26, %zmm25 #178.53
#vfmadd231pd %zmm29, %zmm29, %zmm25 #178.67
#vrcp14pd %zmm25, %zmm24 #195.42
#vcmppd $1, %zmm14, %zmm25, %k2 #194.26
#vfpclasspd $30, %zmm24, %k0 #195.42
#kmovw %k2, %r8d #194.26
#knotw %k0, %k1 #195.42
#vmovaps %zmm25, %zmm17 #195.42
#andl %r8d, %r14d #194.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #195.42
#kmovw %r14d, %k3 #198.21
#vmulpd %zmm17, %zmm17, %zmm18 #195.42
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #195.42
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #195.42
#vmulpd %zmm13, %zmm24, %zmm19 #196.42
#vmulpd %zmm9, %zmm24, %zmm21 #197.58
#vmulpd %zmm19, %zmm24, %zmm22 #196.48
#vmulpd %zmm22, %zmm24, %zmm20 #196.54
#vfmsub213pd %zmm5, %zmm22, %zmm24 #197.58
#vmulpd %zmm21, %zmm20, %zmm23 #197.65
#vmulpd %zmm24, %zmm23, %zmm28 #197.71
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #198.21
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #199.21
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #200.21
cmpq %r9, %r12 #173.13
jb ..B1.20 # Prob 82% #173.13
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.25: # Preds ..B1.24
# Execution count [4.50e+00]
movq 32(%rsp), %r8 #[spill]
movq (%rsp), %r14 #[spill]
cmpl %r11d, %r10d #173.13
je ..B1.40 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.26: # Preds ..B1.25 ..B1.18 ..B1.47
# Execution count [2.50e+01]
lea 8(%r11), %r9d #173.13
cmpl %r9d, %r13d #173.13
jl ..B1.34 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.27: # Preds ..B1.26
# Execution count [4.50e+00]
movq %rdx, %r12 #144.43
imulq %rsi, %r12 #144.43
vbroadcastsd %xmm10, %zmm1 #146.23
vbroadcastsd %xmm6, %zmm0 #147.23
vbroadcastsd %xmm12, %zmm2 #148.23
movslq %r11d, %r9 #173.13
addq %rcx, %r12 #126.5
movq %rdi, 8(%rsp) #126.5[spill]
movq %rdx, 16(%rsp) #126.5[spill]
movq %rcx, 40(%rsp) #126.5[spill]
movq %rax, 48(%rsp) #126.5[spill]
movq %rsi, 56(%rsp) #126.5[spill]
movq %r8, 32(%rsp) #126.5[spill]
movq %r14, (%rsp) #126.5[spill]
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.28: # Preds ..B1.32 ..B1.27
# Execution count [2.50e+01]
vmovdqu (%r12,%r9,4), %ymm3 #174.25
vpaddd %ymm3, %ymm3, %ymm4 #175.40
vpaddd %ymm4, %ymm3, %ymm3 #175.40
movl (%r12,%r9,4), %r14d #174.25
movl 4(%r12,%r9,4), %r8d #174.25
movl 8(%r12,%r9,4), %edi #174.25
movl 12(%r12,%r9,4), %esi #174.25
lea (%r14,%r14,2), %r14d #175.40
movl 16(%r12,%r9,4), %ecx #174.25
lea (%r8,%r8,2), %r8d #175.40
movl 20(%r12,%r9,4), %edx #174.25
lea (%rdi,%rdi,2), %edi #175.40
movl 24(%r12,%r9,4), %eax #174.25
lea (%rsi,%rsi,2), %esi #175.40
movl 28(%r12,%r9,4), %r15d #174.25
lea (%rcx,%rcx,2), %ecx #175.40
lea (%rdx,%rdx,2), %edx #175.40
lea (%rax,%rax,2), %eax #175.40
lea (%r15,%r15,2), %r15d #175.40
# LOE rbx r9 r12 eax edx ecx esi edi r8d r10d r11d r13d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.31: # Preds ..B1.28
# Execution count [1.25e+01]
vpcmpeqb %xmm0, %xmm0, %k1 #175.40
vpcmpeqb %xmm0, %xmm0, %k2 #175.40
vpcmpeqb %xmm0, %xmm0, %k3 #175.40
vpxord %zmm4, %zmm4, %zmm4 #175.40
vpxord %zmm17, %zmm17, %zmm17 #175.40
vpxord %zmm18, %zmm18, %zmm18 #175.40
vgatherdpd 16(%rbx,%ymm3,8), %zmm4{%k1} #175.40
vgatherdpd 8(%rbx,%ymm3,8), %zmm17{%k2} #175.40
vgatherdpd (%rbx,%ymm3,8), %zmm18{%k3} #175.40
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
..B1.32: # Preds ..B1.31
# Execution count [2.50e+01]
addl $8, %r11d #173.13
addq $8, %r9 #173.13
#vsubpd %zmm4, %zmm2, %zmm26 #177.40
#vsubpd %zmm17, %zmm0, %zmm24 #176.40
#vsubpd %zmm18, %zmm1, %zmm23 #175.40
#vmulpd %zmm24, %zmm24, %zmm3 #178.53
#vfmadd231pd %zmm23, %zmm23, %zmm3 #178.53
#vfmadd231pd %zmm26, %zmm26, %zmm3 #178.67
#vrcp14pd %zmm3, %zmm22 #195.42
#vcmppd $1, %zmm14, %zmm3, %k2 #194.26
#vfpclasspd $30, %zmm22, %k0 #195.42
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #195.42
#knotw %k0, %k1 #195.42
#vmulpd %zmm3, %zmm3, %zmm4 #195.42
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #195.42
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #195.42
#vmulpd %zmm13, %zmm22, %zmm17 #196.42
#vmulpd %zmm9, %zmm22, %zmm19 #197.58
#vmulpd %zmm17, %zmm22, %zmm20 #196.48
#vmulpd %zmm20, %zmm22, %zmm18 #196.54
#vfmsub213pd %zmm5, %zmm20, %zmm22 #197.58
#vmulpd %zmm19, %zmm18, %zmm21 #197.65
#vmulpd %zmm22, %zmm21, %zmm25 #197.71
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #198.21
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #199.21
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #200.21
cmpl %r13d, %r11d #173.13
jb ..B1.28 # Prob 82% #173.13
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.33: # Preds ..B1.32
# Execution count [4.50e+00]
movq 8(%rsp), %rdi #[spill]
movq 16(%rsp), %rdx #[spill]
movq 40(%rsp), %rcx #[spill]
movq 48(%rsp), %rax #[spill]
movq 56(%rsp), %rsi #[spill]
movq 32(%rsp), %r8 #[spill]
movq (%rsp), %r14 #[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.34: # Preds ..B1.33 ..B1.26 ..B1.48
# Execution count [5.00e+00]
lea 1(%r13), %r9d #173.13
cmpl %r10d, %r9d #173.13
ja ..B1.40 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.35: # Preds ..B1.34
# Execution count [2.50e+01]
imulq %rdx, %rsi #144.43
vbroadcastsd %xmm10, %zmm4 #146.23
subl %r13d, %r10d #173.13
addq %rcx, %rsi #126.5
vpbroadcastd %r10d, %ymm0 #173.13
vpcmpgtd %ymm15, %ymm0, %k3 #173.13
movslq %r13d, %r13 #173.13
kmovw %k3, %r9d #173.13
vmovdqu32 (%rsi,%r13,4), %ymm1{%k3}{z} #174.25
vpaddd %ymm1, %ymm1, %ymm2 #175.40
vpaddd %ymm2, %ymm1, %ymm0 #175.40
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.38: # Preds ..B1.35
# Execution count [1.25e+01]
kmovw %k3, %k1 #175.40
kmovw %k3, %k2 #175.40
vpxord %zmm1, %zmm1, %zmm1 #175.40
vpxord %zmm2, %zmm2, %zmm2 #175.40
vpxord %zmm3, %zmm3, %zmm3 #175.40
vgatherdpd 16(%rbx,%ymm0,8), %zmm1{%k1} #175.40
vgatherdpd 8(%rbx,%ymm0,8), %zmm2{%k2} #175.40
vgatherdpd (%rbx,%ymm0,8), %zmm3{%k3} #175.40
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.39: # Preds ..B1.38
# Execution count [2.50e+01]
#vbroadcastsd %xmm6, %zmm6 #147.23
#vbroadcastsd %xmm12, %zmm12 #148.23
#vsubpd %zmm1, %zmm12, %zmm23 #177.40
#vsubpd %zmm2, %zmm6, %zmm21 #176.40
#vsubpd %zmm3, %zmm4, %zmm20 #175.40
#vmulpd %zmm21, %zmm21, %zmm19 #178.53
#vfmadd231pd %zmm20, %zmm20, %zmm19 #178.53
#vfmadd231pd %zmm23, %zmm23, %zmm19 #178.67
#vrcp14pd %zmm19, %zmm18 #195.42
#vcmppd $1, %zmm14, %zmm19, %k2 #194.26
#vfpclasspd $30, %zmm18, %k0 #195.42
#kmovw %k2, %esi #194.26
#knotw %k0, %k1 #195.42
#vmovaps %zmm19, %zmm0 #195.42
#andl %esi, %r9d #194.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #195.42
#kmovw %r9d, %k3 #198.21
#vmulpd %zmm0, %zmm0, %zmm1 #195.42
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #195.42
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #195.42
#vmulpd %zmm13, %zmm18, %zmm2 #196.42
#vmulpd %zmm9, %zmm18, %zmm4 #197.58
#vmulpd %zmm2, %zmm18, %zmm10 #196.48
#vmulpd %zmm10, %zmm18, %zmm3 #196.54
#vfmsub213pd %zmm5, %zmm10, %zmm18 #197.58
#vmulpd %zmm4, %zmm3, %zmm17 #197.65
#vmulpd %zmm18, %zmm17, %zmm22 #197.71
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #198.21
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #199.21
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #200.21
# LOE rax rdx rcx rbx rdi r8 r14 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.40: # Preds ..B1.25 ..B1.39 ..B1.34
# Execution count [4.50e+00]
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #151.22
vpermd %zmm11, %zmm19, %zmm0 #151.22
vpermd %zmm7, %zmm19, %zmm6 #150.22
vpermd %zmm8, %zmm19, %zmm20 #149.22
vaddpd %zmm11, %zmm0, %zmm11 #151.22
vaddpd %zmm7, %zmm6, %zmm7 #150.22
vaddpd %zmm8, %zmm20, %zmm8 #149.22
vpermpd $78, %zmm11, %zmm1 #151.22
vpermpd $78, %zmm7, %zmm10 #150.22
vpermpd $78, %zmm8, %zmm21 #149.22
vaddpd %zmm1, %zmm11, %zmm2 #151.22
vaddpd %zmm10, %zmm7, %zmm12 #150.22
vaddpd %zmm21, %zmm8, %zmm22 #149.22
vpermpd $177, %zmm2, %zmm3 #151.22
vpermpd $177, %zmm12, %zmm17 #150.22
vpermpd $177, %zmm22, %zmm23 #149.22
vaddpd %zmm3, %zmm2, %zmm4 #151.22
vaddpd %zmm17, %zmm12, %zmm18 #150.22
vaddpd %zmm23, %zmm22, %zmm24 #149.22
# LOE rax rdx rcx rbx rdi r8 r14 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.41: # Preds ..B1.40 ..B1.11
# Execution count [5.00e+00]
movq 80(%rsp), %rsi #208.9[spill]
addq $24, %rdi #143.5
vaddsd (%rsi,%rax,8), %xmm24, %xmm0 #208.9
vmovsd %xmm0, (%rsi,%rax,8) #208.9
movslq %eax, %rsi #143.32
vaddsd (%r14,%rax,8), %xmm18, %xmm1 #209.9
vmovsd %xmm1, (%r14,%rax,8) #209.9
incq %rsi #143.32
vaddsd (%r8,%rax,8), %xmm4, %xmm2 #210.9
vmovsd %xmm2, (%r8,%rax,8) #210.9
incq %rax #143.5
cmpq 64(%rsp), %rax #143.5[spill]
jb ..B1.11 # Prob 82% #143.5
jmp ..B1.44 # Prob 100% #143.5
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.43: # Preds ..B1.2
# Execution count [5.00e-01]
movl $.L_2__STRING.0, %edi #141.5
..___tag_value_computeForce.48:
# likwid_markerStartRegion(const char *)
call likwid_markerStartRegion #141.5
..___tag_value_computeForce.49:
# LOE
..B1.44: # Preds ..B1.41 ..B1.43
# Execution count [1.00e+00]
movl $.L_2__STRING.0, %edi #219.5
vzeroupper #219.5
..___tag_value_computeForce.50:
# likwid_markerStopRegion(const char *)
call likwid_markerStopRegion #219.5
..___tag_value_computeForce.51:
# LOE
..B1.45: # Preds ..B1.44
# Execution count [1.00e+00]
xorl %eax, %eax #221.16
..___tag_value_computeForce.52:
# getTimeStamp()
call getTimeStamp #221.16
..___tag_value_computeForce.53:
# LOE xmm0
..B1.46: # Preds ..B1.45
# Execution count [1.00e+00]
vsubsd 24(%rsp), %xmm0, %xmm0 #224.14[spill]
addq $88, %rsp #224.14
.cfi_restore 3
popq %rbx #224.14
.cfi_restore 15
popq %r15 #224.14
.cfi_restore 14
popq %r14 #224.14
.cfi_restore 13
popq %r13 #224.14
.cfi_restore 12
popq %r12 #224.14
movq %rbp, %rsp #224.14
popq %rbp #224.14
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #224.14
.cfi_def_cfa 6, 16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
.cfi_offset 6, -16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
# LOE
..B1.47: # Preds ..B1.13
# Execution count [4.50e-01]: Infreq
movl %r10d, %r13d #173.13
xorl %r11d, %r11d #173.13
andl $-8, %r13d #173.13
jmp ..B1.26 # Prob 100% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.48: # Preds ..B1.12
# Execution count [4.50e-01]: Infreq
xorl %r13d, %r13d #173.13
jmp ..B1.34 # Prob 100% #173.13
.align 16,0x90
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
.cfi_endproc
# mark_end;
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.5:
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 64
.L_2il0floatpacket.7:
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
.type .L_2il0floatpacket.7,@object
.size .L_2il0floatpacket.7,64
.align 64
.L_2il0floatpacket.8:
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
.type .L_2il0floatpacket.8,@object
.size .L_2il0floatpacket.8,64
.align 64
.L_2il0floatpacket.10:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.10,@object
.size .L_2il0floatpacket.10,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.9:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.9,@object
.size .L_2il0floatpacket.9,8
.section .rodata.str1.4, "aMS",@progbits,1
.align 4
.align 4
.L_2__STRING.0:
.long 1668444006
.word 101
.type .L_2__STRING.0,@object
.size .L_2__STRING.0,6
.data
.section .note.GNU-stack, ""
# End

585
asm/unused/force-mem-only.s Normal file
View File

@@ -0,0 +1,585 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
# mark_description "-I./src/includes -S -D_GNU_SOURCE -DAOS -DPRECISION=2 -DNEIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=6";
# mark_description "4 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ICC/force.s";
.file "force.c"
.text
..TXTST0:
.L_2__routine_start_computeForce_0:
# -- Begin computeForce
.text
# mark_begin;
.align 16,0x90
.globl computeForce
# --- computeForce(Parameter *, Atom *, Neighbor *, int)
computeForce:
# parameter 1: %rdi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %ecx
..B1.1: # Preds ..B1.0
# Execution count [1.00e+00]
.cfi_startproc
..___tag_value_computeForce.1:
..L2:
#103.87
pushq %rbp #103.87
.cfi_def_cfa_offset 16
movq %rsp, %rbp #103.87
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-64, %rsp #103.87
pushq %r12 #103.87
pushq %r13 #103.87
pushq %r14 #103.87
subq $104, %rsp #103.87
xorl %eax, %eax #106.16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
movq %rdx, %r14 #103.87
movq %rsi, %r13 #103.87
movq %rdi, %r12 #103.87
..___tag_value_computeForce.9:
# getTimeStamp()
call getTimeStamp #106.16
..___tag_value_computeForce.10:
# LOE rbx r12 r13 r14 r15 xmm0
..B1.48: # Preds ..B1.1
# Execution count [1.00e+00]
vmovsd %xmm0, 16(%rsp) #106.16[spill]
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.48
# Execution count [1.00e+00]
movl 4(%r13), %ecx #107.18
movq 64(%r13), %r11 #109.20
movq 72(%r13), %r10 #109.45
movq 80(%r13), %r9 #109.70
vmovsd 72(%r12), %xmm2 #111.27
vmovsd 8(%r12), %xmm1 #112.23
vmovsd (%r12), %xmm0 #113.24
testl %ecx, %ecx #116.24
jle ..B1.42 # Prob 50% #116.24
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.3: # Preds ..B1.2
# Execution count [1.00e+00]
xorl %edi, %edi #116.5
movl %ecx, %edx #116.5
xorl %esi, %esi #116.5
movl $1, %r8d #116.5
xorl %eax, %eax #117.17
shrl $1, %edx #116.5
je ..B1.7 # Prob 9% #116.5
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.5: # Preds ..B1.3 ..B1.5
# Execution count [2.50e+00]
movq %rax, (%rsi,%r11) #117.9
incq %rdi #116.5
movq %rax, (%rsi,%r10) #118.9
movq %rax, (%rsi,%r9) #119.9
movq %rax, 8(%rsi,%r11) #117.9
movq %rax, 8(%rsi,%r10) #118.9
movq %rax, 8(%rsi,%r9) #119.9
addq $16, %rsi #116.5
cmpq %rdx, %rdi #116.5
jb ..B1.5 # Prob 63% #116.5
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.6: # Preds ..B1.5
# Execution count [9.00e-01]
lea 1(%rdi,%rdi), %r8d #117.9
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.7: # Preds ..B1.3 ..B1.6
# Execution count [1.00e+00]
lea -1(%r8), %edx #116.5
cmpl %ecx, %edx #116.5
jae ..B1.9 # Prob 9% #116.5
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.8: # Preds ..B1.7
# Execution count [9.00e-01]
movslq %r8d, %r8 #116.5
movq %rax, -8(%r11,%r8,8) #117.9
movq %rax, -8(%r10,%r8,8) #118.9
movq %rax, -8(%r9,%r8,8) #119.9
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.9: # Preds ..B1.7 ..B1.8
# Execution count [9.00e-01]
vmulsd %xmm2, %xmm2, %xmm13 #111.45
xorl %edi, %edi #124.15
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #153.13
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #177.45
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #153.13
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #177.58
vbroadcastsd %xmm13, %zmm14 #111.25
vbroadcastsd %xmm1, %zmm13 #112.21
vbroadcastsd %xmm0, %zmm9 #177.45
movq 16(%r13), %rdx #127.25
xorl %r8d, %r8d #124.5
movslq %ecx, %r12 #124.5
xorl %eax, %eax #124.5
movq 24(%r14), %r13 #126.25
movslq 16(%r14), %rcx #125.43
movq 8(%r14), %rsi #125.19
shlq $2, %rcx #108.5
movq %r12, 80(%rsp) #124.5[spill]
movq %r13, 88(%rsp) #124.5[spill]
movq %r11, 96(%rsp) #124.5[spill]
movq %r15, 8(%rsp) #124.5[spill]
movq %rbx, (%rsp) #124.5[spill]
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.10: # Preds ..B1.40 ..B1.9
# Execution count [5.00e+00]
movq 88(%rsp), %rbx #126.25[spill]
vxorpd %xmm24, %xmm24, %xmm24 #130.22
vmovapd %xmm24, %xmm18 #131.22
movl (%rbx,%r8,4), %r11d #126.25
vmovapd %xmm18, %xmm4 #132.22
vmovsd (%rax,%rdx), %xmm10 #127.25
vmovsd 8(%rax,%rdx), %xmm6 #128.25
vmovsd 16(%rax,%rdx), %xmm12 #129.25
testl %r11d, %r11d #153.32
jle ..B1.40 # Prob 50% #153.32
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.11: # Preds ..B1.10
# Execution count [4.50e+00]
vpxord %zmm8, %zmm8, %zmm8 #130.22
vmovaps %zmm8, %zmm7 #131.22
vmovaps %zmm7, %zmm11 #132.22
cmpl $8, %r11d #153.13
jl ..B1.45 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.12: # Preds ..B1.11
# Execution count [4.50e+00]
cmpl $1200, %r11d #153.13
jl ..B1.44 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.13: # Preds ..B1.12
# Execution count [4.50e+00]
movq %rcx, %r15 #125.43
imulq %rdi, %r15 #125.43
addq %rsi, %r15 #108.5
movq %r15, %r12 #153.13
andq $63, %r12 #153.13
testl $3, %r12d #153.13
je ..B1.15 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.14: # Preds ..B1.13
# Execution count [2.25e+00]
xorl %r12d, %r12d #153.13
jmp ..B1.17 # Prob 100% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.15: # Preds ..B1.13
# Execution count [2.25e+00]
testl %r12d, %r12d #153.13
je ..B1.17 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.16: # Preds ..B1.15
# Execution count [2.50e+01]
negl %r12d #153.13
addl $64, %r12d #153.13
shrl $2, %r12d #153.13
cmpl %r12d, %r11d #153.13
cmovl %r11d, %r12d #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.17: # Preds ..B1.14 ..B1.16 ..B1.15
# Execution count [5.00e+00]
movl %r11d, %r14d #153.13
subl %r12d, %r14d #153.13
andl $7, %r14d #153.13
negl %r14d #153.13
addl %r11d, %r14d #153.13
cmpl $1, %r12d #153.13
jb ..B1.25 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.18: # Preds ..B1.17
# Execution count [4.50e+00]
vmovdqa %ymm15, %ymm4 #153.13
xorl %r13d, %r13d #153.13
vpbroadcastd %r12d, %ymm3 #153.13
vbroadcastsd %xmm10, %zmm2 #127.23
vbroadcastsd %xmm6, %zmm1 #128.23
vbroadcastsd %xmm12, %zmm0 #129.23
movslq %r12d, %rbx #153.13
movq %r9, 24(%rsp) #153.13[spill]
movq %r10, 32(%rsp) #153.13[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.19: # Preds ..B1.23 ..B1.18
# Execution count [2.50e+01]
vpcmpgtd %ymm4, %ymm3, %k3 #153.13
vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z} #154.25
kmovw %k3, %r10d #153.13
vpaddd %ymm17, %ymm17, %ymm18 #155.40
vpaddd %ymm18, %ymm17, %ymm17 #155.40
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.22: # Preds ..B1.19
# Execution count [1.25e+01]
kmovw %k3, %k1 #155.40
kmovw %k3, %k2 #155.40
vpxord %zmm18, %zmm18, %zmm18 #155.40
vpxord %zmm19, %zmm19, %zmm19 #155.40
vpxord %zmm20, %zmm20, %zmm20 #155.40
vgatherdpd 16(%rdx,%ymm17,8), %zmm18{%k1} #155.40
vgatherdpd 8(%rdx,%ymm17,8), %zmm19{%k2} #155.40
vgatherdpd (%rdx,%ymm17,8), %zmm20{%k3} #155.40
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
..B1.23: # Preds ..B1.22
# Execution count [2.50e+01]
addq $8, %r13 #153.13
#vpaddd %ymm16, %ymm4, %ymm4 #153.13
#vsubpd %zmm18, %zmm0, %zmm29 #157.40
#vsubpd %zmm19, %zmm1, %zmm27 #156.40
#vsubpd %zmm20, %zmm2, %zmm26 #155.40
#vmulpd %zmm27, %zmm27, %zmm25 #158.53
#vfmadd231pd %zmm26, %zmm26, %zmm25 #158.53
#vfmadd231pd %zmm29, %zmm29, %zmm25 #158.67
#vrcp14pd %zmm25, %zmm24 #175.42
#vcmppd $1, %zmm14, %zmm25, %k2 #174.26
#vfpclasspd $30, %zmm24, %k0 #175.42
#kmovw %k2, %r9d #174.26
#knotw %k0, %k1 #175.42
#vmovaps %zmm25, %zmm17 #175.42
#andl %r9d, %r10d #174.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #175.42
#kmovw %r10d, %k3 #178.21
#vmulpd %zmm17, %zmm17, %zmm18 #175.42
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #175.42
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #175.42
#vmulpd %zmm13, %zmm24, %zmm19 #176.42
#vmulpd %zmm9, %zmm24, %zmm21 #177.58
#vmulpd %zmm19, %zmm24, %zmm22 #176.48
#vmulpd %zmm22, %zmm24, %zmm20 #176.54
#vfmsub213pd %zmm5, %zmm22, %zmm24 #177.58
#vmulpd %zmm21, %zmm20, %zmm23 #177.65
#vmulpd %zmm24, %zmm23, %zmm28 #177.71
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #178.21
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #179.21
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #180.21
cmpq %rbx, %r13 #153.13
jb ..B1.19 # Prob 82% #153.13
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.24: # Preds ..B1.23
# Execution count [4.50e+00]
movq 24(%rsp), %r9 #[spill]
movq 32(%rsp), %r10 #[spill]
cmpl %r12d, %r11d #153.13
je ..B1.39 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.25: # Preds ..B1.24 ..B1.17 ..B1.44
# Execution count [2.50e+01]
lea 8(%r12), %ebx #153.13
cmpl %ebx, %r14d #153.13
jl ..B1.33 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.26: # Preds ..B1.25
# Execution count [4.50e+00]
movq %rcx, %r13 #125.43
imulq %rdi, %r13 #125.43
vbroadcastsd %xmm10, %zmm1 #127.23
vbroadcastsd %xmm6, %zmm0 #128.23
vbroadcastsd %xmm12, %zmm2 #129.23
movslq %r12d, %rbx #153.13
addq %rsi, %r13 #108.5
movq %rax, 40(%rsp) #108.5[spill]
movq %rcx, 48(%rsp) #108.5[spill]
movq %rsi, 56(%rsp) #108.5[spill]
movq %r8, 64(%rsp) #108.5[spill]
movq %rdi, 72(%rsp) #108.5[spill]
movq %r9, 24(%rsp) #108.5[spill]
movq %r10, 32(%rsp) #108.5[spill]
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.27: # Preds ..B1.31 ..B1.26
# Execution count [2.50e+01]
vmovdqu (%r13,%rbx,4), %ymm3 #154.25
vpaddd %ymm3, %ymm3, %ymm4 #155.40
vpaddd %ymm4, %ymm3, %ymm3 #155.40
movl (%r13,%rbx,4), %r10d #154.25
movl 4(%r13,%rbx,4), %r9d #154.25
movl 8(%r13,%rbx,4), %r8d #154.25
movl 12(%r13,%rbx,4), %edi #154.25
lea (%r10,%r10,2), %r10d #155.40
movl 16(%r13,%rbx,4), %esi #154.25
lea (%r9,%r9,2), %r9d #155.40
movl 20(%r13,%rbx,4), %ecx #154.25
lea (%r8,%r8,2), %r8d #155.40
movl 24(%r13,%rbx,4), %eax #154.25
lea (%rdi,%rdi,2), %edi #155.40
movl 28(%r13,%rbx,4), %r15d #154.25
lea (%rsi,%rsi,2), %esi #155.40
lea (%rcx,%rcx,2), %ecx #155.40
lea (%rax,%rax,2), %eax #155.40
lea (%r15,%r15,2), %r15d #155.40
# LOE rdx rbx r13 eax ecx esi edi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.30: # Preds ..B1.27
# Execution count [1.25e+01]
vpcmpeqb %xmm0, %xmm0, %k1 #155.40
vpcmpeqb %xmm0, %xmm0, %k2 #155.40
vpcmpeqb %xmm0, %xmm0, %k3 #155.40
vpxord %zmm4, %zmm4, %zmm4 #155.40
vpxord %zmm17, %zmm17, %zmm17 #155.40
vpxord %zmm18, %zmm18, %zmm18 #155.40
vgatherdpd 16(%rdx,%ymm3,8), %zmm4{%k1} #155.40
vgatherdpd 8(%rdx,%ymm3,8), %zmm17{%k2} #155.40
vgatherdpd (%rdx,%ymm3,8), %zmm18{%k3} #155.40
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
..B1.31: # Preds ..B1.30
# Execution count [2.50e+01]
addl $8, %r12d #153.13
addq $8, %rbx #153.13
#vsubpd %zmm4, %zmm2, %zmm26 #157.40
#vsubpd %zmm17, %zmm0, %zmm24 #156.40
#vsubpd %zmm18, %zmm1, %zmm23 #155.40
#vmulpd %zmm24, %zmm24, %zmm3 #158.53
#vfmadd231pd %zmm23, %zmm23, %zmm3 #158.53
#vfmadd231pd %zmm26, %zmm26, %zmm3 #158.67
#vrcp14pd %zmm3, %zmm22 #175.42
#vcmppd $1, %zmm14, %zmm3, %k2 #174.26
#vfpclasspd $30, %zmm22, %k0 #175.42
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #175.42
#knotw %k0, %k1 #175.42
#vmulpd %zmm3, %zmm3, %zmm4 #175.42
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #175.42
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #175.42
#vmulpd %zmm13, %zmm22, %zmm17 #176.42
#vmulpd %zmm9, %zmm22, %zmm19 #177.58
#vmulpd %zmm17, %zmm22, %zmm20 #176.48
#vmulpd %zmm20, %zmm22, %zmm18 #176.54
#vfmsub213pd %zmm5, %zmm20, %zmm22 #177.58
#vmulpd %zmm19, %zmm18, %zmm21 #177.65
#vmulpd %zmm22, %zmm21, %zmm25 #177.71
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #178.21
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #179.21
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #180.21
cmpl %r14d, %r12d #153.13
jb ..B1.27 # Prob 82% #153.13
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.32: # Preds ..B1.31
# Execution count [4.50e+00]
movq 40(%rsp), %rax #[spill]
movq 48(%rsp), %rcx #[spill]
movq 56(%rsp), %rsi #[spill]
movq 64(%rsp), %r8 #[spill]
movq 72(%rsp), %rdi #[spill]
movq 24(%rsp), %r9 #[spill]
movq 32(%rsp), %r10 #[spill]
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.33: # Preds ..B1.32 ..B1.25 ..B1.45
# Execution count [5.00e+00]
lea 1(%r14), %ebx #153.13
cmpl %r11d, %ebx #153.13
ja ..B1.39 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.34: # Preds ..B1.33
# Execution count [2.50e+01]
imulq %rcx, %rdi #125.43
vbroadcastsd %xmm10, %zmm4 #127.23
subl %r14d, %r11d #153.13
addq %rsi, %rdi #108.5
vpbroadcastd %r11d, %ymm0 #153.13
vpcmpgtd %ymm15, %ymm0, %k3 #153.13
movslq %r14d, %r14 #153.13
vmovdqu32 (%rdi,%r14,4), %ymm1{%k3}{z} #154.25
kmovw %k3, %edi #153.13
vpaddd %ymm1, %ymm1, %ymm2 #155.40
vpaddd %ymm2, %ymm1, %ymm0 #155.40
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.37: # Preds ..B1.34
# Execution count [1.25e+01]
kmovw %k3, %k1 #155.40
kmovw %k3, %k2 #155.40
vpxord %zmm1, %zmm1, %zmm1 #155.40
vpxord %zmm2, %zmm2, %zmm2 #155.40
vpxord %zmm3, %zmm3, %zmm3 #155.40
vgatherdpd 16(%rdx,%ymm0,8), %zmm1{%k1} #155.40
vgatherdpd 8(%rdx,%ymm0,8), %zmm2{%k2} #155.40
vgatherdpd (%rdx,%ymm0,8), %zmm3{%k3} #155.40
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.38: # Preds ..B1.37
# Execution count [2.50e+01]
#vbroadcastsd %xmm6, %zmm6 #128.23
#vbroadcastsd %xmm12, %zmm12 #129.23
#vsubpd %zmm1, %zmm12, %zmm23 #157.40
#vsubpd %zmm2, %zmm6, %zmm21 #156.40
#vsubpd %zmm3, %zmm4, %zmm20 #155.40
#vmulpd %zmm21, %zmm21, %zmm19 #158.53
#vfmadd231pd %zmm20, %zmm20, %zmm19 #158.53
#vfmadd231pd %zmm23, %zmm23, %zmm19 #158.67
#vrcp14pd %zmm19, %zmm18 #175.42
#vcmppd $1, %zmm14, %zmm19, %k2 #174.26
#vfpclasspd $30, %zmm18, %k0 #175.42
#kmovw %k2, %ebx #174.26
#knotw %k0, %k1 #175.42
#vmovaps %zmm19, %zmm0 #175.42
#andl %ebx, %edi #174.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #175.42
#kmovw %edi, %k3 #178.21
#vmulpd %zmm0, %zmm0, %zmm1 #175.42
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #175.42
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #175.42
#vmulpd %zmm13, %zmm18, %zmm2 #176.42
#vmulpd %zmm9, %zmm18, %zmm4 #177.58
#vmulpd %zmm2, %zmm18, %zmm10 #176.48
#vmulpd %zmm10, %zmm18, %zmm3 #176.54
#vfmsub213pd %zmm5, %zmm10, %zmm18 #177.58
#vmulpd %zmm4, %zmm3, %zmm17 #177.65
#vmulpd %zmm18, %zmm17, %zmm22 #177.71
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #178.21
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #179.21
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #180.21
# LOE rax rdx rcx rsi r8 r9 r10 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.39: # Preds ..B1.24 ..B1.38 ..B1.33
# Execution count [4.50e+00]
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #132.22
vpermd %zmm11, %zmm19, %zmm0 #132.22
vpermd %zmm7, %zmm19, %zmm6 #131.22
vpermd %zmm8, %zmm19, %zmm20 #130.22
vaddpd %zmm11, %zmm0, %zmm11 #132.22
vaddpd %zmm7, %zmm6, %zmm7 #131.22
vaddpd %zmm8, %zmm20, %zmm8 #130.22
vpermpd $78, %zmm11, %zmm1 #132.22
vpermpd $78, %zmm7, %zmm10 #131.22
vpermpd $78, %zmm8, %zmm21 #130.22
vaddpd %zmm1, %zmm11, %zmm2 #132.22
vaddpd %zmm10, %zmm7, %zmm12 #131.22
vaddpd %zmm21, %zmm8, %zmm22 #130.22
vpermpd $177, %zmm2, %zmm3 #132.22
vpermpd $177, %zmm12, %zmm17 #131.22
vpermpd $177, %zmm22, %zmm23 #130.22
vaddpd %zmm3, %zmm2, %zmm4 #132.22
vaddpd %zmm17, %zmm12, %zmm18 #131.22
vaddpd %zmm23, %zmm22, %zmm24 #130.22
# LOE rax rdx rcx rsi r8 r9 r10 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.40: # Preds ..B1.39 ..B1.10
# Execution count [5.00e+00]
movq 96(%rsp), %rbx #188.9[spill]
addq $24, %rax #124.5
movslq %r8d, %rdi #124.32
incq %rdi #124.32
#vaddsd (%rbx,%r8,8), %xmm24, %xmm0 #188.9
#vmovsd %xmm0, (%rbx,%r8,8) #188.9
#vaddsd (%r10,%r8,8), %xmm18, %xmm1 #189.9
#vmovsd %xmm1, (%r10,%r8,8) #189.9
#vaddsd (%r9,%r8,8), %xmm4, %xmm2 #190.9
#vmovsd %xmm2, (%r9,%r8,8) #190.9
incq %r8 #124.5
cmpq 80(%rsp), %r8 #124.5[spill]
jb ..B1.10 # Prob 82% #124.5
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.41: # Preds ..B1.40
# Execution count [9.00e-01]
movq 8(%rsp), %r15 #[spill]
.cfi_restore 15
movq (%rsp), %rbx #[spill]
.cfi_restore 3
# LOE rbx r15
..B1.42: # Preds ..B1.2 ..B1.41
# Execution count [1.00e+00]
xorl %eax, %eax #201.16
vzeroupper #201.16
..___tag_value_computeForce.43:
# getTimeStamp()
call getTimeStamp #201.16
..___tag_value_computeForce.44:
# LOE rbx r15 xmm0
..B1.43: # Preds ..B1.42
# Execution count [1.00e+00]
vsubsd 16(%rsp), %xmm0, %xmm0 #204.14[spill]
addq $104, %rsp #204.14
.cfi_restore 14
popq %r14 #204.14
.cfi_restore 13
popq %r13 #204.14
.cfi_restore 12
popq %r12 #204.14
movq %rbp, %rsp #204.14
popq %rbp #204.14
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #204.14
.cfi_def_cfa 6, 16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
.cfi_offset 6, -16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
# LOE
..B1.44: # Preds ..B1.12
# Execution count [4.50e-01]: Infreq
movl %r11d, %r14d #153.13
xorl %r12d, %r12d #153.13
andl $-8, %r14d #153.13
jmp ..B1.25 # Prob 100% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.45: # Preds ..B1.11
# Execution count [4.50e-01]: Infreq
xorl %r14d, %r14d #153.13
jmp ..B1.33 # Prob 100% #153.13
.align 16,0x90
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
.cfi_endproc
# mark_end;
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.5:
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 64
.L_2il0floatpacket.7:
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
.type .L_2il0floatpacket.7,@object
.size .L_2il0floatpacket.7,64
.align 64
.L_2il0floatpacket.8:
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
.type .L_2il0floatpacket.8,@object
.size .L_2il0floatpacket.8,64
.align 64
.L_2il0floatpacket.10:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.10,@object
.size .L_2il0floatpacket.10,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.9:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.9,@object
.size .L_2il0floatpacket.9,8
.data
.section .note.GNU-stack, ""
# End

324
asm/unused/force.s Normal file
View File

@@ -0,0 +1,324 @@
.intel_syntax noprefix
.text
.align 16,0x90
.globl computeForce
computeForce:
# parameter 1: rdi Parameter*
# parameter 2: rsi Atom*
# parameter 3: rdx Neighbor*
push rbp
push r12
push r13
push r14
push r15
push rbx
#call getTimeStamp # xmm0 <- getTimeStamp()
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
test r9d, r9d # atom->Nlocal <= 0
jle ..atom_loop_exit
xor r10d, r10d # r10d <- 0
mov ecx, r9d # ecx <- atom->Nlocal
xor r8d, r8d # r8d <- 0
mov r11d, 1 # r11d <- 1
xor eax, eax # eax <- 0
shr ecx, 1 # ecx <- atom->Nlocal >> 1
je ..zero_last_element # ecx == 0
# Init forces to zero loop (unroll factor = 2)
..init_force_loop:
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
add r8, 16 # i++
inc r10 # i++
cmp r10, rcx # i < Nlocal
jb ..init_force_loop
# Trick to make r11d contain value of last element to be zeroed plus 1
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
..zero_last_element:
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
cmp ecx, r9d # i >= Nlocal
jae ..before_atom_loop
# Set last element to zero
movsxd r11, r11d # r11 <- i * 2
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
# Initialize registers to be used within atom loop
..before_atom_loop:
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
movsxd r9, r9d # r9 <- atom->Nlocal
xor r10d, r10d # r10d <- 0 (i)
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
### AOS
xor eax, eax
### SOA
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
###
shl r12, 2 # r12 <- neighbor->maxneighs * 4
# Register spilling
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
..atom_loop_begin:
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
### AOS
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
### SOA
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
###
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
test r13d, r13d # numneighs <= 0
jle ..atom_loop_exit
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
xor r9d, r9d # r9d <- 0 (k)
mov r14d, r13d # r14d <- numneighs
cmp r14d, 8
jl ..compute_forces_remainder
..compute_forces:
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k2, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
sub r14d, 8
add r9, 8
cmp r14d, 8
jge ..compute_forces
# Check if there are remaining neighbors to be computed
..compute_forces_remainder:
test r14d, r14d
jle ..sum_up_forces
vpbroadcastd ymm4, r14d
vpcmpgtd k1, ymm4, ymm17
kmovw r15d, k1
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
kmovw k2, k1
kmovw k3, k1
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
#### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
kmovw r9d, k5 # r9d <- rsq < cutforcesq
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
# and add them (reduction) to obtain the final contribution for the current atom
..sum_up_forces:
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
vpermd zmm0, zmm10, zmm11
vpermd zmm5, zmm10, zmm12
vpermd zmm21, zmm10, zmm13
vaddpd zmm11, zmm0, zmm11
vaddpd zmm12, zmm5, zmm12
vaddpd zmm13, zmm21, zmm13
vpermpd zmm1, zmm11, 78
vpermpd zmm6, zmm12, 78
vpermpd zmm22, zmm13, 78
vaddpd zmm2, zmm11, zmm1
vaddpd zmm8, zmm12, zmm6
vaddpd zmm23, zmm13, zmm22
vpermpd zmm3, zmm2, 177
vpermpd zmm9, zmm8, 177
vpermpd zmm24, zmm23, 177
vaddpd zmm4, zmm2, zmm3
vaddpd zmm20, zmm8, zmm9
vaddpd zmm25, zmm23, zmm24
..atom_loop_exit:
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
### AOS
add rax, 24
###
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
inc r10 #55.5
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
jb ..atom_loop_begin
vzeroupper #93.12
vxorpd xmm0, xmm0, xmm0 #93.12
#call getTimeStamp # xmm0 <- getTimeStamp()
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
pop rbx
pop r15
pop r14 #93.12
pop r13 #93.12
pop r12 #93.12
pop rbp #93.12
ret #93.12
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.5:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,8
.data
.section .note.GNU-stack, ""
# End

326
asm/unused/force_lj.s Normal file
View File

@@ -0,0 +1,326 @@
.intel_syntax noprefix
.text
.align 16,0x90
.globl computeForceLJ
computeForceLJ:
# parameter 1: rdi Parameter*
# parameter 2: rsi Atom*
# parameter 3: rdx Neighbor*
push rbp
push r12
push r13
push r14
push r15
push rbx
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
test r9d, r9d # atom->Nlocal <= 0
jle ..atom_loop_exit
xor r10d, r10d # r10d <- 0
mov ecx, r9d # ecx <- atom->Nlocal
xor r8d, r8d # r8d <- 0
mov r11d, 1 # r11d <- 1
xor eax, eax # eax <- 0
shr ecx, 1 # ecx <- atom->Nlocal >> 1
je ..zero_last_element # ecx == 0
# Init forces to zero loop (unroll factor = 2)
..init_force_loop:
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
add r8, 16 # i++
inc r10 # i++
cmp r10, rcx # i < Nlocal
jb ..init_force_loop
# Trick to make r11d contain value of last element to be zeroed plus 1
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
..zero_last_element:
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
cmp ecx, r9d # i >= Nlocal
jae ..before_atom_loop
# Set last element to zero
movsxd r11, r11d # r11 <- i * 2
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
# Initialize registers to be used within atom loop
..before_atom_loop:
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
movsxd r9, r9d # r9 <- atom->Nlocal
xor r10d, r10d # r10d <- 0 (i)
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
### AOS
xor eax, eax
### SOA
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
###
shl r12, 2 # r12 <- neighbor->maxneighs * 4
# Register spilling
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
#sub rsp, 64
#call getTimeStamp # xmm0 <- getTimeStamp()
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
#add rsp, 64
..atom_loop_begin:
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
### AOS
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
### SOA
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
###
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
test r13d, r13d # numneighs <= 0
jle ..atom_loop_exit
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
xor r9d, r9d # r9d <- 0 (k)
mov r14d, r13d # r14d <- numneighs
cmp r14d, 8
jl ..compute_forces_remainder
..compute_forces:
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k2, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
sub r14d, 8
add r9, 8
cmp r14d, 8
jge ..compute_forces
# Check if there are remaining neighbors to be computed
..compute_forces_remainder:
test r14d, r14d
jle ..sum_up_forces
vpbroadcastd ymm4, r14d
vpcmpgtd k1, ymm4, ymm17
kmovw r15d, k1
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
kmovw k2, k1
kmovw k3, k1
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
#### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
kmovw r9d, k5 # r9d <- rsq < cutforcesq
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
# and add them (reduction) to obtain the final contribution for the current atom
..sum_up_forces:
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
vpermd zmm0, zmm10, zmm11
vpermd zmm5, zmm10, zmm12
vpermd zmm21, zmm10, zmm13
vaddpd zmm11, zmm0, zmm11
vaddpd zmm12, zmm5, zmm12
vaddpd zmm13, zmm21, zmm13
vpermpd zmm1, zmm11, 78
vpermpd zmm6, zmm12, 78
vpermpd zmm22, zmm13, 78
vaddpd zmm2, zmm11, zmm1
vaddpd zmm8, zmm12, zmm6
vaddpd zmm23, zmm13, zmm22
vpermpd zmm3, zmm2, 177
vpermpd zmm9, zmm8, 177
vpermpd zmm24, zmm23, 177
vaddpd zmm4, zmm2, zmm3
vaddpd zmm20, zmm8, zmm9
vaddpd zmm25, zmm23, zmm24
..atom_loop_exit:
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
### AOS
add rax, 24
###
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
inc r10 #55.5
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
jb ..atom_loop_begin
vzeroupper #93.12
vxorpd xmm0, xmm0, xmm0 #93.12
#call getTimeStamp # xmm0 <- getTimeStamp()
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
pop rbx
pop r15
pop r14 #93.12
pop r13 #93.12
pop r12 #93.12
pop rbp #93.12
ret #93.12
.type computeForceLJ,@function
.size computeForceLJ,.-computeForceLJ
..LNcomputeForce.0:
.data
# -- End computeForceLJ
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.5:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,8
.data
.section .note.GNU-stack, ""
# End

View File

@@ -1,97 +0,0 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <parameter.h>
#include <util.h>
#include <box.h>
#include <mpi.h>
int overlapBox(int dim, int dir, const Box* mybox, const Box* other, Box* cut, MD_FLOAT xprd, MD_FLOAT cutneigh)
{
int pbc = -100;
MD_FLOAT min[3], max[3];
int same = (mybox->id == other->id) ? 1 : 0;
//projections
min[_x] = MAX(mybox->lo[_x], other->lo[_x]); max[_x] = MIN(mybox->hi[_x], other->hi[_x]);
min[_y] = MAX(mybox->lo[_y], other->lo[_y]); max[_y] = MIN(mybox->hi[_y], other->hi[_y]);
min[_z] = MAX(mybox->lo[_z], other->lo[_z]); max[_z] = MIN(mybox->hi[_z], other->hi[_z]);
//Intersection no periodic case
if(!same){
if (dir == 0) max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ cutneigh);
if (dir == 1) min[dim] = MAX(mybox->lo[dim], other->lo[dim]- cutneigh);
if ((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z])) pbc = 0;
}
//Intersection periodic case
if(pbc < 0)
{
if(dir == 0){
min[dim] = MAX(mybox->lo[dim] , other->lo[dim]- xprd);
max[dim] = MIN(mybox->hi[dim] , other->hi[dim]- xprd + cutneigh);
} else {
min[dim] = MAX(mybox->lo[dim], other->lo[dim]+ xprd - cutneigh);
max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ xprd);
}
if((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z]))
pbc = (dir == 0) ? 1:-1;
}
//storing the cuts
cut->lo[_x] = min[_x]; cut->hi[_x] = max[_x];
cut->lo[_y] = min[_y]; cut->hi[_y] = max[_y];
cut->lo[_z] = min[_z]; cut->hi[_z] = max[_z];
return pbc;
}
int overlapFullBox(Parameter* param, MD_FLOAT *cutneigh ,const Box* mybox, const Box* other)
{
MD_FLOAT min[3], max[3];
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
for(int k = -1; k < 2; k++)
{
for(int j = -1; j < 2; j++)
{
for(int i= -1; i < 2; i++)
{
min[_x] = MAX(mybox->lo[_x], other->lo[_x]-cutneigh[_x] + i*xprd);
min[_y] = MAX(mybox->lo[_y], other->lo[_y]-cutneigh[_y] + j*yprd);
min[_z] = MAX(mybox->lo[_z], other->lo[_z]-cutneigh[_z] + k*zprd);
max[_x] = MIN(mybox->hi[_x], other->hi[_x]+cutneigh[_x] + i*xprd);
max[_y] = MIN(mybox->hi[_y], other->hi[_y]+cutneigh[_y] + j*yprd);
max[_z] = MIN(mybox->hi[_z], other->hi[_z]+cutneigh[_z] + k*zprd);
if ((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z]))
return 1;
}
}
}
return 0;
}
void expandBox(int iswap, const Box* me, const Box* other, Box* cut, MD_FLOAT cutneigh)
{
if(iswap==2 || iswap==3){
if(me->lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh;
if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh;
}
if(iswap==4 || iswap==5){
if(me->lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh;
if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh;
if(me->lo[_y] <= other->lo[_y]) cut->lo[_y] -= cutneigh;
if(me->hi[_y] >= other->hi[_y]) cut->hi[_y] += cutneigh;
}
}

View File

@@ -1,556 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <comm.h>
#include <allocate.h>
#include <mpi.h>
#include <util.h>
#define NEIGHMIN 6
#define BUFFACTOR 2
#define BUFMIN 1000
#define BUFEXTRA 100
#define world MPI_COMM_WORLD
MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;
static inline void allocDynamicBuffers(Comm*);
static inline void freeDynamicBuffers(Comm*);
static inline void freeBuffers(Comm*);
void defineReverseList(Comm* comm){
int dim = 0;
int index = 0;
int me = comm->myproc;
//Set the inverse list
for(int iswap = 0; iswap<6; iswap++){
int dim = comm->swapdim[iswap];
int dir = comm->swapdir[iswap];
int invswap = comm->swap[dim][(dir+1)%2];
for(int ineigh = comm->sendfrom[invswap]; ineigh< comm->sendtill[invswap]; ineigh++)
comm->nrecv[index++] = comm->nsend[ineigh];
comm->recvfrom[iswap] = (iswap == 0) ? 0 : comm->recvtill[iswap-1];
comm->recvtill[iswap] = index;
}
//set if myproc is unique in the swap
for(int iswap = 0; iswap<6; iswap++){
int sizeswap = comm->sendtill[iswap]-comm->sendfrom[iswap];
int index = comm->sendfrom[iswap];
int myneigh = comm->nsend[index];
comm->othersend[iswap] = (sizeswap != 1 || comm->myproc != myneigh) ? 1 : 0;
}
}
void addNeighToExchangeList(Comm* comm, int newneigh){
int numneigh = comm->numneighexch;
if(comm->numneighexch>=comm->maxneighexch){
size_t oldByteSize = comm->maxneighexch*sizeof(int);
comm->maxneighexch *=2;
comm->nexch = (int*) reallocate(comm->nexch, ALIGNMENT, comm->maxneighexch * sizeof(int), oldByteSize);
}
// Add the new element to the list
comm->nexch[numneigh] = newneigh;
comm->numneighexch++;
}
//Exported functions
void neighComm(Comm *comm, Parameter* param, Grid *grid)
{
int me = comm->myproc;
int numproc = comm ->numproc;
int PAD = 6; //number of elements for processor in the map
int ineigh = 0;
int sneigh = 0;
MD_FLOAT *map = grid->map;
MD_FLOAT cutneigh = param->cutneigh;
MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd};
Box mybox, other, cut;
//needed for rebalancing
freeDynamicBuffers(comm);
//Local box
mybox.id = me;
mybox.lo[_x] = map[me*PAD+0]; mybox.hi[_x] = map[me*PAD+3];
mybox.lo[_y] = map[me*PAD+1]; mybox.hi[_y] = map[me*PAD+4];
mybox.lo[_z] = map[me*PAD+2]; mybox.hi[_z] = map[me*PAD+5];
//Check for all possible neighbours only for exchange atoms
comm->numneighexch = 0;
for(int proc = 0; proc <numproc; proc++){
other.id = proc;
other.lo[_x] = map[proc*PAD+0]; other.hi[_x] = map[proc*PAD+3];
other.lo[_y] = map[proc*PAD+1]; other.hi[_y] = map[proc*PAD+4];
other.lo[_z] = map[proc*PAD+2]; other.hi[_z] = map[proc*PAD+5];
if(proc != me){
int intersection = overlapFullBox(param,grid->cutneigh,&mybox,&other);
if(intersection) addNeighToExchangeList(comm,proc);
}
}
//MAP is stored as follows: xlo,ylo,zlo,xhi,yhi,zhi
for(int iswap = 0; iswap <6; iswap++)
{
int dir = comm->swapdir[iswap];
int dim = comm->swapdim[iswap];
for(int proc = 0; proc < numproc; proc++)
{
//Check for neighbours along dimmensions, for forwardComm, backwardComm and ghostComm
other.id = proc;
other.lo[_x] = map[proc*PAD+0]; other.hi[_x] = map[proc*PAD+3];
other.lo[_y] = map[proc*PAD+1]; other.hi[_y] = map[proc*PAD+4];
other.lo[_z] = map[proc*PAD+2]; other.hi[_z] = map[proc*PAD+5];
//return if two boxes intersect: -100 not intersection, 0, 1 and -1 intersection for each different pbc.
int pbc = overlapBox(dim,dir,&mybox,&other,&cut,prd[dim],cutneigh);
if(pbc == -100) continue;
expandBox(iswap, &mybox, &other, &cut, cutneigh);
if(ineigh >= comm->maxneigh) {
size_t oldByteSize = comm->maxneigh*sizeof(int);
size_t oldBoxSize = comm->maxneigh*sizeof(Box);
comm->maxneigh = 2*ineigh;
comm->nsend = (int*) reallocate(comm->nsend, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
comm->nrecv = (int*) reallocate(comm->nrecv, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
comm->pbc_x = (int*) reallocate(comm->pbc_x, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
comm->pbc_y = (int*) reallocate(comm->pbc_y, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
comm->pbc_z = (int*) reallocate(comm->pbc_z, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
comm->boxes = (Box*) reallocate(comm->boxes, ALIGNMENT, comm->maxneigh * sizeof(Box), oldBoxSize);
}
comm->boxes[ineigh] = cut;
comm->nsend[ineigh] = proc;
comm->pbc_x[ineigh] = (dim == _x) ? pbc : 0;
comm->pbc_y[ineigh] = (dim == _y) ? pbc : 0;
comm->pbc_z[ineigh] = (dim == _z) ? pbc : 0;
ineigh++;
}
comm->sendfrom[iswap] = (iswap == 0) ? 0:comm->sendtill[iswap-1];
comm->sendtill[iswap] = ineigh;
comm->numneigh = ineigh;
}
allocDynamicBuffers(comm);
defineReverseList(comm);
}
void initComm(int* argc, char*** argv, Comm* comm)
{
//MPI Initialize
MPI_Init(argc, argv);
MPI_Comm_size(MPI_COMM_WORLD, &(comm->numproc));
MPI_Comm_rank(MPI_COMM_WORLD, &(comm->myproc));
comm->numneigh = 0;
comm->numneighexch = 0;
comm->nrecv=NULL;
comm->nsend=NULL;
comm->nexch=NULL;
comm->pbc_x=NULL;
comm->pbc_y=NULL;
comm->pbc_z=NULL;
comm->boxes=NULL;
comm->atom_send=NULL;
comm->atom_recv=NULL;
comm->off_atom_send=NULL;
comm->off_atom_recv=NULL;
comm->maxsendlist=NULL;
comm->sendlist=NULL;
comm->buf_send=NULL;
comm->buf_recv=NULL;
}
void endComm(Comm* comm)
{
comm->maxneigh = 0;
comm->maxneighexch =0;
comm->maxsend = 0;
comm->maxrecv = 0;
freeBuffers(comm);
MPI_Finalize();
}
void setupComm(Comm* comm, Parameter* param, Grid* grid){
comm->swap[_x][0] = 0; comm->swap[_x][1] =1;
comm->swap[_y][0] = 2; comm->swap[_y][1] =3;
comm->swap[_z][0] = 4; comm->swap[_z][1] =5;
comm->swapdim[0] = comm->swapdim[1] = _x;
comm->swapdim[2] = comm->swapdim[3] = _y;
comm->swapdim[4] = comm->swapdim[5] = _z;
comm->swapdir[0] = comm->swapdir[2] = comm->swapdir[4] = 0;
comm->swapdir[1] = comm->swapdir[3] = comm->swapdir[5] = 1;
for(int i = 0; i<6; i++){
comm->sendfrom[i] = 0;
comm->sendtill[i] = 0;
comm->recvfrom[i] = 0;
comm->recvtill[i] = 0;
}
comm->forwardSize = FORWARD_SIZE; //send coordiantes x,y,z
comm->reverseSize = REVERSE_SIZE; //return forces fx, fy, fz
comm->ghostSize = GHOST_SIZE; //send x,y,z,type;
comm->exchangeSize = EXCHANGE_SIZE; //send x,y,z,vx,vy,vz,type
//Allocate memory for recv buffer and recv buffer
comm->maxsend = BUFMIN;
comm->maxrecv = BUFMIN;
comm->buf_send = (MD_FLOAT*) allocate(ALIGNMENT,(comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT));
comm->buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT));
comm->maxneighexch = NEIGHMIN;
comm->nexch = (int*) allocate(ALIGNMENT, comm->maxneighexch * sizeof(int));
comm->maxneigh = NEIGHMIN;
comm->nsend = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
comm->nrecv = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
comm->pbc_x = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
comm->pbc_y = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
comm->pbc_z = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
comm->boxes = (Box*) allocate(ALIGNMENT, comm->maxneigh * sizeof(Box));
neighComm(comm, param, grid);
}
void forwardComm(Comm* comm, Atom* atom, int iswap)
{
int nrqst=0, offset=0, nsend=0, nrecv=0;
int pbc[3];
int size = comm->forwardSize;
int maxrqst = comm->numneigh;
MD_FLOAT* buf;
MPI_Request requests[maxrqst];
for(int ineigh = comm->sendfrom[iswap]; ineigh < comm->sendtill[iswap]; ineigh++){
offset = comm->off_atom_send[ineigh];
pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh]; pbc[_z]=comm->pbc_z[ineigh];
packForward(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &comm->buf_send[offset*size],pbc);
}
//Receives elements
if(comm->othersend[iswap])
for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
offset = comm->off_atom_recv[ineigh]*size;
nrecv = comm->atom_recv[ineigh]*size;
MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]);
}
//Send elements
if(comm->othersend[iswap])
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
offset = comm->off_atom_send[ineigh]*size;
nsend = comm->atom_send[ineigh]*size;
MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world);
}
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
if(comm->othersend[iswap]) buf = comm->buf_recv;
else buf = comm->buf_send;
/* unpack buffer */
for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
offset = comm->off_atom_recv[ineigh];
unpackForward(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &buf[offset*size]);
}
}
void reverseComm(Comm* comm, Atom* atom, int iswap)
{
int nrqst=0, offset=0, nsend=0, nrecv=0 ;
int size = comm->reverseSize;
int maxrqst = comm->numneigh;
MD_FLOAT* buf;
MPI_Request requests[maxrqst];
for(int ineigh = comm->recvfrom[iswap]; ineigh < comm->recvtill[iswap]; ineigh++){
offset = comm->off_atom_recv[ineigh];
packReverse(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &comm->buf_send[offset*size]);
}
//Receives elements
if(comm->othersend[iswap])
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
offset = comm->off_atom_send[ineigh]*size;
nrecv = comm->atom_send[ineigh]*size;
MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nsend[ineigh],0,world,&requests[nrqst++]);
}
//Send elements
if(comm->othersend[iswap])
for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
offset = comm->off_atom_recv[ineigh]*size;
nsend = comm->atom_recv[ineigh]*size;
MPI_Send(&comm->buf_send[offset],nsend,type,comm->nrecv[ineigh],0,world);
}
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
if(comm->othersend[iswap]) buf = comm->buf_recv;
else buf = comm->buf_send;
/* unpack buffer */
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
offset = comm->off_atom_send[ineigh];
unpackReverse(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &buf[offset*size]);
}
}
void ghostComm(Comm* comm, Atom* atom,int iswap){
MD_FLOAT xlo=0, xhi=0, ylo=0, yhi=0, zlo=0, zhi=0;
MD_FLOAT* buf;
int nrqst=0, nsend=0, nrecv=0, offset=0, ineigh=0, pbc[3];
int all_recv=0, all_send=0, currentSend=0;
int size = comm->ghostSize;
int maxrqrst = comm->numneigh;
MPI_Request requests[maxrqrst];
for(int i = 0; i<maxrqrst; i++)
requests[maxrqrst]=MPI_REQUEST_NULL;
if(iswap%2==0) comm->iterAtom = LOCAL+GHOST;
int iter = 0;
for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++)
{
Box* tile = &comm->boxes[ineigh];
xlo = tile->lo[_x]; ylo = tile->lo[_y]; zlo = tile->lo[_z];
xhi = tile->hi[_x]; yhi = tile->hi[_y]; zhi = tile->hi[_z];
pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh]; pbc[_z]=comm->pbc_z[ineigh];
nsend = 0;
for(int i = 0; i < comm->iterAtom ; i++)
{
if(IsinRegionToSend(i)){
if(nsend >= comm->maxsendlist[ineigh]) growList(comm,ineigh,nsend);
if(currentSend + size >= comm->maxsend) growSend(comm,currentSend);
comm->sendlist[ineigh][nsend++] = i;
currentSend += packGhost(atom, i, &comm->buf_send[currentSend], pbc);
}
}
comm->atom_send[ineigh] = nsend; //#atoms send per neigh
comm->off_atom_send[ineigh] = all_send; //offset atom respect to neighbours in a swap
all_send += nsend; //all atoms send
}
//Receives how many elements to be received.
if(comm->othersend[iswap])
for(nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++)
MPI_Irecv(&comm->atom_recv[ineigh],1,MPI_INT,comm->nrecv[ineigh],0,world,&requests[nrqst++]);
if(!comm->othersend[iswap]) comm->atom_recv[comm->recvfrom[iswap]] = nsend;
//Communicate how many elements to be sent.
if(comm->othersend[iswap])
for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++)
MPI_Send(&comm->atom_send[ineigh],1,MPI_INT,comm->nsend[ineigh],0,world);
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
//Define offset to store in the recv_buff
for(int ineigh = comm->recvfrom[iswap]; ineigh<comm->recvtill[iswap]; ineigh++){
comm->off_atom_recv[ineigh] = all_recv;
all_recv += comm->atom_recv[ineigh];
}
if(all_recv*size>=comm->maxrecv) growRecv(comm,all_recv*size);
//Receives elements
if(comm->othersend[iswap])
for (nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
offset = comm->off_atom_recv[ineigh]*size;
nrecv = comm->atom_recv[ineigh]*size;
MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]);
}
//Send elements
if(comm->othersend[iswap])
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
offset = comm->off_atom_send[ineigh]*size;
nsend = comm->atom_send[ineigh]*size;
MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world);
}
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
if(comm->othersend[iswap]) buf = comm->buf_recv;
else buf = comm->buf_send;
//unpack elements
comm->firstrecv[iswap] = LOCAL+GHOST;
for(int i = 0; i < all_recv; i++)
unpackGhost(atom, LOCAL+GHOST, &buf[i*size]);
//Increases the buffer if needed
int max_size = MAX(comm->forwardSize,comm->reverseSize);
int max_buf = max_size * MAX(all_recv, all_send);
if(max_buf>=comm->maxrecv) growRecv(comm,max_buf);
if(max_buf>=comm->maxsend) growSend(comm,max_buf);
}
void exchangeComm(Comm* comm, Atom* atom){
MD_FLOAT x,y,z;
MD_FLOAT *lo = atom->mybox.lo;
MD_FLOAT *hi = atom->mybox.hi;
int size = comm->exchangeSize;
int numneigh = comm->numneighexch;
int offset_recv[numneigh];
int size_recv[numneigh];
MPI_Request requests[numneigh];
int i =0, nsend = 0, nrecv = 0;
int nrqst = 0;
int nlocal, offset,m;
/* enforce PBC */
pbc(atom);
if(comm->numneigh == 0) return;
nlocal = atom->Nlocal;
while(i < nlocal) {
if(atom_x(i) < lo[_x] || atom_x(i) >= hi[_x] ||
atom_y(i) < lo[_y] || atom_y(i) >= hi[_y] ||
atom_z(i) < lo[_z] || atom_z(i) >= hi[_z]) {
if(nsend+size >= comm->maxsend) growSend(comm, nsend);
nsend += packExchange(atom, i, &comm->buf_send[nsend]);
copy(atom, i, nlocal-1);
nlocal--;
} else i++;
}
atom->Nlocal = nlocal;
/* send/recv number of to share atoms with neighbouring procs*/
for(int ineigh = 0; ineigh < numneigh; ineigh++)
MPI_Irecv(&size_recv[ineigh],1,MPI_INT,comm->nexch[ineigh],0,world,&requests[nrqst++]);
for (int ineigh = 0; ineigh < numneigh; ineigh++)
MPI_Send(&nsend,1,MPI_INT,comm->nexch[ineigh],0,world);
MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
//Define offset to store in the recv_buff
for(int ineigh = 0; ineigh<numneigh; ineigh++){
offset_recv[ineigh] = nrecv;
nrecv += size_recv[ineigh];
}
if(nrecv >= comm->maxrecv) growRecv(comm,nrecv);
//Receives elements
nrqst=0;
for (int ineigh = 0; ineigh< numneigh; ineigh++){
offset = offset_recv[ineigh];
MPI_Irecv(&comm->buf_recv[offset], size_recv[ineigh], type, comm->nexch[ineigh],0,world,&requests[nrqst++]);
}
//Send elements
for (int ineigh = 0; ineigh< numneigh; ineigh++)
MPI_Send(comm->buf_send,nsend,type,comm->nexch[ineigh],0,world);
MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
nlocal = atom->Nlocal;
m = 0;
while(m < nrecv) {
x = comm->buf_recv[m + _x];
y = comm->buf_recv[m + _y];
z = comm->buf_recv[m + _z];
if(x >= lo[_x] && x < hi[_x] &&
y >= lo[_y] && y < hi[_y] &&
z >= lo[_z] && z < hi[_z]){
m += unpackExchange(atom, nlocal++, &comm->buf_recv[m]);
} else {
m += size;
}
}
atom->Nlocal = nlocal;
int all_atoms=0;
MPI_Allreduce(&atom->Nlocal, &all_atoms, 1, MPI_INT, MPI_SUM, world);
if(atom->Natoms!=all_atoms && comm->myproc ==0){
printf("Losing atoms! current atoms:%d expected atoms:%d\n",all_atoms,atom->Natoms);
}
}
//Internal functions
inline void growRecv(Comm* comm, int n)
{
comm -> maxrecv = BUFFACTOR * n;
if(comm->buf_recv) free(comm -> buf_recv);
comm -> buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT));
}
inline void growSend(Comm* comm, int n)
{
size_t oldByteSize = (comm->maxsend+BUFEXTRA)*sizeof(MD_FLOAT);
comm -> maxsend = BUFFACTOR * n;
comm -> buf_send = (MD_FLOAT*) reallocate(comm->buf_send, ALIGNMENT, (comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT), oldByteSize);
}
inline void growList(Comm* comm, int ineigh, int n)
{
size_t oldByteSize = comm->maxsendlist[ineigh]*sizeof(int);
comm->maxsendlist[ineigh] = BUFFACTOR * n;
comm->sendlist[ineigh] = (int*) reallocate(comm->sendlist[ineigh],ALIGNMENT, comm->maxsendlist[ineigh] * sizeof(int), oldByteSize);
}
static inline void allocDynamicBuffers(Comm* comm)
{
//Buffers depending on the # of my neighs
int numneigh = comm->numneigh;
comm->atom_send = (int*) allocate(ALIGNMENT, numneigh * sizeof(int));
comm->atom_recv = (int*) allocate(ALIGNMENT, numneigh * sizeof(int));
comm->off_atom_send = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
comm->off_atom_recv = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
comm->maxsendlist = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
for(int i = 0; i < numneigh; i++)
comm->maxsendlist[i] = BUFMIN;
comm->sendlist = (int**) allocate(ALIGNMENT, numneigh * sizeof(int*));
for(int i = 0; i < numneigh; i++)
comm->sendlist[i] = (int*) allocate(ALIGNMENT, comm->maxsendlist[i] * sizeof(int));
}
static inline void freeDynamicBuffers(Comm* comm)
{
int numneigh =comm->numneigh;
if(comm->atom_send) free(comm->atom_send);
if(comm->atom_recv) free(comm->atom_recv);
if(comm->off_atom_send) free(comm->off_atom_send);
if(comm->off_atom_recv) free(comm->off_atom_recv);
if(comm->maxsendlist) free(comm->maxsendlist);
if(comm->sendlist){
for(int i = 0; i < numneigh; i++)
if(comm->sendlist[i]) free(comm->sendlist[i]);
}
if(comm->sendlist) free(comm->sendlist);
}
static inline void freeBuffers(Comm* comm)
{
if(comm->nrecv) free(comm->nrecv);
if(comm->nsend) free(comm->nsend);
if(comm->nexch) free(comm->nexch);
if(comm->pbc_x) free(comm->pbc_x);
if(comm->pbc_y) free(comm->pbc_y);
if(comm->pbc_z) free(comm->pbc_z);
if(comm->boxes) free(comm->boxes);
if(comm->atom_send) free(comm->atom_send);
if(comm->atom_recv) free(comm->atom_recv);
if(comm->off_atom_send) free(comm->off_atom_send);
if(comm->off_atom_recv) free(comm->off_atom_recv);
if(comm->maxsendlist) free(comm->maxsendlist);
if(comm->sendlist){
for(int i = 0; i < comm->numneigh; i++)
if(comm->sendlist[i]) free(comm->sendlist[i]);
}
if(comm->sendlist) free(comm->sendlist);
if(comm->buf_send) free(comm->buf_send);
if(comm->buf_recv) free(comm->buf_recv);
}

View File

@@ -1,490 +0,0 @@
#include <stdio.h>
#include <grid.h>
#include <mpi.h>
#include <parameter.h>
#include <allocate.h>
#include <util.h>
#include <math.h>
static MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;
//Grommacs Balancing
MD_FLOAT f_normalization(MD_FLOAT* x,MD_FLOAT* fx, MD_FLOAT minx, int nprocs) {
MD_FLOAT sum=0;
for(int n = 0; n<nprocs; n++){
fx[n] = MAX(minx,x[n]);
sum+=fx[n];
}
for(int n = 0; n<nprocs; n++)
fx[n] /= sum;
}
void fixedPointIteration(MD_FLOAT* x0, int nprocs, MD_FLOAT minx)
{
MD_FLOAT tolerance = 1e-3;
MD_FLOAT alpha = 0.5;
MD_FLOAT *fx = (MD_FLOAT*) malloc(nprocs*sizeof(MD_FLOAT));
int maxIterations = 100;
for (int i = 0; i < maxIterations; i++) {
int converged = 1;
f_normalization(x0,fx,minx,nprocs);
for(int n=0; n<nprocs; n++)
fx[n]= (1-alpha) * x0[n] + alpha * fx[n];
for (int n=0; n<nprocs; n++) {
if (fabs(fx[n] - x0[n]) >= tolerance) {
converged = 0;
break;
}
}
for (int n=0; n<nprocs; n++)
x0[n] = fx[n];
if(converged){
for(int n = 0; n<nprocs; n++)
return;
}
}
}
void staggeredBalance(Grid* grid, Atom* atom, Parameter* param, double newTime)
{
int me;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
int *coord = grid->coord;
int *nprocs = grid ->nprocs;
//Elapsed time since the last rebalance
double time = newTime - grid->Timer;
grid->Timer = newTime;
//store the older dimm to compare later for exchange
MD_FLOAT lo[3], hi[3];
for(int dim = 0; dim< 3; dim++){
lo[dim] = atom->mybox.lo[dim];
hi[dim] = atom->mybox.hi[dim];
}
//Define parameters
MPI_Comm subComm[3];
int color[3] = {0,0,0};
int id[3] = {0,0,0};
MD_FLOAT ** load = (MD_FLOAT**) malloc(3*sizeof(MD_FLOAT*));
for(int dim = 0; dim<3; dim++)
load[dim] = (MD_FLOAT*) malloc(nprocs[dim]*sizeof(MD_FLOAT));
int maxprocs = MAX(MAX(nprocs[_x],nprocs[_y]),nprocs[_z]);
MD_FLOAT* cellSize = (MD_FLOAT*) malloc(maxprocs*sizeof(MD_FLOAT));
MD_FLOAT* limits = (MD_FLOAT*) malloc(2*maxprocs*sizeof(MD_FLOAT)); //limits: (x0, x1), (x1, x2)... Repeat values in between to perfom MPI_Scatter later
MD_FLOAT t_sum[3] = {0,0,0};
MD_FLOAT recv_buf[2] = {0,0}; //Each proc only receives 2 elments per dimension xlo and xhi
MD_FLOAT balancedLoad[3] = {0,0,0}; //1/nprocs
MD_FLOAT minLoad[3] = {0,0,0}; //beta*(1/nprocs)
MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd};
MD_FLOAT boundaries[6] ={0,0,0,0,0,0}; // xlo,xhi,ylo,yhi,zlo,zhi
//Create sub-communications along each dimension
for(int dim = 0; dim<3; dim++){
if(dim == _x){
color[_x] = (coord[_y] == 0 && coord[_z] ==0) ? 1:MPI_UNDEFINED;
id[_x] = me;
} else if(dim == _y) {
color[_y] = coord[_z] == 0 ? coord[_x]:MPI_UNDEFINED;
id[_y] = (coord[_y] == 0 && coord[_z] == 0) ? 0:me;
} else {
color[_z]= coord[_y]*nprocs[_x]+coord[_x];
id[_z] = coord[_z] == 0 ? 0 : me;
}
MPI_Comm_split(world, color[dim], id[dim], &subComm[dim]);
}
//Set the minimum load and the balance load
for(int dim = 0; dim<3; dim++){
balancedLoad[dim] = 1./nprocs[dim];
minLoad[dim] = 0.8*balancedLoad[dim];
}
//set and communicate the workload in reverse order
for(int dim = _z; dim>= _x; dim--)
{
if(subComm[dim] != MPI_COMM_NULL){
MPI_Gather(&time,1,type,load[dim],1,type,0,subComm[dim]);
if(id[dim] == 0)
{
for(int n=0; n<nprocs[dim]; n++)
t_sum[dim] += load[dim][n];
for(int n=0; n<nprocs[dim]; n++)
load[dim][n] /= t_sum[dim];
}
time =t_sum[dim];
}
MPI_Barrier(world);
}
//Brodacast the new boundaries along dimensions
for(int dim=0; dim<3; dim++){
if(subComm[dim] != MPI_COMM_NULL){
MPI_Bcast(boundaries,6,type,0,subComm[dim]);
if(id[dim] == 0) {
fixedPointIteration(load[dim], nprocs[dim], minLoad[dim]);
MD_FLOAT inv_sum=0;
for(int n=0; n<nprocs[dim];n++)
inv_sum +=(1/load[dim][n]);
for(int n=0; n<nprocs[dim];n++)
cellSize[n] = (prd[dim]/load[dim][n])*(1./inv_sum);
MD_FLOAT sum=0;
for(int n=0; n<nprocs[dim]; n++){
limits[2*n] = sum;
limits[2*n+1] = sum+cellSize[n];
sum+= cellSize[n];
}
limits[2*nprocs[dim]-1] = prd[dim];
}
MPI_Scatter(limits,2,type,recv_buf,2,type,0,subComm[dim]);
boundaries[2*dim] = recv_buf[0];
boundaries[2*dim+1] = recv_buf[1];
}
MPI_Barrier(world);
}
atom->mybox.lo[_x]=boundaries[0]; atom->mybox.hi[_x]=boundaries[1];
atom->mybox.lo[_y]=boundaries[2]; atom->mybox.hi[_y]=boundaries[3];
atom->mybox.lo[_z]=boundaries[4]; atom->mybox.hi[_z]=boundaries[5];
MD_FLOAT domain[6] = {boundaries[0], boundaries[2], boundaries[4], boundaries[1], boundaries[3], boundaries[5]};
MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
//because cells change dynamically, It is required to increase the neighbouring exchange region
for(int dim =_x; dim<=_z; dim++){
MD_FLOAT dr,dr_max;
int n = grid->nprocs[dim];
MD_FLOAT maxdelta = 0.2*prd[dim];
dr = MAX(fabs(lo[dim] - atom->mybox.lo[dim]),fabs(hi[dim] - atom->mybox.hi[dim]));
MPI_Allreduce(&dr, &dr_max, 1, type, MPI_MAX, world);
grid->cutneigh[dim] = param->cutneigh+dr_max;
}
for(int dim=0; dim<3; dim++) {
if(subComm[dim] != MPI_COMM_NULL){
MPI_Comm_free(&subComm[dim]);
}
free(load[dim]);
}
free(load);
free(limits);
}
//RCB Balancing
MD_FLOAT meanTimeBisect(Atom *atom, MPI_Comm subComm, int dim, double time)
{
MD_FLOAT mean=0, sum=0, total_sum=0, weightAtoms= 0, total_weight=0;
for(int i=0; i<atom->Nlocal; i++){
sum += atom_pos(i);
}
sum*=time;
weightAtoms = atom->Nlocal*time;
MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm);
MPI_Allreduce(&weightAtoms, &total_weight, 1, type, MPI_SUM, subComm);
mean = total_sum/total_weight;
return mean;
}
MD_FLOAT meanBisect(Atom* atom, MPI_Comm subComm, int dim, double time)
{
int Natoms = 0;
MD_FLOAT sum=0, mean=0, total_sum=0;
for(int i=0; i<atom->Nlocal; i++){
sum += atom_pos(i);
}
MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm);
MPI_Allreduce(&atom->Nlocal, &Natoms, 1, MPI_INT, MPI_SUM, subComm);
mean = total_sum/Natoms;
return mean;
}
void nextBisectionLevel(Grid* grid, Atom* atom, RCB_Method method, MPI_Comm subComm, int dim ,int* color, int ilevel, double time)
{
int rank, size;
int branch = 0, i = 0, m = 0;
int nsend = 0, nrecv = 0, nrecv2 = 0;
int values_per_atom = 7;
MD_FLOAT bisection, pos;
MPI_Request request[2] = {MPI_REQUEST_NULL,MPI_REQUEST_NULL};
MPI_Comm_rank(subComm,&rank);
MPI_Comm_size(subComm,&size);
int odd = size%2;
int extraProc = odd ? size-1:size;
int half = (int) (0.5*size);
int partner = (rank<half) ? rank+half:rank-half;
if(odd && rank == extraProc) partner = 0;
//Apply the bisection
bisection = method(atom,subComm,dim,time);
//Define the new boundaries
if(rank<half){
atom->mybox.hi[dim] = bisection;
branch = 0;
} else {
atom->mybox.lo[dim] = bisection;
branch = 1;
}
//Define new color for the further communicaton
*color = (branch << ilevel) | *color;
//Grow the send buffer
if(atom->Nlocal>=grid->maxsend){
if(grid->buf_send) free(grid->buf_send);
grid->buf_send = (MD_FLOAT*) malloc(atom->Nlocal*values_per_atom* sizeof(MD_FLOAT));
grid->maxsend = atom->Nlocal;
}
//buffer particles to send
while(i < atom->Nlocal) {
pos = atom_pos(i);
if(pos < atom->mybox.lo[dim] || pos >= atom->mybox.hi[dim]) {
nsend += packExchange(atom, i, &grid->buf_send[nsend]);
copy(atom, i, atom->Nlocal-1);
atom->Nlocal--;
} else i++;
}
//Communicate the number of elements to be sent
if(rank < extraProc){
MPI_Irecv(&nrecv,1,MPI_INT,partner,0,subComm,&request[0]);
}
if(odd && rank == 0){
MPI_Irecv(&nrecv2,1,MPI_INT,extraProc,0,subComm,&request[1]);
}
MPI_Send(&nsend,1,MPI_INT,partner,0,subComm);
MPI_Waitall(2,request,MPI_STATUS_IGNORE);
//Grow the recv buffer
if(nrecv+nrecv2>=grid->maxrecv){
if(grid->buf_recv) free(grid->buf_recv);
grid->buf_recv = (MD_FLOAT*) malloc((nrecv+nrecv2)*values_per_atom*sizeof(MD_FLOAT));
grid->maxrecv = nrecv+nrecv2;
}
//communicate elements in the buffer
request[0] = MPI_REQUEST_NULL;
request[1] = MPI_REQUEST_NULL;
if(rank < extraProc){
MPI_Irecv(grid->buf_recv,nrecv,type,partner,0,subComm,&request[0]);
}
if(odd && rank == 0){
MPI_Irecv(&grid->buf_recv[nrecv],nrecv2,type,extraProc,0,subComm,&request[1]);
}
MPI_Send (grid->buf_send,nsend,type,partner,0,subComm);
MPI_Waitall(2,request,MPI_STATUS_IGNORE);
//store atoms in atom list
while(m < nrecv+nrecv2){
m += unpackExchange(atom, atom->Nlocal++, &grid->buf_recv[m]);
}
}
void rcbBalance(Grid* grid, Atom* atom, Parameter* param, RCB_Method method, int ndim, double newTime)
{
int me, nprocs=0, ilevel=0, nboxes=1;
int color = 0, size =0;
int index, prd[3];
MPI_Comm subComm;
MPI_Comm_size(world, &nprocs);
MPI_Comm_rank(world, &me);
//set the elapsed time since the last dynamic balance
double time = newTime - grid->Timer;
prd[_x] = atom->mybox.xprd = param->xprd;
prd[_y] = atom->mybox.yprd = param->yprd;
prd[_z] = atom->mybox.zprd = param->zprd;
//Sort by larger dimension
int largerDim[3] ={_x, _y, _z};
for(int i = 0; i< 2; i++){
for(int j = i+1; j<3; j++)
{
if(prd[largerDim[j]]>prd[largerDim[i]]){
MD_FLOAT tmp = largerDim[j];
largerDim[j] = largerDim[i];
largerDim[i] = tmp;
}
}
}
//Initial Partition
atom->mybox.lo[_x] = 0; atom->mybox.hi[_x] = atom->mybox.xprd;
atom->mybox.lo[_y] = 0; atom->mybox.hi[_y] = atom->mybox.yprd;
atom->mybox.lo[_z] = 0; atom->mybox.hi[_z] = atom->mybox.zprd;
//Recursion tree
while(nboxes<nprocs)
{
index = ilevel%ndim;
MPI_Comm_split(world, color, me, &subComm);
MPI_Comm_size(subComm,&size);
if(size > 1){
nextBisectionLevel(grid, atom, method, subComm, largerDim[index], &color, ilevel, time);
}
MPI_Comm_free(&subComm);
nboxes = pow(2,++ilevel);
}
//Set the new timer grid
grid->Timer = newTime;
//Creating the global map
MD_FLOAT domain[6] = {atom->mybox.lo[_x], atom->mybox.lo[_y], atom->mybox.lo[_z], atom->mybox.hi[_x], atom->mybox.hi[_y], atom->mybox.hi[_z]};
MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
//Define the same cutneighbour in all dimensions for the exchange communication
for(int dim =_x; dim<=_z; dim++)
grid->cutneigh[dim] = param->cutneigh;
}
//Regular grid
void cartisian3d(Grid* grid, Parameter* param, Box* box)
{
int me, nproc;
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
int numdim=3;
int reorder=0;
int periods[3]={1,1,1};
int mycoord[3]={0,0,0};
int griddim[3]={0,0,0};
MD_FLOAT len[3];
MPI_Comm cartesian;
box->xprd = param->xprd;
box->yprd = param->yprd;
box->zprd = param->zprd;
//Creates a cartesian 3d grid
MPI_Dims_create(nproc, numdim, griddim);
MPI_Cart_create(world,numdim,griddim,periods,reorder,&cartesian);
grid->nprocs[_x] = griddim[_x];
grid->nprocs[_y] = griddim[_y];
grid->nprocs[_z] = griddim[_z];
//Coordinates position in the grid
MPI_Cart_coords(cartesian,me,3,mycoord);
grid->coord[_x] = mycoord[_x];
grid->coord[_y] = mycoord[_y];
grid->coord[_z] = mycoord[_z];
//boundaries of my local box, with origin in (0,0,0).
len[_x] = param->xprd / griddim[_x];
len[_y] = param->yprd / griddim[_y];
len[_z] = param->zprd / griddim[_z];
box->lo[_x] = mycoord[_x] * len[_x];
box->hi[_x] = (mycoord[_x] + 1) * len[_x];
box->lo[_y] = mycoord[_y] * len[_y];
box->hi[_y] = (mycoord[_y] + 1) * len[_y];
box->lo[_z] = mycoord[_z] * len[_z];
box->hi[_z] = (mycoord[_z] + 1) * len[_z];
MD_FLOAT domain[6] = {box->lo[_x], box->lo[_y], box->lo[_z], box->hi[_x], box->hi[_y], box->hi[_z]};
MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
MPI_Comm_free(&cartesian);
//Define the same cutneighbour in all dimensions for the exchange communication
for(int dim =_x; dim<=_z; dim++)
grid->cutneigh[dim] = param->cutneigh;
}
//Other Functions from the grid
void initGrid(Grid* grid)
{ //start with regular grid
int nprocs;
MPI_Comm_size(world, &nprocs);
grid->map_size = 6 * nprocs;
grid->map = (MD_FLOAT*) allocate(ALIGNMENT, grid->map_size * sizeof(MD_FLOAT));
//========rcb=======
grid->maxsend = 0;
grid->maxrecv = 0;
grid->buf_send = NULL;
grid->buf_recv = NULL;
//====staggered=====
grid->Timer = 0.;
}
void setupGrid(Grid* grid, Atom* atom, Parameter* param)
{
int me;
MD_FLOAT xlo, ylo, zlo, xhi, yhi, zhi;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
initGrid(grid);
//Set the origin at (0,0,0)
if(param->input_file){
for(int i=0; i<atom->Nlocal; i++){
atom_x(i) = atom_x(i) - param->xlo;
atom_y(i) = atom_y(i) - param->ylo;
atom_z(i) = atom_z(i) - param->zlo;
}
}
cartisian3d(grid, param, &atom->mybox);
xlo = atom->mybox.lo[_x]; xhi = atom->mybox.hi[_x];
ylo = atom->mybox.lo[_y]; yhi = atom->mybox.hi[_y];
zlo = atom->mybox.lo[_z]; zhi = atom->mybox.hi[_z];
int i = 0;
while(i < atom->Nlocal)
{
if(atom_x(i) >= xlo && atom_x(i)< xhi &&
atom_y(i) >= ylo && atom_y(i)< yhi &&
atom_z(i) >= zlo && atom_z(i)< zhi)
{
i++;
} else {
copy(atom, i, atom->Nlocal-1);
atom->Nlocal--;
}
}
//printGrid(grid);
if(!param->balance){
MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world);
printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
MPI_Barrier(world);
}
}
void printGrid(Grid* grid)
{
int me, nprocs;
MPI_Comm_size(world, &nprocs);
MPI_Comm_rank(world, &me);
MD_FLOAT* map = grid->map;
if(me==0)
{
printf("GRID:\n");
printf("===================================================================================================\n");
for(int i=0; i<nprocs; i++)
printf("Box:%i\txlo:%.4f\txhi:%.4f\tylo:%.4f\tyhi:%.4f\tzlo:%.4f\tzhi:%.4f\n", i,map[6*i],map[6*i+3],map[6*i+1],map[6*i+4],map[6*i+2],map[6*i+5]);
printf("\n\n");
//printf("Box processor:%i\n xlo:%.4f\txhi:%.4f\n ylo:%.4f\tyhi:%.4f\n zlo:%.4f\tzhi:%.4f\n", i,map[6*i],map[6*i+3],map[6*i+1],map[6*i+4],map[6*i+2],map[6*i+5]);
}
MPI_Barrier(world);
}

View File

@@ -1,22 +0,0 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <parameter.h>
#ifndef __BOX_H_
#define __BOX_H_
typedef struct {
int id;
MD_FLOAT xprd, yprd, zprd; //Domain Dimension
MD_FLOAT lo[3]; //smallest coordinate of my subdomain
MD_FLOAT hi[3]; //Highest coordinate of my subdomain
} Box;
int overlapBox(int, int , const Box*, const Box* , Box* , MD_FLOAT , MD_FLOAT);
int overlapFullBox(Parameter*, MD_FLOAT*, const Box*, const Box*);
void expandBox(int , const Box*, const Box* , Box* , MD_FLOAT);
#endif

View File

@@ -1,104 +0,0 @@
#include <atom.h>
#include <parameter.h>
#include <box.h>
#include <grid.h>
#ifndef COMM_H
#define COMM_H
#ifdef GROMACS
#define FORWARD_SIZE (3*CLUSTER_N)
#define REVERSE_SIZE (3*CLUSTER_N)
#define GHOST_SIZE (4*CLUSTER_N+10)
#define EXCHANGE_SIZE 7
#define JFAC MAX(1, CLUSTER_N / CLUSTER_M)
#define LOCAL atom->Nclusters_local / JFAC
#define GHOST atom->Nclusters_ghost
#define IsinRegionToSend(cj) \
((atom->jclusters[(cj)].bbminx >= xlo || atom->jclusters[(cj)].bbmaxx >= xlo) && \
(atom->jclusters[(cj)].bbminx < xhi || atom->jclusters[(cj)].bbmaxx < xhi) && \
(atom->jclusters[(cj)].bbminy >= ylo || atom->jclusters[(cj)].bbmaxy >= ylo) && \
(atom->jclusters[(cj)].bbminy < yhi || atom->jclusters[(cj)].bbmaxy < yhi) && \
(atom->jclusters[(cj)].bbminz >= zlo || atom->jclusters[(cj)].bbmaxz >= zlo) && \
(atom->jclusters[(cj)].bbminz < zhi || atom->jclusters[(cj)].bbmaxz < zhi))
#else
#define FORWARD_SIZE 3
#define REVERSE_SIZE 3
#define GHOST_SIZE 4
#define EXCHANGE_SIZE 7
#define LOCAL atom->Nlocal
#define GHOST atom->Nghost
#define IsinRegionToSend(i) \
((atom_x((i)) >= xlo && atom_x((i)) < xhi) && \
(atom_y((i)) >= ylo && atom_y((i)) < yhi) && \
(atom_z((i)) >= zlo && atom_z((i)) < zhi))
#endif
typedef struct {
int myproc; // my proc ID
int numproc; // # of processors
int numneigh; // # of all my neighs along all swaps
int maxneigh; // Buffer size for my neighs
int sendfrom[6]; //return the lowest neigh index to send in each swap
int sendtill[6]; //return the highest neigh index to send in each swao
int recvfrom[6]; //return the lowest neigh index to recv in each swap
int recvtill[6]; //return the highest neigh index to recv in each swap
int* nsend; // neigh whose I want to send
int* nrecv; // neigh whose I want to recv
int* pbc_x; // if pbc in x
int* pbc_y; // if pbc in y
int* pbc_z; // if pbc in z
int* atom_send, *atom_recv; // # of atoms to send/recv for each of my neighs
int* off_atom_send; // atom offset to send, inside of a swap
int* off_atom_recv; // atom offset to recv, inside of a swap
int* nexch; //procs to exchange
int numneighexch; //# of neighbours to exchange
int maxneighexch; //max buff size to store neighbours
int numswap; // # of swaps to perform, it is 6
int swapdim[6]; // dimension of the swap (_x, _y or _z)
int swapdir[6]; // direction of the swap 0 or 1
int swap[3][2]; // given a dim and dir, knows the swap
int othersend[6]; // Determine if a proc interact with more procs in a given swap
int firstrecv[6]; // where to put 1st recv atom in each swap
int** sendlist; // list of atoms to send in each swap
int* maxsendlist; // max # of atoms send in each list-swap
int maxsend; // max elements in buff sender
int maxrecv; // max elements in buff receiver
MD_FLOAT* buf_send; // sender buffer for all comm
MD_FLOAT* buf_recv; // receicer buffer for all comm
int forwardSize; // # of paramaters per atom in forward comm.
int reverseSize; // # of parameters per atom in reverse
int exchangeSize; // # of parameters per atom in exchange
int ghostSize; // # of parameters per atom in ghost list
int iterAtom; //last atom to iterate in each swap.
Box* boxes; // Boundaries to be sent to other procs as ghost.
} Comm;
void initComm(int*, char***, Comm*); //Init MPI
void endComm(Comm*); //End MPI
void setupComm(Comm*,Parameter*,Grid*); //Creates a 3d grid or rcb grid
void neighComm(Comm*,Parameter*,Grid*); //Find neighbours within cut-off and defines ghost regions
void forwardComm(Comm*,Atom*,int); //Send info in one direction
void reverseComm(Comm*,Atom*,int); //Return info after forward communication
void exchangeComm(Comm*,Atom*); //Exchange info between procs
void ghostComm(Comm*, Atom*,int); //Build the ghost neighbours to send during next forwards
void growSend(Comm*,int); //Grows the size of the buffer sender
void growRecv(Comm*,int); //Grows the size of the buffer receiver
void growList(Comm*, int, int); //Grows the size of the list to send
#endif

View File

@@ -1,51 +0,0 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <parameter.h>
#include <box.h>
#include <atom.h>
#include <mpi.h>
#ifndef __MAP_H_
#define __MAP_H_
#define world MPI_COMM_WORLD
#define atom_pos(i) ((dim == _x) ? atom_x((i)) : (dim == _y) ? atom_y((i)) : atom_z((i)))
enum {RCB=1, meanTimeRCB, Staggered};
typedef struct {
int balance_every;
int map_size;
MD_FLOAT* map;
//===Param for Staggerd balance
int nprocs[3];
int coord[3];
MD_FLOAT cutneigh[3];
double Timer;
//===Param for RCB balance
MD_FLOAT* buf_send;
MD_FLOAT* buf_recv;
int maxsend;
int maxrecv;
} Grid;
typedef MD_FLOAT(*RCB_Method)(Atom*,MPI_Comm,int,double);
void setupGrid(Grid*, Atom*, Parameter*);
void cartisian3d(Grid*, Parameter*, Box*);
void rcbBalance(Grid*, Atom*, Parameter* ,RCB_Method, int, double);
void staggeredBalance(Grid*, Atom*, Parameter*, double);
void printGrid(Grid*);
//rcb methods
MD_FLOAT meanBisect(Atom* , MPI_Comm, int, double);
MD_FLOAT meanTimeBisect(Atom*, MPI_Comm, int, double);
#endif

View File

@@ -8,11 +8,9 @@
#define __PARAMETER_H_
#if PRECISION == 1
# define MD_FLOAT float
# define MD_UINT unsigned int
#define MD_FLOAT float
#else
# define MD_FLOAT double
# define MD_UINT unsigned long long int
#define MD_FLOAT double
#endif
typedef struct {
@@ -21,7 +19,6 @@ typedef struct {
char* input_file;
char* vtk_file;
char* xtc_file;
char* write_atom_file;
MD_FLOAT epsilon;
MD_FLOAT sigma;
MD_FLOAT sigma6;
@@ -53,10 +50,6 @@ typedef struct {
MD_FLOAT k_dn;
MD_FLOAT gx, gy, gz;
MD_FLOAT reflect_x, reflect_y, reflect_z;
//MPI implementation
int balance;
int method;
int balance_every;
} Parameter;
void initParameter(Parameter*);

View File

@@ -1,71 +0,0 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
#include <math.h>
#include <comm.h>
#include <atom.h>
#include <timing.h>
#include <parameter.h>
#include <util.h>
//static void addDummyCluster(Atom*);
double forward(Comm* comm, Atom *atom, Parameter* param){
double S, E;
S = getTimeStamp();
if(param->method == halfShell){
for(int iswap = 0; iswap < 5; iswap++)
forwardComm(comm, atom, iswap);
} else if(param->method == eightShell){
for(int iswap = 0; iswap < 6; iswap+=2)
forwardComm(comm, atom, iswap);
} else {
for(int iswap = 0; iswap < 6; iswap++)
forwardComm(comm, atom, iswap);
}
E = getTimeStamp();
return E-S;
}
double reverse(Comm* comm, Atom *atom, Parameter* param){
double S, E;
S = getTimeStamp();
if(param->method == halfShell){
for(int iswap = 4; iswap >= 0; iswap--)
reverseComm(comm, atom, iswap);
} else if(param->method == eightShell){
for(int iswap = 4; iswap >= 0; iswap-=2)
reverseComm(comm, atom, iswap);
} else if(param->method == halfStencil){
for(int iswap = 5; iswap >= 0; iswap--)
reverseComm(comm, atom, iswap);
} else { } //Full Shell Reverse does nothing
E = getTimeStamp();
return E-S;
}
void ghostNeighbor(Comm* comm, Atom* atom, Parameter* param)
{
#ifdef GROMACS
atom->Nclusters_ghost = 0;
#endif
atom->Nghost = 0;
if(param->method == halfShell){
for(int iswap=0; iswap<5; iswap++)
ghostComm(comm,atom,iswap);
} else if(param->method == eightShell){
for(int iswap = 0; iswap<6; iswap+=2)
ghostComm(comm, atom,iswap);
} else {
for(int iswap=0; iswap<6; iswap++)
ghostComm(comm,atom,iswap);
}
}

View File

@@ -48,13 +48,11 @@ static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_S
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
t0 = _mm256_add_pd(t0, t2);
t1 = _mm256_add_pd(t1, t2);
t0 = _mm256_blend_pd(t0, t1, 0xC);
//t0 = _mm256_blend_pd(t0, t1, 0b1100);
t0 = _mm256_blend_pd(t0, t1, 0b1100);
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
_mm256_store_pd(m, t1);
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
//t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
a0 = _mm256_castpd256_pd128(t0);
a1 = _mm256_extractf128_pd(t0, 0x1);
a0 = _mm_add_sd(a0, a1);
@@ -93,7 +91,7 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1,
}
// Functions used in LAMMPS kernel
#define simd_gather(vidx, m, s) _mm256_i32gather_pd(m, vidx, s);
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }

View File

@@ -9,13 +9,10 @@
# include <zmmintrin.h>
#endif
#define MD_SIMD_FLOAT __m512d
#define MD_SIMD_MASK __mmask8
#define MD_SIMD_INT __m256i
#define MD_SIMD_BITMASK MD_SIMD_INT
#define MD_SIMD_IBOOL __mmask16
#define MD_SIMD_FLOAT __m512d
#define MD_SIMD_MASK __mmask8
#define MD_SIMD_INT __m256i
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }

View File

@@ -7,30 +7,11 @@
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#ifndef NO_ZMM_INTRIN
# include <zmmintrin.h>
#endif
#include <zmmintrin.h>
#define MD_SIMD_FLOAT __m512
#define MD_SIMD_MASK __mmask16
#define MD_SIMD_INT __m256i
#define MD_SIMD_IBOOL __mmask16
#define MD_SIMD_INT32 __m512i
#define MD_SIMD_BITMASK MD_SIMD_INT32
static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
return _mm512_load_si512(m);
}
static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
return _mm512_set1_epi32(a);
}
static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
}
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
@@ -88,7 +69,7 @@ static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0,
return _mm_cvtss_f32(t3);
}
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
__m256 t;
a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
t = _mm256_load_ps(m);

View File

@@ -9,15 +9,9 @@
typedef enum {
TOTAL = 0,
FORCE,
NEIGH,
FORWARD,
REVERSE,
UPDATE,
BALANCE,
SETUP,
REST,
FORCE,
NUMTIMER
} timerComm;
} timertype;
#endif

View File

@@ -7,8 +7,8 @@
#ifndef __TIMING_H_
#define __TIMING_H_
extern double getTimeStamp(void);
extern double getTimeResolution(void);
extern double getTimeStamp_(void);
extern double getTimeStamp();
extern double getTimeResolution();
extern double getTimeStamp_();
#endif

View File

@@ -4,8 +4,6 @@
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <math.h>
#ifndef __UTIL_H_
#define __UTIL_H_
@@ -37,19 +35,12 @@
# define PRECISION_STRING "double"
#endif
#define BigOrEqual(a,b) (fabs((a)-(b))<1e-9 || (a)>(b))
#define Equal(a,b) (fabs((a)-(b))<1e-9)
enum {_x=0, _y, _z};
enum {fullShell=0, halfShell, eightShell, halfStencil};
extern double myrandom(int*);
extern void random_reset(int *seed, int ibase, double *coord);
extern int str2ff(const char *string);
extern const char* ff2str(int ff);
extern int get_num_threads();
extern void readline(char *line, FILE *fp);
extern void debug_printf(const char *format, ...);
extern int get_cuda_num_threads();
#endif

View File

@@ -11,14 +11,12 @@
#include <atom.h>
#include <parameter.h>
#include <util.h>
#include <mpi.h>
void initParameter(Parameter *param) {
param->input_file = NULL;
param->vtk_file = NULL;
param->xtc_file = NULL;
param->eam_file = NULL;
param->write_atom_file = NULL;
param->force_field = FF_LJ;
param->epsilon = 1.0;
param->sigma = 1.0;
@@ -55,10 +53,6 @@ void initParameter(Parameter *param) {
param->reflect_x = 0.0;
param->reflect_y = 0.0;
param->reflect_z = 0.0;
//MPI
param->balance = 0;
param->method = 0;
param->balance_every =param->reneigh_every;
}
void readParameter(Parameter *param, const char *filename) {
@@ -77,8 +71,8 @@ void readParameter(Parameter *param, const char *filename) {
for(i = 0; line[i] != '\0' && line[i] != '#'; i++);
line[i] = '\0';
char *tok = strtok(line, "\t ");
char *val = strtok(NULL, "\t ");
char *tok = strtok(line, " ");
char *val = strtok(NULL, " ");
#define PARSE_PARAM(p,f) if(strncmp(tok, #p, sizeof(#p) / sizeof(#p[0]) - 1) == 0) { param->p = f(val); }
#define PARSE_STRING(p) PARSE_PARAM(p, strdup)
@@ -122,39 +116,34 @@ void readParameter(Parameter *param, const char *filename) {
PARSE_INT(x_out_every);
PARSE_INT(v_out_every);
PARSE_INT(half_neigh);
PARSE_INT(method);
PARSE_INT(balance);
PARSE_INT(balance_every);
}
}
// Update dtforce
param->dtforce = 0.5 * param->dt;
// Update sigma6 parameter
MD_FLOAT s2 = param->sigma * param->sigma;
param->sigma6 = s2 * s2 * s2;
//Update balance parameter, 10 could be change
param->balance_every *=param->reneigh_every;
fclose(fp);
}
void printParameter(Parameter *param) {
printf("Parameters:\n");
if(param->input_file != NULL) {
printf("\tInput file: %s\n", param->input_file);
printf("Input file: %s\n", param->input_file);
}
if(param->vtk_file != NULL) {
printf("\tVTK file: %s\n", param->vtk_file);
printf("VTK file: %s\n", param->vtk_file);
}
if(param->xtc_file != NULL) {
printf("\tXTC file: %s\n", param->xtc_file);
printf("XTC file: %s\n", param->xtc_file);
}
if(param->eam_file != NULL) {
printf("\tEAM file: %s\n", param->eam_file);
printf("EAM file: %s\n", param->eam_file);
}
printf("\tForce field: %s\n", ff2str(param->force_field));
@@ -180,11 +169,6 @@ void printParameter(Parameter *param) {
printf("\tNumber of timesteps: %d\n", param->ntimes);
printf("\tReport stats every (timesteps): %d\n", param->nstat);
printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
#ifdef SORT_ATOMS
printf("\tSort atoms when reneighboring: yes\n");
#else
printf("\tSort atoms when reneighboring: no\n");
#endif
printf("\tPrune every (timesteps): %d\n", param->prune_every);
printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);
@@ -193,19 +177,4 @@ void printParameter(Parameter *param) {
printf("\tSkin: %e\n", param->skin);
printf("\tHalf neighbor lists: %d\n", param->half_neigh);
printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq);
// ================ New MPI features =============
char str[20];
strcpy(str, (param->method == 1) ? "Half Shell" :
(param->method == 2) ? "Eight Shell" :
(param->method == 3) ? "Half Stencil":
"Full Shell");
printf("\tMethod: %s\n", str);
strcpy(str, (param->balance == 1) ? "mean RCB" :
(param->balance == 2) ? "mean Time RCB" :
(param->balance == 3) ? "Staggered" :
"cartisian");
printf("\tPartition: %s\n", str);
if(param->balance)
printf("\tRebalancing every (timesteps): %d\n",param->balance_every);
}

View File

@@ -10,7 +10,6 @@
#include <thermo.h>
#include <util.h>
#include <mpi.h>
static int *steparr;
static MD_FLOAT *tmparr;
@@ -25,7 +24,6 @@ static MD_FLOAT t_act;
static MD_FLOAT p_act;
static MD_FLOAT e_act;
static int mstat;
static MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;
/* exported subroutines */
void setupThermo(Parameter *param, int natoms)
@@ -55,42 +53,33 @@ void setupThermo(Parameter *param, int natoms)
void computeThermo(int iflag, Parameter *param, Atom *atom)
{
MD_FLOAT t_sum = 0.0, t = 0.0, p;
int me;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
MD_FLOAT t = 0.0, p;
for(int i = 0; i < atom->Nlocal; i++) {
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
}
MPI_Reduce(&t, &t_sum, 1, type, MPI_SUM, 0 ,MPI_COMM_WORLD);
if(me == 0)
{
t = t_sum * t_scale;
p = (t * dof_boltz) * p_scale;
int istep = iflag;
t = t * t_scale;
p = (t * dof_boltz) * p_scale;
int istep = iflag;
if(iflag == -1){
istep = param->ntimes;
}
if(iflag == 0){
mstat = 0;
}
steparr[mstat] = istep;
tmparr[mstat] = t;
prsarr[mstat] = p;
mstat++;
fprintf(stdout, "%i\t%e\t%e\n", istep, t, p);
if(iflag == -1){
istep = param->ntimes;
}
if(iflag == 0){
mstat = 0;
}
steparr[mstat] = istep;
tmparr[mstat] = t;
prsarr[mstat] = p;
mstat++;
fprintf(stdout, "%i\t%e\t%e\n", istep, t, p);
}
void adjustThermo(Parameter *param, Atom *atom)
{
/* zero center-of-mass motion */
MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
MD_FLOAT v_sum[3], vtot[3];
for(int i = 0; i < atom->Nlocal; i++) {
vxtot += atom_vx(i);
@@ -98,13 +87,9 @@ void adjustThermo(Parameter *param, Atom *atom)
vztot += atom_vz(i);
}
vtot[0] = vxtot; vtot[1] = vytot; vtot[2] = vztot;
MPI_Allreduce(vtot, v_sum, 3, type, MPI_SUM, MPI_COMM_WORLD);
vxtot = v_sum[0] / atom->Natoms;
vytot = v_sum[1] / atom->Natoms;
vztot = v_sum[2] / atom->Natoms;
vxtot = vxtot / atom->Natoms;
vytot = vytot / atom->Natoms;
vztot = vztot / atom->Natoms;
for(int i = 0; i < atom->Nlocal; i++) {
atom_vx(i) -= vxtot;
@@ -112,16 +97,13 @@ void adjustThermo(Parameter *param, Atom *atom)
atom_vz(i) -= vztot;
}
t_act = 0;
MD_FLOAT t = 0.0;
MD_FLOAT t_sum = 0.0;
for(int i = 0; i < atom->Nlocal; i++) {
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
}
MPI_Allreduce(&t, &t_sum, 1,type, MPI_SUM,MPI_COMM_WORLD);
t = t_sum;
t *= t_scale;
MD_FLOAT factor = sqrt(param->temp / t);

View File

@@ -10,7 +10,6 @@
#include <stdlib.h>
#include <string.h>
#include <util.h>
#include <math.h>
/* Park/Miller RNG w/out MASKING, so as to be like f90s version */
#define IA 16807
@@ -80,14 +79,13 @@ const char* ff2str(int ff) {
return "invalid";
}
int get_cuda_num_threads() {
int get_num_threads() {
const char *num_threads_env = getenv("NUM_THREADS");
return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
}
void readline(char *line, FILE *fp) {
if(fgets(line, MAXLINE, fp) == NULL) {
printf("error %i\n",errno);
if(errno != 0) {
perror("readline()");
exit(-1);

View File

@@ -1,22 +1,20 @@
# Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
TAG ?= MPIICC
TAG ?= NVCC
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
ISA ?= AVX512
# Optimization scheme (lammps/gromacs/clusters_per_bin)
OPT_SCHEME ?= gromacs
# Enable likwid (true or false)
ENABLE_LIKWID ?= false
ENABLE_LIKWID ?= true
# SP or DP
DATA_TYPE ?= DP
# AOS or SOA
DATA_LAYOUT ?= SOA
DATA_LAYOUT ?= AOS
# Assembly syntax to generate (ATT/INTEL)
ASM_SYNTAX ?= ATT
# Debug
DEBUG ?= false
DEBUG ?= true
# Sort atoms when reneighboring (true or false)
SORT_ATOMS ?= true
# Explicitly store and load atom types (true or false)
EXPLICIT_TYPES ?= false
# Trace memory addresses for cache simulator (true or false)
@@ -24,13 +22,13 @@ MEM_TRACER ?= false
# Trace indexes and distances for gather-md (true or false)
INDEX_TRACER ?= false
# Compute statistics
COMPUTE_STATS ?= false
COMPUTE_STATS ?= true
# Configurations for lammps optimization scheme
# Use omp simd pragma when running with half neighbor-lists
ENABLE_OMP_SIMD ?= false
ENABLE_OMP_SIMD ?= true
# Use kernel with explicit SIMD intrinsics
USE_SIMD_KERNEL ?= true
USE_SIMD_KERNEL ?= false
# Configurations for gromacs optimization scheme
# Use reference version
@@ -43,6 +41,7 @@ HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
# Configurations for CUDA
# Use CUDA host memory to optimize transfers
USE_CUDA_HOST_MEMORY ?= false
USE_SUPER_CLUSTERS ?= true
#Feature options
OPTIONS = -DALIGNMENT=64

View File

@@ -6,7 +6,7 @@ dt 0.001
temp 80
x_out_freq 500
v_out_freq 5
cutforce 1.8
skin 0.1
cutforce 0.9
skin 0.0
reneigh_every 100
nstat 125000

1
gather-bench Submodule

Submodule gather-bench added at 2f654cb043

View File

@@ -12,7 +12,6 @@
#include <atom.h>
#include <allocate.h>
#include <util.h>
#include <mpi.h>
void initAtom(Atom *atom) {
atom->x = NULL; atom->y = NULL; atom->z = NULL;
@@ -28,7 +27,6 @@ void initAtom(Atom *atom) {
atom->Nclusters = 0;
atom->Nclusters_local = 0;
atom->Nclusters_ghost = 0;
atom->NmaxGhost = 0; //Temporal
atom->Nclusters_max = 0;
atom->type = NULL;
atom->ntypes = 0;
@@ -39,19 +37,27 @@ void initAtom(Atom *atom) {
atom->iclusters = NULL;
atom->jclusters = NULL;
atom->icluster_bin = NULL;
atom->PBCx = NULL;
atom->PBCy = NULL;
atom->PBCz = NULL;
initMasks(atom);
//MPI
Box *mybox = &(atom->mybox);
mybox->xprd = mybox->yprd = mybox->zprd = 0;
mybox->lo[0] = mybox->lo[1] = mybox->lo[2] = 0;
mybox->hi[0] = mybox->hi[1] = mybox->hi[2] = 0;
#ifdef USE_SUPER_CLUSTERS
atom->scl_x = NULL;
atom->scl_v = NULL;
atom->scl_f = NULL;
atom->Nsclusters = 0;
atom->Nsclusters_local = 0;
atom->Nsclusters_ghost = 0;
atom->Nsclusters_max = 0;
atom->scl_type = NULL;
atom->siclusters = NULL;
atom->icluster_idx = NULL;
atom->sicluster_bin = NULL;
#endif //USE_SUPER_CLUSTERS
}
void createAtom(Atom *atom, Parameter *param) {
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
@@ -62,7 +68,6 @@ void createAtom(Atom *atom, Parameter *param) {
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
@@ -139,26 +144,22 @@ int type_str2int(const char *type) {
}
int readAtom(Atom* atom, Parameter* param) {
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
int len = strlen(param->input_file);
if(strncmp(&param->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
if(strncmp(&param->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
if(strncmp(&param->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
if(me==0) fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
exit(-1);
return -1;
}
int readAtom_pdb(Atom* atom, Parameter* param) {
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int read_atoms = 0;
if(!fp) {
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
@@ -168,11 +169,11 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
char *item = strtok(line, " ");
if(strncmp(item, "CRYST1", 6) == 0) {
param->xlo = 0.0;
param->xhi = atof(strtok(NULL, "\t "));
param->xhi = atof(strtok(NULL, " "));
param->ylo = 0.0;
param->yhi = atof(strtok(NULL, "\t "));
param->yhi = atof(strtok(NULL, " "));
param->zlo = 0.0;
param->zhi = atof(strtok(NULL, "\t "));
param->zhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
param->yprd = param->yhi - param->ylo;
param->zprd = param->zhi - param->zlo;
@@ -181,23 +182,23 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
char *label;
int atom_id, comp_id;
MD_FLOAT occupancy, charge;
atom_id = atoi(strtok(NULL, "\t ")) - 1;
atom_id = atoi(strtok(NULL, " ")) - 1;
while(atom_id + 1 >= atom->Nmax) {
growAtom(atom);
}
atom->type[atom_id] = type_str2int(strtok(NULL, "\t "));
label = strtok(NULL, "\t ");
comp_id = atoi(strtok(NULL, "\t "));
atom_x(atom_id) = atof(strtok(NULL, "\t "));
atom_y(atom_id) = atof(strtok(NULL, "\t "));
atom_z(atom_id) = atof(strtok(NULL, "\t "));
atom->type[atom_id] = type_str2int(strtok(NULL, " "));
label = strtok(NULL, " ");
comp_id = atoi(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom->vx[atom_id] = 0.0;
atom->vy[atom_id] = 0.0;
atom->vz[atom_id] = 0.0;
occupancy = atof(strtok(NULL, "\t "));
charge = atof(strtok(NULL, "\t "));
occupancy = atof(strtok(NULL, " "));
charge = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
atom->Natoms++;
atom->Nlocal++;
@@ -209,14 +210,14 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
strncmp(item, "ENDMDL", 6) == 0) {
// Do nothing
} else {
if(me==0) fprintf(stderr, "Invalid item: %s\n", item);
fprintf(stderr, "Invalid item: %s\n", item);
exit(-1);
return -1;
}
}
if(!read_atoms) {
if(me==0) fprintf(stderr, "Input error: No atoms read!\n");
fprintf(stderr, "Input error: No atoms read!\n");
exit(-1);
return -1;
}
@@ -232,15 +233,12 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
if(me==0) fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fclose(fp);
return read_atoms;
}
int readAtom_gro(Atom* atom, Parameter* param) {
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
char desc[MAXLINE];
@@ -249,7 +247,7 @@ int readAtom_gro(Atom* atom, Parameter* param) {
int i = 0;
if(!fp) {
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
@@ -259,25 +257,25 @@ int readAtom_gro(Atom* atom, Parameter* param) {
desc[i] = '\0';
readline(line, fp);
atoms_to_read = atoi(strtok(line, " "));
if(me==0) fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
while(!feof(fp) && read_atoms < atoms_to_read) {
readline(line, fp);
char *label = strtok(line, "\t ");
int type = type_str2int(strtok(NULL, "\t "));
int atom_id = atoi(strtok(NULL, "\t ")) - 1;
char *label = strtok(line, " ");
int type = type_str2int(strtok(NULL, " "));
int atom_id = atoi(strtok(NULL, " ")) - 1;
atom_id = read_atoms;
while(atom_id + 1 >= atom->Nmax) {
growAtom(atom);
}
atom->type[atom_id] = type;
atom_x(atom_id) = atof(strtok(NULL, "\t "));
atom_y(atom_id) = atof(strtok(NULL, "\t "));
atom_z(atom_id) = atof(strtok(NULL, "\t "));
atom->vx[atom_id] = atof(strtok(NULL, "\t "));
atom->vy[atom_id] = atof(strtok(NULL, "\t "));
atom->vz[atom_id] = atof(strtok(NULL, "\t "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom->vx[atom_id] = atof(strtok(NULL, " "));
atom->vy[atom_id] = atof(strtok(NULL, " "));
atom->vz[atom_id] = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
atom->Natoms++;
atom->Nlocal++;
@@ -287,18 +285,18 @@ int readAtom_gro(Atom* atom, Parameter* param) {
if(!feof(fp)) {
readline(line, fp);
param->xlo = 0.0;
param->xhi = atof(strtok(line, "\t "));
param->xhi = atof(strtok(line, " "));
param->ylo = 0.0;
param->yhi = atof(strtok(NULL, "\t "));
param->yhi = atof(strtok(NULL, " "));
param->zlo = 0.0;
param->zhi = atof(strtok(NULL, "\t "));
param->zhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
param->yprd = param->yhi - param->ylo;
param->zprd = param->zhi - param->zlo;
}
if(read_atoms != atoms_to_read) {
if(me==0) fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
exit(-1);
return -1;
}
@@ -314,14 +312,12 @@ int readAtom_gro(Atom* atom, Parameter* param) {
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
if(me==0) fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fclose(fp);
return read_atoms;
}
int readAtom_dmp(Atom* atom, Parameter* param) {
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int natoms = 0;
@@ -330,7 +326,7 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
int ts = -1;
if(!fp) {
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
@@ -353,47 +349,47 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
}
} else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
readline(line, fp);
param->xlo = atof(strtok(line, "\t "));
param->xhi = atof(strtok(NULL, "\t "));
param->xlo = atof(strtok(line, " "));
param->xhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
readline(line, fp);
param->ylo = atof(strtok(line, "\t "));
param->yhi = atof(strtok(NULL, "\t "));
param->ylo = atof(strtok(line, " "));
param->yhi = atof(strtok(NULL, " "));
param->yprd = param->yhi - param->ylo;
readline(line, fp);
param->zlo = atof(strtok(line, "\t "));
param->zhi = atof(strtok(NULL, "\t "));
param->zlo = atof(strtok(line, " "));
param->zhi = atof(strtok(NULL, " "));
param->zprd = param->zhi - param->zlo;
} else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
for(int i = 0; i < natoms; i++) {
readline(line, fp);
atom_id = atoi(strtok(line, "\t ")) - 1;
atom->type[atom_id] = atoi(strtok(NULL, "\t "));
atom_x(atom_id) = atof(strtok(NULL, "\t "));
atom_y(atom_id) = atof(strtok(NULL, "\t "));
atom_z(atom_id) = atof(strtok(NULL, "\t "));
atom->vx[atom_id] = atof(strtok(NULL, "\t "));
atom->vy[atom_id] = atof(strtok(NULL, "\t "));
atom->vz[atom_id] = atof(strtok(NULL, "\t "));
atom_id = atoi(strtok(line, " ")) - 1;
atom->type[atom_id] = atoi(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom->vx[atom_id] = atof(strtok(NULL, " "));
atom->vy[atom_id] = atof(strtok(NULL, " "));
atom->vz[atom_id] = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
read_atoms++;
}
} else {
if(me==0) fprintf(stderr, "Invalid item: %s\n", item);
fprintf(stderr, "Invalid item: %s\n", item);
exit(-1);
return -1;
}
} else {
if(me==0) fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
exit(-1);
return -1;
}
}
if(ts < 0 || !natoms || !read_atoms) {
if(me==0) fprintf(stderr, "Input error: atom data was not read!\n");
fprintf(stderr, "Input error: atom data was not read!\n");
exit(-1);
return -1;
}
@@ -409,118 +405,11 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
if(me==0) fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
fclose(fp);
return natoms;
}
void initMasks(Atom *atom) {
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
unsigned int mask0, mask1, mask2, mask3;
atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
//atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
}
for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
}
for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
atom->exclusion_filter[i] = (1U << i);
}
#if CLUSTER_M == CLUSTER_N
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
mask0 = (unsigned int)(0xf - 0x1 * cond0);
mask1 = (unsigned int)(0xf - 0x3 * cond0);
mask2 = (unsigned int)(0xf - 0x7 * cond0);
mask3 = (unsigned int)(0xf - 0xf * cond0);
atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
mask0 = (unsigned int)(0xf - 0x1 * cond0);
mask1 = (unsigned int)(0xf - 0x2 * cond0);
mask2 = (unsigned int)(0xf - 0x4 * cond0);
mask3 = (unsigned int)(0xf - 0x8 * cond0);
atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
}
#else
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
#if CLUSTER_M < CLUSTER_N
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
#else
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
#endif
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
#if CLUSTER_M < CLUSTER_N
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
#else
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
#endif
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
#if CLUSTER_M < CLUSTER_N
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
#else
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
#endif
}
}
#endif
}
void growAtom(Atom *atom) {
int nold = atom->Nmax;
atom->Nmax += DELTA;
@@ -551,248 +440,17 @@ void growClusters(Atom *atom) {
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
}
/* MPI added*/
void growPbc(Atom* atom) {
int nold = atom->NmaxGhost;
atom->NmaxGhost += DELTA;
#ifdef USE_SUPER_CLUSTERS
void growSuperClusters(Atom *atom) {
int nold = atom->Nsclusters_max;
atom->Nsclusters_max += DELTA;
atom->siclusters = (SuperCluster*) reallocate(atom->siclusters, ALIGNMENT, atom->Nsclusters_max * sizeof(SuperCluster), nold * sizeof(SuperCluster));
atom->icluster_idx = (int*) reallocate(atom->icluster_idx, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int), nold * SCLUSTER_SIZE * sizeof(int));
atom->sicluster_bin = (int*) reallocate(atom->sicluster_bin, ALIGNMENT, atom->Nsclusters_max * sizeof(int), nold * sizeof(int));
//atom->scl_type = (int*) reallocate(atom->scl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * SCLUSTER_SIZE * sizeof(int), nold * CLUSTER_M * SCLUSTER_SIZE * sizeof(int));
if (atom->PBCx || atom->PBCy || atom->PBCz){
atom->PBCx = (int*) reallocate(atom->PBCx, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
atom->PBCy = (int*) reallocate(atom->PBCy, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
atom->PBCz = (int*) reallocate(atom->PBCz, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
} else {
atom->PBCx = (int*) malloc(atom->NmaxGhost * sizeof(int));
atom->PBCy = (int*) malloc(atom->NmaxGhost * sizeof(int));
atom->PBCz = (int*) malloc(atom->NmaxGhost * sizeof(int));
}
}
void packForward(Atom* atom, int nc, int* list, MD_FLOAT* buf, int* pbc)
{
for(int i = 0; i < nc; i++) {
int cj = list[i];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
int displ = i*CLUSTER_N;
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
buf[3*(displ+cjj)+0] = cj_x[CL_X_OFFSET + cjj] + pbc[_x] * atom->mybox.xprd;
buf[3*(displ+cjj)+1] = cj_x[CL_Y_OFFSET + cjj] + pbc[_y] * atom->mybox.yprd;
buf[3*(displ+cjj)+2] = cj_x[CL_Z_OFFSET + cjj] + pbc[_z] * atom->mybox.zprd;
}
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
buf[3*(displ+cjj)+0] = -1; //x
buf[3*(displ+cjj)+1] = -1; //y
buf[3*(displ+cjj)+2] = -1; //z
}
}
}
void unpackForward(Atom* atom, int nc, int c0, MD_FLOAT* buf)
{
for(int i = 0; i < nc; i++) {
int cj = c0+i;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
int displ = i*CLUSTER_N;
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
if(cj_x[CL_X_OFFSET + cjj]<INFINITY) cj_x[CL_X_OFFSET + cjj] = buf[3*(displ+cjj)+0];
if(cj_x[CL_Y_OFFSET + cjj]<INFINITY) cj_x[CL_Y_OFFSET + cjj] = buf[3*(displ+cjj)+1];
if(cj_x[CL_Z_OFFSET + cjj]<INFINITY) cj_x[CL_Z_OFFSET + cjj] = buf[3*(displ+cjj)+2];
}
}
}
int packGhost(Atom* atom, int cj, MD_FLOAT* buf, int* pbc)
{
//#of elements per cluster natoms,x0,y0,z0,type_0, . . ,xn,yn,zn,type_n,bbminx,bbmaxxy,bbminy,bbmaxy,bbminz,bbmaxz
//count = 4*N_CLUSTER+7, if N_CLUSTER =4 => count = 23 value/cluster + trackpbc[x] + trackpbc[y] + trackpbc[z]
int m = 0;
if(atom->jclusters[cj].natoms > 0) {
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
buf[m++] = atom->jclusters[cj].natoms;
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
MD_FLOAT xtmp = cj_x[CL_X_OFFSET + cjj] + pbc[_x] * atom->mybox.xprd;
MD_FLOAT ytmp = cj_x[CL_Y_OFFSET + cjj] + pbc[_y] * atom->mybox.yprd;
MD_FLOAT ztmp = cj_x[CL_Z_OFFSET + cjj] + pbc[_z] * atom->mybox.zprd;
buf[m++] = xtmp;
buf[m++] = ytmp;
buf[m++] = ztmp;
buf[m++]= atom->cl_type[cj_sca_base + cjj];
if(bbminx > xtmp) { bbminx = xtmp; }
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
if(bbminy > ytmp) { bbminy = ytmp; }
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
}
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
buf[m++] = -1; //x
buf[m++] = -1; //y
buf[m++] = -1; //z
buf[m++] = -1; //type
}
buf[m++] = bbminx;
buf[m++] = bbmaxx;
buf[m++] = bbminy;
buf[m++] = bbmaxy;
buf[m++] = bbminz;
buf[m++] = bbmaxz;
//TODO: check atom->ncj
int ghostId = cj-atom->ncj;
//check for ghost particles
buf[m++] = (cj-atom->ncj>=0) ? pbc[_x]+atom->PBCx[ghostId]:pbc[_x];
buf[m++] = (cj-atom->ncj>=0) ? pbc[_y]+atom->PBCy[ghostId]:pbc[_y];
buf[m++] = (cj-atom->ncj>=0) ? pbc[_z]+atom->PBCz[ghostId]:pbc[_z];
}
return m;
}
int unpackGhost(Atom* atom, int cj, MD_FLOAT* buf)
{
int m = 0;
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
if(cj*jfac>=atom->Nclusters_max) growClusters(atom);
if(atom->Nclusters_ghost>=atom->NmaxGhost) growPbc(atom);
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj);
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
atom->jclusters[cj].natoms = buf[m++];
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
cj_x[CL_X_OFFSET + cjj] = buf[m++];
cj_x[CL_Y_OFFSET + cjj] = buf[m++];
cj_x[CL_Z_OFFSET + cjj] = buf[m++];
atom->cl_type[cj_sca_base + cjj] = buf[m++];
atom->Nghost++;
}
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
cj_x[CL_X_OFFSET + cjj] = INFINITY;
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
atom->cl_type[cj_sca_base + cjj] = -1;
m+=4;
}
atom->jclusters[cj].bbminx = buf[m++];
atom->jclusters[cj].bbmaxx = buf[m++];
atom->jclusters[cj].bbminy = buf[m++];
atom->jclusters[cj].bbmaxy = buf[m++];
atom->jclusters[cj].bbminz = buf[m++];
atom->jclusters[cj].bbmaxz = buf[m++];
atom->PBCx[atom->Nclusters_ghost] = buf[m++];
atom->PBCy[atom->Nclusters_ghost] = buf[m++];
atom->PBCz[atom->Nclusters_ghost] = buf[m++];
atom->Nclusters_ghost++;
}
void packReverse(Atom* atom, int nc, int c0, MD_FLOAT* buf)
{
for(int i = 0; i < nc; i++) {
int cj = c0+i;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
int displ = i*CLUSTER_N;
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
buf[3*(displ+cjj)+0] = cj_f[CL_X_OFFSET + cjj];
buf[3*(displ+cjj)+1] = cj_f[CL_Y_OFFSET + cjj];
buf[3*(displ+cjj)+2] = cj_f[CL_Z_OFFSET + cjj];
}
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
buf[3*(displ+cjj)+0] = -1; //x
buf[3*(displ+cjj)+1] = -1; //y
buf[3*(displ+cjj)+2] = -1; //z
}
}
}
void unpackReverse(Atom* atom, int nc, int* list, MD_FLOAT* buf)
{
for(int i = 0; i < nc; i++) {
int cj = list[i];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
int displ = i*CLUSTER_N;
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
cj_f[CL_X_OFFSET + cjj] += buf[3*(displ+cjj)+0];
cj_f[CL_Y_OFFSET + cjj] += buf[3*(displ+cjj)+1];
cj_f[CL_Z_OFFSET + cjj] += buf[3*(displ+cjj)+2];
}
}
}
int packExchange(Atom* atom, int i, MD_FLOAT* buf)
{
int m = 0;
buf[m++] = atom_x(i);
buf[m++] = atom_y(i);
buf[m++] = atom_z(i);
buf[m++] = atom_vx(i);
buf[m++] = atom_vy(i);
buf[m++] = atom_vz(i);
buf[m++] = atom->type[i];
return m;
}
int unpackExchange(Atom* atom, int i, MD_FLOAT* buf)
{
while(i >= atom->Nmax) growAtom(atom);
int m = 0;
atom_x(i) = buf[m++];
atom_y(i) = buf[m++];
atom_z(i) = buf[m++];
atom_vx(i) = buf[m++];
atom_vy(i) = buf[m++];
atom_vz(i) = buf[m++];
atom->type[i] = buf[m++];
return m;
}
void pbc(Atom* atom)
{
for(int i = 0; i < atom->Nlocal; i++) {
MD_FLOAT xprd = atom->mybox.xprd;
MD_FLOAT yprd = atom->mybox.yprd;
MD_FLOAT zprd = atom->mybox.zprd;
if(atom_x(i) < 0.0) atom_x(i) += xprd;
if(atom_y(i) < 0.0) atom_y(i) += yprd;
if(atom_z(i) < 0.0) atom_z(i) +=zprd;
if(atom_x(i) >= xprd) atom_x(i) -=xprd;
if(atom_y(i) >= yprd) atom_y(i) -=yprd;
if(atom_z(i) >= zprd) atom_z(i) -=zprd;
}
}
void copy(Atom* atom, int i, int j)
{
atom_x(i) = atom_x(j);
atom_y(i) = atom_y(j);
atom_z(i) = atom_z(j);
atom_vx(i) = atom_vx(j);
atom_vy(i) = atom_vy(j);
atom_vz(i) = atom_vz(j);
atom->type[i] = atom->type[j];
atom->scl_x = (MD_FLOAT*) reallocate(atom->scl_x, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->scl_f = (MD_FLOAT*) reallocate(atom->scl_f, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->scl_v = (MD_FLOAT*) reallocate(atom->scl_v, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
}
#endif //USE_SUPER_CLUSTERS

View File

@@ -39,8 +39,29 @@ extern "C" {
MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
int isReneighboured;
int *cuda_iclusters;
int *cuda_nclusters;
int cuda_max_scl;
MD_FLOAT *cuda_scl_x;
MD_FLOAT *cuda_scl_v;
MD_FLOAT *cuda_scl_f;
extern void alignDataToSuperclusters(Atom *atom);
extern void alignDataFromSuperclusters(Atom *atom);
extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
}
extern __global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters,
int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt);
extern __global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters, int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce);
extern "C"
void initDevice(Atom *atom, Neighbor *neighbor) {
cuda_assert("cudaDeviceSetup", cudaDeviceReset());
@@ -59,10 +80,23 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
natoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
ngatoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
isReneighboured = 1;
#ifdef USE_SUPER_CLUSTERS
cuda_max_scl = atom->Nsclusters_max;
cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
#endif //USE_SUPER_CLUSTERS
}
extern "C"
void copyDataToCUDADevice(Atom *atom) {
DEBUG_MESSAGE("copyDataToCUDADevice start\r\n");
memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
@@ -85,13 +119,49 @@ void copyDataToCUDADevice(Atom *atom) {
memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
#ifdef USE_SUPER_CLUSTERS
//alignDataToSuperclusters(atom);
if (cuda_max_scl < atom->Nsclusters_max) {
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
cuda_max_scl = atom->Nsclusters_max;
cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
}
memcpyToGPU(cuda_scl_x, atom->scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_scl_v, atom->scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_scl_f, atom->scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
#endif //USE_SUPER_CLUSTERS
DEBUG_MESSAGE("copyDataToCUDADevice stop\r\n");
}
extern "C"
void copyDataFromCUDADevice(Atom *atom) {
DEBUG_MESSAGE("copyDataFromCUDADevice start\r\n");
memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
#ifdef USE_SUPER_CLUSTERS
memcpyFromGPU(atom->scl_x, cuda_scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->scl_v, cuda_scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->scl_f, cuda_scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
//alignDataFromSuperclusters(atom);
#endif //USE_SUPER_CLUSTERS
DEBUG_MESSAGE("copyDataFromCUDADevice stop\r\n");
}
extern "C"
@@ -109,6 +179,12 @@ void cudaDeviceFree() {
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
free(natoms);
free(ngatoms);
#ifdef USE_SUPER_CLUSTERS
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
#endif //USE_SUPER_CLUSTERS
}
__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
@@ -165,6 +241,39 @@ __global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
}
}
__global__ void cudaUpdatePbcSup_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
int *cuda_jclusters_natoms,
int *cuda_PBCx,
int *cuda_PBCy,
int *cuda_PBCz,
int Nsclusters_local,
int Nclusters_ghost,
MD_FLOAT param_xprd,
MD_FLOAT param_yprd,
MD_FLOAT param_zprd) {
unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
if (cg >= Nclusters_ghost) return;
//int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int jfac = SCLUSTER_SIZE / CLUSTER_M;
int ncj = Nsclusters_local / jfac;
MD_FLOAT xprd = param_xprd;
MD_FLOAT yprd = param_yprd;
MD_FLOAT zprd = param_zprd;
const int cj = ncj + cg;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
}
}
__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
int Nclusters_local, int Nclusters_max,
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
@@ -251,9 +360,17 @@ extern "C"
void cudaInitialIntegrate(Parameter *param, Atom *atom) {
const int threads_num = 16;
dim3 block_size = dim3(threads_num, 1, 1);
#ifdef USE_SUPER_CLUSTERS
dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
cudaInitialIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_v, cuda_scl_f,
cuda_nclusters,
cuda_natoms, atom->Nsclusters_local, param->dtforce, param->dt);
#else
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
#endif //USE_SUPER_CLUSTERS
cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
}
@@ -264,11 +381,19 @@ extern "C"
void cudaUpdatePbc(Atom *atom, Parameter *param) {
const int threads_num = 512;
dim3 block_size = dim3(threads_num, 1, 1);;
dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);
#ifdef USE_SUPER_CLUSTERS
cudaUpdatePbcSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_border_map,
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
atom->Nclusters_local, atom->Nclusters_ghost,
param->xprd, param->yprd, param->zprd);
#else
cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
atom->Nclusters_local, atom->Nclusters_ghost,
param->xprd, param->yprd, param->zprd);
#endif //USE_SUPER_CLUSTERS
cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
}
@@ -310,8 +435,17 @@ extern "C"
void cudaFinalIntegrate(Parameter *param, Atom *atom) {
const int threads_num = 16;
dim3 block_size = dim3(threads_num, 1, 1);
#ifdef USE_SUPER_CLUSTERS
dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
cudaFinalIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_v, cuda_scl_f,
cuda_nclusters, cuda_natoms,
atom->Nsclusters_local, param->dt);
#else
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms,
atom->Nclusters_local, param->dt);
#endif //USE_SUPER_CLUSTERS
cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
}

View File

@@ -0,0 +1,288 @@
extern "C" {
#include <stdio.h>
//---
#include <cuda.h>
#include <driver_types.h>
//---
#include <likwid-marker.h>
//---
#include <atom.h>
#include <device.h>
#include <neighbor.h>
#include <parameter.h>
#include <stats.h>
#include <timing.h>
#include <util.h>
}
extern "C" {
extern MD_FLOAT *cuda_cl_x;
extern MD_FLOAT *cuda_cl_v;
extern MD_FLOAT *cuda_cl_f;
extern int *cuda_neighbors;
extern int *cuda_numneigh;
extern int *cuda_natoms;
extern int *natoms;
extern int *ngatoms;
extern int *cuda_border_map;
extern int *cuda_jclusters_natoms;
extern MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
extern MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
extern MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
extern int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
extern int isReneighboured;
extern int *cuda_iclusters;
extern int *cuda_nclusters;
extern MD_FLOAT *cuda_scl_x;
extern MD_FLOAT *cuda_scl_v;
extern MD_FLOAT *cuda_scl_f;
}
#ifdef USE_SUPER_CLUSTERS
extern "C"
void alignDataToSuperclusters(Atom *atom) {
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
/*
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
*/
memcpy(&atom->scl_x[scci], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_v[scci], &ci_v[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_f[scci], &ci_f[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
}
}
}
extern "C"
void alignDataFromSuperclusters(Atom *atom) {
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
/*
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
*/
memcpy(&ci_x[0], &atom->scl_x[scci], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_x[0 + CLUSTER_M], &atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_x[0 + 2 * CLUSTER_M], &atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_v[0], &atom->scl_v[scci], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_v[0 + CLUSTER_M], &atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_v[0 + 2 * CLUSTER_M], &atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_f[0], &atom->scl_f[scci], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_f[0 + CLUSTER_M], &atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_f[0 + 2 * CLUSTER_M], &atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
}
}
}
__global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters,
int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
if (sci_pos >= Nsclusters_local) return;
//unsigned int ci_pos = cii_pos / CLUSTER_M;
//unsigned int scii_pos = cii_pos % CLUSTER_M;
//if (ci_pos >= cuda_nclusters[sci_pos]) return;
//if (scii_pos >= cuda_natoms[ci_pos]) return;
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
ci_x[SCL_X_OFFSET + scii_pos] += dt * ci_v[SCL_X_OFFSET + scii_pos];
ci_x[SCL_Y_OFFSET + scii_pos] += dt * ci_v[SCL_Y_OFFSET + scii_pos];
ci_x[SCL_Z_OFFSET + scii_pos] += dt * ci_v[SCL_Z_OFFSET + scii_pos];
}
}
__global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters, int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce) {
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
if (sci_pos >= Nsclusters_local) return;
//unsigned int ci_pos = cii_pos / CLUSTER_M;
//unsigned int scii_pos = cii_pos % CLUSTER_M;
//if (ci_pos >= cuda_nclusters[sci_pos]) return;
//if (scii_pos >= cuda_natoms[ci_pos]) return;
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
}
}
__global__ void computeForceLJSup_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters, int *cuda_iclusters,
int Nsclusters_local,
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int scii_pos = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
if ((sci_pos >= Nsclusters_local) || (scii_pos >= SCLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
unsigned int ci_pos = scii_pos / CLUSTER_M;
unsigned int cii_pos = scii_pos % CLUSTER_M;
if (ci_pos >= cuda_nclusters[sci_pos]) return;
int ci_cj0 = CJ0_FROM_CI(ci_pos);
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
//int numneighs = cuda_numneigh[ci_pos];
int numneighs = cuda_numneigh[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos]];
for(int k = 0; k < numneighs; k++) {
int glob_j = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k];
int scj = glob_j / SCLUSTER_SIZE;
// TODO Make cj accessible from super cluster data alignment (not reachable right now)
int cj = SCJ_VECTOR_BASE_INDEX(scj) + CLUSTER_M * (glob_j % SCLUSTER_SIZE);
int cj_vec_base = cj;
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
MD_FLOAT xtmp = ci_x[SCL_CL_X_OFFSET(ci_pos) + cii_pos];
MD_FLOAT ytmp = ci_x[SCL_CL_Y_OFFSET(ci_pos) + cii_pos];
MD_FLOAT ztmp = ci_x[SCL_CL_Z_OFFSET(ci_pos) + cii_pos];
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
//int cond = ci_cj0 != cj || cii_pos != cjj_pos || scj != sci_pos;
int cond = (glob_j != cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] && cii_pos != cjj_pos);
if(cond) {
MD_FLOAT delx = xtmp - cj_x[SCL_CL_X_OFFSET(ci_pos) + cjj_pos];
MD_FLOAT dely = ytmp - cj_x[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos];
MD_FLOAT delz = ztmp - cj_x[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos];
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
if(half_neigh) {
atomicAdd(&cj_f[SCL_CL_X_OFFSET(ci_pos) + cjj_pos], -delx * force);
atomicAdd(&cj_f[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos], -dely * force);
atomicAdd(&cj_f[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos], -delz * force);
}
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
atomicAdd(&ci_f[SCL_CL_X_OFFSET(ci_pos) + cii_pos], fix);
atomicAdd(&ci_f[SCL_CL_Y_OFFSET(ci_pos) + cii_pos], fiy);
atomicAdd(&ci_f[SCL_CL_Z_OFFSET(ci_pos) + cii_pos], fiz);
}
}
}
}
extern "C"
double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJSup_cuda start\r\n");
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
if (isReneighboured) {
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
}
for(int sci = 0; sci < atom->Nsclusters_local; sci++) {
memcpyToGPU(&cuda_nclusters[sci], &atom->siclusters[sci].nclusters, sizeof(int));
//memcpyToGPU(&cuda_iclusters[sci * SCLUSTER_SIZE], &atom->siclusters[sci].iclusters, sizeof(int) * atom->siclusters[sci].nclusters);
}
memcpyToGPU(cuda_iclusters, atom->icluster_idx, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
isReneighboured = 0;
}
const int threads_num = 1;
dim3 block_size = dim3(threads_num, SCLUSTER_M, CLUSTER_N);
dim3 grid_size = dim3(atom->Nsclusters_local/threads_num+1, 1, 1);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
computeForceLJSup_cuda_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_f,
cuda_nclusters, cuda_iclusters,
atom->Nsclusters_local,
cuda_numneigh, cuda_neighbors,
neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
sigma6, epsilon);
cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
DEBUG_MESSAGE("computeForceLJSup_cuda stop\r\n");
return E-S;
}
#endif //USE_SUPER_CLUSTERS

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,6 @@
* license that can be found in the LICENSE file.
*/
#include <parameter.h>
#include <box.h>
#ifndef __ATOM_H_
#define __ATOM_H_
@@ -23,8 +22,25 @@
# define KERNEL_NAME "CUDA"
# define CLUSTER_M 8
# define CLUSTER_N VECTOR_WIDTH
# define UNROLL_J 1
#ifdef USE_SUPER_CLUSTERS
# define XX 0
# define YY 1
# define ZZ 2
# define SCLUSTER_SIZE_X 2
# define SCLUSTER_SIZE_Y 2
# define SCLUSTER_SIZE_Z 2
# define SCLUSTER_SIZE (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)
# define DIM_COORD(dim,coord) ((dim == XX) ? atom_x(coord) : ((dim == YY) ? atom_y(coord) : atom_z(coord)))
# define MIN(a,b) ({int _a = (a), _b = (b); _a < _b ? _a : _b; })
# define SCLUSTER_M CLUSTER_M * SCLUSTER_SIZE
# define computeForceLJ computeForceLJSup_cuda
#else
# define computeForceLJ computeForceLJ_cuda
#endif //USE_SUPER_CLUSTERS
# define initialIntegrate cudaInitialIntegrate
# define finalIntegrate cudaFinalIntegrate
# define updatePbc cudaUpdatePbc
@@ -34,15 +50,11 @@
# if VECTOR_WIDTH > CLUSTER_M * 2
# define KERNEL_NAME "Simd2xNN"
# define CLUSTER_N (VECTOR_WIDTH / 2)
# define UNROLL_I 4
# define UNROLL_J 2
# define computeForceLJ computeForceLJ_2xnn
// Simd4xN
# else
# define KERNEL_NAME "Simd4xN"
# define CLUSTER_N VECTOR_WIDTH
# define UNROLL_I 4
# define UNROLL_J 1
# define computeForceLJ computeForceLJ_4xn
# endif
# ifdef USE_REFERENCE_VERSION
@@ -61,16 +73,29 @@
# define CJ1_FROM_CI(a) (a)
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
#ifdef USE_SUPER_CLUSTERS
# define CJ1_FROM_SCI(a) (a)
# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
#endif //USE_SUPER_CLUSTERS
#elif CLUSTER_M == CLUSTER_N * 2 // M > N
# define CJ0_FROM_CI(a) ((a) << 1)
# define CJ1_FROM_CI(a) (((a) << 1) | 0x1)
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b))
# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
#ifdef USE_SUPER_CLUSTERS
# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_M * SCLUSTER_SIZE * (b))
# define SCJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (SCLUSTER_SIZE * CLUSTER_M >> 1))
#endif //USE_SUPER_CLUSTERS
#elif CLUSTER_M == CLUSTER_N / 2 // M < N
# define CJ0_FROM_CI(a) ((a) >> 1)
# define CJ1_FROM_CI(a) ((a) >> 1)
# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
#ifdef USE_SUPER_CLUSTERS
# define SCI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (CLUSTER_N * SCLUSTER_SIZE >> 1))
# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
#endif //USE_SUPER_CLUSTERS
#else
# error "Invalid cluster configuration!"
#endif
@@ -84,14 +109,37 @@
#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1))
#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3))
#ifdef USE_SUPER_CLUSTERS
#define SCI_SCALAR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 1))
#define SCI_VECTOR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 3))
#define SCJ_SCALAR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 1))
#define SCJ_VECTOR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 3))
#endif //USE_SUPER_CLUSTERS
#if CLUSTER_M >= CLUSTER_N
# define CL_X_OFFSET (0 * CLUSTER_M)
# define CL_Y_OFFSET (1 * CLUSTER_M)
# define CL_Z_OFFSET (2 * CLUSTER_M)
#ifdef USE_SUPER_CLUSTERS
# define SCL_CL_X_OFFSET(ci) (ci * CLUSTER_M + 0 * SCLUSTER_M)
# define SCL_CL_Y_OFFSET(ci) (ci * CLUSTER_M + 1 * SCLUSTER_M)
# define SCL_CL_Z_OFFSET(ci) (ci * CLUSTER_M + 2 * SCLUSTER_M)
# define SCL_X_OFFSET (0 * SCLUSTER_M)
# define SCL_Y_OFFSET (1 * SCLUSTER_M)
# define SCL_Z_OFFSET (2 * SCLUSTER_M)
#endif //USE_SUPER_CLUSTERS
#else
# define CL_X_OFFSET (0 * CLUSTER_N)
# define CL_Y_OFFSET (1 * CLUSTER_N)
# define CL_Z_OFFSET (2 * CLUSTER_N)
#ifdef USE_SUPER_CLUSTERS
# define SCL_X_OFFSET (0 * SCLUSTER_SIZE * CLUSTER_N)
# define SCL_Y_OFFSET (1 * SCLUSTER_SIZE * CLUSTER_N)
# define SCL_Z_OFFSET (2 * SCLUSTER_SIZE * CLUSTER_N)
#endif //USE_SUPER_CLUSTERS
#endif
typedef struct {
@@ -101,9 +149,16 @@ typedef struct {
MD_FLOAT bbminz, bbmaxz;
} Cluster;
typedef struct {
int nclusters;
MD_FLOAT bbminx, bbmaxx;
MD_FLOAT bbminy, bbmaxy;
MD_FLOAT bbminz, bbmaxz;
} SuperCluster;
typedef struct {
int Natoms, Nlocal, Nghost, Nmax;
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max, NmaxGhost,ncj;
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
MD_FLOAT *x, *y, *z;
MD_FLOAT *vx, *vy, *vz;
int *border_map;
@@ -113,7 +168,6 @@ typedef struct {
MD_FLOAT *sigma6;
MD_FLOAT *cutforcesq;
MD_FLOAT *cutneighsq;
//track the movement of a particle along boundaries
int *PBCx, *PBCy, *PBCz;
// Data in cluster format
MD_FLOAT *cl_x;
@@ -123,20 +177,20 @@ typedef struct {
Cluster *iclusters, *jclusters;
int *icluster_bin;
int dummy_cj;
MD_UINT *exclusion_filter;
MD_FLOAT *diagonal_4xn_j_minus_i;
MD_FLOAT *diagonal_2xnn_j_minus_i;
unsigned int masks_2xnn_hn[8];
unsigned int masks_2xnn_fn[8];
unsigned int masks_4xn_hn[16];
unsigned int masks_4xn_fn[16];
//Info Subdomain
Box mybox;
#ifdef USE_SUPER_CLUSTERS
int Nsclusters, Nsclusters_local, Nsclusters_ghost, Nsclusters_max;
MD_FLOAT *scl_x;
MD_FLOAT *scl_v;
MD_FLOAT *scl_f;
int *scl_type;
int *icluster_idx;
SuperCluster *siclusters;
int *sicluster_bin;
#endif //USE_SUPER_CLUSTERS
} Atom;
extern void initAtom(Atom*);
extern void initMasks(Atom*);
extern void createAtom(Atom*, Parameter*);
extern int readAtom(Atom*, Parameter*);
extern int readAtom_pdb(Atom*, Parameter*);
@@ -144,18 +198,7 @@ extern int readAtom_gro(Atom*, Parameter*);
extern int readAtom_dmp(Atom*, Parameter*);
extern void growAtom(Atom*);
extern void growClusters(Atom*);
int packGhost(Atom*, int, MD_FLOAT* , int*);
int unpackGhost(Atom*, int, MD_FLOAT*);
int packExchange(Atom*, int, MD_FLOAT*);
int unpackExchange(Atom*, int, MD_FLOAT*);
void packForward(Atom*, int, int*, MD_FLOAT*, int*);
void unpackForward(Atom*, int, int, MD_FLOAT*);
void packReverse(Atom* , int , int , MD_FLOAT*);
void unpackReverse(Atom*, int, int*, MD_FLOAT*);
void pbc(Atom*);
void copy(Atom*, int, int);
extern void growSuperClusters(Atom*);
#ifdef AOS
# define POS_DATA_LAYOUT "AoS"

View File

@@ -9,13 +9,10 @@
#include <atom.h>
#include <parameter.h>
#include <util.h>
#include <timers.h>
#include <timing.h>
#include <simd.h>
/*
void cpuInitialIntegrate(Parameter *param, Atom *atom) {
void cpuInitialIntegrate(Parameter *param, Atom *atom) {
DEBUG_MESSAGE("cpuInitialIntegrate start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
@@ -35,9 +32,9 @@ void cpuInitialIntegrate(Parameter *param, Atom *atom) {
DEBUG_MESSAGE("cpuInitialIntegrate end\n");
}
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
DEBUG_MESSAGE("cpuFinalIntegrate start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
@@ -49,56 +46,6 @@ void cpuFinalIntegrate(Parameter *param, Atom *atom) {
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
}
}
DEBUG_MESSAGE("cpuFinalIntegrate end\n");
}
*/
void cpuInitialIntegrate(Parameter *param, Atom *atom) {
DEBUG_MESSAGE("cpuInitialIntegrate start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci+=2) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
MD_SIMD_FLOAT dtforce = simd_broadcast(param->dtforce);
MD_SIMD_FLOAT dt = simd_broadcast(param->dt);
MD_SIMD_FLOAT vx_vector = simd_fma(simd_load(&ci_f[CL_X_OFFSET]), dtforce, simd_load(&ci_v[CL_X_OFFSET]));
MD_SIMD_FLOAT vy_vector = simd_fma(simd_load(&ci_f[CL_Y_OFFSET]), dtforce, simd_load(&ci_v[CL_Y_OFFSET]));
MD_SIMD_FLOAT vz_vector = simd_fma(simd_load(&ci_f[CL_Z_OFFSET]), dtforce, simd_load(&ci_v[CL_Z_OFFSET]));
MD_SIMD_FLOAT x_vector = simd_fma(vx_vector, dt, simd_load(&ci_x[CL_X_OFFSET]));
MD_SIMD_FLOAT y_vector = simd_fma(vy_vector, dt, simd_load(&ci_x[CL_Y_OFFSET]));
MD_SIMD_FLOAT z_vector = simd_fma(vz_vector, dt, simd_load(&ci_x[CL_Z_OFFSET]));
simd_store(&ci_v[CL_X_OFFSET], vx_vector);
simd_store(&ci_v[CL_Y_OFFSET], vy_vector);
simd_store(&ci_v[CL_Z_OFFSET], vz_vector);
simd_store(&ci_x[CL_X_OFFSET], x_vector);
simd_store(&ci_x[CL_Y_OFFSET], y_vector);
simd_store(&ci_x[CL_Z_OFFSET], z_vector);
}
DEBUG_MESSAGE("cpuInitialIntegrate end\n");
}
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
DEBUG_MESSAGE("cpuFinalIntegrate start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci+=2) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
MD_SIMD_FLOAT dtforce = simd_broadcast(param->dtforce);
MD_SIMD_FLOAT vx_vector = simd_fma(simd_load(&ci_f[CL_X_OFFSET]), dtforce, simd_load(&ci_v[CL_X_OFFSET]));
MD_SIMD_FLOAT vy_vector = simd_fma(simd_load(&ci_f[CL_Y_OFFSET]), dtforce, simd_load(&ci_v[CL_Y_OFFSET]));
MD_SIMD_FLOAT vz_vector = simd_fma(simd_load(&ci_f[CL_Z_OFFSET]), dtforce, simd_load(&ci_v[CL_Z_OFFSET]));
simd_store(&ci_v[CL_X_OFFSET], vx_vector);
simd_store(&ci_v[CL_Y_OFFSET], vy_vector);
simd_store(&ci_v[CL_Z_OFFSET], vz_vector);
}
DEBUG_MESSAGE("cpuFinalIntegrate end\n");
}
@@ -107,6 +54,3 @@ void cpuFinalIntegrate(Parameter *param, Atom *atom) {
void cudaInitialIntegrate(Parameter*, Atom*);
void cudaFinalIntegrate(Parameter*, Atom*);
#endif

View File

@@ -9,50 +9,15 @@
#ifndef __NEIGHBOR_H_
#define __NEIGHBOR_H_
// Interaction masks from GROMACS, things to remember (maybe these confused just me):
// 1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
// interaction masks (1 = interaction, 0 = no interaction)
// 2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
// so read them from right to left (least significant to most significant bit)
// All interaction mask is the same for all kernels
#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
// 4x4 kernel diagonal mask
#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
// 4x2 kernel diagonal masks
#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
// 4x8 kernel diagonal masks
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
typedef struct {
int cluster;
int atom;
} Pair;
typedef struct {
int every;
int ncalls;
int* neighbors;
int maxneighs;
int* numneigh;
int* numneigh_masked;
int half_neigh;
int* neighbors;
unsigned int* neighbors_imask;
//MPI
/*
int Nshell; //# of atoms in listShell(Cluster here cover all possible ghost interactions)
int *numNeighShell; //# of neighs for each atom in listShell
Pair *neighshell; //list of neighs for each atom in listShell
Pair *listshell; //Atoms to compute the force
*/
int Nshell; //# of cluster in listShell(Cluster here cover all possible ghost interactions)
int *numNeighShell; //# of neighs for each atom in listShell
int *neighshell; //list of neighs for each atom in listShell
int *listshell; //Atoms to compute the force
} Neighbor;
extern void initNeighbor(Neighbor*, Parameter*);
extern void setupNeighbor(Parameter*, Atom*);
extern void binatoms(Atom*);
@@ -60,6 +25,7 @@ extern void buildNeighbor(Atom*, Neighbor*);
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
extern void sortAtom(Atom*);
extern void buildClusters(Atom*);
extern void buildClustersGPU(Atom*);
extern void defineJClusters(Atom*);
extern void binClusters(Atom*);
extern void updateSingleAtoms(Atom*);

View File

@@ -16,5 +16,8 @@ extern void setupPbc(Atom*, Parameter*);
#ifdef CUDA_TARGET
extern void cudaUpdatePbc(Atom*, Parameter*, int);
#if defined(USE_SUPER_CLUSTERS)
extern void setupPbcGPU(Atom*, Parameter*);
#endif //defined(USE_SUPER_CLUSTERS)
#endif
#endif

19
gromacs/includes/utils.h Normal file
View File

@@ -0,0 +1,19 @@
/*
* Temporal functions for debugging, remove before proceeding with pull request
*/
#ifndef MD_BENCH_UTILS_H
#define MD_BENCH_UTILS_H
#include <atom.h>
#include <neighbor.h>
#ifdef USE_SUPER_CLUSTERS
void verifyClusters(Atom *atom);
void verifyLayout(Atom *atom);
void checkAlignment(Atom *atom);
void showSuperclusters(Atom *atom);
void printNeighs(Atom *atom, Neighbor *neighbor);
#endif //USE_SUPER_CLUSTERS
#endif //MD_BENCH_UTILS_H

View File

@@ -5,15 +5,13 @@
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#include <comm.h>
#include <parameter.h>
#ifndef __VTK_H_
#define __VTK_H_
extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
extern int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep);
#endif

View File

@@ -60,15 +60,18 @@ void init(Parameter *param) {
param->eam_file = NULL;
}
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
// Show debug messages
#define DEBUG(msg) printf(msg)
// Do not show debug messages
//#define DEBUG(msg)
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
const int maxneighs = nneighs * nreps;
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
const int ncj = atom->Nclusters_local / jfac;
const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
if(pattern == P_RAND && ncj <= nneighs) {
fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
@@ -77,7 +80,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
int m = (pattern == P_SEQ) ? ncj : nneighs;
int k = 0;
@@ -88,7 +90,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
do {
int cj = rand() % ncj;
neighptr[k] = cj;
neighptr_imask[k] = imask;
found = 0;
for(int l = 0; l < k; l++) {
if(neighptr[l] == cj) {
@@ -98,7 +99,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
} while(found == 1);
} else {
neighptr[k] = j;
neighptr_imask[k] = imask;
j = (j + 1) % m;
}
}
@@ -106,12 +106,10 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
for(int r = 1; r < nreps; r++) {
for(int k = 0; k < nneighs; k++) {
neighptr[r * nneighs + k] = neighptr[k];
neighptr_imask[r * nneighs + k] = neighptr_imask[k];
}
}
neighbor->numneigh[ci] = nneighs * nreps;
neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
}
}
@@ -127,13 +125,12 @@ int main(int argc, const char *argv[]) {
int niclusters = 256; // Number of local i-clusters
int iclusters_natoms = CLUSTER_M; // Number of valid atoms within i-clusters
int nneighs = 9; // Number of j-cluster neighbors per i-cluster
int masked = 0; // Use masked loop
int nreps = 1;
int csv = 0;
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("force");
DEBUG_MESSAGE("Initializing parameters...\n");
DEBUG("Initializing parameters...\n");
init(&param);
for(int i = 0; i < argc; i++) {
@@ -159,10 +156,6 @@ int main(int argc, const char *argv[]) {
param.eam_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-m") == 0)) {
masked = 1;
continue;
}
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
param.ntimes = atoi(argv[++i]);
continue;
@@ -213,11 +206,11 @@ int main(int argc, const char *argv[]) {
}
if(param.force_field == FF_EAM) {
DEBUG_MESSAGE("Initializing EAM parameters...\n");
DEBUG("Initializing EAM parameters...\n");
initEam(&eam, &param);
}
DEBUG_MESSAGE("Initializing atoms...\n");
DEBUG("Initializing atoms...\n");
initAtom(atom);
initStats(&stats);
@@ -233,7 +226,7 @@ int main(int argc, const char *argv[]) {
atom->cutforcesq[i] = param.cutforce * param.cutforce;
}
DEBUG_MESSAGE("Creating atoms...\n");
DEBUG("Creating atoms...\n");
while(atom->Nmax < niclusters * iclusters_natoms) {
growAtom(atom);
}
@@ -288,13 +281,13 @@ int main(int argc, const char *argv[]) {
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
}
DEBUG_MESSAGE("Defining j-clusters...\n");
DEBUG("Defining j-clusters...\n");
defineJClusters(atom);
DEBUG_MESSAGE("Initializing neighbor lists...\n");
DEBUG("Initializing neighbor lists...\n");
initNeighbor(&neighbor, &param);
DEBUG_MESSAGE("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
DEBUG_MESSAGE("Computing forces...\n");
DEBUG("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
DEBUG("Computing forces...\n");
double T_accum = 0.0;
for(int i = 0; i < param.ntimes; i++) {

View File

@@ -24,10 +24,6 @@
#include <util.h>
#include <vtk.h>
#include <xtc.h>
#include <comm.h>
#include <grid.h>
#include <shell_methods.h>
#include <mpi.h>
#define HLINE "----------------------------------------------------------------------------\n"
@@ -42,57 +38,28 @@ extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighb
extern void copyDataToCUDADevice(Atom *atom);
extern void copyDataFromCUDADevice(Atom *atom);
extern void cudaDeviceFree();
#endif
double dynamicBalance(Comm* comm, Grid* grid, Atom* atom, Parameter* param, double time)
{
double S, E;
int dims = 3; //TODO: Adjust to do in 3d and 2d
S = getTimeStamp();
if(param->balance == RCB) {
rcbBalance(grid, atom, param, meanBisect,dims,0);
neighComm(comm, param, grid);
}else if(param->balance == meanTimeRCB){
rcbBalance(grid, atom, param, meanTimeBisect,dims,time);
neighComm(comm, param, grid);
}else if(param->balance == Staggered) {
staggeredBalance(grid, atom, param, time);
neighComm(comm, param, grid);
exchangeComm(comm,atom);
}else { } //Do nothing
//printGrid(grid);
E = getTimeStamp();
#ifdef USE_SUPER_CLUSTERS
#include <utils.h>
extern void buildNeighborGPU(Atom *atom, Neighbor *neighbor);
extern void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor);
extern void alignDataToSuperclusters(Atom *atom);
extern void alignDataFromSuperclusters(Atom *atom);
extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
#endif //USE_SUPER_CLUSTERS
#endif //CUDA_TARGET
return E-S;
}
double initialBalance(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid)
{
double E,S,time;
int me;
MPI_Comm_rank(world,&me);
S = getTimeStamp();
if(param->balance == meanTimeRCB || param->balance == RCB){
rcbBalance(grid, atom, param, meanBisect,3,0);
neighComm(comm, param, grid);
}
MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world);
printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
MPI_Barrier(world);
E = getTimeStamp();
return E-S;
}
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid) {
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) { initEam(eam, param); }
double S, E;
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
param->xprd = param->nx * param->lattice;
param->yprd = param->ny * param->lattice;
param->zprd = param->nz * param->lattice;
S = getTimeStamp();
initAtom(atom);
//initPbc(atom);
initPbc(atom);
initStats(stats);
initNeighbor(neighbor, param);
if(param->input_file == NULL) {
@@ -100,48 +67,71 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
} else {
readAtom(atom, param);
}
setupGrid(grid,atom,param);
setupNeighbor(param, atom);
setupComm(comm, param, grid);
if(param->balance){
initialBalance(param, eam, atom, neighbor, stats, comm, grid);
}
setupThermo(param, atom->Natoms);
if(param->input_file == NULL) { adjustThermo(param, atom); }
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildClustersGPU(atom);
#else
buildClusters(atom);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
defineJClusters(atom);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
setupPbcGPU(atom, param);
//setupPbc(atom, param);
ghostNeighbor(comm, atom, param); //change
#else
setupPbc(atom, param);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
binClusters(atom);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildNeighborGPU(atom, neighbor);
#else
buildNeighbor(atom, neighbor);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
initDevice(atom, neighbor);
E = getTimeStamp();
return E-S;
}
double reneighbour(Comm* comm, Parameter *param, Atom *atom, Neighbor *neighbor) {
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
double S, E;
S = getTimeStamp();
LIKWID_MARKER_START("reneighbour");
//updateAtomsPbc(atom, param);
updateSingleAtoms(atom);
updateAtomsPbc(atom, param);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildClustersGPU(atom);
#else
buildClusters(atom);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
defineJClusters(atom);
//setupPbc(atom, param);
ghostNeighbor(comm, atom, param);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
//setupPbcGPU(atom, param);
setupPbc(atom, param);
#else
setupPbc(atom, param);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
binClusters(atom);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildNeighborGPU(atom, neighbor);
#else
buildNeighbor(atom, neighbor);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
LIKWID_MARKER_STOP("reneighbour");
E = getTimeStamp();
return E-S;
}
double updateAtoms(Comm* comm, Atom* atom){
double S,E;
S = getTimeStamp();
updateSingleAtoms(atom);
exchangeComm(comm, atom);
E = getTimeStamp();
return E-S;
void printAtomState(Atom *atom) {
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
/* int nall = atom->Nlocal + atom->Nghost; */
/* for (int i=0; i<nall; i++) { */
/* printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]); */
/* } */
}
int main(int argc, char** argv) {
@@ -151,8 +141,7 @@ int main(int argc, char** argv) {
Neighbor neighbor;
Stats stats;
Parameter param;
Comm comm;
Grid grid;
LIKWID_MARKER_INIT;
#pragma omp parallel
{
@@ -160,10 +149,10 @@ int main(int argc, char** argv) {
//LIKWID_MARKER_REGISTER("reneighbour");
//LIKWID_MARKER_REGISTER("pbc");
}
initComm(&argc, &argv, &comm); //change
initParameter(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
if((strcmp(argv[i], "-p") == 0)) {
readParameter(&param, argv[++i]);
continue;
}
@@ -202,24 +191,6 @@ int main(int argc, char** argv) {
param.half_neigh = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-method") == 0)) {
param.method = atoi(argv[++i]);
if (param.method>2 || param.method< 0){
if(comm.myproc == 0) fprintf(stderr, "Method does not exist!\n");
endComm(&comm);
exit(0);
}
continue;
}
if((strcmp(argv[i], "-bal") == 0)) {
param.balance = atoi(argv[++i]);
if (param.balance>3 || param.balance< 0){
if(comm.myproc == 0) fprintf(stderr, "Load balance does not exist!\n");
endComm(&comm);
exit(0);
}
continue;
}
if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) {
param.mass = atof(argv[++i]);
continue;
@@ -250,7 +221,6 @@ int main(int argc, char** argv) {
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
//TODO: add the shell and ac print options
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
@@ -269,86 +239,115 @@ int main(int argc, char** argv) {
}
}
if(param.balance>0 && param.method == 1){
if(comm.myproc == 0) fprintf(stderr, "Half Shell is not supported by load balance!\n");
endComm(&comm);
exit(0);
}
param.cutneigh = param.cutforce + param.skin;
timer[SETUP]=setup(&param, &eam, &atom, &neighbor, &stats, &comm, &grid);
if(comm.myproc == 0) printParameter(&param);
if(comm.myproc == 0) printf(HLINE);
if(comm.myproc == 0) printf("step\ttemp\t\tpressure\n");
setup(&param, &eam, &atom, &neighbor, &stats);
printParameter(&param);
printf(HLINE);
//verifyNeigh(&atom, &neighbor);
printf("step\ttemp\t\tpressure\n");
computeThermo(0, &param, &atom);
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
#ifdef CUDA_TARGET
copyDataToCUDADevice(&atom);
#endif
if(param.force_field == FF_EAM) {
timer[FORCE] = computeForceEam(&eam, &param, &atom, &neighbor, &stats);
} else {
timer[FORCE] = computeForceLJ(&param, &atom, &neighbor, &stats);
}
timer[NEIGH] = 0.0;
timer[FORWARD] = 0.0;
timer[UPDATE] = 0.0;
timer[BALANCE] = 0.0;
timer[REVERSE] = reverse(&comm, &atom, &param);
MPI_Barrier(world);
timer[NEIGH] = 0.0;
timer[TOTAL] = getTimeStamp();
if(param.vtk_file != NULL) {
//write_data_to_vtk_file(param.vtk_file, &comm ,&atom, 0);
printvtk(param.vtk_file, &comm, &atom, &param, 0);
write_data_to_vtk_file(param.vtk_file, &atom, 0);
}
//TODO: modify xct
if(param.xtc_file != NULL) {
xtc_init(param.xtc_file, &atom, 0);
}
double forceTime=0.0;
double commTime=0.0;
for(int n = 0; n < param.ntimes; n++) {
//printf("Step:\t%d\r\n", n);
initialIntegrate(&param, &atom);
if((n + 1) % param.reneigh_every) {
timer[FORWARD]+=forward(&comm, &atom, &param);
if(!((n + 1) % param.prune_every)){
if(!((n + 1) % param.prune_every)) {
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
pruneNeighborGPU(&param, &atom, &neighbor);
#else
pruneNeighbor(&param, &atom, &neighbor);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
}
copyDataFromCUDADevice(&atom);
updatePbc(&atom, &param, 0);
copyDataToCUDADevice(&atom);
} else {
#ifdef CUDA_TARGET
copyDataFromCUDADevice(&atom);
#endif
timer[UPDATE] +=updateAtoms(&comm,&atom);
if(param.balance && !((n+1)%param.balance_every))
timer[BALANCE] +=dynamicBalance(&comm, &grid, &atom , &param, timer[FORCE]);
timer[NEIGH] += reneighbour(&comm, &param, &atom, &neighbor);
timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
#ifdef CUDA_TARGET
copyDataToCUDADevice(&atom);
isReneighboured = 1;
#endif
}
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
/*
printf("%d\t%d\r\n", atom.Nsclusters_local, atom.Nclusters_local);
copyDataToCUDADevice(&atom);
verifyLayout(&atom);
//printClusterIndices(&atom);
*/
if(param.force_field == FF_EAM) {
timer[FORCE] += computeForceEam(&eam, &param, &atom, &neighbor, &stats);
} else {
timer[FORCE] += computeForceLJ(&param, &atom, &neighbor, &stats);
}
timer[REVERSE] += reverse(&comm, &atom, &param);
/*
copyDataFromCUDADevice(&atom);
verifyLayout(&atom);
getchar();
*/
finalIntegrate(&param, &atom);
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
computeThermo(n + 1, &param, &atom);
}
int write_pos = !((n + 1) % param.x_out_every);
int write_vel = !((n + 1) % param.v_out_every);
if(write_pos || write_vel) {
if(param.vtk_file != NULL) {
printvtk(param.vtk_file, &comm, &atom, &param, n+1);
write_data_to_vtk_file(param.vtk_file, &atom, n + 1);
}
//TODO: xtc file
if(param.xtc_file != NULL) {
xtc_write(&atom, n + 1, write_pos, write_vel);
}
@@ -358,11 +357,11 @@ int main(int argc, char** argv) {
#ifdef CUDA_TARGET
copyDataFromCUDADevice(&atom);
#endif
MPI_Barrier(world);
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
updateAtoms(&comm,&atom);
updateSingleAtoms(&atom);
computeThermo(-1, &param, &atom);
//TODO:
if(param.xtc_file != NULL) {
xtc_end();
}
@@ -370,35 +369,17 @@ int main(int argc, char** argv) {
#ifdef CUDA_TARGET
cudaDeviceFree();
#endif
double mint[NUMTIMER];
double maxt[NUMTIMER];
double sumt[NUMTIMER];
timer[REST] = timer[TOTAL]-timer[FORCE]-timer[NEIGH]-timer[BALANCE]-timer[FORWARD]-timer[REVERSE];
MPI_Reduce(timer,mint,NUMTIMER,MPI_DOUBLE,MPI_MIN,0,world);
MPI_Reduce(timer,maxt,NUMTIMER,MPI_DOUBLE,MPI_MAX,0,world);
MPI_Reduce(timer,sumt,NUMTIMER,MPI_DOUBLE,MPI_SUM,0,world);
int Nghost;
MPI_Reduce(&atom.Nghost,&Nghost,1,MPI_INT,MPI_SUM,0,world);
if(comm.myproc == 0){
int n = comm.numproc;
printf(HLINE);
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, Nghost, param.ntimes);
printf("TOTAL %.2fs\n\n",timer[TOTAL]);
printf("%4s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|\n","","FORCE ", "NEIGH ", "BALANCE", "FORWARD", "REVERSE","UPDATE","REST ","SETUP");
printf("----|-------|-------|-------|-------|-------|-------|-------|-------|\n");
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "AVG", sumt[FORCE]/n,sumt[NEIGH]/n,sumt[BALANCE]/n,sumt[FORWARD]/n,sumt[REVERSE]/n,sumt[UPDATE]/n,sumt[REST]/n,sumt[SETUP]/n);
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MIN", mint[FORCE],mint[NEIGH],mint[BALANCE],mint[FORWARD],mint[REVERSE],mint[UPDATE],mint[REST],mint[SETUP]);
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MAX", maxt[FORCE],maxt[NEIGH],maxt[BALANCE],maxt[FORWARD],maxt[REVERSE],maxt[UPDATE],maxt[REST],maxt[SETUP]);
printf(HLINE);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
#ifdef COMPUTE_STATS
printf(HLINE);
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
printf(HLINE);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
#ifdef COMPUTE_STATS
displayStatistics(&atom, &param, &stats, timer);
#endif
}
endComm(&comm);
#endif
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

File diff suppressed because it is too large Load Diff

View File

@@ -86,6 +86,98 @@ void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
DEBUG_MESSAGE("updatePbc end\n");
}
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
void gpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
DEBUG_MESSAGE("gpuUpdatePbc start\n");
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = atom->Nclusters_local / jfac;
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
const int cj = ncj + cg;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int scj_vec_base = SCJ_VECTOR_BASE_INDEX(cj);
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
int sbmap_vec_base = SCJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
MD_FLOAT *scj_x = &atom->scl_x[scj_vec_base];
MD_FLOAT *sbmap_x = &atom->scl_x[sbmap_vec_base];
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
MD_FLOAT sbbminx = INFINITY, sbbmaxx = -INFINITY;
MD_FLOAT sbbminy = INFINITY, sbbmaxy = -INFINITY;
MD_FLOAT sbbminz = INFINITY, sbbmaxz = -INFINITY;
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
MD_FLOAT sxtmp = sbmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
MD_FLOAT sytmp = sbmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
MD_FLOAT sztmp = sbmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
cj_x[CL_X_OFFSET + cjj] = xtmp;
cj_x[CL_Y_OFFSET + cjj] = ytmp;
cj_x[CL_Z_OFFSET + cjj] = ztmp;
scj_x[SCL_X_OFFSET + cjj] = sxtmp;
scj_x[SCL_Y_OFFSET + cjj] = sytmp;
scj_x[SCL_Z_OFFSET + cjj] = sztmp;
if(firstUpdate) {
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(bbminx > xtmp) { bbminx = xtmp; }
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
if(bbminy > ytmp) { bbminy = ytmp; }
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
if(sbbminx > sxtmp) { sbbminx = sxtmp; }
if(sbbmaxx < sxtmp) { sbbmaxx = sxtmp; }
if(sbbminy > sytmp) { sbbminy = sytmp; }
if(sbbmaxy < sytmp) { sbbmaxy = sytmp; }
if(sbbminz > sztmp) { sbbminz = sztmp; }
if(sbbmaxz < sztmp) { sbbmaxz = sztmp; }
}
}
if(firstUpdate) {
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
cj_x[CL_X_OFFSET + cjj] = INFINITY;
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
scj_x[SCL_X_OFFSET + cjj] = INFINITY;
scj_x[SCL_Y_OFFSET + cjj] = INFINITY;
scj_x[SCL_Z_OFFSET + cjj] = INFINITY;
}
atom->jclusters[cj].bbminx = bbminx;
atom->jclusters[cj].bbmaxx = bbmaxx;
atom->jclusters[cj].bbminy = bbminy;
atom->jclusters[cj].bbmaxy = bbmaxy;
atom->jclusters[cj].bbminz = bbminz;
atom->jclusters[cj].bbmaxz = bbmaxz;
}
}
DEBUG_MESSAGE("gpuUpdatePbc end\n");
}
/* relocate atoms that have left domain according
* to periodic boundary conditions */
void updateAtomsPbc(Atom *atom, Parameter *param) {
@@ -229,3 +321,91 @@ void setupPbc(Atom *atom, Parameter *param) {
cpuUpdatePbc(atom, param, 1);
DEBUG_MESSAGE("setupPbc end\n");
}
void setupPbcGPU(Atom *atom, Parameter *param) {
DEBUG_MESSAGE("setupPbcGPU start\n");
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
MD_FLOAT Cutneigh = param->cutneigh;
//int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int jfac = SCLUSTER_M / CLUSTER_M;
int ncj = atom->Nsclusters_local * jfac;
int Nghost = -1;
int Nghost_atoms = 0;
for(int cj = 0; cj < ncj; cj++) {
if(atom->jclusters[cj].natoms > 0) {
if(atom->Nsclusters_local + (Nghost + (jfac - 1) + 7) / jfac >= atom->Nclusters_max) {
growClusters(atom);
//growSuperClusters(atom);
}
if((Nghost + 7) * CLUSTER_M >= NmaxGhost) {
growPbc(atom);
}
MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
/* Setup ghost atoms */
/* 6 planes */
if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); }
if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); }
if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); }
if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
/* 8 corners */
if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); }
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); }
if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
/* 12 edges */
if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); }
if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); }
if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); }
if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); }
if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); }
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
}
}
if(ncj + (Nghost + (jfac - 1) + 1) / jfac >= atom->Nclusters_max) {
growClusters(atom);
//growSuperClusters(atom);
}
// Add dummy cluster at the end
int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
cj_x[CL_X_OFFSET + cjj] = INFINITY;
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
}
// increase by one to make it the ghost atom count
atom->dummy_cj = ncj + Nghost + 1;
atom->Nghost = Nghost_atoms;
atom->Nclusters_ghost = Nghost + 1;
atom->Nclusters = atom->Nclusters_local + Nghost + 1;
// Update created ghost clusters positions
gpuUpdatePbc(atom, param, 1);
DEBUG_MESSAGE("setupPbcGPU end\n");
}

View File

@@ -13,8 +13,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
MEM_TRACER_INIT;
INDEX_TRACER_INIT;
int Nlocal = atom->Nlocal;
int *neighs;
unsigned int *neighs_imask;
int* neighs;
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
@@ -35,8 +34,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
DIST_TRACE(neighs, numneighs);
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MEM_TRACE(j, 'R');
MEM_TRACE(neighs[k], 'R');
MEM_TRACE(atom_x(j), 'R');
MEM_TRACE(atom_y(j), 'R');
MEM_TRACE(atom_z(j), 'R');

332
gromacs/utils.c Normal file
View File

@@ -0,0 +1,332 @@
/*
* Temporal functions for debugging, remove before proceeding with pull request
*/
#include <stdio.h>
#include <stdlib.h>
#include <utils.h>
extern void alignDataToSuperclusters(Atom *atom);
extern void alignDataFromSuperclusters(Atom *atom);
#ifdef USE_SUPER_CLUSTERS
/*
void verifyClusters(Atom *atom) {
unsigned int count = 0;
for (int i = 0; i < atom->Nsclusters_local; i++) {
for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
for(int cii = 0; cii < CLUSTER_M; cii++, count++);
}
}
MD_FLOAT *x = malloc(count * sizeof(MD_FLOAT));
MD_FLOAT *y = malloc(count * sizeof(MD_FLOAT));
MD_FLOAT *z = malloc(count * sizeof(MD_FLOAT));
count = 0;
unsigned int diffs = 0;
printf("######### %d #########\r\n", atom->Nsclusters_local);
for (int i = 0; i < atom->Nsclusters_local; i++) {
printf("######### %d\t #########\r\n", atom->siclusters[i].nclusters);
for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
//printf("%d\t", atom.siclusters[i].iclusters[j]);
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[i].iclusters[j])];
if (atom->iclusters[atom->siclusters[i].iclusters[j]].bbminx < atom->siclusters[i].bbminx ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxx > atom->siclusters[i].bbmaxx ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbminy < atom->siclusters[i].bbminy ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxy > atom->siclusters[i].bbmaxy ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbminz < atom->siclusters[i].bbminz ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxz > atom->siclusters[i].bbmaxz) diffs++;
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
x[count] = ci_x[CL_X_OFFSET + cii];
y[count] = ci_x[CL_Y_OFFSET + cii];
z[count] = ci_x[CL_Z_OFFSET + cii];
//printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
}
printf("######### \t #########\r\n");
}
printf("######### Diffs: %d\t #########\r\n", diffs);
printf("\r\n");
count = 0;
diffs = 0;
for (int i = 0; i < atom->Nclusters_local; i++) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
if (ci_x[CL_X_OFFSET + cii] != x[count] ||
ci_x[CL_Y_OFFSET + cii] != y[count] ||
ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
}
}
printf("######### Diffs: %d\t #########\r\n", diffs);
}
*/
void verifyLayout(Atom *atom) {
printf("verifyLayout start\r\n");
/*
unsigned int count = 0;
for (int i = 0; i < atom->Nsclusters_local; i++) {
for (int j = 0; j < atom->siclusters[i].nclusters; j++, count++);
}
MD_FLOAT *scl_x = malloc(atom->Nsclusters_local * SCLUSTER_SIZE * 3 * CLUSTER_M * sizeof(MD_FLOAT));
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
const unsigned int atom_offset = scci;
/*
for(int cii = 0, scii = atom_offset; cii < CLUSTER_M; cii++, scii += 3) {
scl_x[CL_X_OFFSET + scii] = ci_x[CL_X_OFFSET + cii];
scl_x[CL_Y_OFFSET + scii] = ci_x[CL_Y_OFFSET + cii];
scl_x[CL_Z_OFFSET + scii] = ci_x[CL_Z_OFFSET + cii];
//printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
memcpy(&scl_x[atom_offset], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&scl_x[atom_offset + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&scl_x[atom_offset + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
}
}
*/
//alignDataToSuperclusters(atom);
//for (int sci = 0; sci < 2; sci++) {
for (int sci = 4; sci < 6; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE;
MD_FLOAT *sci_x = &atom->scl_f[SCI_VECTOR_BASE_INDEX(sci)];
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
/*
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[cii],
sci_x[cii + SCLUSTER_SIZE * CLUSTER_M], sci_x[cii + 2 * SCLUSTER_SIZE * CLUSTER_M]);
*/
printf("%d\t%d\t%f\t%f\t%f\r\n", atom->icluster_idx[SCLUSTER_SIZE * sci + cl_idx], cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
}
/*
//for (int cii = 0; cii < SCLUSTER_M; ++cii) {
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
/*
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + cii],
sci_x[SCL_Y_OFFSET(cl_idx) + cii], sci_x[SCL_Z_OFFSET(cl_idx) + cii]);
*/
/*
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_Z_OFFSET(cl_idx) + ciii]);
}
*/
/*
for (int scii = scl_offset; scii < scl_offset + SCLUSTER_SIZE; scii++) {
for (int cii = 0; cii < CLUSTER_M; ++cii) {
printf("%f\t%f\t%f\r\n", sci_x[SCL_X_OFFSET(scii) + cii],
sci_x[SCL_Y_OFFSET(scii) + cii], sci_x[SCL_Z_OFFSET(scii) + cii]);
}
/*
const unsigned int cl_offset = scii * 3 * CLUSTER_M;
//MD_FLOAT *sci_x = &scl_x[CI_VECTOR_BASE_INDEX(scii)];
for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
printf("%f\t%f\t%f\r\n", sci_x[CL_X_OFFSET + cii],
sci_x[CL_Y_OFFSET + cii], sci_x[CL_Z_OFFSET + cii]);
}
*/
/*
for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
printf("%f\t%f\t%f\r\n", scl_x[CL_X_OFFSET + cii],
scl_x[CL_Y_OFFSET + cii], scl_x[CL_Z_OFFSET + cii]);
}
*/
//}
printf("##########\t##########\r\n");
}
printf("\r\n");
//for (int ci = 0; ci < 16; ci++) {
for (int ci = 35; ci < 37; ci++) {
printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
MD_FLOAT *ci_x = &atom->cl_f[CI_VECTOR_BASE_INDEX(ci)];
//for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
for(int cii = 0; cii < CLUSTER_M; cii++) {
printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
ci_x[CL_Y_OFFSET + cii],
ci_x[CL_Z_OFFSET + cii]);
}
printf("##########\t##########\r\n");
}
printf("verifyLayout end\r\n");
/*
for (int i = 0; i < atom->Nclusters_local; i++) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
if (ci_x[CL_X_OFFSET + cii] != x[count] ||
ci_x[CL_Y_OFFSET + cii] != y[count] ||
ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
}
}
*/
}
void checkAlignment(Atom *atom) {
alignDataToSuperclusters(atom);
for (int sci = 4; sci < 6; sci++) {
MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
}
}
for (int ci = 35; ci < 37; ci++) {
printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(ci)];
for(int cii = 0; cii < CLUSTER_M; cii++) {
printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
ci_x[CL_Y_OFFSET + cii],
ci_x[CL_Z_OFFSET + cii]);
}
printf("##########\t##########\r\n");
}
}
void showSuperclusters(Atom *atom) {
for (int sci = 4; sci < 6; sci++) {
MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
}
}
}
void printNeighs(Atom *atom, Neighbor *neighbor) {
for (int i = 0; i < atom->Nclusters_local; ++i) {
int neigh_num = neighbor->numneigh[i];
for (int j = 0; j < neigh_num; j++) {
printf("%d ", neighbor->neighbors[ i * neighbor->maxneighs + j]);
}
printf("\r\n");
}
}
void printClusterIndices(Atom *atom) {
for (int i = 0; i < atom->Nsclusters_local; ++i) {
int clusters_num = atom->siclusters[i].nclusters;
for (int j = 0; j < clusters_num; j++) {
printf("%d ", atom->icluster_idx[j + SCLUSTER_SIZE * i]);
}
printf("\r\n");
}
}
void verifyNeigh(Atom *atom, Neighbor *neighbor) {
buildNeighbor(atom, neighbor);
int *numneigh = (int*) malloc(atom->Nclusters_local * sizeof(int));
int *neighbors = (int*) malloc(atom->Nclusters_local * neighbor->maxneighs * sizeof(int*));
for (int i = 0; i < atom->Nclusters_local; ++i) {
int neigh_num = neighbor->numneigh[i];
numneigh[i] = neighbor->numneigh[i];
neighbor->numneigh[i] = 0;
for (int j = 0; j < neigh_num; j++) {
neighbors[i * neighbor->maxneighs + j] = neighbor->neighbors[i * neighbor->maxneighs + j];
neighbor->neighbors[i * neighbor->maxneighs + j] = 0;
}
}
buildNeighborGPU(atom, neighbor);
unsigned int num_diff = 0;
unsigned int neigh_diff = 0;
for (int i = 0; i < atom->Nclusters_local; ++i) {
int neigh_num = neighbor->numneigh[i];
if (numneigh[i] != neigh_num) num_diff++;
for (int j = 0; j < neigh_num; j++) {
if (neighbors[i * neighbor->maxneighs + j] !=
neighbor->neighbors[ i * neighbor->maxneighs + j]) neigh_diff++;
}
}
printf("%d\t%d\r\n", num_diff, neigh_diff);
}
#endif //USE_SUPER_CLUSTERS

View File

@@ -9,19 +9,67 @@
#include <atom.h>
#include <vtk.h>
#include <mpi.h>
#include <string.h>
static MPI_File _fh;
static inline void flushBuffer(char*);
void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
write_local_atoms_to_vtk_file(filename, atom, timestep);
write_ghost_atoms_to_vtk_file(filename, atom, timestep);
write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
#ifdef USE_SUPER_CLUSTERS
write_super_clusters_to_vtk_file(filename, atom, timestep);
#endif //#ifdef USE_SUPER_CLUSTERS
}
#ifdef USE_SUPER_CLUSTERS
int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_sup_%d.vtk", filename, timestep);
FILE* fp = fopen(timestep_filename, "wb");
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
fprintf(fp, "POINTS %d double\n", atom->Nsclusters_local * SCLUSTER_M);
for(int ci = 0; ci < atom->Nsclusters_local; ++ci) {
int factor = (rand() % 1000) + 1;
//double factor = ci * 10;
int ci_vec_base = SCI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->scl_x[ci_vec_base];
for(int cii = 0; cii < SCLUSTER_M; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[SCL_X_OFFSET + cii] * factor, ci_x[SCL_Y_OFFSET + cii] * factor, ci_x[SCL_Z_OFFSET + cii] * factor);
}
}
fprintf(fp, "\n\n");
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
for(int i = 0; i < atom->Nlocal; ++i) {
fprintf(fp, "1 %d\n", i);
}
fprintf(fp, "\n\n");
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
for(int i = 0; i < atom->Nlocal; ++i) {
fprintf(fp, "1\n");
}
fprintf(fp, "\n\n");
fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
fprintf(fp, "SCALARS mass double\n");
fprintf(fp, "LOOKUP_TABLE default\n");
for(int i = 0; i < atom->Nlocal; i++) {
fprintf(fp, "1.0\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}
#endif //USE_SUPER_CLUSTERS
int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
@@ -193,128 +241,3 @@ int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int
fclose(fp);
return 0;
}
int vtkOpen(const char* filename, Comm* comm, Atom* atom ,int timestep)
{
char msg[256];
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep);
MPI_File_open(MPI_COMM_WORLD, timestep_filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &_fh);
if(_fh == MPI_FILE_NULL) {
if(comm->myproc == 0) fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
if (comm->myproc==0){
sprintf(msg, "# vtk DataFile Version 2.0\n");
sprintf(msg, "%sParticle data\n",msg);
sprintf(msg, "%sASCII\n",msg);
sprintf(msg, "%sDATASET UNSTRUCTURED_GRID\n",msg);
sprintf(msg, "%sPOINTS %d double\n",msg, atom->Natoms);
flushBuffer(msg);
}
}
int vtkVector(Comm* comm, Atom* atom, Parameter* param)
{
if (_fh == MPI_FILE_NULL) {
if(comm->myproc==0) printf("vtk not initialize! Call vtkOpen first!\n");
return -1;
}
int sizeline= 25; //#initial guess of characters in "%.4f %.4f %.4f\n"
int extrabuff = 100;
int sizebuff = sizeline*atom->Nlocal+extrabuff;
int mysize = 0;
char* msg = (char*) malloc(sizebuff);
sprintf(msg, "");
for(int i = 0; i < atom->Nlocal; i++){
if(mysize+extrabuff >= sizebuff){
sizebuff*= 1.5;
msg = (char*) realloc(msg, sizebuff);
}
//TODO: do not forget to add param->xlo, param->ylo, param->zlo
sprintf(msg, "%s%.4f %.4f %.4f\n",msg, atom_x(i), atom_y(i), atom_z(i));
mysize = strlen(msg);
}
int gatherSize[comm->numproc];
MPI_Allgather(&mysize, 1, MPI_INT, gatherSize, 1, MPI_INT, MPI_COMM_WORLD);
int offset=0;
int globalSize = 0;
for(int i = 0; i < comm->myproc; i++)
offset+= gatherSize[i];
for(int i = 0; i < comm->numproc; i++)
globalSize+= gatherSize[i];
MPI_Offset displ;
MPI_Datatype FileType;
int GlobalSize[] = {globalSize};
int LocalSize[] = {mysize};
int Start[] = {offset};
if(LocalSize[0]>0){
MPI_Type_create_subarray(1, GlobalSize, LocalSize, Start, MPI_ORDER_C, MPI_CHAR, &FileType);
} else {
MPI_Type_vector(0,0,0,MPI_CHAR,&FileType);
}
MPI_Type_commit(&FileType);
MPI_File_get_size(_fh, &displ);
MPI_File_set_view(_fh, displ, MPI_CHAR, FileType, "native", MPI_INFO_NULL);
MPI_File_write_all (_fh, msg, mysize , MPI_CHAR ,MPI_STATUS_IGNORE);
MPI_Barrier(MPI_COMM_WORLD);
MPI_File_set_view(_fh,0,MPI_CHAR, MPI_CHAR, "native", MPI_INFO_NULL);
if (comm->myproc==0){
sprintf(msg, "\n\n");
sprintf(msg, "%sCELLS %d %d\n", msg, atom->Natoms, atom->Natoms * 2);
for(int i = 0; i < atom->Natoms; i++)
sprintf(msg, "%s1 %d\n", msg, i);
flushBuffer(msg);
sprintf(msg, "\n\n");
sprintf(msg, "%sCELL_TYPES %d\n",msg, atom->Natoms);
for(int i = 0; i < atom->Natoms; i++)
sprintf(msg, "%s1\n",msg);
flushBuffer(msg);
sprintf(msg, "\n\n");
sprintf(msg, "%sPOINT_DATA %d\n",msg,atom->Natoms);
sprintf(msg, "%sSCALARS mass double\n",msg);
sprintf(msg, "%sLOOKUP_TABLE default\n",msg);
for(int i = 0; i < atom->Natoms; i++)
sprintf(msg, "%s1.0\n",msg);
sprintf(msg, "%s\n\n",msg);
flushBuffer(msg);
}
}
void vtkClose()
{
MPI_File_close(&_fh);
_fh=MPI_FILE_NULL;
}
//TODO: print ghost and cluster using MPI
void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep)
{
if(comm->numproc == 1)
{
write_data_to_vtk_file(filename, atom, timestep);
return;
}
vtkOpen(filename, comm, atom, timestep);
vtkVector(comm, atom, param);
vtkClose();
}
static inline void flushBuffer(char* msg){
MPI_Offset displ;
MPI_File_get_size(_fh, &displ);
MPI_File_write_at(_fh, displ, msg, strlen(msg), MPI_CHAR, MPI_STATUS_IGNORE);
}

View File

@@ -7,7 +7,6 @@ ANSI_CFLAGS += -pedantic
ANSI_CFLAGS += -Wextra
CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
#CFLAGS = -Ofast -march=core-avx2 $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
#CFLAGS = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
#CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
ASFLAGS = -masm=intel

View File

@@ -6,29 +6,13 @@ ANSI_CFLAGS += -std=c99
ANSI_CFLAGS += -pedantic
ANSI_CFLAGS += -Wextra
ifeq ($(ISA),AVX512)
CFLAGS = -Ofast -mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops # -fopenmp
endif
ifeq ($(ISA),AVX2)
CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -Ofast -march=native -mavx2 -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -Ofast -mavx2 -ffast-math -funroll-loops # -fopenmp
CFLAGS = -Ofast -mavx2 -mfma -ffast-math -funroll-loops # -fopenmp
endif
ifeq ($(ISA),AVX)
CFLAGS = -Ofast -mavx -ffast-math -funroll-loops # -fopenmp
endif
ifeq ($(ISA),SSE)
CFLAGS = -Ofast -msse4.2 -ffast-math -funroll-loops # -fopenmp
endif
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -O3 -march=native -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
ASFLAGS = #-masm=intel
LFLAGS =
DEFINES = -D_GNU_SOURCE -DNO_ZMM_INTRIN

View File

@@ -1,27 +1,13 @@
CC = icc
LINKER = $(CC)
OPENMP = -qopenmp
OPENMP = #-qopenmp
PROFILE = #-profile-functions -g -pg
ifeq ($(ISA),AVX512)
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
endif
ifeq ($(ISA),AVX2)
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
#OPTS = -Ofast -xAVX $(PROFILE)
#OPTS = -Ofast -xAVX2 $(PROFILE)
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
endif
ifeq ($(ISA),AVX)
OPTS = -Ofast -xAVX $(PROFILE)
endif
ifeq ($(ISA),SSE)
OPTS = -Ofast -xSSE4.2 $(PROFILE)
endif
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
#OPTS = -Ofast -no-vec $(PROFILE)
#OPTS = -Ofast -xHost $(PROFILE)
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)

View File

@@ -3,28 +3,13 @@ LINKER = $(CC)
OPENMP = #-qopenmp
PROFILE = #-profile-functions -g -pg
ifeq ($(ISA),AVX512)
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
#OPTS = -Ofast -march=cascadelake -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
endif
ifeq ($(ISA),AVX2)
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
#OPTS = -Ofast -xHost $(PROFILE)
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
endif
ifeq ($(ISA),AVX)
OPTS = -Ofast -xAVX $(PROFILE)
endif
ifeq ($(ISA),SSE)
OPTS = -Ofast -xSSE4.2 $(PROFILE)
endif
#OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
#OPTS = -Ofast -xAVX $(PROFILE)
#OPTS = -Ofast -xAVX2 $(PROFILE)
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
#OPTS = -Ofast -no-vec $(PROFILE)
#OPTS = -Ofast -xHost $(PROFILE)
OPTS = -Ofast -xHost $(PROFILE)
CFLAGS = $(PROFILE) $(OPENMP) $(OPTS)
ASFLAGS = #-masm=intel
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)

View File

@@ -9,15 +9,13 @@ else ifeq ($(strip $(ISA)), AVX_FMA)
__ISA_AVX_FMA__=true
__SIMD_WIDTH_DBL__=4
else ifeq ($(strip $(ISA)), AVX2)
#__SIMD_KERNEL__=true
__ISA_AVX2__=true
#__SIMD_KERNEL__=true
__SIMD_WIDTH_DBL__=4
else ifeq ($(strip $(ISA)), AVX512)
__ISA_AVX512__=true
__SIMD_KERNEL__=true
__SIMD_WIDTH_DBL__=8
ifeq ($(strip $(DATA_TYPE)), DP)
__SIMD_KERNEL__=true
endif
endif
# SIMD width is specified in double-precision, hence it may

View File

@@ -1,32 +0,0 @@
CC = mpiicc
LINKER = $(CC)
OPENMP = #-qopenmp
PROFILE = #-profile-functions -g -pg
ifeq ($(ISA),AVX512)
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE) #-g -debug
endif
ifeq ($(ISA),AVX2)
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
#OPTS = -Ofast -xAVX2 $(PROFILE)
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
endif
ifeq ($(ISA),AVX)
OPTS = -Ofast -xAVX $(PROFILE)
endif
ifeq ($(ISA),SSE)
OPTS = -Ofast -xSSE4.2 $(PROFILE)
endif
#OPTS = -Ofast -no-vec $(PROFILE)
#OPTS = -Ofast -xHost $(PROFILE)
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
ASFLAGS = #-masm=intel
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
DEFINES = -std=c11 -pedantic-errors -D_GNU_SOURCE -DNO_ZMM_INTRIN
INCLUDES =
LIBS = -lm

View File

@@ -8,7 +8,8 @@ ANSI_CFLAGS += -Wextra
#
# A100 + Native
CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
#CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
CFLAGS = -O3 -arch=compute_61 -code=sm_61,sm_80,sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
# A40 + Native
#CFLAGS = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
# Cascade Lake

View File

@@ -9,12 +9,10 @@
#include <string.h>
#include <math.h>
#include <parameter.h>
#include <atom.h>
#include <allocate.h>
#include <device.h>
#include <util.h>
#include <mpi.h>
#define DELTA 20000
@@ -23,10 +21,10 @@
#endif
#ifndef MAX
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#endif
void initAtom(Atom *atom){
void initAtom(Atom *atom) {
atom->x = NULL; atom->y = NULL; atom->z = NULL;
atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
atom->fx = NULL; atom->fy = NULL; atom->fz = NULL;
@@ -43,7 +41,6 @@ void initAtom(Atom *atom){
atom->radius = NULL;
atom->av = NULL;
atom->r = NULL;
atom->border_map = NULL;
DeviceAtom *d_atom = &(atom->d_atom);
d_atom->x = NULL; d_atom->y = NULL; d_atom->z = NULL;
@@ -55,19 +52,12 @@ void initAtom(Atom *atom){
d_atom->sigma6 = NULL;
d_atom->cutforcesq = NULL;
d_atom->cutneighsq = NULL;
//MPI
Box *mybox = &(atom->mybox);
mybox->xprd = mybox->yprd = mybox->zprd = 0;
mybox->lo[_x] = mybox->lo[_y] = mybox->lo[_z] = 0;
mybox->hi[_x] = mybox->hi[_y] = mybox->hi[_z] = 0;
}
void createAtom(Atom *atom, Parameter *param) {
MD_FLOAT xlo = 0; MD_FLOAT xhi = param->xprd;
MD_FLOAT ylo = 0; MD_FLOAT yhi = param->yprd;
MD_FLOAT zlo = 0; MD_FLOAT zhi = param->zprd;
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
atom->Natoms = 4 * param->nx * param->ny * param->nz;
atom->Nlocal = 0;
atom->ntypes = param->ntypes;
@@ -141,7 +131,7 @@ void createAtom(Atom *atom, Parameter *param) {
}
vztmp = myrandom(&n);
while(atom->Nlocal >= atom->Nmax) {
if(atom->Nlocal == atom->Nmax) {
growAtom(atom);
}
@@ -173,42 +163,38 @@ int type_str2int(const char *type) {
return -1;
}
int readAtom(Atom *atom, Parameter *param) {
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
int readAtom(Atom* atom, Parameter* param) {
int len = strlen(param->input_file);
if(strncmp(&param->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
if(strncmp(&param->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
if(strncmp(&param->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
if(strncmp(&param->input_file[len - 3], ".in", 3) == 0) { return readAtom_in(atom, param); }
if(me==0) fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file);
fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file);
exit(-1);
return -1;
}
int readAtom_pdb(Atom* atom, Parameter* param) {
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int read_atoms = 0;
if(!fp) {
if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
while(!feof(fp)) {
readline(line, fp);
char *item = strtok(line, "\t ");
char *item = strtok(line, " ");
if(strncmp(item, "CRYST1", 6) == 0) {
param->xlo = 0.0;
param->xhi = atof(strtok(NULL, "\t "));
param->xhi = atof(strtok(NULL, " "));
param->ylo = 0.0;
param->yhi = atof(strtok(NULL, "\t "));
param->yhi = atof(strtok(NULL, " "));
param->zlo = 0.0;
param->zhi = atof(strtok(NULL, "\t "));
param->zhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
param->yprd = param->yhi - param->ylo;
param->zprd = param->zhi - param->zlo;
@@ -217,23 +203,23 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
char *label;
int atom_id, comp_id;
MD_FLOAT occupancy, charge;
atom_id = atoi(strtok(NULL, "\t ")) - 1;
atom_id = atoi(strtok(NULL, " ")) - 1;
while(atom_id + 1 >= atom->Nmax) {
growAtom(atom);
}
atom->type[atom_id] = type_str2int(strtok(NULL, "\t "));
label = strtok(NULL, "\t ");
comp_id = atoi(strtok(NULL, "\t "));
atom_x(atom_id) = atof(strtok(NULL, "\t "));
atom_y(atom_id) = atof(strtok(NULL, "\t "));
atom_z(atom_id) = atof(strtok(NULL, "\t "));
atom->type[atom_id] = type_str2int(strtok(NULL, " "));
label = strtok(NULL, " ");
comp_id = atoi(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom_vx(atom_id) = 0.0;
atom_vy(atom_id) = 0.0;
atom_vz(atom_id) = 0.0;
occupancy = atof(strtok(NULL, "\t "));
charge = atof(strtok(NULL, "\t "));
occupancy = atof(strtok(NULL, " "));
charge = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
atom->Natoms++;
atom->Nlocal++;
@@ -245,14 +231,14 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
strncmp(item, "ENDMDL", 6) == 0) {
// Do nothing
} else {
if(me==0)fprintf(stderr, "Invalid item: %s\n", item);
fprintf(stderr, "Invalid item: %s\n", item);
exit(-1);
return -1;
}
}
if(!read_atoms) {
if(me==0)fprintf(stderr, "Input error: No atoms read!\n");
fprintf(stderr, "Input error: No atoms read!\n");
exit(-1);
return -1;
}
@@ -268,15 +254,12 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fclose(fp);
return read_atoms;
}
int readAtom_gro(Atom* atom, Parameter* param) {
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
char desc[MAXLINE];
@@ -285,7 +268,7 @@ int readAtom_gro(Atom* atom, Parameter* param) {
int i = 0;
if(!fp) {
if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
@@ -294,26 +277,26 @@ int readAtom_gro(Atom* atom, Parameter* param) {
for(i = 0; desc[i] != '\n'; i++);
desc[i] = '\0';
readline(line, fp);
atoms_to_read = atoi(strtok(line, "\t "));
if(me==0)fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
atoms_to_read = atoi(strtok(line, " "));
fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
while(!feof(fp) && read_atoms < atoms_to_read) {
readline(line, fp);
char *label = strtok(line, "\t ");
int type = type_str2int(strtok(NULL, "\t "));
int atom_id = atoi(strtok(NULL, "\t ")) - 1;
char *label = strtok(line, " ");
int type = type_str2int(strtok(NULL, " "));
int atom_id = atoi(strtok(NULL, " ")) - 1;
atom_id = read_atoms;
while(atom_id + 1 >= atom->Nmax) {
growAtom(atom);
}
atom->type[atom_id] = type;
atom_x(atom_id) = atof(strtok(NULL, "\t "));
atom_y(atom_id) = atof(strtok(NULL, "\t "));
atom_z(atom_id) = atof(strtok(NULL, "\t "));
atom_vx(atom_id) = atof(strtok(NULL, "\t "));
atom_vy(atom_id) = atof(strtok(NULL, "\t "));
atom_vz(atom_id) = atof(strtok(NULL, "\t "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom_vx(atom_id) = atof(strtok(NULL, " "));
atom_vy(atom_id) = atof(strtok(NULL, " "));
atom_vz(atom_id) = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
atom->Natoms++;
atom->Nlocal++;
@@ -323,18 +306,18 @@ int readAtom_gro(Atom* atom, Parameter* param) {
if(!feof(fp)) {
readline(line, fp);
param->xlo = 0.0;
param->xhi = atof(strtok(line, "\t "));
param->xhi = atof(strtok(line, " "));
param->ylo = 0.0;
param->yhi = atof(strtok(NULL, "\t "));
param->yhi = atof(strtok(NULL, " "));
param->zlo = 0.0;
param->zhi = atof(strtok(NULL, "\t "));
param->zhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
param->yprd = param->yhi - param->ylo;
param->zprd = param->zhi - param->zlo;
}
if(read_atoms != atoms_to_read) {
if(me==0)fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
exit(-1);
return -1;
}
@@ -350,14 +333,12 @@ int readAtom_gro(Atom* atom, Parameter* param) {
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fclose(fp);
return read_atoms;
}
int readAtom_dmp(Atom* atom, Parameter* param) {
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int natoms = 0;
@@ -366,7 +347,7 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
int ts = -1;
if(!fp) {
if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
@@ -389,47 +370,47 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
}
} else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
readline(line, fp);
param->xlo = atof(strtok(line, "\t "));
param->xhi = atof(strtok(NULL, "\t "));
param->xlo = atof(strtok(line, " "));
param->xhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
readline(line, fp);
param->ylo = atof(strtok(line, "\t "));
param->yhi = atof(strtok(NULL, "\t "));
param->ylo = atof(strtok(line, " "));
param->yhi = atof(strtok(NULL, " "));
param->yprd = param->yhi - param->ylo;
readline(line, fp);
param->zlo = atof(strtok(line, "\t "));
param->zhi = atof(strtok(NULL, "\t "));
param->zlo = atof(strtok(line, " "));
param->zhi = atof(strtok(NULL, " "));
param->zprd = param->zhi - param->zlo;
} else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
for(int i = 0; i < natoms; i++) {
readline(line, fp);
atom_id = atoi(strtok(line, "\t ")) - 1;
atom->type[atom_id] = atoi(strtok(NULL, "\t "));
atom_x(atom_id) = atof(strtok(NULL, "\t "));
atom_y(atom_id) = atof(strtok(NULL, "\t "));
atom_z(atom_id) = atof(strtok(NULL, "\t "));
atom_vx(atom_id) = atof(strtok(NULL, "\t "));
atom_vy(atom_id) = atof(strtok(NULL, "\t "));
atom_vz(atom_id) = atof(strtok(NULL, "\t "));
atom_id = atoi(strtok(line, " ")) - 1;
atom->type[atom_id] = atoi(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom_vx(atom_id) = atof(strtok(NULL, " "));
atom_vy(atom_id) = atof(strtok(NULL, " "));
atom_vz(atom_id) = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
read_atoms++;
}
} else {
if(me==0)fprintf(stderr, "Invalid item: %s\n", item);
fprintf(stderr, "Invalid item: %s\n", item);
exit(-1);
return -1;
}
} else {
if(me==0)fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
exit(-1);
return -1;
}
}
if(ts < 0 || !natoms || !read_atoms) {
if(me==0)fprintf(stderr, "Input error: atom data was not read!\n");
fprintf(stderr, "Input error: atom data was not read!\n");
exit(-1);
return -1;
}
@@ -445,34 +426,30 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
return natoms;
}
int readAtom_in(Atom* atom, Parameter* param) {
int me = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &me);
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int natoms = 0;
int atom_id = 0;
if(!fp) {
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
readline(line, fp);
natoms = atoi(strtok(line, "\t "));
param->xlo = atof(strtok(NULL, "\t "));
param->xhi = atof(strtok(NULL, "\t "));
param->ylo = atof(strtok(NULL, "\t "));
param->yhi = atof(strtok(NULL, "\t "));
param->zlo = atof(strtok(NULL, "\t "));
param->zhi = atof(strtok(NULL, "\t "));
param->xprd = param->xhi - param->xlo;
param->yprd = param->yhi - param->ylo;
param->zprd = param->zhi - param->zlo;
natoms = atoi(strtok(line, " "));
param->xlo = atof(strtok(NULL, " "));
param->xhi = atof(strtok(NULL, " "));
param->ylo = atof(strtok(NULL, " "));
param->yhi = atof(strtok(NULL, " "));
param->zlo = atof(strtok(NULL, " "));
param->zhi = atof(strtok(NULL, " "));
atom->Natoms = natoms;
atom->Nlocal = natoms;
atom->ntypes = 1;
@@ -485,26 +462,27 @@ int readAtom_in(Atom* atom, Parameter* param) {
readline(line, fp);
// TODO: store mass per atom
char *s_mass = strtok(line, "\t ");
char *s_mass = strtok(line, " ");
if(strncmp(s_mass, "inf", 3) == 0) {
// Set atom's mass to INFINITY
} else {
param->mass = atof(s_mass);
}
atom->radius[atom_id] = atof(strtok(NULL, "\t "));
atom_x(atom_id) = atof(strtok(NULL, "\t "));
atom_y(atom_id) = atof(strtok(NULL, "\t "));
atom_z(atom_id) = atof(strtok(NULL, "\t "));
atom_vx(atom_id) = atof(strtok(NULL, "\t "));
atom_vy(atom_id) = atof(strtok(NULL, "\t "));
atom_vz(atom_id) = atof(strtok(NULL, "\t "));
atom->radius[atom_id] = atof(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom_vx(atom_id) = atof(strtok(NULL, " "));
atom_vy(atom_id) = atof(strtok(NULL, " "));
atom_vz(atom_id) = atof(strtok(NULL, " "));
atom->type[atom_id] = 0;
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
atom_id++;
}
if(!natoms) {
if(me==0)fprintf(stderr, "Input error: atom data was not read!\n");
fprintf(stderr, "Input error: atom data was not read!\n");
exit(-1);
return -1;
}
@@ -520,7 +498,7 @@ int readAtom_in(Atom* atom, Parameter* param) {
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
return natoms;
}
@@ -552,125 +530,7 @@ void growAtom(Atom *atom) {
REALLOC(type, int, atom->Nmax * sizeof(int), nold * sizeof(int));
// DEM
atom->radius = (MD_FLOAT*) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->radius = (MD_FLOAT *) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->av = (MD_FLOAT*) reallocate(atom->av, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
atom->r = (MD_FLOAT*) reallocate(atom->r, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 4, nold * sizeof(MD_FLOAT) * 4);
}
/* MPI added*/
void packForward(Atom* atom, int n ,int* list, MD_FLOAT* buf, int* pbc)
{
int i, j;
for(i = 0; i < n; i++) {
j = list[i];
buf_x(i) = atom_x(j) + pbc[0] * atom->mybox.xprd;
buf_y(i) = atom_y(j) + pbc[1] * atom->mybox.yprd;
buf_z(i) = atom_z(j) + pbc[2] * atom->mybox.zprd;
}
}
void unpackForward(Atom* atom, int n, int first, MD_FLOAT* buf)
{
for(int i = 0; i < n; i++) {
atom_x((first + i)) = buf_x(i);
atom_y((first + i)) = buf_y(i);
atom_z((first + i)) = buf_z(i);
}
}
int packGhost(Atom* atom, int i, MD_FLOAT* buf, int* pbc)
{
int m = 0;
buf[m++] = atom_x(i) + pbc[_x] * atom->mybox.xprd;
buf[m++] = atom_y(i) + pbc[_y] * atom->mybox.yprd;
buf[m++] = atom_z(i) + pbc[_z] * atom->mybox.zprd;
buf[m++] = atom->type[i];
return m;
}
int unpackGhost(Atom* atom, int i, MD_FLOAT* buf)
{
while (i>=atom->Nmax) growAtom(atom);
int m = 0;
atom_x(i) = buf[m++];
atom_y(i) = buf[m++];
atom_z(i) = buf[m++];
atom->type[i] = buf[m++];
atom->Nghost++;
return m;
}
void packReverse(Atom* atom, int n, int first, MD_FLOAT* buf)
{
for(int i = 0; i < n; i++) {
buf_x(i) = atom_fx(first + i);
buf_y(i) = atom_fy(first + i);
buf_z(i) = atom_fz(first + i);
}
}
void unpackReverse(Atom* atom, int n, int* list, MD_FLOAT* buf)
{
int i, j;
for(i = 0; i < n; i++) {
j = list[i];
atom_fx(j) += buf_x(i);
atom_fy(j) += buf_y(i);
atom_fz(j) += buf_z(i);
}
}
int packExchange(Atom* atom, int i, MD_FLOAT* buf)
{
int m = 0;
buf[m++] = atom_x(i);
buf[m++] = atom_y(i);
buf[m++] = atom_z(i);
buf[m++] = atom_vx(i);
buf[m++] = atom_vy(i);
buf[m++] = atom_vz(i);
buf[m++] = atom->type[i];
return m;
}
int unpackExchange(Atom* atom, int i, MD_FLOAT* buf)
{
while(i >= atom->Nmax) growAtom(atom);
int m = 0;
atom_x(i) = buf[m++];
atom_y(i) = buf[m++];
atom_z(i) = buf[m++];
atom_vx(i) = buf[m++];
atom_vy(i) = buf[m++];
atom_vz(i) = buf[m++];
atom->type[i] = buf[m++];
return m;
}
void pbc(Atom* atom)
{
for(int i = 0; i < atom->Nlocal; i++) {
MD_FLOAT xprd = atom->mybox.xprd;
MD_FLOAT yprd = atom->mybox.yprd;
MD_FLOAT zprd = atom->mybox.zprd;
if(atom_x(i) < 0.0) atom_x(i) += xprd;
if(atom_y(i) < 0.0) atom_y(i) += yprd;
if(atom_z(i) < 0.0) atom_z(i)+= zprd;
if(atom_x(i) >= xprd) atom_x(i) -= xprd;
if(atom_y(i) >= yprd) atom_y(i) -= yprd;
if(atom_z(i) >= zprd) atom_z(i) -= zprd;
}
}
void copy(Atom* atom, int i, int j)
{
atom_x(i) = atom_x(j);
atom_y(i) = atom_y(j);
atom_z(i) = atom_z(j);
atom_vx(i) = atom_vx(j);
atom_vy(i) = atom_vy(j);
atom_vz(i) = atom_vz(j);
atom->type[i] = atom->type[j];
}

View File

@@ -29,7 +29,7 @@ extern "C" {
}
// cuda kernel
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= Nlocal) {
return;
@@ -46,10 +46,6 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
#endif
for(int k = 0; k < numneighs; k++) {
int j = neigh_neighbors[Nlocal * k + i];
MD_FLOAT delx = xtmp - atom_x(j);
@@ -59,7 +55,7 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * ntypes + type_j;
const int type_ij = type_i * atom->ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
@@ -113,7 +109,7 @@ extern "C" {
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
@@ -127,7 +123,7 @@ void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
@@ -140,11 +136,13 @@ void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
}
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
int Nlocal = atom->Nlocal;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
/*
int nDevices;
@@ -167,7 +165,7 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
double S = getTimeStamp();
LIKWID_MARKER_START("force");
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
cuda_assert("calc_force", cudaPeekAtLastError());
cuda_assert("calc_force", cudaDeviceSynchronize());
cudaProfilerStop();

View File

@@ -120,7 +120,7 @@ __global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins
__global__ void compute_neighborhood(
DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= nlocal) {
@@ -157,7 +157,7 @@ __global__ void compute_neighborhood(
#ifdef EXPLICIT_TYPES
int type_j = atom->type[j];
const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
#else
const MD_FLOAT cutoff = cutneighsq;
#endif
@@ -206,7 +206,7 @@ void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbo
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
int nall = atom->Nlocal + atom->Nghost;
cudaProfilerStart();
@@ -269,7 +269,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
c_new_maxneighs,
cutneighsq, atom->ntypes);
cutneighsq);
cuda_assert("compute_neighborhood", cudaPeekAtLastError());
cuda_assert("compute_neighborhood", cudaDeviceSynchronize());

View File

@@ -65,7 +65,7 @@ __global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
if(reneigh) {
memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
@@ -98,7 +98,7 @@ void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
}
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;

View File

@@ -14,7 +14,6 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->cutneighsq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
@@ -23,7 +22,6 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->cutneighsq, atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
}

View File

@@ -31,12 +31,8 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
double S = getTimeStamp();
#pragma omp parallel
{
LIKWID_MARKER_START("force_eam_fp");
#pragma omp for
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
@@ -99,19 +95,13 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
}
LIKWID_MARKER_STOP("force_eam_fp");
}
// We still need to update fp for PBC atoms
for(int i = 0; i < atom->Nghost; i++) {
fp[Nlocal + i] = fp[atom->border_map[i]];
}
#pragma omp parallel
{
LIKWID_MARKER_START("force_eam");
#pragma omp for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
@@ -202,8 +192,6 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
}
LIKWID_MARKER_STOP("force_eam");
}
double E = getTimeStamp();
return E-S;
}

View File

@@ -13,48 +13,39 @@
#include <parameter.h>
#include <stats.h>
#include <timing.h>
#include <mpi.h>
#include <util.h>
#ifdef __SIMD_KERNEL__
#include <simd.h>
#endif
void computeForceGhostShell(Parameter*, Atom*, Neighbor*);
double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
int Nlocal = atom->Nlocal;
int Nghost = atom->Nghost;
int* neighs;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
const MD_FLOAT num1 = 1.0;
const MD_FLOAT num48 = 48.0;
const MD_FLOAT num05 = 0.5;
for(int i = 0; i < Nlocal; i++) {
atom_fx(i) = 0.0;
atom_fy(i) = 0.0;
atom_fz(i) = 0.0;
}
double S = getTimeStamp();
#pragma omp parallel
{
double S = getTimeStamp();
LIKWID_MARKER_START("force");
#pragma omp for
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0.0;
MD_FLOAT fiy = 0.0;
MD_FLOAT fiz = 0.0;
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
@@ -74,14 +65,14 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
#endif
if(rsq < cutforcesq) {
MD_FLOAT sr2 = num1 / rsq;
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
#ifdef USE_REFERENCE_VERSION
addStat(stats->atoms_within_cutoff, 1);
} else {
@@ -89,51 +80,38 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
#endif
}
}
atom_fx(i) += fix;
atom_fy(i) += fiy;
atom_fz(i) += fiz;
#ifdef USE_REFERENCE_VERSION
if(numneighs % VECTOR_WIDTH > 0) {
addStat(stats->atoms_outside_cutoff, VECTOR_WIDTH - (numneighs % VECTOR_WIDTH));
}
#endif
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
}
LIKWID_MARKER_STOP("force");
}
double E = getTimeStamp();
return E-S;
}
double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
int Nlocal = atom->Nlocal;
int Nghost = atom->Nghost;
int* neighs;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
const MD_FLOAT num1 = 1.0;
const MD_FLOAT num48 = 48.0;
const MD_FLOAT num05 = 0.5;
for(int i = 0; i < Nlocal+Nghost; i++) {
for(int i = 0; i < Nlocal; i++) {
atom_fx(i) = 0.0;
atom_fy(i) = 0.0;
atom_fz(i) = 0.0;
}
double S = getTimeStamp();
#pragma omp parallel
{
double S = getTimeStamp();
LIKWID_MARKER_START("forceLJ-halfneigh");
#pragma omp for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
@@ -168,20 +146,22 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
#endif
if(rsq < cutforcesq) {
MD_FLOAT sr2 = num1 / rsq;
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
// We need to update forces for ghost atoms if shell_method or half stencil is requiered
if((param->half_neigh && j<Nlocal) || param->method){
// We do not need to update forces for ghost atoms
if(j < Nlocal) {
atom_fx(j) -= delx * force;
atom_fy(j) -= dely * force;
atom_fz(j) -= delz * force;
}
}
}
atom_fx(i) += fix;
atom_fy(i) += fiy;
atom_fz(i) += fiz;
@@ -190,10 +170,7 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
}
if(param->method == eightShell) computeForceGhostShell(param, atom, neighbor);
LIKWID_MARKER_STOP("forceLJ-halfneigh");
}
double E = getTimeStamp();
return E-S;
}
@@ -212,6 +189,7 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
}
double S = getTimeStamp();
LIKWID_MARKER_START("force");
#ifndef __SIMD_KERNEL__
fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
@@ -223,12 +201,7 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
#pragma omp parallel
{
LIKWID_MARKER_START("force");
#pragma omp for
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
@@ -269,66 +242,9 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
atom_fy(i) += simd_h_reduce_sum(fiy);
atom_fz(i) += simd_h_reduce_sum(fiz);
}
LIKWID_MARKER_STOP("force");
}
#endif
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
return E-S;
}
void computeForceGhostShell(Parameter *param, Atom *atom, Neighbor *neighbor) {
int Nshell = neighbor->Nshell;
int* neighs;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
const MD_FLOAT num1 = 1.0;
const MD_FLOAT num48 = 48.0;
const MD_FLOAT num05 = 0.5;
for(int i = 0; i < Nshell; i++) {
neighs = &(neighbor->neighshell[i * neighbor->maxneighs]);
int numneigh = neighbor->numNeighShell[i];
int iatom = neighbor->listshell[i];
MD_FLOAT xtmp = atom_x(iatom);
MD_FLOAT ytmp = atom_y(iatom);
MD_FLOAT ztmp = atom_z(iatom);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
#endif
for(int k = 0; k < numneigh; k++) {
int jatom = neighs[k];
MD_FLOAT delx = xtmp - atom_x(jatom);
MD_FLOAT dely = ytmp - atom_y(jatom);
MD_FLOAT delz = ztmp - atom_z(jatom);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < cutforcesq) {
MD_FLOAT sr2 = num1 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
atom_fx(jatom) -= delx * force;
atom_fy(jatom) -= dely * force;
atom_fz(jatom) -= delz * force;
}
}
atom_fx(iatom) += fix;
atom_fy(iatom) += fiy;
atom_fz(iatom) += fiz;
}
}

View File

@@ -4,9 +4,8 @@
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <box.h>
#include <parameter.h>
#ifndef __ATOM_H_
#define __ATOM_H_
@@ -57,8 +56,6 @@ typedef struct {
MD_FLOAT *sigma6;
MD_FLOAT *cutforcesq;
MD_FLOAT *cutneighsq;
//TODO: insert the id number
//MD_FLOAT *Atom_id;
// DEM
MD_FLOAT *radius;
@@ -67,9 +64,6 @@ typedef struct {
// Device data
DeviceAtom d_atom;
//Info Subdomain
Box mybox;
} Atom;
extern void initAtom(Atom*);
@@ -81,17 +75,6 @@ extern int readAtom_dmp(Atom*, Parameter*);
extern int readAtom_in(Atom*, Parameter*);
extern void growAtom(Atom*);
int packGhost(Atom*, int, MD_FLOAT*, int*);
int unpackGhost(Atom*, int, MD_FLOAT*);
int packExchange(Atom*, int, MD_FLOAT*);
int unpackExchange(Atom*, int, MD_FLOAT*);
void packForward(Atom*, int, int*, MD_FLOAT*, int*);
void unpackForward(Atom*, int, int, MD_FLOAT*);
void packReverse(Atom* , int , int , MD_FLOAT*);
void unpackReverse(Atom*, int, int*, MD_FLOAT*);
void pbc(Atom*);
void copy(Atom*, int, int);
#ifdef AOS
# define POS_DATA_LAYOUT "AoS"
# define atom_x(i) atom->x[(i) * 3 + 0]
@@ -116,8 +99,4 @@ void copy(Atom*, int, int);
# define atom_fz(i) atom->fz[i]
#endif
# define buf_x(i) buf[3*(i)]
# define buf_y(i) buf[3*(i)+1]
# define buf_z(i) buf[3*(i)+2]
#endif

View File

@@ -20,14 +20,9 @@ typedef struct {
int ncalls;
int maxneighs;
int half_neigh;
int half_stencil;
int *neighbors;
int *numneigh;
//MPI
int Nshell; //# of atoms in listShell
int *numNeighShell; //# of neighs for each atom in listShell
int *neighshell; //list of neighs for each atom in listShell
int *listshell; //Atoms to compute the force
// Device data
DeviceNeighbor d_neighbor;
} Neighbor;

View File

@@ -5,11 +5,8 @@
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#include <comm.h>
#include <parameter.h>
#ifndef __VTK_H_
#define __VTK_H_
extern int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep);
#endif

View File

@@ -59,6 +59,12 @@ void init(Parameter *param) {
param->eam_file = NULL;
}
// Show debug messages
#define DEBUG(msg) printf(msg)
// Do not show debug messages
//#define DEBUG(msg)
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
const int maxneighs = nneighs * nreps;
neighbor->numneigh = (int*) malloc(atom->Nmax * sizeof(int));
@@ -119,7 +125,7 @@ int main(int argc, const char *argv[]) {
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("force");
DEBUG_MESSAGE("Initializing parameters...\n");
DEBUG("Initializing parameters...\n");
init(&param);
for(int i = 0; i < argc; i++) {
@@ -190,11 +196,11 @@ int main(int argc, const char *argv[]) {
}
if(param.force_field == FF_EAM) {
DEBUG_MESSAGE("Initializing EAM parameters...\n");
DEBUG("Initializing EAM parameters...\n");
initEam(&eam, &param);
}
DEBUG_MESSAGE("Initializing atoms...\n");
DEBUG("Initializing atoms...\n");
initAtom(atom);
initStats(&stats);
@@ -210,7 +216,7 @@ int main(int argc, const char *argv[]) {
atom->cutforcesq[i] = param.cutforce * param.cutforce;
}
DEBUG_MESSAGE("Creating atoms...\n");
DEBUG("Creating atoms...\n");
for(int i = 0; i < natoms; ++i) {
while(atom->Nlocal > atom->Nmax - natoms) {
growAtom(atom);
@@ -241,11 +247,11 @@ int main(int argc, const char *argv[]) {
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
}
DEBUG_MESSAGE("Initializing neighbor lists...\n");
DEBUG("Initializing neighbor lists...\n");
initNeighbor(&neighbor, &param);
DEBUG_MESSAGE("Creating neighbor lists...\n");
DEBUG("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
DEBUG_MESSAGE("Computing forces...\n");
DEBUG("Computing forces...\n");
double T_accum = 0.0;
for(int i = 0; i < param.ntimes; i++) {

View File

@@ -11,7 +11,9 @@
#include <limits.h>
#include <math.h>
#include <float.h>
#include <likwid-marker.h>
#include <allocate.h>
#include <atom.h>
#include <device.h>
@@ -21,19 +23,13 @@
#include <timing.h>
#include <neighbor.h>
#include <parameter.h>
#include <pbc.h>
#include <stats.h>
#include <timers.h>
#include <util.h>
#include <vtk.h>
#include <comm.h>
#include <grid.h>
#include <shell_methods.h>
#include <mpi.h>
#define HLINE "----------------------------------------------------------------------------\n"
#ifdef CUDA_TARGET
extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
#endif
extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*);
@@ -41,6 +37,62 @@ extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
#ifdef CUDA_TARGET
extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
#endif
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) { initEam(eam, param); }
double S, E;
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
param->xprd = param->nx * param->lattice;
param->yprd = param->ny * param->lattice;
param->zprd = param->nz * param->lattice;
S = getTimeStamp();
initAtom(atom);
initPbc(atom);
initStats(stats);
initNeighbor(neighbor, param);
if(param->input_file == NULL) {
createAtom(atom, param);
} else {
readAtom(atom, param);
}
setupNeighbor(param);
setupThermo(param, atom->Natoms);
if(param->input_file == NULL) { adjustThermo(param, atom); }
setupPbc(atom, param);
initDevice(atom, neighbor);
updatePbc(atom, param, true);
buildNeighbor(atom, neighbor);
E = getTimeStamp();
return E-S;
}
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
double S, E;
S = getTimeStamp();
LIKWID_MARKER_START("reneighbour");
updateAtomsPbc(atom, param);
setupPbc(atom, param);
updatePbc(atom, param, true);
//sortAtom(atom);
buildNeighbor(atom, neighbor);
LIKWID_MARKER_STOP("reneighbour");
E = getTimeStamp();
return E-S;
}
void printAtomState(Atom *atom) {
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
// int nall = atom->Nlocal + atom->Nghost;
// for (int i=0; i<nall; i++) {
// printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]);
// }
}
double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) {
return computeForceEam(eam, param, atom, neighbor, stats);
@@ -53,7 +105,7 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor,
}
}
if(param->half_neigh || param->method) {
if(param->half_neigh) {
return computeForceLJHalfNeigh(param, atom, neighbor, stats);
}
@@ -64,102 +116,6 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor,
#endif
}
double dynamicBalance(Comm* comm, Grid* grid, Atom* atom, Parameter* param, double time){
double S, E;
int dims = 3; //TODO: Adjust to do in 3d and 2d
S = getTimeStamp();
if(param->balance == RCB) {
rcbBalance(grid, atom, param, meanBisect,dims,0);
neighComm(comm, param, grid);
}else if(param->balance == meanTimeRCB){
rcbBalance(grid, atom, param, meanTimeBisect,dims,time);
neighComm(comm, param, grid);
}else if(param->balance == Staggered) {
staggeredBalance(grid, atom, param, time);
neighComm(comm, param, grid);
exchangeComm(comm,atom);
}else { } //Do nothing
//printGrid(grid);
E = getTimeStamp();
return E-S;
}
double initialBalance(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid)
{
double E,S,time;
int me;
MPI_Comm_rank(world,&me);
S = getTimeStamp();
if(param->balance == meanTimeRCB || param->balance == RCB){
rcbBalance(grid, atom, param, meanBisect,3,0);
neighComm(comm, param, grid);
}
MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world);
printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
MPI_Barrier(world);
E = getTimeStamp();
return E-S;
}
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid) {
if(param->force_field == FF_EAM) { initEam(eam, param); }
double S, E;
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
param->xprd = param->nx * param->lattice;
param->yprd = param->ny * param->lattice;
param->zprd = param->nz * param->lattice;
S = getTimeStamp();
initAtom(atom);
initStats(stats);
initNeighbor(neighbor, param);
if(param->input_file == NULL) {
createAtom(atom, param);
} else {
readAtom(atom, param);
}
setupGrid(grid,atom,param);
setupNeighbor(param);
setupComm(comm, param, grid);
if(param->balance){
initialBalance(param, eam, atom, neighbor, stats, comm, grid);
}
setupThermo(param, atom->Natoms);
if(param->input_file == NULL) { adjustThermo(param, atom); }
#ifdef SORT_ATOMS
atom->Nghost = 0;
sortAtom(atom);
#endif
initDevice(atom, neighbor);
ghostNeighbor(comm, atom, param);
buildNeighbor(atom, neighbor);
E = getTimeStamp();
return E-S;
}
double reneighbour(Comm* comm, Parameter *param, Atom *atom, Neighbor *neighbor) {
double S, E;
S = getTimeStamp();
LIKWID_MARKER_START("reneighbour");
#ifdef SORT_ATOMS
atom->Nghost = 0;
sortAtom(atom);
#endif
ghostNeighbor(comm, atom, param);
buildNeighbor(atom, neighbor);
LIKWID_MARKER_STOP("reneighbour");
E = getTimeStamp();
return E-S;
}
double updateAtoms(Comm* comm, Atom* atom){
double S,E;
S = getTimeStamp();
exchangeComm(comm, atom);
E = getTimeStamp();
return E-S;
}
void writeInput(Parameter *param, Atom *atom) {
FILE *fpin = fopen("input.in", "w");
fprintf(fpin, "0,%f,0,%f,0,%f\n", param->xprd, param->yprd, param->zprd);
@@ -178,8 +134,7 @@ int main(int argc, char** argv) {
Neighbor neighbor;
Stats stats;
Parameter param;
Comm comm;
Grid grid;
LIKWID_MARKER_INIT;
#pragma omp parallel
{
@@ -187,7 +142,7 @@ int main(int argc, char** argv) {
//LIKWID_MARKER_REGISTER("reneighbour");
//LIKWID_MARKER_REGISTER("pbc");
}
initComm(&argc, &argv, &comm);
initParameter(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-p") == 0)) {
@@ -229,24 +184,6 @@ int main(int argc, char** argv) {
param.half_neigh = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-method") == 0)) {
param.method = atoi(argv[++i]);
if (param.method>3 || param.method< 0){
if(comm.myproc == 0) fprintf(stderr, "Method does not exist!\n");
endComm(&comm);
exit(0);
}
continue;
}
if((strcmp(argv[i], "-bal") == 0)) {
param.balance = atoi(argv[++i]);
if (param.balance>3 || param.balance< 0){
if(comm.myproc == 0) fprintf(stderr, "Load Balance does not exist!\n");
endComm(&comm);
exit(0);
}
continue;
}
if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
param.cutforce = atof(argv[++i]);
continue;
@@ -264,68 +201,58 @@ int main(int argc, char** argv) {
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
if(comm.myproc ==0 ){
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
printf("-f <string>: force field (lj, eam or dem), default lj\n");
printf("-i <string>: input file with atom positions (dump)\n");
printf("-e <string>: input file for EAM\n");
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
printf("-r / --radius <real>: set cutoff radius\n");
printf("-s / --skin <real>: set skin (verlet buffer)\n");
printf("--freq <real>: processor frequency (GHz)\n");
printf("--vtk <string>: VTK file for visualization\n");
printf(HLINE);
}
exit(EXIT_SUCCESS);
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
printf("-f <string>: force field (lj, eam or dem), default lj\n");
printf("-i <string>: input file with atom positions (dump)\n");
printf("-e <string>: input file for EAM\n");
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
printf("-r / --radius <real>: set cutoff radius\n");
printf("-s / --skin <real>: set skin (verlet buffer)\n");
printf("--freq <real>: processor frequency (GHz)\n");
printf("--vtk <string>: VTK file for visualization\n");
printf(HLINE);
exit(EXIT_SUCCESS);
}
}
if(param.balance>0 && param.method == 1){
if(comm.myproc == 0) fprintf(stderr, "Half Shell is not supported by load balance!\n");
endComm(&comm);
exit(0);
}
param.cutneigh = param.cutforce + param.skin;
timer[SETUP]=setup(&param, &eam, &atom, &neighbor, &stats, &comm, &grid);
if(comm.myproc == 0)printParameter(&param);
if(comm.myproc == 0)printf(HLINE);
if(comm.myproc == 0) printf("step\ttemp\t\tpressure\n");
setup(&param, &eam, &atom, &neighbor, &stats);
printParameter(&param);
printf(HLINE);
printf("step\ttemp\t\tpressure\n");
computeThermo(0, &param, &atom);
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);// TODO: trace adress
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
//writeInput(&param, &atom);
timer[FORCE] = computeForce(&eam, &param, &atom, &neighbor, &stats);
timer[NEIGH] = 0.0;
timer[FORWARD] = 0.0;
timer[UPDATE] = 0.0;
timer[BALANCE] = 0.0;
timer[REVERSE] = reverse(&comm, &atom, &param);
MPI_Barrier(world);
timer[TOTAL] = getTimeStamp();
timer[FORCE] = computeForce(&eam, &param, &atom, &neighbor, &stats);
timer[NEIGH] = 0.0;
timer[TOTAL] = getTimeStamp();
if(param.vtk_file != NULL) {
printvtk(param.vtk_file, &comm, &atom, &param, 0);
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
}
for(int n = 0; n < param.ntimes; n++) {
bool reneigh = (n + 1) % param.reneigh_every == 0;
initialIntegrate(reneigh, &param, &atom);
if(reneigh) {
timer[UPDATE] +=updateAtoms(&comm,&atom);
if(param.balance && !((n+1)%param.balance_every))
timer[BALANCE] +=dynamicBalance(&comm, &grid, &atom , &param, timer[FORCE]);
timer[NEIGH] += reneighbour(&comm, &param, &atom, &neighbor);
if((n + 1) % param.reneigh_every) {
updatePbc(&atom, &param, false);
} else {
timer[FORWARD] += forward(&comm, &atom, &param);
timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
}
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
timer[FORCE] += computeForce(&eam, &param, &atom, &neighbor, &stats);
timer[REVERSE] += reverse(&comm, &atom, &param);
finalIntegrate(reneigh, &param, &atom);
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
@@ -336,42 +263,23 @@ int main(int argc, char** argv) {
}
if(param.vtk_file != NULL) {
printvtk(param.vtk_file, &comm, &atom ,&param, n+1);
write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1);
}
}
MPI_Barrier(world);
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
computeThermo(-1, &param, &atom);
double mint[NUMTIMER];
double maxt[NUMTIMER];
double sumt[NUMTIMER];
timer[REST] = timer[TOTAL]-timer[FORCE]-timer[NEIGH]-timer[BALANCE]-timer[FORWARD]-timer[REVERSE];
MPI_Reduce(timer,mint,NUMTIMER,MPI_DOUBLE,MPI_MIN,0,world);
MPI_Reduce(timer,maxt,NUMTIMER,MPI_DOUBLE,MPI_MAX,0,world);
MPI_Reduce(timer,sumt,NUMTIMER,MPI_DOUBLE,MPI_SUM,0,world);
int Nghost;
MPI_Reduce(&atom.Nghost,&Nghost,1,MPI_INT,MPI_SUM,0,world);
if(comm.myproc == 0){
int n = comm.numproc;
printf(HLINE);
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, Nghost, param.ntimes);
printf("TOTAL %.2fs\n\n",timer[TOTAL]);
printf("%4s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|\n","","FORCE ", "NEIGH ", "BALANCE", "FORWARD", "REVERSE","UPDATE","REST ","SETUP");
printf("----|-------|-------|-------|-------|-------|-------|-------|-------|\n");
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "AVG", sumt[FORCE]/n,sumt[NEIGH]/n,sumt[BALANCE]/n,sumt[FORWARD]/n,sumt[REVERSE]/n,sumt[UPDATE]/n,sumt[REST]/n,sumt[SETUP]/n);
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MIN", mint[FORCE],mint[NEIGH],mint[BALANCE],mint[FORWARD],mint[REVERSE],mint[UPDATE],mint[REST],mint[SETUP]);
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MAX", maxt[FORCE],maxt[NEIGH],maxt[BALANCE],maxt[FORWARD],maxt[REVERSE],maxt[UPDATE],maxt[REST],maxt[SETUP]);
printf(HLINE);
printf("Performance: %.2f million atom updates per second\n",
printf(HLINE);
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
printf(HLINE);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
#ifdef COMPUTE_STATS
displayStatistics(&atom, &param, &stats, timer);
#endif
}
endComm(&comm);
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

View File

@@ -11,39 +11,27 @@
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <util.h>
#include <mpi.h>
#define SMALL 1.0e-6
#define FACTOR 0.999
MD_FLOAT xprd, yprd, zprd;
MD_FLOAT bininvx, bininvy, bininvz;
int pad_x, pad_y, pad_z;
int mbinxlo, mbinylo, mbinzlo;
int nbinx, nbiny, nbinz;
int mbinx, mbiny, mbinz; // m bins in x, y, z
int mbinx, mbiny, mbinz; // n bins in x, y, z
int *bincount;
int *bins;
int mbins; //total number of bins
int atoms_per_bin; // max atoms per bin
int mbins; //total number of bins
int atoms_per_bin; // max atoms per bin
MD_FLOAT cutneigh;
MD_FLOAT cutneighsq; // neighbor cutoff squared
MD_FLOAT cutneighsq; // neighbor cutoff squared
int nmax;
int nstencil; // # of bins in stencil
int* stencil; // stencil list of bin offsets
int nstencil; // # of bins in stencil
int* stencil; // stencil list of bin offsets
MD_FLOAT binsizex, binsizey, binsizez;
int me; //rank
int method; // method
int half_stencil; //If half stencil exist
int shellMethod; //If shell method exist
static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
static MD_FLOAT bindist(int, int, int);
static int ghostZone(Atom*, int);
static int eightZone(Atom*, int);
static int halfZone(Atom*, int);
static void neighborGhost(Atom*, Neighbor*);
static inline int interaction(Atom* atom, int i, int j);
/* exported subroutines */
void initNeighbor(Neighbor *neighbor, Parameter *param) {
@@ -63,25 +51,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
neighbor->maxneighs = 100;
neighbor->numneigh = NULL;
neighbor->neighbors = NULL;
//========== MPI =============
shellMethod = 0;
half_stencil = 0;
method = param->method;
if(method == halfShell || method == eightShell){
param->half_neigh = 1;
shellMethod = 1;
}
if(method == halfStencil){
param->half_neigh = 0;
half_stencil = 1;
}
me = 0;
MPI_Comm_rank(MPI_COMM_WORLD,&me);
neighbor->half_neigh = param->half_neigh;
neighbor->Nshell = 0;
neighbor->numNeighShell = NULL;
neighbor->neighshell = NULL;
neighbor->listshell = NULL;
}
void setupNeighbor(Parameter* param) {
@@ -94,6 +64,7 @@ void setupNeighbor(Parameter* param) {
yprd = param->yprd;
zprd = param->zprd;
}
// TODO: update lo and hi for standard case and use them here instead
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
@@ -122,48 +93,54 @@ void setupNeighbor(Parameter* param) {
bininvy = 1.0 / binsizey;
bininvz = 1.0 / binsizez;
}
pad_x = (int)(cutneigh*bininvx);
while(pad_x * binsizex < FACTOR * cutneigh) pad_x++;
pad_y = (int)(cutneigh*bininvy);
while(pad_y * binsizey < FACTOR * cutneigh) pad_y++;
pad_z = (int)(cutneigh*bininvz);
while(pad_z * binsizez < FACTOR * cutneigh) pad_z++;
coord = xlo - cutneigh - SMALL * xprd;
mbinxlo = (int) (coord * bininvx);
if (coord < 0.0) { mbinxlo = mbinxlo - 1; }
coord = xhi + cutneigh + SMALL * xprd;
mbinxhi = (int) (coord * bininvx);
coord = ylo - cutneigh - SMALL * yprd;
mbinylo = (int) (coord * bininvy);
if (coord < 0.0) { mbinylo = mbinylo - 1; }
coord = yhi + cutneigh + SMALL * yprd;
mbinyhi = (int) (coord * bininvy);
coord = zlo - cutneigh - SMALL * zprd;
mbinzlo = (int) (coord * bininvz);
if (coord < 0.0) { mbinzlo = mbinzlo - 1; }
coord = zhi + cutneigh + SMALL * zprd;
mbinzhi = (int) (coord * bininvz);
mbinxlo = mbinxlo - 1;
mbinxhi = mbinxhi + 1;
mbinx = mbinxhi - mbinxlo + 1;
mbinylo = mbinylo - 1;
mbinyhi = mbinyhi + 1;
mbiny = mbinyhi - mbinylo + 1;
mbinzlo = mbinzlo - 1;
mbinzhi = mbinzhi + 1;
mbinz = mbinzhi - mbinzlo + 1;
nextx = (int) (cutneigh * bininvx);
if(nextx * binsizex < FACTOR * cutneigh){
nextx++;
pad_x++;
}
if(nextx * binsizex < FACTOR * cutneigh) nextx++;
nexty = (int) (cutneigh * bininvy);
if(nexty * binsizey < FACTOR * cutneigh){
nexty++;
pad_y++;
}
if(nexty * binsizey < FACTOR * cutneigh) nexty++;
nextz = (int) (cutneigh * bininvz);
if(nextz * binsizez < FACTOR * cutneigh){
nextz++;
pad_z++;
}
mbinx = nbinx+4*pad_x;
mbiny = nbiny+4*pad_y;
mbinz = nbinz+4*pad_z;
if(nextz * binsizez < FACTOR * cutneigh) nextz++;
if (stencil) { free(stencil); }
stencil = (int*) malloc((2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
nstencil = 0;
int kstart = -nextz;
int jstart = -nexty;
int istart = -nextx;
int ibin = 0;
for(int k = kstart; k <= nextz; k++) {
for(int j = jstart; j <= nexty; j++) {
for(int i = istart; i <= nextx; i++) {
for(int j = -nexty; j <= nexty; j++) {
for(int i = -nextx; i <= nextx; i++) {
if(bindist(i, j, k) < cutneighsq) {
int jbin = k * mbiny * mbinx + j * mbinx + i;
if(ibin>jbin && half_stencil) continue;
stencil[nstencil++] = jbin;
stencil[nstencil++] = k * mbiny * mbinx + j * mbinx + i;
}
}
}
@@ -178,6 +155,7 @@ void setupNeighbor(Parameter* param) {
void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
int nall = atom->Nlocal + atom->Nghost;
/* extend atom arrays if necessary */
if(nall > nmax) {
nmax = nall;
@@ -186,13 +164,16 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
}
/* bin local & ghost atoms */
binatoms(atom);
int resize = 1;
/* loop over each atom, storing neighbors */
while(resize) {
int new_maxneighs = neighbor->maxneighs;
resize = 0;
for(int i = 0; i < atom->Nlocal; i++) {
int* neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]);
int n = 0;
@@ -203,22 +184,21 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
#ifdef EXPLICIT_TYPES
int type_i = atom->type[i];
#endif
for(int k = 0; k < nstencil; k++) {
int jbin = ibin + stencil[k];
int* loc_bin = &bins[jbin * atoms_per_bin];
for(int m = 0; m < bincount[jbin]; m++) {
int j = loc_bin[m];
if((j==i) || (neighbor->half_neigh && (j<i)))
continue;
if(half_stencil && ibin==jbin && !interaction(atom,i,j))
if((j == i) || (neighbor->half_neigh && (j < i))) {
continue;
}
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
#ifdef EXPLICIT_TYPES
int type_j = atom->type[j];
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
@@ -230,8 +210,8 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
}
}
}
neighbor->numneigh[i] = n;
neighbor->numneigh[i] = n;
if(n >= neighbor->maxneighs) {
resize = 1;
@@ -240,15 +220,14 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
}
}
}
if(resize) {
printf("RESIZE %d, PROC %d\n", neighbor->maxneighs,me);
printf("RESIZE %d\n", neighbor->maxneighs);
neighbor->maxneighs = new_maxneighs * 1.2;
free(neighbor->neighbors);
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
}
}
if(method == eightShell) neighborGhost(atom, neighbor);
}
/* internal subroutines */
@@ -278,22 +257,38 @@ MD_FLOAT bindist(int i, int j, int k) {
} else {
delz = (k + 1) * binsizez;
}
return (delx * delx + dely * dely + delz * delz);
}
int coord2bin(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin) {
int ix, iy, iz;
MD_FLOAT eps = 1e-9;
MD_FLOAT xlo=0.0; MD_FLOAT ylo=0.0; MD_FLOAT zlo=0.0;
xlo = fabs(xlo - pad_x*binsizex)+eps;
ylo = fabs(ylo - pad_y*binsizey)+eps;
zlo = fabs(zlo - pad_z*binsizez)+eps;
ix = (int) ((xin + xlo)*bininvx);
iy = (int) ((yin + ylo)*bininvy);
iz = (int) ((zin + zlo)*bininvz);
int ix, iy, iz;
return (iz * mbiny * mbinx + iy * mbinx + ix);
//return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
if(xin >= xprd) {
ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
} else if(xin >= 0.0) {
ix = (int)(xin * bininvx) - mbinxlo;
} else {
ix = (int)(xin * bininvx) - mbinxlo - 1;
}
if(yin >= yprd) {
iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
} else if(yin >= 0.0) {
iy = (int)(yin * bininvy) - mbinylo;
} else {
iy = (int)(yin * bininvy) - mbinylo - 1;
}
if(zin >= zprd) {
iz = (int)((zin - zprd) * bininvz) + nbinz - mbinzlo;
} else if(zin >= 0.0) {
iz = (int)(zin * bininvz) - mbinzlo;
} else {
iz = (int)(zin * bininvz) - mbinzlo - 1;
}
return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
}
void binatoms(Atom *atom) {
@@ -309,7 +304,7 @@ void binatoms(Atom *atom) {
for(int i = 0; i < nall; i++) {
int ibin = coord2bin(atom_x(i), atom_y(i), atom_z(i));
if(shellMethod && !ghostZone(atom, i)) continue;
if(bincount[ibin] < atoms_per_bin) {
int ac = bincount[ibin]++;
bins[ibin * atoms_per_bin + ac] = i;
@@ -330,51 +325,54 @@ void sortAtom(Atom* atom) {
binatoms(atom);
int Nmax = atom->Nmax;
int* binpos = bincount;
for(int i = 1; i < mbins; i++) {
binpos[i] += binpos[i - 1];
for(int i=1; i<mbins; i++) {
binpos[i] += binpos[i-1];
}
#ifdef AOS
#ifdef AOS
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
#else
#else
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
#endif
#endif
MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
for(int mybin = 0; mybin < mbins; mybin++) {
int start = mybin > 0 ? binpos[mybin - 1] : 0;
for(int mybin = 0; mybin<mbins; mybin++) {
int start = mybin>0?binpos[mybin-1]:0;
int count = binpos[mybin] - start;
for(int k = 0; k < count; k++) {
for(int k=0; k<count; k++) {
int new_i = start + k;
int old_i = bins[mybin * atoms_per_bin + k];
#ifdef AOS
#ifdef AOS
new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
new_vx[new_i * 3 + 1] = old_vx[old_i * 3 + 1];
new_vx[new_i * 3 + 2] = old_vx[old_i * 3 + 2];
#else
#else
new_x[new_i] = old_x[old_i];
new_y[new_i] = old_y[old_i];
new_z[new_i] = old_z[old_i];
new_vx[new_i] = old_vx[old_i];
new_vy[new_i] = old_vy[old_i];
new_vz[new_i] = old_vz[old_i];
#endif
#endif
}
}
free(atom->x);
free(atom->vx);
atom->x = new_x;
atom->vx = new_vx;
#ifndef AOS
#ifndef AOS
free(atom->y);
free(atom->z);
free(atom->vy);
@@ -383,160 +381,5 @@ void sortAtom(Atom* atom) {
atom->z = new_z;
atom->vy = new_vy;
atom->vz = new_vz;
#endif
#endif
}
/* internal subroutines
Added with MPI*/
static int ghostZone(Atom* atom, int i){
if(i<atom->Nlocal) return 1;
else if(method == halfShell) return halfZone(atom,i);
else if(method == eightShell) return eightZone(atom,i);
else return 0;
}
static int eightZone(Atom* atom, int i)
{
//Mapping: 0->0, 1->1, 2->2, 3->6, 4->3, 5->5, 6->4, 7->7
int zoneMapping[] = {0, 1, 2, 6, 3, 5, 4, 7};
MD_FLOAT *hi = atom->mybox.hi;
int zone = 0;
if(BigOrEqual(atom_x(i),hi[_x])) {
zone += 1;
}
if(BigOrEqual(atom_y(i),hi[_y])) {
zone += 2;
}
if(BigOrEqual(atom_z(i),hi[_z])) {
zone += 4;
}
return zoneMapping[zone];
}
static int halfZone(Atom* atom, int i)
{
MD_FLOAT *hi = atom->mybox.hi;
MD_FLOAT *lo = atom->mybox.lo;
if(atom_x(i)<lo[_x] && atom_y(i)<hi[_y] && atom_z(i)<hi[_z]){
return 0;
} else if(atom_y(i)<lo[_y] && atom_z(i)<hi[_z]){
return 0;
} else if(atom_z(i)<lo[_z]){
return 0;
} else {
return 1;
}
}
static void neighborGhost(Atom *atom, Neighbor *neighbor) {
int Nshell=0;
int Nlocal = atom->Nlocal;
int Nghost = atom->Nghost;
if(neighbor->listshell) free(neighbor->listshell);
neighbor->listshell = (int*) malloc(Nghost * sizeof(int));
int* listzone = (int*) malloc(8 * Nghost * sizeof(int));
int countAtoms[8] = {0,0,0,0,0,0,0,0};
//Selecting ghost atoms for interaction
for(int i = Nlocal; i < Nlocal+Nghost; i++) {
int izone = ghostZone(atom,i);
int *list = &listzone[Nghost*izone];
int n = countAtoms[izone];
list[n] = i;
countAtoms[izone]++;
}
for(int zone = 1; zone<=3; zone++){
int *list = &listzone[Nghost*zone];
for(int n=0; n<countAtoms[zone]; n++)
neighbor->listshell[Nshell++] = list[n];
}
neighbor->Nshell = Nshell;
if(neighbor->numNeighShell) free(neighbor->numNeighShell);
if(neighbor->neighshell) free(neighbor->neighshell);
neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int));
neighbor->numNeighShell = (int*) malloc(Nshell * sizeof(int));
int resize = 1;
while(resize)
{
resize = 0;
for(int i = 0; i < Nshell; i++) {
int *neighshell = &(neighbor->neighshell[i*neighbor->maxneighs]);
int n = 0;
int iatom = neighbor->listshell[i];
int izone = ghostZone(atom, iatom);
MD_FLOAT xtmp = atom_x(iatom);
MD_FLOAT ytmp = atom_y(iatom);
MD_FLOAT ztmp = atom_z(iatom);
int ibin = coord2bin(xtmp, ytmp, ztmp);
#ifdef EXPLICIT_TYPES
int type_i = atom->type[iatom];
#endif
for(int k = 0; k < nstencil; k++) {
int jbin = ibin + stencil[k];
int* loc_bin = &bins[jbin * atoms_per_bin];
for(int m = 0; m < bincount[jbin]; m++) {
int jatom = loc_bin[m];
int jzone = ghostZone(atom,jatom);
if(jzone <=izone) continue;
if(izone == 1 && (jzone==5||jzone==6||jzone==7)) continue;
if(izone == 2 && (jzone==4||jzone==6||jzone==7)) continue;
if(izone == 3 && (jzone==4||jzone==5||jzone==7)) continue;
MD_FLOAT delx = xtmp - atom_x(jatom);
MD_FLOAT dely = ytmp - atom_y(jatom);
MD_FLOAT delz = ztmp - atom_z(jatom);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
#ifdef EXPLICIT_TYPES
int type_j = atom->type[jatom];
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
#else
const MD_FLOAT cutoff = cutneighsq;
#endif
if(rsq <= cutoff) {
neighshell[n++] = jatom;
}
}
}
neighbor->numNeighShell[i] = n;
if(n >= neighbor->maxneighs){
resize = 1;
neighbor->maxneighs = n * 1.2;
break;
}
}
if(resize) {
free(neighbor->neighshell);
neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int));
}
}
free(listzone);
}
static inline int interaction(Atom* atom, int i, int j) {
if(i<j && j<atom->Nlocal) {
return 1;
} else if( atom_z(j)>atom_z(i) && j>=atom->Nlocal) {
return 1;
} else if(Equal(atom_z(j),atom_z(i)) && atom_y(j)<atom_y(i) && j>=atom->Nlocal){
return 1;
} else if(Equal(atom_z(j),atom_z(i)) && Equal(atom_y(j),atom_y(i)) && atom_x(j)<atom_x(i) && j>=atom->Nlocal){
return 1;
} else {
return 0;
}
}

View File

@@ -125,7 +125,7 @@ void setupPbc(Atom *atom, Parameter *param) {
if(param->pbc_x != 0 && param->pbc_y != 0 && param->pbc_z != 0) {
if (x < Cutneigh && y < Cutneigh && z < Cutneigh) { ADDGHOST(+1,+1,+1); }
if (x < Cutneigh && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(+1,-1,+1); }
if (x < Cutneigh && y < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
if (x < Cutneigh && y >= Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
if (x < Cutneigh && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
if (x >= (xprd-Cutneigh) && y < Cutneigh && z < Cutneigh) { ADDGHOST(-1,+1,+1); }
if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(-1,-1,+1); }

View File

@@ -6,12 +6,8 @@
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vtk.h>
#include <mpi.h>
static MPI_File _fh;
static inline void flushBuffer(char*);
#include <atom.h>
int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
@@ -22,12 +18,12 @@ int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
for(int i = 0; i < atom->Nlocal; ++i) {
fprintf(fp, "%.4f %.4f %.4f\n", atom_x(i), atom_y(i), atom_z(i));
}
@@ -52,168 +48,3 @@ int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
fclose(fp);
return 0;
}
int vtkOpen(const char* filename, Comm* comm, Atom* atom ,int timestep)
{
char msg[256];
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep);
MPI_File_open(MPI_COMM_WORLD, timestep_filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &_fh);
if(_fh == MPI_FILE_NULL) {
if(comm->myproc == 0) fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
if (comm->myproc==0){
sprintf(msg, "# vtk DataFile Version 2.0\n");
sprintf(msg, "%sParticle data\n",msg);
sprintf(msg, "%sASCII\n",msg);
sprintf(msg, "%sDATASET UNSTRUCTURED_GRID\n",msg);
sprintf(msg, "%sPOINTS %d double\n",msg, atom->Natoms);
flushBuffer(msg);
}
}
int vtkVector(Comm* comm, Atom* atom, Parameter* param)
{
if (_fh == MPI_FILE_NULL) {
if(comm->myproc==0) printf("vtk not initialize! Call vtkOpen first!\n");
return -1;
}
int sizeline= 25; //#initial guess of characters in "%.4f %.4f %.4f\n"
int extrabuff = 100;
int sizebuff = sizeline*atom->Nlocal+extrabuff;
int mysize = 0;
char* msg = (char*) malloc(sizebuff);
sprintf(msg, "");
for(int i = 0; i < atom->Nlocal; i++){
if(mysize+extrabuff >= sizebuff){
sizebuff*= 1.5;
msg = (char*) realloc(msg, sizebuff);
}
//TODO: do not forget to add param->xlo, param->ylo, param->zlo
sprintf(msg, "%s%.4f %.4f %.4f\n",msg, atom_x(i), atom_y(i), atom_z(i));
mysize = strlen(msg);
}
int gatherSize[comm->numproc];
MPI_Allgather(&mysize, 1, MPI_INT, gatherSize, 1, MPI_INT, MPI_COMM_WORLD);
int offset=0;
int globalSize = 0;
for(int i = 0; i < comm->myproc; i++)
offset+= gatherSize[i];
for(int i = 0; i < comm->numproc; i++)
globalSize+= gatherSize[i];
MPI_Offset displ;
MPI_Datatype FileType;
int GlobalSize[] = {globalSize};
int LocalSize[] = {mysize};
int Start[] = {offset};
if(LocalSize[0]>0){
MPI_Type_create_subarray(1, GlobalSize, LocalSize, Start, MPI_ORDER_C, MPI_CHAR, &FileType);
} else {
MPI_Type_vector(0,0,0,MPI_CHAR,&FileType);
}
MPI_Type_commit(&FileType);
MPI_File_get_size(_fh, &displ);
MPI_File_set_view(_fh, displ, MPI_CHAR, FileType, "native", MPI_INFO_NULL);
MPI_File_write_all (_fh, msg, mysize , MPI_CHAR ,MPI_STATUS_IGNORE);
MPI_Barrier(MPI_COMM_WORLD);
MPI_File_set_view(_fh,0,MPI_CHAR, MPI_CHAR, "native", MPI_INFO_NULL);
if (comm->myproc==0){
sprintf(msg, "\n\n");
sprintf(msg, "%sCELLS %d %d\n", msg, atom->Natoms, atom->Natoms * 2);
for(int i = 0; i < atom->Natoms; i++)
sprintf(msg, "%s1 %d\n", msg, i);
flushBuffer(msg);
sprintf(msg, "\n\n");
sprintf(msg, "%sCELL_TYPES %d\n",msg, atom->Natoms);
for(int i = 0; i < atom->Natoms; i++)
sprintf(msg, "%s1\n",msg);
flushBuffer(msg);
sprintf(msg, "\n\n");
sprintf(msg, "%sPOINT_DATA %d\n",msg,atom->Natoms);
sprintf(msg, "%sSCALARS mass double\n",msg);
sprintf(msg, "%sLOOKUP_TABLE default\n",msg);
for(int i = 0; i < atom->Natoms; i++)
sprintf(msg, "%s1.0\n",msg);
sprintf(msg, "%s\n\n",msg);
flushBuffer(msg);
}
}
void vtkClose()
{
MPI_File_close(&_fh);
_fh=MPI_FILE_NULL;
}
int printGhost(const char* filename, Atom* atom, int timestep, int me) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_%d_ghost%i.vtk", filename, timestep,me);
FILE* fp = fopen(timestep_filename, "wb");
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
fprintf(fp, "POINTS %d double\n", atom->Nghost);
for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) {
fprintf(fp, "%.4f %.4f %.4f\n", atom_x(i), atom_y(i), atom_z(i));
}
fprintf(fp, "\n\n");
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) {
fprintf(fp, "1 %d\n", i);
}
fprintf(fp, "\n\n");
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) {
fprintf(fp, "1\n");
}
fprintf(fp, "\n\n");
fprintf(fp, "POINT_DATA %d\n", atom->Nghost);
fprintf(fp, "SCALARS mass double\n");
fprintf(fp, "LOOKUP_TABLE default\n");
for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; i++) {
fprintf(fp, "1.0\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}
void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep)
{
if(comm->numproc == 1)
{
write_atoms_to_vtk_file(filename, atom, timestep);
return;
}
vtkOpen(filename, comm, atom, timestep);
vtkVector(comm, atom, param);
vtkClose();
//printGhost(filename, atom, timestep, comm->myproc);
}
static inline void flushBuffer(char* msg){
MPI_Offset displ;
MPI_File_get_size(_fh, &displ);
MPI_File_write_at(_fh, displ, msg, strlen(msg), MPI_CHAR, MPI_STATUS_IGNORE);
}

View File

@@ -1,116 +1,46 @@
#!/bin/bash
[[ -z "$1" ]] && echo "Use: $0 <binary> [-c <core>] [-f <freq>] [-n <nruns>] [-l <log>] [-s]" && exit
[[ ! -f "$1" ]] && echo "Binary file not found, make sure to use 'make'" && exit
[[ ! -f "$1-stub" ]] && echo "Binary file for stubbed case not found, make sure to use 'make VARIANT=stub'" && exit
TAG=ICX
OPT_SCHEME=gromacs
MDBENCH_BIN=./MDBench-$TAG-$OPT_SCHEME
FREQ=2.4
NRUNS=3
FIXED_PARAMS=--freq $FREQ
MDBENCH_BIN=$1
BIN_INFO="${MDBENCH_BIN#*-}" # $OPT_SCHEME-$TAG-$ISA-$PREC
OPT_SCHEME="${BIN_INFO%%-*}"
PREC="${BIN_INFO##*-}"
BIN_INFO="${BIN_INFO#*-}" # $TAG-$ISA-$PREC
BIN_INFO="${BIN_INFO%-*}" # $TAG-$ISA
TAG="${BIN_INFO%%-*}"
ISA="${BIN_INFO##*-}"
CORE="${CORE:-0}"
FREQ="${FREQ:-2.4}"
NRUNS="${NRUNS:-3}"
LOG="${LOG:-latencies_and_cfds.$(hostname).log}"
STUB_ONLY="${STUB_ONLY:-false}"
SKIP_SET_FREQ="${SKIP_SET_FREQ:-false}"
OPTIND=2
while getopts "c:f:n:l:s" flag; do
case "${flag}" in
c) CORE=${OPTARG};;
f) FREQ=${OPTARG};;
n) NRUNS=${OPTARG};;
l) LOG=${OPTARG};;
s) STUB_ONLY=true;;
esac
done
# Other useful variables
MDBENCH_BIN=./MDBench-$OPT_SCHEME-$TAG-$ISA-$PREC
FIXED_PARAMS="--freq $FREQ"
CPU_VENDOR=$(lscpu | grep "Vendor ID" | tr -s ' ' | cut -d ' ' -f3)
if [ "$CPU_VENDOR" == "GenuineIntel" ]; then
ALL_PREFETCHERS="HW_PREFETCHER,CL_PREFETCHER,DCU_PREFETCHER,IP_PREFETCHER"
DEFAULT_PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
if [ "$OPT_SCHEME" = "gromacs" ]; then
STUB1_NAME=Stub-33
STUB1_PARAMS=-na 4 -nn 33
STUB2_NAME=Stub-128
STUB2_PARAMS=-na 4 -nn 128
else
ALL_PREFETCHERS=""
DEFAULT_PREFETCHERS=("IGNORE")
fi
if [ -z ${PREFETCHERS+x} ]; then
PREFETCHERS=${DEFAULT_PREFETCHERS}
fi
if [ "$OPT_SCHEME" == "gromacs" ]; then
STUB1_NAME=stub-33
STUB1_PARAMS="-na 4 -nn 33"
STUB2_NAME=stub-128
STUB2_PARAMS="-na 4 -nn 128"
else
STUB1_NAME=stub-76
STUB1_PARAMS="-nn 76"
STUB2_NAME=stub-1024
STUB2_PARAMS="-nn 1024"
STUB1_NAME=Stub-76
STUB1_PARAMS=-nn 76
STUB2_NAME=Stub-1024
STUB2_PARAMS=-nn 1024
fi
function run_benchmark() {
BEST=10000000
for i in $(seq $NRUNS); do
RES=$(likwid-pin -c $CORE "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3)
if (( $(echo "$BEST > $RES" | bc -l ) )); then
BEST=$RES
fi
likwid-pin -c 0 "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3
done
}
echo "Tag: $TAG" | tee -a $LOG
echo "Optimization scheme: $OPT_SCHEME" | tee -a $LOG
echo "Instruction set: $ISA" | tee -a $LOG
echo "Precision: $PREC" | tee -a $LOG
echo "Binary: $MDBENCH_BIN(-stub)" | tee -a $LOG
echo "Frequency: $FREQ" | tee -a $LOG
echo "Number of runs: $NRUNS" | tee -a $LOG
echo "Run only stubbed cases: $STUB_ONLY" | tee -a $LOG
echo "Tag: $TAG"
echo "Optimization scheme: $OPT_SCHEME"
echo "Binary: $MDBENCH_BIN(-stub)"
echo "Frequency: $FREQ"
echo "Number of runs: $NRUNS"
if [ "$SKIP_SET_FREQ" == "false" ]; then
echo "Fixing frequencies..."
likwid-setFrequencies -f $FREQ -t 0
fi
echo "Fixing frequencies..."
likwid-setFrequencies -f $FREQ -t 0
for p in $PREFETCHERS; do
if [ "$p" != "IGNORE" ]; then
if [ "$p" == "ALL" ]; then
likwid-features -c $CORE -e $ALL_PREFETCHERS
elif [ "$p" == "NONE" ]; then
likwid-features -c $CORE -d $ALL_PREFETCHERS
else
likwid-features -c $CORE -d $ALL_PREFETCHERS
likwid-features -c $CORE -e $p
fi
echo "Prefetcher settings: $p"
likwid-features -c $CORE -l
fi
MSG="$p: "
if [ "$STUB_ONLY" == "false" ]; then
run_benchmark $MDBENCH_BIN
MSG+="standard=$BEST, "
run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
MSG+="melt=$BEST, "
run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
MSG+="argon=$BEST, "
fi
run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
MSG+="$STUB1_NAME=$BEST, "
run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
MSG+="$STUB2_NAME=$BEST"
echo $MSG | tee -a $LOG
done
echo "Standard"
run_benchmark $MDBENCH_BIN
echo "Melt"
run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
echo "Argon"
run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
echo "$STUB1_NAME"
run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
echo "$STUB2_NAME"
run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS

View File

@@ -1,52 +0,0 @@
# Prerequisites
*.d
# Object files
*.o
*.ko
*.obj
*.elf
# Linker output
*.ilk
*.map
*.exp
# Precompiled Headers
*.gch
*.pch
# Libraries
*.lib
*.a
*.la
*.lo
# Shared objects (inc. Windows DLLs)
*.dll
*.so
*.so.*
*.dylib
# Executables
*.exe
*.out
*.app
*.i*86
*.x86_64
*.hex
# Debug files
*.dSYM/
*.su
*.idb
*.pdb
# Kernel Module Compile Results
*.mod*
*.cmd
.tmp_versions/
modules.order
Module.symvers
Mkfile.old
dkms.conf

View File

@@ -1,21 +0,0 @@
MIT License
Copyright (c) 2021 RRZE-HPC
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,126 +0,0 @@
#CONFIGURE BUILD SYSTEM
TARGET = gather-bench-$(TAG)
BUILD_DIR = ./$(TAG)
SRC_DIR = ./src
MAKE_DIR = ./
ISA_DIR = ./src/$(ISA)
Q ?= @
#DO NOT EDIT BELOW
include $(MAKE_DIR)/config.mk
include $(MAKE_DIR)/include_$(TAG).mk
include $(MAKE_DIR)/include_LIKWID.mk
INCLUDES += -I./src/includes
VPATH = $(SRC_DIR) ${ISA_DIR}
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
ASM += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.f90))
OBJ = $(filter-out $(BUILD_DIR)/main%, $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
OBJ += $(patsubst $(SRC_DIR)/%.cc, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cc))
OBJ += $(patsubst $(SRC_DIR)/%.cpp, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cpp))
OBJ += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.f90))
OBJ += $(patsubst $(SRC_DIR)/%.F90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.F90))
OBJ += $(patsubst $(SRC_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.s))
OBJ += $(patsubst $(ISA_DIR)/%.S, $(BUILD_DIR)/%.o,$(wildcard $(ISA_DIR)/*.S))
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES) -DISA_$(ISA)
ifneq ($(VARIANT),)
.DEFAULT_GOAL := ${TARGET}-$(VARIANT)
endif
ifeq ($(strip $(DATA_LAYOUT)),AOS)
CPPFLAGS += -DAOS
endif
ifeq ($(strip $(TEST)),true)
CPPFLAGS += -DTEST
endif
ifeq ($(strip $(PADDING)),true)
CPPFLAGS += -DPADDING
endif
ifeq ($(strip $(MEASURE_GATHER_CYCLES)),true)
CPPFLAGS += -DMEASURE_GATHER_CYCLES
endif
ifeq ($(strip $(ONLY_FIRST_DIMENSION)),true)
CPPFLAGS += -DONLY_FIRST_DIMENSION
endif
ifeq ($(strip $(MEM_TRACER)),true)
CPPFLAGS += -DMEM_TRACER
endif
${TARGET}: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main.c
@echo "===> LINKING $(TARGET)"
$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET) $(SRC_DIR)/main.c $(OBJ) $(LIBS)
${TARGET}-%: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main-%.c
@echo "===> LINKING $(TARGET)-$* "
$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET)-$* $(SRC_DIR)/main-$*.c $(OBJ) $(LIBS)
asm: $(BUILD_DIR) $(ASM)
$(BUILD_DIR)/%.o: %.c
@echo "===> COMPILE $@"
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
$(Q)$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%.s: %.c
@echo "===> GENERATE ASM $@"
$(Q)$(CC) -S $(CPPFLAGS) $(CFLAGS) $< -o $@
$(BUILD_DIR)/%.s: %.f90
@echo "===> COMPILE $@"
$(Q)$(FC) -S $(FCFLAGS) $< -o $@
$(BUILD_DIR)/%.o: %.cc
@echo "===> COMPILE $@"
$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%.o: %.cpp
@echo "===> COMPILE $@"
$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%.o: %.f90
@echo "===> COMPILE $@"
$(Q)$(FC) -c $(FCFLAGS) $< -o $@
$(BUILD_DIR)/%.o: %.F90
@echo "===> COMPILE $@"
$(Q)$(FC) -c $(CPPFLAGS) $(FCFLAGS) $< -o $@
$(BUILD_DIR)/%.o: %.s
@echo "===> ASSEMBLE $@"
$(Q)$(AS) $(ASFLAGS) $< -o $@
$(BUILD_DIR)/%.o: %.S
@echo "===> ASSEMBLE $@"
$(Q)$(CC) -c $(CPPFLAGS) $< -o $@
tags:
@echo "===> GENERATE TAGS"
$(Q)ctags -R
$(BUILD_DIR):
@mkdir $(BUILD_DIR)
ifeq ($(findstring $(MAKECMDGOALS),clean),)
-include $(OBJ:.o=.d)
endif
.PHONY: clean distclean
clean:
@echo "===> CLEAN"
@rm -rf $(BUILD_DIR)
@rm -f tags
distclean: clean
@echo "===> DIST CLEAN"
@rm -f $(TARGET)
@rm -f tags

View File

@@ -1,2 +0,0 @@
# gather-bench
A X86 gather instruction performance benchmark

View File

@@ -1,22 +0,0 @@
# Supported: GCC, CLANG, ICC
TAG ?= ICC
# Supported: avx2, avx512
ISA ?= avx512
# Use likwid?
ENABLE_LIKWID ?= false
# SP or DP
DATA_TYPE ?= DP
# AOS or SOA
DATA_LAYOUT ?= AOS
# Padding byte for AoS
PADDING ?= false
# Measure cycles for each gather separately
MEASURE_GATHER_CYCLES ?= false
# Gather data only for first dimension (one gather per iteration)
ONLY_FIRST_DIMENSION ?= false
# Trace memory addresses for cache simulator
MEM_TRACER ?= false
# Test correctness of gather kernels
TEST ?= false

View File

@@ -1,9 +0,0 @@
CC = clang
LINKER = $(CC)
OPENMP =# -fopenmp
CFLAGS = -Ofast -std=c11 -march=core-avx2 -mavx -mfma $(OPENMP)
LFLAGS = $(OPENMP) -march=core-avx2 -mavx -mfma
DEFINES = -D_GNU_SOURCE
INCLUDES =
LIBS =

View File

@@ -1,11 +0,0 @@
CC = gcc
AS = as
LINKER = $(CC)
OPENMP = -fopenmp
CFLAGS = -Ofast -std=c11 -mavx2 -mfma $(OPENMP)
ASFLAGS =
LFLAGS = $(OPENMP) -mavx2 -mfma
DEFINES = -D_GNU_SOURCE
INCLUDES =
LIBS =

View File

@@ -1,9 +0,0 @@
CC = icc
LINKER = $(CC)
OPENMP = -qopenmp
CFLAGS = -Ofast -xhost -std=c11 $(OPENMP)
LFLAGS = $(OPENMP)
DEFINES = -D_GNU_SOURCE
INCLUDES =
LIBS =

View File

@@ -1,10 +0,0 @@
LIKWID_INC ?= -I/usr/local/include
LIKWID_DEFINES ?= -DLIKWID_PERFMON
LIKWID_LIB ?= -L/usr/local/lib
ifeq ($(strip $(ENABLE_LIKWID)),true)
INCLUDES += ${LIKWID_INC}
DEFINES += ${LIKWID_DEFINES}
LIBS += -llikwid
LFLAGS += ${LIKWID_LIB}
endif

View File

@@ -1,57 +0,0 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* =======================================================================================
*/
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
void* allocate (int alignment, size_t bytesize)
{
int errorCode;
void* ptr;
errorCode = posix_memalign(&ptr, alignment, bytesize);
if (errorCode) {
if (errorCode == EINVAL) {
fprintf(stderr,
"Error: Alignment parameter is not a power of two\n");
exit(EXIT_FAILURE);
}
if (errorCode == ENOMEM) {
fprintf(stderr,
"Error: Insufficient memory to fulfill the request\n");
exit(EXIT_FAILURE);
}
}
if (ptr == NULL) {
fprintf(stderr, "Error: posix_memalign failed!\n");
exit(EXIT_FAILURE);
}
return ptr;
}

View File

@@ -1,63 +0,0 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
.text
.globl gather
.type gather, @function
gather :
push rbp
mov rbp, rsp
push rbx
push r12
push r13
push r14
push r15
xor rax, rax
vpcmpeqd ymm0, ymm0, ymm0
.align 16
1:
vmovups xmm1, [rsi + rax * 4]
vmovups xmm2, [rsi + rax * 4 + 16]
vmovups xmm3, [rsi + rax * 4 + 32]
vmovups xmm4, [rsi + rax * 4 + 48]
vmovdqa ymm5, ymm0
vmovdqa ymm6, ymm0
vmovdqa ymm7, ymm0
vmovdqa ymm8, ymm0
vxorpd ymm9, ymm9, ymm9
vxorpd ymm10, ymm10, ymm10
vxorpd ymm11, ymm11, ymm11
vxorpd ymm12, ymm12, ymm12
vgatherdpd ymm9, [rdi + xmm1 * 8], ymm5
vgatherdpd ymm10, [rdi + xmm2 * 8], ymm6
vgatherdpd ymm11, [rdi + xmm3 * 8], ymm7
vgatherdpd ymm12, [rdi + xmm4 * 8], ymm8
#ifdef TEST
vmovapd [rcx + rax * 8], ymm9
vmovapd [rcx + rax * 8 + 32], ymm10
vmovapd [rcx + rax * 8 + 64], ymm11
vmovapd [rcx + rax * 8 + 96], ymm12
#endif
addq rax, 16
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather, .-gather

View File

@@ -1,71 +0,0 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
.text
.globl gather_aos
.type gather_aos, @function
gather_aos :
push rbp
mov rbp, rsp
push rbx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
vpcmpeqd ymm8, ymm8, ymm8
.align 16
1:
vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
vpaddd xmm4, xmm3, xmm3
#ifdef PADDING
vpaddd xmm3, xmm4, xmm4
#else
vpaddd xmm3, xmm3, xmm4
#endif
vmovdqa ymm5, ymm8
vmovdqa ymm6, ymm8
vmovdqa ymm7, ymm8
vxorpd ymm0, ymm0, ymm0
vxorpd ymm1, ymm1, ymm1
vxorpd ymm2, ymm2, ymm2
vgatherdpd ymm0, [ rdi + xmm3 * 8], ymm5
vgatherdpd ymm1, [8 + rdi + xmm3 * 8], ymm6
vgatherdpd ymm2, [16 + rdi + xmm3 * 8], ymm7
#ifdef TEST
vmovupd [rcx + rax * 8], ymm0
lea rbx, [rcx + rdx * 8]
vmovupd [rbx + rax * 8], ymm1
lea r9, [rbx + rdx * 8]
vmovupd [r9 + rax * 8], ymm2
#endif
addq rax, 4
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather_aos, .-gather_aos

View File

@@ -1,67 +0,0 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
.text
.globl gather_soa
.type gather_soa, @function
gather_soa :
push rbp
mov rbp, rsp
push rbx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
vpcmpeqd ymm8, ymm8, ymm8
lea r8, [rdi + rdx * 8]
lea r9, [r8 + rdx * 8]
.align 16
1:
vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
vmovdqa ymm5, ymm8
vmovdqa ymm6, ymm8
vmovdqa ymm7, ymm8
vxorpd ymm0, ymm0, ymm0
vxorpd ymm1, ymm1, ymm1
vxorpd ymm2, ymm2, ymm2
vgatherdpd ymm0, [rdi + xmm3 * 8], ymm5
vgatherdpd ymm1, [r8 + xmm3 * 8], ymm6
vgatherdpd ymm2, [r9 + xmm3 * 8], ymm7
#ifdef TEST
vmovupd [rcx + rax * 8], ymm0
lea rbx, [rcx + rdx * 8]
vmovupd [rbx + rax * 8], ymm1
lea r10, [rbx + rdx * 8]
vmovupd [r10 + rax * 8], ymm2
#endif
addq rax, 4
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather_soa, .-gather_soa

View File

@@ -1,62 +0,0 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
.text
.globl gather
.type gather, @function
gather :
push rbp
mov rbp, rsp
push rbx
push r12
push r13
push r14
push r15
xor rax, rax
.align 16
1:
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k2, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vpcmpeqb k4, xmm0, xmm0
vmovdqu ymm0, [rsi + rax * 4]
vmovdqu ymm1, [rsi + rax * 4 + 32]
vmovdqu ymm2, [rsi + rax * 4 + 64]
vmovdqu ymm3, [rsi + rax * 4 + 96]
vpxord zmm4, zmm4, zmm4
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
vpxord zmm7, zmm7, zmm7
vgatherdpd zmm4{k1}, [rdi + ymm0 * 8]
vgatherdpd zmm5{k2}, [rdi + ymm1 * 8]
vgatherdpd zmm6{k3}, [rdi + ymm2 * 8]
vgatherdpd zmm7{k4}, [rdi + ymm3 * 8]
#ifdef TEST
vmovapd [rcx + rax * 8], zmm4
vmovapd [rcx + rax * 8 + 64], zmm5
vmovapd [rcx + rax * 8 + 128], zmm6
vmovapd [rcx + rax * 8 + 192], zmm7
#endif
addq rax, 32
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather, .-gather

View File

@@ -1,151 +0,0 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
# r8 -> cycles
.text
.globl gather_aos
.type gather_aos, @function
gather_aos :
push rbp
mov rbp, rsp
push rbx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
.align 16
1:
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
vpaddd ymm4, ymm3, ymm3
#ifdef PADDING
vpaddd ymm3, ymm4, ymm4
#else
vpaddd ymm3, ymm3, ymm4
#endif
# Prefetching instructions
#mov ebx, DWORD PTR[rsi + rax*4]
#mov r9d, DWORD PTR[4 + rsi + rax*4]
#mov r10d, DWORD PTR[8 + rsi + rax*4]
#mov r11d, DWORD PTR[12 + rsi + rax*4]
#mov r12d, DWORD PTR[16 + rsi + rax*4]
#mov r13d, DWORD PTR[20 + rsi + rax*4]
#mov r14d, DWORD PTR[24 + rsi + rax*4]
#mov r15d, DWORD PTR[28 + rsi + rax*4]
#lea ebx, DWORD PTR[rbx]
#lea r9d, DWORD PTR[r9]
#lea r10d, DWORD PTR[r10]
#lea r11d, DWORD PTR[r11]
#lea r12d, DWORD PTR[r12]
#lea r13d, DWORD PTR[r13]
#lea r14d, DWORD PTR[r14]
#lea r15d, DWORD PTR[r15]
vpcmpeqb k1, xmm5, xmm5
#ifndef ONLY_FIRST_DIMENSION
vpcmpeqb k2, xmm5, xmm5
vpcmpeqb k3, xmm5, xmm5
#endif
vpxord zmm0, zmm0, zmm0
#ifndef ONLY_FIRST_DIMENSION
vpxord zmm1, zmm1, zmm1
vpxord zmm2, zmm2, zmm2
#endif
#ifdef MEASURE_GATHER_CYCLES
mov r9, rax
mov r10, rdx
xor r11, r11
add r11, rax
add r11, rax
add r11, rax
#shr r11, 3
xor rbx, rbx
lfence
rdtsc
add ebx, eax
vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
lfence
rdtsc
sub eax, ebx
#movdiri [r8 + r11], rax
movnti [r8 + r11], rax
#ifndef ONLY_FIRST_DIMENSION
xor rbx, rbx
lfence
rdtsc
add ebx, eax
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
lfence
rdtsc
sub eax, ebx
#movdiri [8 + r8 + r11], rax
movnti [8 + r8 + r11], rax
xor rbx, rbx
lfence
rdtsc
add ebx, eax
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
lfence
rdtsc
sub eax, ebx
#movdiri [16 + r8 + r11], rax
movnti [16 + r8 + r11], rax
#endif // ONLY_FIRST_DIMENSION
mov rax, r9
mov rdx, r10
#else // MEASURE_GATHER_CYCLES
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
#ifndef ONLY_FIRST_DIMENSION
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
#endif
#endif // MEASURE_GATHER_CYCLES
#ifdef TEST
vmovupd [rcx + rax * 8], zmm0
lea rbx, [rcx + rdx * 8]
vmovupd [rbx + rax * 8], zmm1
lea r9, [rbx + rdx * 8]
vmovupd [r9 + rax * 8], zmm2
#endif
addq rax, 8
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather_aos, .-gather_aos

View File

@@ -1,147 +0,0 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
.section .rodata, "a"
.align 64
.align 64
.ymm_reg_mask.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .ymm_reg_mask.1,@object
.size .ymm_reg_mask.1,32
.align 8
# rdi -> a
# rsi -> neighbors
# rdx -> numneighs[i]
# rcx -> &t[t_idx]
# r8 -> ntest
.text
.globl gather_md_aos
.type gather_md_aos, @function
gather_md_aos :
push rbp
mov rbp, rsp
push rbx
push r10
push r11
push r12
push r13
push r14
push r15
vmovdqu ymm7, YMMWORD PTR .ymm_reg_mask.1[rip]
mov r15, rdx
xor rax, rax
.align 16
1:
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
vpaddd ymm4, ymm3, ymm3
#ifdef PADDING
vpaddd ymm3, ymm4, ymm4
#else
vpaddd ymm3, ymm3, ymm4
#endif
# Prefetching instructions
#mov ebx, DWORD PTR[rsi + rax*4]
#mov r9d, DWORD PTR[4 + rsi + rax*4]
#mov r10d, DWORD PTR[8 + rsi + rax*4]
#mov r11d, DWORD PTR[12 + rsi + rax*4]
#mov r12d, DWORD PTR[16 + rsi + rax*4]
#mov r13d, DWORD PTR[20 + rsi + rax*4]
#mov r14d, DWORD PTR[24 + rsi + rax*4]
#mov r15d, DWORD PTR[28 + rsi + rax*4]
#lea ebx, DWORD PTR[rbx]
#lea r9d, DWORD PTR[r9]
#lea r10d, DWORD PTR[r10]
#lea r11d, DWORD PTR[r11]
#lea r12d, DWORD PTR[r12]
#lea r13d, DWORD PTR[r13]
#lea r14d, DWORD PTR[r14]
#lea r15d, DWORD PTR[r15]
vpcmpeqb k1, xmm5, xmm5
#ifndef ONLY_FIRST_DIMENSION
vpcmpeqb k2, xmm5, xmm5
vpcmpeqb k3, xmm5, xmm5
#endif
vpxord zmm0, zmm0, zmm0
#ifndef ONLY_FIRST_DIMENSION
vpxord zmm1, zmm1, zmm1
vpxord zmm2, zmm2, zmm2
#endif
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
#ifndef ONLY_FIRST_DIMENSION
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
#endif
#ifdef TEST
vmovupd [rcx + rax * 8], zmm0
lea rbx, [rcx + r8 * 8]
vmovupd [rbx + rax * 8], zmm1
lea r10, [rbx + r8 * 8]
vmovupd [r10 + rax * 8], zmm2
#endif
# TODO: see if this logic can be optimized
addq rax, 8
subq r15, 8
cmpq r15, 8
jge 1b
cmpq r15, 0
jle .end_func
vpbroadcastd ymm6, r15d
vpcmpgtd k1, ymm6, ymm7
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rsi + rax * 4]
vpaddd ymm4, ymm3, ymm3
#ifdef PADDING
vpaddd ymm3, ymm4, ymm4
#else
vpaddd ymm3, ymm3, ymm4
#endif
vpxord zmm0, zmm1, zmm2
#ifndef ONLY_FIRST_DIMENSION
kmovw k2, k1
kmovw k3, k1
vpxord zmm1, zmm1, zmm1
vpxord zmm2, zmm2, zmm2
#endif
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
#ifndef ONLY_FIRST_DIMENSION
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
#endif
#ifdef TEST
vmovupd [rcx + rax * 8], zmm0
lea rbx, [rcx + r8 * 8]
vmovupd [rbx + rax * 8], zmm1
lea r10, [rbx + r8 * 8]
vmovupd [r10 + rax * 8], zmm2
#endif
addq rax, r15
.end_func:
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather_md_aos, .-gather_md_aos

View File

@@ -1,67 +0,0 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
.text
.globl gather_soa
.type gather_soa, @function
gather_soa :
push rbp
mov rbp, rsp
push rbx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
vpcmpeqd ymm8, ymm8, ymm8
lea r8, [rdi + rdx * 8]
lea r9, [r8 + rdx * 8]
.align 16
1:
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
vpcmpeqb k1, xmm5, xmm5
vpcmpeqb k2, xmm5, xmm5
vpcmpeqb k3, xmm5, xmm5
vpxord zmm0, zmm0, zmm0
vpxord zmm1, zmm1, zmm1
vpxord zmm2, zmm2, zmm2
vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
vgatherdpd zmm1{k2}, [r8 + ymm3 * 8]
vgatherdpd zmm2{k3}, [r9 + ymm3 * 8]
#ifdef TEST
vmovupd [rcx + rax * 8], zmm0
lea rbx, [rcx + rdx * 8]
vmovupd [rbx + rax * 8], zmm1
lea r10, [rbx + rdx * 8]
vmovupd [r10 + rax * 8], zmm2
#endif
addq rax, 8
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather_soa, .-gather_soa

View File

@@ -1,23 +0,0 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> &a[i * snbytes]
.text
.globl load_aos
.type load_aos, @function
load_aos :
vmovsd xmm0, QWORD PTR [rdi]
vmovsd xmm1, QWORD PTR [8 + rdi]
vmovsd xmm2, QWORD PTR [16 + rdi]
vbroadcastsd zmm3, xmm0
vbroadcastsd zmm4, xmm1
vbroadcastsd zmm5, xmm2
ret
.size load_aos, .-load_aos

View File

@@ -1,32 +0,0 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* =======================================================================================
*/
#ifndef __ALLOCATE_H_
#define __ALLOCATE_H_
extern void* allocate (int alignment, size_t bytesize);
#endif

View File

@@ -1,53 +0,0 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* =======================================================================================
*/
#ifndef LIKWID_MARKERS_H
#define LIKWID_MARKERS_H
#ifdef LIKWID_PERFMON
#include <likwid.h>
#define LIKWID_MARKER_INIT likwid_markerInit()
#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
#define LIKWID_MARKER_CLOSE likwid_markerClose()
#define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag)
#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
#else /* LIKWID_PERFMON */
#define LIKWID_MARKER_INIT
#define LIKWID_MARKER_THREADINIT
#define LIKWID_MARKER_SWITCH
#define LIKWID_MARKER_REGISTER(regionTag)
#define LIKWID_MARKER_START(regionTag)
#define LIKWID_MARKER_STOP(regionTag)
#define LIKWID_MARKER_CLOSE
#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
#define LIKWID_MARKER_RESET(regionTag)
#endif /* LIKWID_PERFMON */
#endif /*LIKWID_MARKERS_H*/

View File

@@ -1,34 +0,0 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* =======================================================================================
*/
#ifndef __TIMING_H_
#define __TIMING_H_
extern double getTimeStamp();
extern double getTimeResolution();
extern double getTimeStamp_();
#endif

View File

@@ -1,441 +0,0 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* =======================================================================================
*/
#include <float.h>
#include <getopt.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <x86intrin.h>
//---
#include <likwid-marker.h>
//---
#include <allocate.h>
#include <timing.h>
#if !defined(ISA_avx2) && !defined (ISA_avx512)
#error "Invalid ISA macro, possible values are: avx2 and avx512"
#endif
#if defined(TEST) && defined(ONLY_FIRST_DIMENSION)
#error "TEST and ONLY_FIRST_DIMENSION options are mutually exclusive!"
#endif
#define HLINE "----------------------------------------------------------------------------\n"
#ifndef MIN
#define MIN(x,y) ((x)<(y)?(x):(y))
#endif
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#endif
#ifndef ABS
#define ABS(a) ((a) >= 0 ? (a) : -(a))
#endif
#define ARRAY_ALIGNMENT 64
#ifdef ISA_avx512
#define _VL_ 8
#define ISA_STRING "avx512"
#else
#define _VL_ 4
#define ISA_STRING "avx2"
#endif
#ifdef AOS
#define GATHER gather_md_aos
#define LOAD(a, i, d, n) load_aos(&a[i * d])
#define LAYOUT_STRING "AoS"
#else
#define GATHER gather_md_soa
#define LOAD(a, i, d, n) load_soa(a, i, n)
#define LAYOUT_STRING "SoA"
#endif
#if defined(PADDING) && defined(AOS)
#define PADDING_BYTES 1
#else
#define PADDING_BYTES 0
#endif
#ifdef MEM_TRACER
# define MEM_TRACER_INIT(trace_file) FILE *mem_tracer_fp = fopen(get_mem_tracer_filename(trace_file), "w");
# define MEM_TRACER_END fclose(mem_tracer_fp);
# define MEM_TRACE(addr, op) fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr)));
#else
# define MEM_TRACER_INIT
# define MEM_TRACER_END
# define MEM_TRACE(addr, op)
#endif
int gather_md_aos(double*, int*, int, double*, int);
int gather_md_soa(double*, int*, int, double*, int);
void load_aos(double*);
void load_soa(double*, int, int);
const char *get_mem_tracer_filename(const char *trace_file) {
static char fname[64];
snprintf(fname, sizeof fname, "mem_tracer_%s.txt", trace_file);
return fname;
}
int log2_uint(unsigned int x) {
int ans = 0;
while(x >>= 1) { ans++; }
return ans;
}
int main (int argc, char** argv) {
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("gather");
char *trace_file = NULL;
int cl_size = 64;
int ntimesteps = 200;
int reneigh_every = 20;
int opt = 0;
double freq = 2.5;
struct option long_opts[] = {
{"trace" , required_argument, NULL, 't'},
{"freq", required_argument, NULL, 'f'},
{"line", required_argument, NULL, 'l'},
{"timesteps", required_argument, NULL, 'n'},
{"reneigh", required_argument, NULL, 'r'},
{"help", required_argument, NULL, 'h'}
};
while((opt = getopt_long(argc, argv, "t:f:l:n:r:h", long_opts, NULL)) != -1) {
switch(opt) {
case 't':
trace_file = strdup(optarg);
break;
case 'f':
freq = atof(optarg);
break;
case 'l':
cl_size = atoi(optarg);
break;
case 'n':
ntimesteps = atoi(optarg);
break;
case 'r':
reneigh_every = atoi(optarg);
break;
case 'h':
case '?':
default:
printf("Usage: %s [OPTION]...\n", argv[0]);
printf("MD variant for gather benchmark.\n\n");
printf("Mandatory arguments to long options are also mandatory for short options.\n");
printf("\t-t, --trace=STRING input file with traced indexes from MD-Bench.\n");
printf("\t-f, --freq=REAL CPU frequency in GHz (default 2.5).\n");
printf("\t-l, --line=NUMBER cache line size in bytes (default 64).\n");
printf("\t-n, --timesteps=NUMBER number of timesteps to simulate (default 200).\n");
printf("\t-r, --reneigh=NUMBER reneighboring frequency in timesteps (default 20).\n");
printf("\t-h, --help display this help message.\n");
printf("\n\n");
return EXIT_FAILURE;
}
}
if(trace_file == NULL) {
fprintf(stderr, "Trace file not specified!\n");
return EXIT_FAILURE;
}
FILE *fp;
char *line = NULL;
int *neighborlists = NULL;
int *numneighs = NULL;
int atom = -1;
int nlocal, nghost, maxneighs;
int nall = 0;
int N_alloc = 0;
size_t ntest = 0;
size_t llen;
ssize_t read;
double *a = NULL;
double *f = NULL;
double *t = NULL;
double time = 0.0;
double E, S;
const int dims = 3;
const int snbytes = dims + PADDING_BYTES; // bytes per element (struct), includes padding
long long int niters = 0;
long long int ngathered = 0;
printf("ISA,Layout,Dims,Frequency (GHz),Cache Line Size (B),Vector Width (e)\n");
printf("%s,%s,%d,%f,%d,%d\n\n", ISA_STRING, LAYOUT_STRING, dims, freq, cl_size, _VL_);
freq = freq * 1e9;
#ifdef ONLY_FIRST_DIMENSION
const int gathered_dims = 1;
#else
const int gathered_dims = dims;
#endif
for(int ts = -1; ts < ntimesteps; ts++) {
if(!((ts + 1) % reneigh_every)) {
char ts_trace_file[128];
snprintf(ts_trace_file, sizeof ts_trace_file, "%s_%d.out", trace_file, ts + 1);
if((fp = fopen(ts_trace_file, "r")) == NULL) {
fprintf(stderr, "Error: could not open trace file!\n");
return EXIT_FAILURE;
}
while((read = getline(&line, &llen, fp)) != -1) {
int i = 2;
if(strncmp(line, "N:", 2) == 0) {
while(line[i] == ' ') { i++; }
nlocal = atoi(strtok(&line[i], " "));
nghost = atoi(strtok(NULL, " "));
nall = nlocal + nghost;
maxneighs = atoi(strtok(NULL, " "));
if(nlocal <= 0 || maxneighs <= 0) {
fprintf(stderr, "Number of local atoms and neighbor lists capacity cannot be less or equal than zero!\n");
return EXIT_FAILURE;
}
if(neighborlists == NULL) {
neighborlists = (int *) allocate( ARRAY_ALIGNMENT, nlocal * maxneighs * sizeof(int) );
numneighs = (int *) allocate( ARRAY_ALIGNMENT, nlocal * sizeof(int) );
}
}
if(strncmp(line, "A:", 2) == 0) {
while(line[i] == ' ') { i++; }
atom = atoi(strtok(&line[i], " "));
numneighs[atom] = 0;
}
if(strncmp(line, "I:", 2) == 0) {
while(line[i] == ' ') { i++; }
char *neigh_idx = strtok(&line[i], " ");
while(neigh_idx != NULL && *neigh_idx != '\n') {
int j = numneighs[atom];
neighborlists[atom * maxneighs + j] = atoi(neigh_idx);
numneighs[atom]++;
ntest++;
neigh_idx = strtok(NULL, " ");
}
}
}
fclose(fp);
}
if(N_alloc == 0) {
N_alloc = nall * 2;
a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * snbytes * sizeof(double) );
f = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * dims * sizeof(double) );
}
#ifdef TEST
if(t != NULL) { free(t); }
ntest += 100;
t = (double*) allocate( ARRAY_ALIGNMENT, ntest * dims * sizeof(double) );
#endif
for(int i = 0; i < N_alloc; ++i) {
#ifdef AOS
a[i * snbytes + 0] = i * dims + 0;
a[i * snbytes + 1] = i * dims + 1;
a[i * snbytes + 2] = i * dims + 2;
#else
a[N * 0 + i] = N * 0 + i;
a[N * 1 + i] = N * 1 + i;
a[N * 2 + i] = N * 2 + i;
#endif
f[i * dims + 0] = 0.0;
f[i * dims + 1] = 0.0;
f[i * dims + 2] = 0.0;
}
int t_idx = 0;
S = getTimeStamp();
LIKWID_MARKER_START("gather");
for(int i = 0; i < nlocal; i++) {
int *neighbors = &neighborlists[i * maxneighs];
// We inline the assembly for AVX512 with AoS layout to evaluate the impact
// of calling external assembly procedures in the overall runtime
#ifdef ISA_avx512
__m256i ymm_reg_mask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
__asm__ __volatile__( "vmovsd 0(%0), %%xmm3;"
"vmovsd 8(%0), %%xmm4;"
"vmovsd 16(%0), %%xmm5;"
"vbroadcastsd %%xmm3, %%zmm0;"
"vbroadcastsd %%xmm4, %%zmm1;"
"vbroadcastsd %%xmm5, %%zmm2;"
:
: "r" (&a[i * snbytes])
: "%xmm3", "%xmm4", "%xmm5", "%zmm0", "%zmm1", "%zmm2" );
__asm__ __volatile__( "xor %%rax, %%rax;"
"movq %%rdx, %%r15;"
"1: vmovdqu (%1,%%rax,4), %%ymm3;"
"vpaddd %%ymm3, %%ymm3, %%ymm4;"
#ifdef PADDING
"vpaddd %%ymm4, %%ymm4, %%ymm3;"
#else
"vpaddd %%ymm3, %%ymm4, %%ymm3;"
#endif
"vpcmpeqb %%xmm5, %%xmm5, %%k1;"
"vpcmpeqb %%xmm5, %%xmm5, %%k2;"
"vpcmpeqb %%xmm5, %%xmm5, %%k3;"
"vpxord %%zmm0, %%zmm0, %%zmm0;"
"vpxord %%zmm1, %%zmm1, %%zmm1;"
"vpxord %%zmm2, %%zmm2, %%zmm2;"
"vgatherdpd (%3, %%ymm3, 8), %%zmm0{{%%k1}};"
"vgatherdpd 8(%3, %%ymm3, 8), %%zmm1{{%%k2}};"
"vgatherdpd 16(%3, %%ymm3, 8), %%zmm2{{%%k3}};"
"addq $8, %%rax;"
"subq $8, %%r15;"
"cmpq $8, %%r15;"
"jge 1b;"
"cmpq $0, %%r15;"
"jle 2;"
"vpbroadcastd %%r15d, %%ymm5;"
"vpcmpgtd %%ymm5, %2, %%k1;"
"vmovdqu32 (%1,%%rax,4), %%ymm3{{%%k1}}{{z}};"
"vpaddd %%ymm3, %%ymm3, %%ymm4;"
#ifdef PADDING
"vpaddd %%ymm4, %%ymm4, %%ymm3;"
#else
"vpaddd %%ymm3, %%ymm4, %%ymm3;"
#endif
"vpxord %%zmm0, %%zmm0, %%zmm0;"
"kmovw %%k1, %%k2;"
"kmovw %%k1, %%k3;"
"vpxord %%zmm1, %%zmm1, %%zmm1;"
"vpxord %%zmm2, %%zmm2, %%zmm2;"
"vgatherdpd (%3, %%ymm3, 8), %%zmm0{{%%k1}};"
"vgatherdpd 8(%3, %%ymm3, 8), %%zmm1{{%%k2}};"
"vgatherdpd 16(%3, %%ymm3, 8), %%zmm2{{%%k3}};"
"addq %%r15, %%rax;"
"2:;"
:
: "d" (numneighs[i]), "r" (neighbors), "x" (ymm_reg_mask), "r" (a)
: "%rax", "%r15", "%ymm3", "%ymm4", "%ymm5", "%k1", "%k2", "%k3", "%zmm0", "%zmm1", "%zmm2" );
#else
LOAD(a, i, snbytes, N_alloc);
t_idx += GATHER(a, neighbors, numneighs[i], &t[t_idx], ntest);
#endif
f[i * dims + 0] += i;
f[i * dims + 1] += i;
f[i * dims + 2] += i;
}
LIKWID_MARKER_STOP("gather");
E = getTimeStamp();
time += E - S;
#ifdef MEM_TRACER
MEM_TRACER_INIT(trace_file);
for(int i = 0; i < nlocal; i++) {
int *neighbors = &neighborlists[i * maxneighs];
for(int d = 0; d < gathered_dims; d++) {
#ifdef AOS
MEM_TRACE('R', a[i * snbytes + d])
#else
MEM_TRACE('R', a[d * N + i])
#endif
}
for(int j = 0; j < numneighs[i]; j += _VL_) {
for(int jj = j; jj < MIN(j + _VL_, numneighs[i]); j++) {
int k = neighbors[jj];
for(int d = 0; d < gathered_dims; d++) {
#ifdef AOS
MEM_TRACE('R', a[k * snbytes + d])
#else
MEM_TRACE('R', a[d * N + k])
#endif
}
}
}
}
MEM_TRACER_END;
#endif
#ifdef TEST
int test_failed = 0;
t_idx = 0;
for(int i = 0; i < nlocal; ++i) {
int *neighbors = &neighborlists[i * maxneighs];
for(int j = 0; j < numneighs[i]; ++j) {
int k = neighbors[j];
for(int d = 0; d < dims; ++d) {
#ifdef AOS
if(t[d * ntest + t_idx] != k * dims + d) {
#else
if(t[d * ntest + t_idx] != d * N + k) {
#endif
test_failed = 1;
break;
}
}
t_idx++;
}
}
if(test_failed) {
printf("Test failed!\n");
return EXIT_FAILURE;
}
#endif
for(int i = 0; i < nlocal; i++) {
niters += (numneighs[i] / _VL_) + ((numneighs[i] % _VL_ == 0) ? 0 : 1);
ngathered += numneighs[i];
}
}
printf("%14s,%14s,%14s,%14s,%14s,%14s", "tot. time(s)", "time/step(ms)", "time/iter(us)", "cy/it", "cy/gather", "cy/elem");
printf("\n");
const double time_per_step = time * 1e3 / ((double) ntimesteps);
const double time_per_it = time * 1e6 / ((double) niters);
const double cy_per_it = time * freq * _VL_ / ((double) niters);
const double cy_per_gather = time * freq * _VL_ / ((double) niters * gathered_dims);
const double cy_per_elem = time * freq / ((double) ngathered * gathered_dims);
printf("%14.6f,%14.6f,%14.6f,%14.6f,%14.6f,%14.6f\n", time, time_per_step, time_per_it, cy_per_it, cy_per_gather, cy_per_elem);
#ifdef TEST
printf("Test passed!\n");
#endif
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

View File

@@ -1,361 +0,0 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* =======================================================================================
*/
#include <float.h>
#include <getopt.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
//---
#include <likwid-marker.h>
//---
#include <allocate.h>
#include <timing.h>
#if !defined(ISA_avx2) && !defined (ISA_avx512)
#error "Invalid ISA macro, possible values are: avx2 and avx512"
#endif
#if defined(TEST) && defined(ONLY_FIRST_DIMENSION)
#error "TEST and ONLY_FIRST_DIMENSION options are mutually exclusive!"
#endif
#define HLINE "----------------------------------------------------------------------------\n"
#ifndef MIN
#define MIN(x,y) ((x)<(y)?(x):(y))
#endif
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#endif
#ifndef ABS
#define ABS(a) ((a) >= 0 ? (a) : -(a))
#endif
#define ARRAY_ALIGNMENT 64
#define SIZE 20000
#ifdef ISA_avx512
#define _VL_ 8
#define ISA_STRING "avx512"
#else
#define _VL_ 4
#define ISA_STRING "avx2"
#endif
#ifdef AOS
#define GATHER gather_aos
#define LAYOUT_STRING "AoS"
#else
#define GATHER gather_soa
#define LAYOUT_STRING "SoA"
#endif
#if defined(PADDING) && defined(AOS)
#define PADDING_BYTES 1
#else
#define PADDING_BYTES 0
#endif
#ifdef MEM_TRACER
# define MEM_TRACER_INIT(stride, size) FILE *mem_tracer_fp = fopen(get_mem_tracer_filename(stride, size), "w");
# define MEM_TRACER_END fclose(mem_tracer_fp);
# define MEM_TRACE(addr, op) fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr)));
#else
# define MEM_TRACER_INIT
# define MEM_TRACER_END
# define MEM_TRACE(addr, op)
#endif
extern void gather_aos(double*, int*, int, double*, long int*);
extern void gather_soa(double*, int*, int, double*, long int*);
const char *get_mem_tracer_filename(int stride, int size) {
static char fname[64];
snprintf(fname, sizeof fname, "mem_tracer_%d_%d.txt", stride, size);
return fname;
}
int log2_uint(unsigned int x) {
int ans = 0;
while(x >>= 1) { ans++; }
return ans;
}
int main (int argc, char** argv) {
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("gather");
int stride = 1;
int cl_size = 64;
int opt = 0;
double freq = 2.5;
struct option long_opts[] = {
{"stride", required_argument, NULL, 's'},
{"freq", required_argument, NULL, 'f'},
{"line", required_argument, NULL, 'l'},
{"help", required_argument, NULL, 'h'}
};
while((opt = getopt_long(argc, argv, "s:f:l:h", long_opts, NULL)) != -1) {
switch(opt) {
case 's':
stride = atoi(optarg);
break;
case 'f':
freq = atof(optarg);
break;
case 'l':
cl_size = atoi(optarg);
break;
case 'h':
case '?':
default:
printf("Usage: %s [OPTION]...\n", argv[0]);
printf("MD variant for gather benchmark.\n\n");
printf("Mandatory arguments to long options are also mandatory for short options.\n");
printf("\t-s, --stride=NUMBER stride between two successive elements (default 1).\n");
printf("\t-f, --freq=REAL CPU frequency in GHz (default 2.5).\n");
printf("\t-l, --line=NUMBER cache line size in bytes (default 64).\n");
printf("\t-h, --help display this help message.\n");
printf("\n\n");
return EXIT_FAILURE;
}
}
size_t bytesPerWord = sizeof(double);
const int dims = 3;
const int snbytes = dims + PADDING_BYTES; // bytes per element (struct), includes padding
#ifdef AOS
size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ * snbytes / (cl_size / sizeof(double)), 1), _VL_);
#else
size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ / (cl_size / sizeof(double)), 1), _VL_) * dims;
#endif
size_t N = SIZE;
double E, S;
printf("ISA,Layout,Stride,Dims,Frequency (GHz),Cache Line Size (B),Vector Width (e),Cache Lines/Gather\n");
printf("%s,%s,%d,%d,%f,%d,%d,%lu\n\n", ISA_STRING, LAYOUT_STRING, stride, dims, freq, cl_size, _VL_, cacheLinesPerGather);
printf("%14s,%14s,%14s,", "N", "Size(kB)", "cut CLs");
#ifndef MEASURE_GATHER_CYCLES
printf("%14s,%14s,%14s,%14s,%14s", "tot. time", "time/LUP(ms)", "cy/it", "cy/gather", "cy/elem");
#else
#ifdef ONLY_FIRST_DIMENSION
printf("%27s,%27s,%27s", "min/max/avg cy(x)", "min/max/avg cy(y)", "min/max/avg cy(z)");
#else
printf("%27s", "min/max/avg cy(x)");
#endif
#endif
printf("\n");
freq = freq * 1e9;
for(int N = 512; N < 80000000; N = 1.5 * N) {
// Currently this only works when the array size (in elements) is multiple of the vector length (no preamble and prelude)
if(N % _VL_ != 0) {
N += _VL_ - (N % _VL_);
}
MEM_TRACER_INIT(stride, N);
int N_gathers_per_dim = N / _VL_;
int N_alloc = N * 2;
int N_cycles_alloc = N_gathers_per_dim * 2;
int cut_cl = 0;
double* a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * snbytes * sizeof(double) );
int* idx = (int*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(int) );
int rep;
double time;
#ifdef TEST
double* t = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * dims * sizeof(double) );
#else
double* t = (double*) NULL;
#endif
#ifdef MEASURE_GATHER_CYCLES
long int* cycles = (long int*) allocate( ARRAY_ALIGNMENT, N_cycles_alloc * dims * sizeof(long int)) ;
#else
long int* cycles = (long int*) NULL;
#endif
for(int i = 0; i < N_alloc; ++i) {
#ifdef AOS
a[i * snbytes + 0] = i * dims + 0;
a[i * snbytes + 1] = i * dims + 1;
a[i * snbytes + 2] = i * dims + 2;
#else
a[N * 0 + i] = N * 0 + i;
a[N * 1 + i] = N * 1 + i;
a[N * 2 + i] = N * 2 + i;
#endif
idx[i] = (i * stride) % N;
}
#ifdef ONLY_FIRST_DIMENSION
const int gathered_dims = 1;
#else
const int gathered_dims = dims;
#endif
#ifdef MEM_TRACER
for(int i = 0; i < N; i += _VL_) {
for(int j = 0; j < _VL_; j++) {
MEM_TRACE(idx[i + j], 'R');
}
for(int d = 0; d < gathered_dims; d++) {
for(int j = 0; j < _VL_; j++) {
#ifdef AOS
MEM_TRACE(a[idx[i + j] * snbytes + d], 'R');
#else
MEM_TRACE(a[N * d + idx[i + j]], 'R');
#endif
}
}
}
#endif
#ifdef AOS
const int cl_shift = log2_uint((unsigned int) cl_size);
for(int i = 0; i < N; i++) {
const int first_cl = (idx[i] * snbytes * sizeof(double)) >> cl_shift;
const int last_cl = ((idx[i] * snbytes + gathered_dims - 1) * sizeof(double)) >> cl_shift;
if(first_cl != last_cl) {
cut_cl++;
}
}
#endif
S = getTimeStamp();
for(int r = 0; r < 100; ++r) {
GATHER(a, idx, N, t, cycles);
}
E = getTimeStamp();
#ifdef MEASURE_GATHER_CYCLES
for(int i = 0; i < N_cycles_alloc; i++) {
cycles[i * 3 + 0] = 0;
cycles[i * 3 + 1] = 0;
cycles[i * 3 + 2] = 0;
}
#endif
rep = 100 * (0.5 / (E - S));
S = getTimeStamp();
LIKWID_MARKER_START("gather");
for(int r = 0; r < rep; ++r) {
GATHER(a, idx, N, t, cycles);
}
LIKWID_MARKER_STOP("gather");
E = getTimeStamp();
time = E - S;
#ifdef TEST
int test_failed = 0;
for(int i = 0; i < N; ++i) {
for(int d = 0; d < dims; ++d) {
#ifdef AOS
if(t[d * N + i] != ((i * stride) % N) * dims + d) {
#else
if(t[d * N + i] != d * N + ((i * stride) % N)) {
#endif
test_failed = 1;
break;
}
}
}
if(test_failed) {
printf("Test failed!\n");
return EXIT_FAILURE;
} else {
printf("Test passed!\n");
}
#endif
const double size = N * (dims * sizeof(double) + sizeof(int)) / 1000.0;
printf("%14d,%14.2f,%14d,", N, size, cut_cl);
#ifndef MEASURE_GATHER_CYCLES
const double time_per_it = time * 1e6 / ((double) N * rep);
const double cy_per_it = time * freq * _VL_ / ((double) N * rep);
const double cy_per_gather = time * freq * _VL_ / ((double) N * rep * gathered_dims);
const double cy_per_elem = time * freq / ((double) N * rep * gathered_dims);
printf("%14.10f,%14.10f,%14.6f,%14.6f,%14.6f", time, time_per_it, cy_per_it, cy_per_gather, cy_per_elem);
#else
double cy_min[dims];
double cy_max[dims];
double cy_avg[dims];
for(int d = 0; d < dims; d++) {
cy_min[d] = 100000.0;
cy_max[d] = 0.0;
cy_avg[d] = 0.0;
}
for(int i = 0; i < N_gathers_per_dim; ++i) {
for(int d = 0; d < gathered_dims; d++) {
const double cy_d = (double)(cycles[i * 3 + d]);
cy_min[d] = MIN(cy_min[d], cy_d);
cy_max[d] = MAX(cy_max[d], cy_d);
cy_avg[d] += cy_d;
}
}
for(int d = 0; d < gathered_dims; d++) {
char tmp_str[64];
cy_avg[d] /= (double) N_gathers_per_dim;
snprintf(tmp_str, sizeof tmp_str, "%4.4f/%4.4f/%4.4f", cy_min[d], cy_max[d], cy_avg[d]);
printf("%27s%c", tmp_str, (d < gathered_dims - 1) ? ',' : ' ');
}
#endif
printf("\n");
free(a);
free(idx);
#ifdef TEST
free(t);
#endif
#ifdef MEASURE_GATHER_CYCLES
free(cycles);
#endif
MEM_TRACER_END;
}
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

View File

@@ -1,166 +0,0 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* =======================================================================================
*/
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <limits.h>
#include <float.h>
//---
#include <likwid-marker.h>
//---
#include <timing.h>
#include <allocate.h>
#if !defined(ISA_avx2) && !defined (ISA_avx512)
#error "Invalid ISA macro, possible values are: avx2 and avx512"
#endif
#define HLINE "----------------------------------------------------------------------------\n"
#ifndef MIN
#define MIN(x,y) ((x)<(y)?(x):(y))
#endif
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#endif
#ifndef ABS
#define ABS(a) ((a) >= 0 ? (a) : -(a))
#endif
#define ARRAY_ALIGNMENT 64
#define SIZE 20000
#ifdef ISA_avx512
#define _VL_ 8
#define ISA_STRING "avx512"
#else
#define _VL_ 4
#define ISA_STRING "avx2"
#endif
#ifdef TEST
extern void gather(double*, int*, int, double*);
#else
extern void gather(double*, int*, int);
#endif
int main (int argc, char** argv) {
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("gather");
if (argc < 3) {
printf("Please provide stride and frequency\n");
printf("%s <stride> <freq (GHz)> [cache line size (B)]\n", argv[0]);
return -1;
}
int stride = atoi(argv[1]);
double freq = atof(argv[2]);
int cl_size = (argc == 3) ? 64 : atoi(argv[3]);
size_t bytesPerWord = sizeof(double);
size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ / (cl_size / sizeof(double)), 1), _VL_);
size_t N = SIZE;
double E, S;
printf("ISA,Stride (elems),Frequency (GHz),Cache Line Size (B),Vector Width (elems),Cache Lines/Gather\n");
printf("%s,%d,%f,%d,%d,%lu\n\n", ISA_STRING, stride, freq, cl_size, _VL_, cacheLinesPerGather);
printf("%14s,%14s,%14s,%14s,%14s,%14s\n", "N", "Size(kB)", "tot. time", "time/LUP(ms)", "cy/gather", "cy/elem");
freq = freq * 1e9;
for(int N = 1024; N < 400000; N = 1.5 * N) {
int N_alloc = N * 2;
double* a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(double) );
int* idx = (int*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(int) );
int rep;
double time;
#ifdef TEST
double* t = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(double) );
#endif
for(int i = 0; i < N_alloc; ++i) {
a[i] = i;
idx[i] = (i * stride) % N;
}
S = getTimeStamp();
for(int r = 0; r < 100; ++r) {
#ifdef TEST
gather(a, idx, N, t);
#else
gather(a, idx, N);
#endif
}
E = getTimeStamp();
rep = 100 * (0.5 / (E - S));
S = getTimeStamp();
LIKWID_MARKER_START("gather");
for(int r = 0; r < rep; ++r) {
#ifdef TEST
gather(a, idx, N, t);
#else
gather(a, idx, N);
#endif
}
LIKWID_MARKER_STOP("gather");
E = getTimeStamp();
time = E - S;
#ifdef TEST
int test_failed = 0;
for(int i = 0; i < N; ++i) {
if(t[i] != i * stride % N) {
test_failed = 1;
break;
}
}
if(test_failed) {
printf("Test failed!\n");
return EXIT_FAILURE;
} else {
printf("Test passed!\n");
}
#endif
const double size = N * (sizeof(double) + sizeof(int)) / 1000.0;
const double time_per_it = time * 1e6 / ((double) N * rep);
const double cy_per_gather = time * freq * _VL_ / ((double) N * rep);
const double cy_per_elem = time * freq / ((double) N * rep);
printf("%14d,%14.2f,%14.10f,%14.10f,%14.6f,%14.6f\n", N, size, time, time_per_it, cy_per_gather, cy_per_elem);
free(a);
free(idx);
#ifdef TEST
free(t);
#endif
}
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

View File

@@ -1,47 +0,0 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* =======================================================================================
*/
#include <stdlib.h>
#include <time.h>
double getTimeStamp()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
}
double getTimeResolution()
{
struct timespec ts;
clock_getres(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
}
double getTimeStamp_()
{
return getTimeStamp();
}

View File

@@ -1,28 +0,0 @@
import sys
import re
if len(sys.argv) != 6:
print("Usage: python preds.py <iaca> <mca> <osaca> <uica> <div_factor>")
sys.exit(1)
iaca_pred = float(sys.argv[1])
mca_pred = float(sys.argv[2])
osaca_pred = float(sys.argv[3])
uica_pred = float(sys.argv[4])
div_factor = float(sys.argv[5])
preds = [x / div_factor for x in [iaca_pred, mca_pred, osaca_pred, uica_pred]]
start = -4.0
end = 36.0
npoints = 50
offset = (end - start) / (npoints - 1)
i = 0
for pred in preds:
print(f"@target G0.S{i+6}")
print(f"@type xy")
for j in range(npoints):
pos = start + offset * j
print("{:.6f} {}".format(pos, pred))
print("&")
i += 1

View File

@@ -1,34 +0,0 @@
import sys
import re
if len(sys.argv) != 3:
print("Usage: python string_to_agr.py <input_filename> <div_factor>")
sys.exit(1)
input_filename = sys.argv[1]
div_factor = float(sys.argv[2])
result_list = []
with open(input_filename, 'r') as file:
for line in file:
numbers = re.findall(r'\d+\.\d+', line)
divided_numbers = [float(number) / div_factor for number in numbers]
result_list.append(divided_numbers)
start = -2.5
bar_offset = 1.0
group_offset = 8.0
i = 0
for group in result_list:
print(f"@target G0.S{i}")
print(f"@type bar")
j = 0
for meas in group:
pos = start + i * bar_offset + j * group_offset
print(f"{pos} {meas}")
j += 1
print("&")
i += 1