Compare commits
70 Commits
MPI
...
bc7b523979
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bc7b523979 | ||
|
|
eeba125a52 | ||
|
|
b32254b03f | ||
|
|
4dac820784 | ||
|
|
fe56c50efd | ||
|
|
7a61cbbabf | ||
|
|
176de0525b | ||
|
|
7bad7e84b6 | ||
|
|
fb304f240b | ||
|
|
5a6d1851ed | ||
|
|
f61f59ba3f | ||
|
|
d1c2249b55 | ||
|
|
c9db6e45fa | ||
|
|
0967e8f671 | ||
|
|
fa409c016c | ||
|
|
b65199308d | ||
|
|
71798f5ec5 | ||
|
|
4f0403d3ea | ||
|
|
fa86e44f90 | ||
|
|
7e8fd96fa4 | ||
|
|
463de5b1ed | ||
|
|
4a32a62a98 | ||
|
|
16e8b76012 | ||
|
|
60ed524dd8 | ||
|
|
45f83c7607 | ||
|
|
c49278cb21 | ||
|
|
757d4329f3 | ||
|
|
67f9c769ef | ||
|
|
b5b4d23c0c | ||
|
|
fea1e41daa | ||
|
|
f1998b7acc | ||
|
|
2fe3cd80a0 | ||
|
|
f4313f64e5 | ||
|
|
7f068a6959 | ||
|
|
62cfc22856 | ||
|
|
b024adaf5b | ||
|
|
696e6da01d | ||
|
|
b2a6574426 | ||
|
|
c4080e866e | ||
|
|
7b592b5fc7 | ||
|
|
4690542db5 | ||
|
|
8c131a7699 | ||
|
|
dc4d5f1a9c | ||
|
|
50007216ed | ||
|
|
72e4599acc | ||
|
|
8fa03733e9 | ||
|
|
bf1ae3d013 | ||
|
|
8009b54113 | ||
|
|
0ea0587442 | ||
|
|
134e3f4b78 | ||
|
|
c2bfa3ca3f | ||
|
|
2a099da5b7 | ||
|
|
7691b23d67 | ||
|
|
da90466f98 | ||
|
|
8f723c1299 | ||
|
|
0586ef150a | ||
|
|
2e5d973f7d | ||
|
|
e2fd1a0476 | ||
|
|
4105c844c6 | ||
|
|
1f5c9c4b23 | ||
|
|
29e115464b | ||
|
|
1a54314c8b | ||
|
|
280f595b7f | ||
|
|
3428974730 | ||
|
|
b54842f764 | ||
|
|
9730164e6f | ||
|
|
0f5fdd3708 | ||
|
|
3f7fb7f22a | ||
|
|
bfa6c581c3 | ||
|
|
fd886e77eb |
19
.gitignore
vendored
19
.gitignore
vendored
@@ -27,6 +27,7 @@
|
||||
*.so
|
||||
*.so.*
|
||||
*.dylib
|
||||
.DS_Store
|
||||
|
||||
# Executables
|
||||
*.exe
|
||||
@@ -51,17 +52,9 @@ Module.symvers
|
||||
Mkfile.old
|
||||
dkms.conf
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# TODO list
|
||||
todo.txt
|
||||
|
||||
# Build directories and executables
|
||||
#GCC-*/
|
||||
#ICC-*/
|
||||
#ICX-*/
|
||||
#CLANG-*/
|
||||
#NVCC-*/
|
||||
build-*/
|
||||
MDBench-*
|
||||
.vscode/
|
||||
GCC/
|
||||
ICC/
|
||||
MDBench-GCC*
|
||||
MDBench-ICC*
|
||||
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -1,3 +0,0 @@
|
||||
[submodule "gather-bench"]
|
||||
path = gather-bench
|
||||
url = https://github.com/RRZE-HPC/gather-bench
|
||||
109
Makefile
109
Makefile
@@ -1,11 +1,8 @@
|
||||
#CONFIGURE BUILD SYSTEM
|
||||
IDENTIFIER = $(OPT_SCHEME)-$(TAG)-$(ISA)-$(DATA_TYPE)
|
||||
TARGET = MDBench-$(IDENTIFIER)
|
||||
BUILD_DIR = ./build-$(IDENTIFIER)
|
||||
SRC_DIR = ./$(OPT_SCHEME)
|
||||
TARGET = MDBench-$(TAG)
|
||||
BUILD_DIR = ./$(TAG)
|
||||
SRC_DIR = ./src
|
||||
ASM_DIR = ./asm
|
||||
COMMON_DIR = ./common
|
||||
CUDA_DIR = ./$(SRC_DIR)/cuda
|
||||
MAKE_DIR = ./
|
||||
Q ?= @
|
||||
|
||||
@@ -13,28 +10,27 @@ Q ?= @
|
||||
include $(MAKE_DIR)/config.mk
|
||||
include $(MAKE_DIR)/include_$(TAG).mk
|
||||
include $(MAKE_DIR)/include_LIKWID.mk
|
||||
include $(MAKE_DIR)/include_ISA.mk
|
||||
include $(MAKE_DIR)/include_GROMACS.mk
|
||||
INCLUDES += -I./$(SRC_DIR)/includes -I./$(COMMON_DIR)/includes
|
||||
INCLUDES += -I./src/includes
|
||||
|
||||
ifeq ($(strip $(OPT_SCHEME)),gromacs)
|
||||
DEFINES += -DGROMACS
|
||||
endif
|
||||
ifeq ($(strip $(DATA_LAYOUT)),AOS)
|
||||
DEFINES += -DAOS
|
||||
DEFINES += -DAOS
|
||||
endif
|
||||
ifeq ($(strip $(DATA_TYPE)),SP)
|
||||
DEFINES += -DPRECISION=1
|
||||
DEFINES += -DPRECISION=1
|
||||
else
|
||||
DEFINES += -DPRECISION=2
|
||||
DEFINES += -DPRECISION=2
|
||||
endif
|
||||
|
||||
ifneq ($(ASM_SYNTAX), ATT)
|
||||
ASFLAGS += -masm=intel
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(SORT_ATOMS)),true)
|
||||
DEFINES += -DSORT_ATOMS
|
||||
ifneq ($(ATOMS_LOOP_RUNS),)
|
||||
DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS)
|
||||
endif
|
||||
|
||||
ifneq ($(NEIGHBORS_LOOP_RUNS),)
|
||||
DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(EXPLICIT_TYPES)),true)
|
||||
@@ -53,67 +49,18 @@ ifeq ($(strip $(COMPUTE_STATS)),true)
|
||||
DEFINES += -DCOMPUTE_STATS
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(XTC_OUTPUT)),true)
|
||||
DEFINES += -DXTC_OUTPUT
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(USE_REFERENCE_VERSION)),true)
|
||||
DEFINES += -DUSE_REFERENCE_VERSION
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(HALF_NEIGHBOR_LISTS_CHECK_CJ)),true)
|
||||
DEFINES += -DHALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(DEBUG)),true)
|
||||
DEFINES += -DDEBUG
|
||||
endif
|
||||
|
||||
ifneq ($(VECTOR_WIDTH),)
|
||||
DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__SIMD_KERNEL__)),true)
|
||||
DEFINES += -D__SIMD_KERNEL__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__SSE__)),true)
|
||||
DEFINES += -D__ISA_SSE__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX__)),true)
|
||||
DEFINES += -D__ISA_AVX__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX_FMA__)),true)
|
||||
DEFINES += -D__ISA_AVX_FMA__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX2__)),true)
|
||||
DEFINES += -D__ISA_AVX2__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX512__)),true)
|
||||
DEFINES += -D__ISA_AVX512__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(ENABLE_OMP_SIMD)),true)
|
||||
DEFINES += -DENABLE_OMP_SIMD
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(USE_SIMD_KERNEL)),true)
|
||||
DEFINES += -DUSE_SIMD_KERNEL
|
||||
endif
|
||||
|
||||
VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR)
|
||||
VPATH = $(SRC_DIR) $(ASM_DIR)
|
||||
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
|
||||
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
|
||||
OBJ = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
|
||||
OBJ += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s))
|
||||
OBJ += $(patsubst $(COMMON_DIR)/%.c, $(BUILD_DIR)/%-common.o,$(wildcard $(COMMON_DIR)/*.c))
|
||||
ifeq ($(strip $(TAG)),NVCC)
|
||||
OBJ += $(patsubst $(CUDA_DIR)/%.cu, $(BUILD_DIR)/%-cuda.o,$(wildcard $(CUDA_DIR)/*.cu))
|
||||
endif
|
||||
OBJ += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu))
|
||||
|
||||
|
||||
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)
|
||||
|
||||
# $(warning $(OBJ))
|
||||
@@ -136,16 +83,6 @@ $(BUILD_DIR)/%.o: %.c
|
||||
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
|
||||
|
||||
$(BUILD_DIR)/%-common.o: $(COMMON_DIR)/%.c
|
||||
$(info ===> COMPILE $@)
|
||||
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
|
||||
|
||||
$(BUILD_DIR)/%-cuda.o: %.cu
|
||||
$(info ===> COMPILE $@)
|
||||
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
|
||||
|
||||
$(BUILD_DIR)/%.s: %.c
|
||||
$(info ===> GENERATE ASM $@)
|
||||
$(Q)$(CC) -S $(ASFLAGS) $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||
@@ -154,18 +91,16 @@ $(BUILD_DIR)/%.o: %.s
|
||||
$(info ===> ASSEMBLE $@)
|
||||
$(Q)$(AS) $< -o $@
|
||||
|
||||
$(BUILD_DIR)/%.o: %.cu
|
||||
$(info ===> COMPILE $@)
|
||||
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
|
||||
|
||||
.PHONY: clean distclean tags info asm
|
||||
|
||||
clean:
|
||||
$(info ===> CLEAN)
|
||||
@rm -rf $(BUILD_DIR)
|
||||
@rm -rf $(TARGET)*
|
||||
@rm -f tags
|
||||
|
||||
cleanall:
|
||||
$(info ===> CLEAN)
|
||||
@rm -rf build-*
|
||||
@rm -rf MDBench-*
|
||||
@rm -f tags
|
||||
|
||||
distclean: clean
|
||||
|
||||
102
README.md
102
README.md
@@ -1,103 +1,27 @@
|
||||
# MD-Bench
|
||||
|
||||

|
||||
A simple, sequential C implementation of the [Mantevo miniMD](https://github.com/Mantevo/miniMD) benchmark in less than 1000 LOC.
|
||||
|
||||
MD-Bench is a toolbox for the performance engineering of short-range force calculation kernels on molecular-dynamics applications.
|
||||
It aims at covering all available state-of-the-art algorithms from different community codes such as LAMMPS and GROMACS.
|
||||
## Build
|
||||
|
||||
Apart from that, many tools to study and evaluate the in-depth performance of such kernels on distinct hardware are offered, like gather-bench, a standalone benchmark that mimics the data movement from MD kernels and the stubbed force calculation cases that focus on isolating the impacts caused by memory latencies and control flow divergence contributions in the overall performance.
|
||||
1. Open `config.mk` and edit the `TAG` value according to the tool chain used. Currently supported is GCC, CLANG (LLVM), and ICC (Intel).
|
||||
2. Change `DATA_LAYOUT` and `DATA_TYPE` if desired in config.mk.
|
||||
3. Open and adapt the compiler flags in `<include_<TOOLCHAIN>.mk`, e.g. in `include_ICC.mk` for the Intel tool chain.
|
||||
4. Build the binary calling `make`.
|
||||
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Verlet Lists</th>
|
||||
<th>GROMACS MxN</th>
|
||||
<th>Stubbed cases</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><a target="_blank" rel="noopener noreferrer" href="figures/verlet_v2.png"><img src="figures/verlet_v2.png" alt="Image" title="Verlet Lists" style="width: 100%;"></a></td>
|
||||
<td><a target="_blank" rel="noopener noreferrer" href="figures/gromacs_mxn_v2.png"><img src="figures/gromacs_mxn_v2.png" alt="Image" title="GROMACS MxN" style="width: 90%;"></a></td>
|
||||
<td><a target="_blank" rel="noopener noreferrer" href="figures/stub_new_v3.png"><img src="figures/stub_new_v3.png" alt="Image" title="Stubbed cases" style="width: 100%;"></a></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<!--  -->
|
||||
|
||||
## Build instructions
|
||||
|
||||
Properly configure your building by changing `config.mk` file. The following options are available:
|
||||
|
||||
- **TAG:** Compiler tag (available options: GCC, CLANG, ICC, ONEAPI, NVCC).
|
||||
- **ISA:** Instruction set (available options: SSE, AVX, AVX\_FMA, AVX2, AVX512).
|
||||
- **MASK\_REGISTERS:** Use AVX512 mask registers (always true when ISA is set to AVX512).
|
||||
- **OPT\_SCHEME:** Optimization algorithm (available options: lammps, gromacs).
|
||||
- **ENABLE\_LIKWID:** Enable likwid to make use of HPM counters.
|
||||
- **DATA\_TYPE:** Floating-point precision (available options: SP, DP).
|
||||
- **DATA\_LAYOUT:** Data layout for atom vector properties (available options: AOS, SOA).
|
||||
- **ASM\_SYNTAX:** Assembly syntax to use when generating assembly files (available options: ATT, INTEL).
|
||||
- **DEBUG:** Toggle debug mode.
|
||||
- **EXPLICIT\_TYPES:** Explicitly store and load atom types.
|
||||
- **MEM\_TRACER:** Trace memory addresses for cache simulator.
|
||||
- **INDEX\_TRACER:** Trace indexes and distances for gather-md.
|
||||
- **COMPUTE\_STATS:** Compute statistics.
|
||||
|
||||
Configurations for LAMMPS Verlet Lists optimization scheme:
|
||||
- **ENABLE\_OMP\_SIMD:** Use omp simd pragma on half neighbor-lists kernels.
|
||||
- **USE\_SIMD\_KERNEL:** Compile kernel with explicit SIMD intrinsics.
|
||||
|
||||
Configurations for GROMACS MxN optimization scheme:
|
||||
- **USE\_REFERENCE\_VERSION:** Use reference version (only for correction purposes).
|
||||
- **XTC\_OUTPUT:** Enable XTC output.
|
||||
- **HALF\_NEIGHBOR\_LISTS\_CHECK\_CJ:** Check if j-clusters are local when decreasing the reaction force.
|
||||
|
||||
Configurations for CUDA:
|
||||
- **USE\_CUDA\_HOST\_MEMORY:** Use CUDA host memory to optimize host-device transfers.
|
||||
|
||||
When done, just use `make` to compile the code.
|
||||
You can clean intermediate build results with `make clean`, and all build results with `make distclean`.
|
||||
You have to call `make clean` before `make` if you changed the build settings.
|
||||
|
||||
## Usage
|
||||
## Configuration
|
||||
|
||||
Use the following command to run a simulation:
|
||||
Currently all settings apart from the options described below are hard-coded in `main.c`.
|
||||
|
||||
```bash
|
||||
./MD-Bench-<TAG>-<OPT_SCHEME> [OPTION]...
|
||||
## Run the benchmark
|
||||
|
||||
Without any options 200 steps with system size 32x32x32 is used.
|
||||
|
||||
The default can be changed using the following options:
|
||||
```
|
||||
|
||||
Where `TAG` and `OPT_SCHEME` correspond to the building options with the same name.
|
||||
Without any options, a Copper FCC lattice system with size 32x32x32 (131072 atoms) over 200 time-steps using the Lennard-Jones potential (sigma=1.0, epsilon=1.0) is simulated.
|
||||
|
||||
The default behavior and other options can be changed using the following parameters:
|
||||
```
|
||||
-p <string>: file to read parameters from (can be specified more than once)
|
||||
-f <string>: force field (lj or eam), default lj
|
||||
-i <string>: input file with atom positions (dump)
|
||||
-e <string>: input file for EAM
|
||||
-n / --nsteps <int>: set number of timesteps for simulation
|
||||
-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction
|
||||
-r / --radius <real>: set cutoff radius
|
||||
-s / --skin <real>: set skin (verlet buffer)
|
||||
--freq <real>: processor frequency (GHz)
|
||||
--vtk <string>: VTK file for visualization
|
||||
--xtc <string>: XTC file for visualization
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
TBD
|
||||
|
||||
## Citations
|
||||
|
||||
Rafael Ravedutti Lucio Machado, Jan Eitzinger, Harald Köstler, and Gerhard Wellein: MD-Bench: A generic proxy-app toolbox for state-of-the-art molecular dynamics algorithms. Accepted for [PPAM](https://ppam.edu.pl/) 2022, the 14th International Conference on Parallel Processing and Applied Mathematics, Gdansk, Poland, September 11-14, 2022. PPAM 2022 Best Paper Award. Preprint: [arXiv:2207.13094](https://arxiv.org/abs/2207.13094)
|
||||
|
||||
## Credits
|
||||
|
||||
MD-Bench is developed by the Erlangen National High Performance Computing Center ([NHR@FAU](https://hpc.fau.de/)) at the University of Erlangen-Nürnberg.
|
||||
|
||||
## License
|
||||
|
||||
[LGPL-3.0](https://github.com/RRZE-HPC/MD-Bench/blob/master/LICENSE)
|
||||
|
||||
0
asm/.gitkeep
Normal file
0
asm/.gitkeep
Normal file
626
asm/unused/force-mem-only-with-likwid.s
Normal file
626
asm/unused/force-mem-only-with-likwid.s
Normal file
@@ -0,0 +1,626 @@
|
||||
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
|
||||
# mark_description "-I/mnt/opt/likwid-5.2-dev/include -I./src/includes -S -D_GNU_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DN";
|
||||
# mark_description "EIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ";
|
||||
# mark_description "ICC/force.s";
|
||||
.file "force.c"
|
||||
.text
|
||||
..TXTST0:
|
||||
.L_2__routine_start_computeForce_0:
|
||||
# -- Begin computeForce
|
||||
.text
|
||||
# mark_begin;
|
||||
.align 16,0x90
|
||||
.globl computeForce
|
||||
# --- computeForce(Parameter *, Atom *, Neighbor *, int, int, int)
|
||||
computeForce:
|
||||
# parameter 1: %rdi
|
||||
# parameter 2: %rsi
|
||||
# parameter 3: %rdx
|
||||
# parameter 4: %ecx
|
||||
# parameter 5: %r8d
|
||||
# parameter 6: %r9d
|
||||
..B1.1: # Preds ..B1.0
|
||||
# Execution count [1.00e+00]
|
||||
.cfi_startproc
|
||||
..___tag_value_computeForce.1:
|
||||
..L2:
|
||||
#121.112
|
||||
pushq %rbp #121.112
|
||||
.cfi_def_cfa_offset 16
|
||||
movq %rsp, %rbp #121.112
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_offset 6, -16
|
||||
andq $-64, %rsp #121.112
|
||||
pushq %r12 #121.112
|
||||
pushq %r13 #121.112
|
||||
pushq %r14 #121.112
|
||||
pushq %r15 #121.112
|
||||
pushq %rbx #121.112
|
||||
subq $88, %rsp #121.112
|
||||
xorl %eax, %eax #124.16
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
|
||||
movq %rdx, %r15 #121.112
|
||||
movq %rsi, %r12 #121.112
|
||||
movq %rdi, %rbx #121.112
|
||||
..___tag_value_computeForce.11:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #124.16
|
||||
..___tag_value_computeForce.12:
|
||||
# LOE rbx r12 r15 xmm0
|
||||
..B1.51: # Preds ..B1.1
|
||||
# Execution count [1.00e+00]
|
||||
vmovsd %xmm0, 24(%rsp) #124.16[spill]
|
||||
# LOE rbx r12 r15
|
||||
..B1.2: # Preds ..B1.51
|
||||
# Execution count [1.00e+00]
|
||||
movl 4(%r12), %r13d #125.18
|
||||
movq 64(%r12), %r9 #127.20
|
||||
movq 72(%r12), %r14 #127.45
|
||||
movq 80(%r12), %r8 #127.70
|
||||
vmovsd 72(%rbx), %xmm2 #129.27
|
||||
vmovsd 8(%rbx), %xmm1 #130.23
|
||||
vmovsd (%rbx), %xmm0 #131.24
|
||||
testl %r13d, %r13d #134.24
|
||||
jle ..B1.43 # Prob 50% #134.24
|
||||
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
|
||||
..B1.3: # Preds ..B1.2
|
||||
# Execution count [1.00e+00]
|
||||
xorl %ebx, %ebx #134.5
|
||||
movl %r13d, %edx #134.5
|
||||
xorl %ecx, %ecx #134.5
|
||||
movl $1, %esi #134.5
|
||||
xorl %eax, %eax #135.17
|
||||
shrl $1, %edx #134.5
|
||||
je ..B1.7 # Prob 9% #134.5
|
||||
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
|
||||
..B1.5: # Preds ..B1.3 ..B1.5
|
||||
# Execution count [2.50e+00]
|
||||
movq %rax, (%rcx,%r9) #135.9
|
||||
incq %rbx #134.5
|
||||
movq %rax, (%rcx,%r14) #136.9
|
||||
movq %rax, (%rcx,%r8) #137.9
|
||||
movq %rax, 8(%rcx,%r9) #135.9
|
||||
movq %rax, 8(%rcx,%r14) #136.9
|
||||
movq %rax, 8(%rcx,%r8) #137.9
|
||||
addq $16, %rcx #134.5
|
||||
cmpq %rdx, %rbx #134.5
|
||||
jb ..B1.5 # Prob 63% #134.5
|
||||
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
|
||||
..B1.6: # Preds ..B1.5
|
||||
# Execution count [9.00e-01]
|
||||
lea 1(%rbx,%rbx), %esi #135.9
|
||||
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
|
||||
..B1.7: # Preds ..B1.3 ..B1.6
|
||||
# Execution count [1.00e+00]
|
||||
lea -1(%rsi), %edx #134.5
|
||||
cmpl %r13d, %edx #134.5
|
||||
jae ..B1.9 # Prob 9% #134.5
|
||||
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
|
||||
..B1.8: # Preds ..B1.7
|
||||
# Execution count [9.00e-01]
|
||||
movslq %esi, %rsi #134.5
|
||||
movq %rax, -8(%r9,%rsi,8) #135.9
|
||||
movq %rax, -8(%r14,%rsi,8) #136.9
|
||||
movq %rax, -8(%r8,%rsi,8) #137.9
|
||||
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
|
||||
..B1.9: # Preds ..B1.7 ..B1.8
|
||||
# Execution count [5.00e-01]
|
||||
movl $.L_2__STRING.0, %edi #141.5
|
||||
movq %r8, 32(%rsp) #141.5[spill]
|
||||
movq %r9, 80(%rsp) #141.5[spill]
|
||||
vmovsd %xmm2, (%rsp) #141.5[spill]
|
||||
vmovsd %xmm1, 8(%rsp) #141.5[spill]
|
||||
vmovsd %xmm0, 16(%rsp) #141.5[spill]
|
||||
..___tag_value_computeForce.18:
|
||||
# likwid_markerStartRegion(const char *)
|
||||
call likwid_markerStartRegion #141.5
|
||||
..___tag_value_computeForce.19:
|
||||
# LOE r12 r14 r15 r13d
|
||||
..B1.10: # Preds ..B1.9
|
||||
# Execution count [9.00e-01]
|
||||
vmovsd 16(%rsp), %xmm0 #[spill]
|
||||
xorl %esi, %esi #143.15
|
||||
vmovsd (%rsp), %xmm2 #[spill]
|
||||
xorl %eax, %eax #143.5
|
||||
vmulsd %xmm2, %xmm2, %xmm13 #129.45
|
||||
xorl %edi, %edi #143.5
|
||||
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #173.13
|
||||
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #197.45
|
||||
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #173.13
|
||||
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #197.58
|
||||
vmovsd 8(%rsp), %xmm1 #[spill]
|
||||
vbroadcastsd %xmm13, %zmm14 #129.25
|
||||
vbroadcastsd %xmm1, %zmm13 #130.21
|
||||
vbroadcastsd %xmm0, %zmm9 #197.45
|
||||
movslq %r13d, %r13 #143.5
|
||||
movq 24(%r15), %r10 #145.25
|
||||
movslq 16(%r15), %rdx #144.43
|
||||
movq 8(%r15), %rcx #144.19
|
||||
movq 32(%rsp), %r8 #[spill]
|
||||
movq 16(%r12), %rbx #146.25
|
||||
shlq $2, %rdx #126.5
|
||||
movq %r13, 64(%rsp) #143.5[spill]
|
||||
movq %r10, 72(%rsp) #143.5[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.11: # Preds ..B1.41 ..B1.10
|
||||
# Execution count [5.00e+00]
|
||||
movq 72(%rsp), %r9 #145.25[spill]
|
||||
vxorpd %xmm24, %xmm24, %xmm24 #149.22
|
||||
vmovapd %xmm24, %xmm18 #150.22
|
||||
movl (%r9,%rax,4), %r10d #145.25
|
||||
vmovapd %xmm18, %xmm4 #151.22
|
||||
vmovsd (%rdi,%rbx), %xmm10 #146.25
|
||||
vmovsd 8(%rdi,%rbx), %xmm6 #147.25
|
||||
vmovsd 16(%rdi,%rbx), %xmm12 #148.25
|
||||
testl %r10d, %r10d #173.32
|
||||
jle ..B1.41 # Prob 50% #173.32
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.12: # Preds ..B1.11
|
||||
# Execution count [4.50e+00]
|
||||
vpxord %zmm8, %zmm8, %zmm8 #149.22
|
||||
vmovaps %zmm8, %zmm7 #150.22
|
||||
vmovaps %zmm7, %zmm11 #151.22
|
||||
cmpl $8, %r10d #173.13
|
||||
jl ..B1.48 # Prob 10% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.13: # Preds ..B1.12
|
||||
# Execution count [4.50e+00]
|
||||
cmpl $1200, %r10d #173.13
|
||||
jl ..B1.47 # Prob 10% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.14: # Preds ..B1.13
|
||||
# Execution count [4.50e+00]
|
||||
movq %rdx, %r15 #144.43
|
||||
imulq %rsi, %r15 #144.43
|
||||
addq %rcx, %r15 #126.5
|
||||
movq %r15, %r11 #173.13
|
||||
andq $63, %r11 #173.13
|
||||
testl $3, %r11d #173.13
|
||||
je ..B1.16 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.15: # Preds ..B1.14
|
||||
# Execution count [2.25e+00]
|
||||
xorl %r11d, %r11d #173.13
|
||||
jmp ..B1.18 # Prob 100% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.16: # Preds ..B1.14
|
||||
# Execution count [2.25e+00]
|
||||
testl %r11d, %r11d #173.13
|
||||
je ..B1.18 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.17: # Preds ..B1.16
|
||||
# Execution count [2.50e+01]
|
||||
negl %r11d #173.13
|
||||
addl $64, %r11d #173.13
|
||||
shrl $2, %r11d #173.13
|
||||
cmpl %r11d, %r10d #173.13
|
||||
cmovl %r10d, %r11d #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.18: # Preds ..B1.15 ..B1.17 ..B1.16
|
||||
# Execution count [5.00e+00]
|
||||
movl %r10d, %r13d #173.13
|
||||
subl %r11d, %r13d #173.13
|
||||
andl $7, %r13d #173.13
|
||||
negl %r13d #173.13
|
||||
addl %r10d, %r13d #173.13
|
||||
cmpl $1, %r11d #173.13
|
||||
jb ..B1.26 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.19: # Preds ..B1.18
|
||||
# Execution count [4.50e+00]
|
||||
vmovdqa %ymm15, %ymm4 #173.13
|
||||
xorl %r12d, %r12d #173.13
|
||||
vpbroadcastd %r11d, %ymm3 #173.13
|
||||
vbroadcastsd %xmm10, %zmm2 #146.23
|
||||
vbroadcastsd %xmm6, %zmm1 #147.23
|
||||
vbroadcastsd %xmm12, %zmm0 #148.23
|
||||
movslq %r11d, %r9 #173.13
|
||||
movq %r8, 32(%rsp) #173.13[spill]
|
||||
movq %r14, (%rsp) #173.13[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.20: # Preds ..B1.24 ..B1.19
|
||||
# Execution count [2.50e+01]
|
||||
vpcmpgtd %ymm4, %ymm3, %k3 #173.13
|
||||
vmovdqu32 (%r15,%r12,4), %ymm17{%k3}{z} #174.25
|
||||
kmovw %k3, %r14d #173.13
|
||||
vpaddd %ymm17, %ymm17, %ymm18 #175.40
|
||||
vpaddd %ymm18, %ymm17, %ymm17 #175.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.23: # Preds ..B1.20
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #175.40
|
||||
kmovw %k3, %k2 #175.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #175.40
|
||||
vpxord %zmm19, %zmm19, %zmm19 #175.40
|
||||
vpxord %zmm20, %zmm20, %zmm20 #175.40
|
||||
vgatherdpd 16(%rbx,%ymm17,8), %zmm18{%k1} #175.40
|
||||
vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2} #175.40
|
||||
vgatherdpd (%rbx,%ymm17,8), %zmm20{%k3} #175.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
|
||||
..B1.24: # Preds ..B1.23
|
||||
# Execution count [2.50e+01]
|
||||
addq $8, %r12 #173.13
|
||||
#vpaddd %ymm16, %ymm4, %ymm4 #173.13
|
||||
#vsubpd %zmm18, %zmm0, %zmm29 #177.40
|
||||
#vsubpd %zmm19, %zmm1, %zmm27 #176.40
|
||||
#vsubpd %zmm20, %zmm2, %zmm26 #175.40
|
||||
#vmulpd %zmm27, %zmm27, %zmm25 #178.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm25 #178.53
|
||||
#vfmadd231pd %zmm29, %zmm29, %zmm25 #178.67
|
||||
#vrcp14pd %zmm25, %zmm24 #195.42
|
||||
#vcmppd $1, %zmm14, %zmm25, %k2 #194.26
|
||||
#vfpclasspd $30, %zmm24, %k0 #195.42
|
||||
#kmovw %k2, %r8d #194.26
|
||||
#knotw %k0, %k1 #195.42
|
||||
#vmovaps %zmm25, %zmm17 #195.42
|
||||
#andl %r8d, %r14d #194.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #195.42
|
||||
#kmovw %r14d, %k3 #198.21
|
||||
#vmulpd %zmm17, %zmm17, %zmm18 #195.42
|
||||
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #195.42
|
||||
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #195.42
|
||||
#vmulpd %zmm13, %zmm24, %zmm19 #196.42
|
||||
#vmulpd %zmm9, %zmm24, %zmm21 #197.58
|
||||
#vmulpd %zmm19, %zmm24, %zmm22 #196.48
|
||||
#vmulpd %zmm22, %zmm24, %zmm20 #196.54
|
||||
#vfmsub213pd %zmm5, %zmm22, %zmm24 #197.58
|
||||
#vmulpd %zmm21, %zmm20, %zmm23 #197.65
|
||||
#vmulpd %zmm24, %zmm23, %zmm28 #197.71
|
||||
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #198.21
|
||||
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #199.21
|
||||
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #200.21
|
||||
cmpq %r9, %r12 #173.13
|
||||
jb ..B1.20 # Prob 82% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.25: # Preds ..B1.24
|
||||
# Execution count [4.50e+00]
|
||||
movq 32(%rsp), %r8 #[spill]
|
||||
movq (%rsp), %r14 #[spill]
|
||||
cmpl %r11d, %r10d #173.13
|
||||
je ..B1.40 # Prob 10% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.26: # Preds ..B1.25 ..B1.18 ..B1.47
|
||||
# Execution count [2.50e+01]
|
||||
lea 8(%r11), %r9d #173.13
|
||||
cmpl %r9d, %r13d #173.13
|
||||
jl ..B1.34 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.27: # Preds ..B1.26
|
||||
# Execution count [4.50e+00]
|
||||
movq %rdx, %r12 #144.43
|
||||
imulq %rsi, %r12 #144.43
|
||||
vbroadcastsd %xmm10, %zmm1 #146.23
|
||||
vbroadcastsd %xmm6, %zmm0 #147.23
|
||||
vbroadcastsd %xmm12, %zmm2 #148.23
|
||||
movslq %r11d, %r9 #173.13
|
||||
addq %rcx, %r12 #126.5
|
||||
movq %rdi, 8(%rsp) #126.5[spill]
|
||||
movq %rdx, 16(%rsp) #126.5[spill]
|
||||
movq %rcx, 40(%rsp) #126.5[spill]
|
||||
movq %rax, 48(%rsp) #126.5[spill]
|
||||
movq %rsi, 56(%rsp) #126.5[spill]
|
||||
movq %r8, 32(%rsp) #126.5[spill]
|
||||
movq %r14, (%rsp) #126.5[spill]
|
||||
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.28: # Preds ..B1.32 ..B1.27
|
||||
# Execution count [2.50e+01]
|
||||
vmovdqu (%r12,%r9,4), %ymm3 #174.25
|
||||
vpaddd %ymm3, %ymm3, %ymm4 #175.40
|
||||
vpaddd %ymm4, %ymm3, %ymm3 #175.40
|
||||
movl (%r12,%r9,4), %r14d #174.25
|
||||
movl 4(%r12,%r9,4), %r8d #174.25
|
||||
movl 8(%r12,%r9,4), %edi #174.25
|
||||
movl 12(%r12,%r9,4), %esi #174.25
|
||||
lea (%r14,%r14,2), %r14d #175.40
|
||||
movl 16(%r12,%r9,4), %ecx #174.25
|
||||
lea (%r8,%r8,2), %r8d #175.40
|
||||
movl 20(%r12,%r9,4), %edx #174.25
|
||||
lea (%rdi,%rdi,2), %edi #175.40
|
||||
movl 24(%r12,%r9,4), %eax #174.25
|
||||
lea (%rsi,%rsi,2), %esi #175.40
|
||||
movl 28(%r12,%r9,4), %r15d #174.25
|
||||
lea (%rcx,%rcx,2), %ecx #175.40
|
||||
lea (%rdx,%rdx,2), %edx #175.40
|
||||
lea (%rax,%rax,2), %eax #175.40
|
||||
lea (%r15,%r15,2), %r15d #175.40
|
||||
# LOE rbx r9 r12 eax edx ecx esi edi r8d r10d r11d r13d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.31: # Preds ..B1.28
|
||||
# Execution count [1.25e+01]
|
||||
vpcmpeqb %xmm0, %xmm0, %k1 #175.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k2 #175.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k3 #175.40
|
||||
vpxord %zmm4, %zmm4, %zmm4 #175.40
|
||||
vpxord %zmm17, %zmm17, %zmm17 #175.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #175.40
|
||||
vgatherdpd 16(%rbx,%ymm3,8), %zmm4{%k1} #175.40
|
||||
vgatherdpd 8(%rbx,%ymm3,8), %zmm17{%k2} #175.40
|
||||
vgatherdpd (%rbx,%ymm3,8), %zmm18{%k3} #175.40
|
||||
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
|
||||
..B1.32: # Preds ..B1.31
|
||||
# Execution count [2.50e+01]
|
||||
addl $8, %r11d #173.13
|
||||
addq $8, %r9 #173.13
|
||||
#vsubpd %zmm4, %zmm2, %zmm26 #177.40
|
||||
#vsubpd %zmm17, %zmm0, %zmm24 #176.40
|
||||
#vsubpd %zmm18, %zmm1, %zmm23 #175.40
|
||||
#vmulpd %zmm24, %zmm24, %zmm3 #178.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm3 #178.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm3 #178.67
|
||||
#vrcp14pd %zmm3, %zmm22 #195.42
|
||||
#vcmppd $1, %zmm14, %zmm3, %k2 #194.26
|
||||
#vfpclasspd $30, %zmm22, %k0 #195.42
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #195.42
|
||||
#knotw %k0, %k1 #195.42
|
||||
#vmulpd %zmm3, %zmm3, %zmm4 #195.42
|
||||
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #195.42
|
||||
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #195.42
|
||||
#vmulpd %zmm13, %zmm22, %zmm17 #196.42
|
||||
#vmulpd %zmm9, %zmm22, %zmm19 #197.58
|
||||
#vmulpd %zmm17, %zmm22, %zmm20 #196.48
|
||||
#vmulpd %zmm20, %zmm22, %zmm18 #196.54
|
||||
#vfmsub213pd %zmm5, %zmm20, %zmm22 #197.58
|
||||
#vmulpd %zmm19, %zmm18, %zmm21 #197.65
|
||||
#vmulpd %zmm22, %zmm21, %zmm25 #197.71
|
||||
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #198.21
|
||||
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #199.21
|
||||
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #200.21
|
||||
cmpl %r13d, %r11d #173.13
|
||||
jb ..B1.28 # Prob 82% #173.13
|
||||
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.33: # Preds ..B1.32
|
||||
# Execution count [4.50e+00]
|
||||
movq 8(%rsp), %rdi #[spill]
|
||||
movq 16(%rsp), %rdx #[spill]
|
||||
movq 40(%rsp), %rcx #[spill]
|
||||
movq 48(%rsp), %rax #[spill]
|
||||
movq 56(%rsp), %rsi #[spill]
|
||||
movq 32(%rsp), %r8 #[spill]
|
||||
movq (%rsp), %r14 #[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.34: # Preds ..B1.33 ..B1.26 ..B1.48
|
||||
# Execution count [5.00e+00]
|
||||
lea 1(%r13), %r9d #173.13
|
||||
cmpl %r10d, %r9d #173.13
|
||||
ja ..B1.40 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.35: # Preds ..B1.34
|
||||
# Execution count [2.50e+01]
|
||||
imulq %rdx, %rsi #144.43
|
||||
vbroadcastsd %xmm10, %zmm4 #146.23
|
||||
subl %r13d, %r10d #173.13
|
||||
addq %rcx, %rsi #126.5
|
||||
vpbroadcastd %r10d, %ymm0 #173.13
|
||||
vpcmpgtd %ymm15, %ymm0, %k3 #173.13
|
||||
movslq %r13d, %r13 #173.13
|
||||
kmovw %k3, %r9d #173.13
|
||||
vmovdqu32 (%rsi,%r13,4), %ymm1{%k3}{z} #174.25
|
||||
vpaddd %ymm1, %ymm1, %ymm2 #175.40
|
||||
vpaddd %ymm2, %ymm1, %ymm0 #175.40
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.38: # Preds ..B1.35
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #175.40
|
||||
kmovw %k3, %k2 #175.40
|
||||
vpxord %zmm1, %zmm1, %zmm1 #175.40
|
||||
vpxord %zmm2, %zmm2, %zmm2 #175.40
|
||||
vpxord %zmm3, %zmm3, %zmm3 #175.40
|
||||
vgatherdpd 16(%rbx,%ymm0,8), %zmm1{%k1} #175.40
|
||||
vgatherdpd 8(%rbx,%ymm0,8), %zmm2{%k2} #175.40
|
||||
vgatherdpd (%rbx,%ymm0,8), %zmm3{%k3} #175.40
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.39: # Preds ..B1.38
|
||||
# Execution count [2.50e+01]
|
||||
#vbroadcastsd %xmm6, %zmm6 #147.23
|
||||
#vbroadcastsd %xmm12, %zmm12 #148.23
|
||||
#vsubpd %zmm1, %zmm12, %zmm23 #177.40
|
||||
#vsubpd %zmm2, %zmm6, %zmm21 #176.40
|
||||
#vsubpd %zmm3, %zmm4, %zmm20 #175.40
|
||||
#vmulpd %zmm21, %zmm21, %zmm19 #178.53
|
||||
#vfmadd231pd %zmm20, %zmm20, %zmm19 #178.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm19 #178.67
|
||||
#vrcp14pd %zmm19, %zmm18 #195.42
|
||||
#vcmppd $1, %zmm14, %zmm19, %k2 #194.26
|
||||
#vfpclasspd $30, %zmm18, %k0 #195.42
|
||||
#kmovw %k2, %esi #194.26
|
||||
#knotw %k0, %k1 #195.42
|
||||
#vmovaps %zmm19, %zmm0 #195.42
|
||||
#andl %esi, %r9d #194.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #195.42
|
||||
#kmovw %r9d, %k3 #198.21
|
||||
#vmulpd %zmm0, %zmm0, %zmm1 #195.42
|
||||
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #195.42
|
||||
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #195.42
|
||||
#vmulpd %zmm13, %zmm18, %zmm2 #196.42
|
||||
#vmulpd %zmm9, %zmm18, %zmm4 #197.58
|
||||
#vmulpd %zmm2, %zmm18, %zmm10 #196.48
|
||||
#vmulpd %zmm10, %zmm18, %zmm3 #196.54
|
||||
#vfmsub213pd %zmm5, %zmm10, %zmm18 #197.58
|
||||
#vmulpd %zmm4, %zmm3, %zmm17 #197.65
|
||||
#vmulpd %zmm18, %zmm17, %zmm22 #197.71
|
||||
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #198.21
|
||||
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #199.21
|
||||
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #200.21
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.40: # Preds ..B1.25 ..B1.39 ..B1.34
|
||||
# Execution count [4.50e+00]
|
||||
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #151.22
|
||||
vpermd %zmm11, %zmm19, %zmm0 #151.22
|
||||
vpermd %zmm7, %zmm19, %zmm6 #150.22
|
||||
vpermd %zmm8, %zmm19, %zmm20 #149.22
|
||||
vaddpd %zmm11, %zmm0, %zmm11 #151.22
|
||||
vaddpd %zmm7, %zmm6, %zmm7 #150.22
|
||||
vaddpd %zmm8, %zmm20, %zmm8 #149.22
|
||||
vpermpd $78, %zmm11, %zmm1 #151.22
|
||||
vpermpd $78, %zmm7, %zmm10 #150.22
|
||||
vpermpd $78, %zmm8, %zmm21 #149.22
|
||||
vaddpd %zmm1, %zmm11, %zmm2 #151.22
|
||||
vaddpd %zmm10, %zmm7, %zmm12 #150.22
|
||||
vaddpd %zmm21, %zmm8, %zmm22 #149.22
|
||||
vpermpd $177, %zmm2, %zmm3 #151.22
|
||||
vpermpd $177, %zmm12, %zmm17 #150.22
|
||||
vpermpd $177, %zmm22, %zmm23 #149.22
|
||||
vaddpd %zmm3, %zmm2, %zmm4 #151.22
|
||||
vaddpd %zmm17, %zmm12, %zmm18 #150.22
|
||||
vaddpd %zmm23, %zmm22, %zmm24 #149.22
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.41: # Preds ..B1.40 ..B1.11
|
||||
# Execution count [5.00e+00]
|
||||
movq 80(%rsp), %rsi #208.9[spill]
|
||||
addq $24, %rdi #143.5
|
||||
vaddsd (%rsi,%rax,8), %xmm24, %xmm0 #208.9
|
||||
vmovsd %xmm0, (%rsi,%rax,8) #208.9
|
||||
movslq %eax, %rsi #143.32
|
||||
vaddsd (%r14,%rax,8), %xmm18, %xmm1 #209.9
|
||||
vmovsd %xmm1, (%r14,%rax,8) #209.9
|
||||
incq %rsi #143.32
|
||||
vaddsd (%r8,%rax,8), %xmm4, %xmm2 #210.9
|
||||
vmovsd %xmm2, (%r8,%rax,8) #210.9
|
||||
incq %rax #143.5
|
||||
cmpq 64(%rsp), %rax #143.5[spill]
|
||||
jb ..B1.11 # Prob 82% #143.5
|
||||
jmp ..B1.44 # Prob 100% #143.5
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.43: # Preds ..B1.2
|
||||
# Execution count [5.00e-01]
|
||||
movl $.L_2__STRING.0, %edi #141.5
|
||||
..___tag_value_computeForce.48:
|
||||
# likwid_markerStartRegion(const char *)
|
||||
call likwid_markerStartRegion #141.5
|
||||
..___tag_value_computeForce.49:
|
||||
# LOE
|
||||
..B1.44: # Preds ..B1.41 ..B1.43
|
||||
# Execution count [1.00e+00]
|
||||
movl $.L_2__STRING.0, %edi #219.5
|
||||
vzeroupper #219.5
|
||||
..___tag_value_computeForce.50:
|
||||
# likwid_markerStopRegion(const char *)
|
||||
call likwid_markerStopRegion #219.5
|
||||
..___tag_value_computeForce.51:
|
||||
# LOE
|
||||
..B1.45: # Preds ..B1.44
|
||||
# Execution count [1.00e+00]
|
||||
xorl %eax, %eax #221.16
|
||||
..___tag_value_computeForce.52:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #221.16
|
||||
..___tag_value_computeForce.53:
|
||||
# LOE xmm0
|
||||
..B1.46: # Preds ..B1.45
|
||||
# Execution count [1.00e+00]
|
||||
vsubsd 24(%rsp), %xmm0, %xmm0 #224.14[spill]
|
||||
addq $88, %rsp #224.14
|
||||
.cfi_restore 3
|
||||
popq %rbx #224.14
|
||||
.cfi_restore 15
|
||||
popq %r15 #224.14
|
||||
.cfi_restore 14
|
||||
popq %r14 #224.14
|
||||
.cfi_restore 13
|
||||
popq %r13 #224.14
|
||||
.cfi_restore 12
|
||||
popq %r12 #224.14
|
||||
movq %rbp, %rsp #224.14
|
||||
popq %rbp #224.14
|
||||
.cfi_def_cfa 7, 8
|
||||
.cfi_restore 6
|
||||
ret #224.14
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_offset 6, -16
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE
|
||||
..B1.47: # Preds ..B1.13
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
movl %r10d, %r13d #173.13
|
||||
xorl %r11d, %r11d #173.13
|
||||
andl $-8, %r13d #173.13
|
||||
jmp ..B1.26 # Prob 100% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.48: # Preds ..B1.12
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
xorl %r13d, %r13d #173.13
|
||||
jmp ..B1.34 # Prob 100% #173.13
|
||||
.align 16,0x90
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
.cfi_endproc
|
||||
# mark_end;
|
||||
.type computeForce,@function
|
||||
.size computeForce,.-computeForce
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForce
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.7:
|
||||
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
|
||||
.type .L_2il0floatpacket.7,@object
|
||||
.size .L_2il0floatpacket.7,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.8:
|
||||
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
|
||||
.type .L_2il0floatpacket.8,@object
|
||||
.size .L_2il0floatpacket.8,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.10:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.10,@object
|
||||
.size .L_2il0floatpacket.10,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.9:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.9,@object
|
||||
.size .L_2il0floatpacket.9,8
|
||||
.section .rodata.str1.4, "aMS",@progbits,1
|
||||
.align 4
|
||||
.align 4
|
||||
.L_2__STRING.0:
|
||||
.long 1668444006
|
||||
.word 101
|
||||
.type .L_2__STRING.0,@object
|
||||
.size .L_2__STRING.0,6
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
||||
585
asm/unused/force-mem-only.s
Normal file
585
asm/unused/force-mem-only.s
Normal file
@@ -0,0 +1,585 @@
|
||||
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
|
||||
# mark_description "-I./src/includes -S -D_GNU_SOURCE -DAOS -DPRECISION=2 -DNEIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=6";
|
||||
# mark_description "4 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ICC/force.s";
|
||||
.file "force.c"
|
||||
.text
|
||||
..TXTST0:
|
||||
.L_2__routine_start_computeForce_0:
|
||||
# -- Begin computeForce
|
||||
.text
|
||||
# mark_begin;
|
||||
.align 16,0x90
|
||||
.globl computeForce
|
||||
# --- computeForce(Parameter *, Atom *, Neighbor *, int)
|
||||
computeForce:
|
||||
# parameter 1: %rdi
|
||||
# parameter 2: %rsi
|
||||
# parameter 3: %rdx
|
||||
# parameter 4: %ecx
|
||||
..B1.1: # Preds ..B1.0
|
||||
# Execution count [1.00e+00]
|
||||
.cfi_startproc
|
||||
..___tag_value_computeForce.1:
|
||||
..L2:
|
||||
#103.87
|
||||
pushq %rbp #103.87
|
||||
.cfi_def_cfa_offset 16
|
||||
movq %rsp, %rbp #103.87
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_offset 6, -16
|
||||
andq $-64, %rsp #103.87
|
||||
pushq %r12 #103.87
|
||||
pushq %r13 #103.87
|
||||
pushq %r14 #103.87
|
||||
subq $104, %rsp #103.87
|
||||
xorl %eax, %eax #106.16
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
movq %rdx, %r14 #103.87
|
||||
movq %rsi, %r13 #103.87
|
||||
movq %rdi, %r12 #103.87
|
||||
..___tag_value_computeForce.9:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #106.16
|
||||
..___tag_value_computeForce.10:
|
||||
# LOE rbx r12 r13 r14 r15 xmm0
|
||||
..B1.48: # Preds ..B1.1
|
||||
# Execution count [1.00e+00]
|
||||
vmovsd %xmm0, 16(%rsp) #106.16[spill]
|
||||
# LOE rbx r12 r13 r14 r15
|
||||
..B1.2: # Preds ..B1.48
|
||||
# Execution count [1.00e+00]
|
||||
movl 4(%r13), %ecx #107.18
|
||||
movq 64(%r13), %r11 #109.20
|
||||
movq 72(%r13), %r10 #109.45
|
||||
movq 80(%r13), %r9 #109.70
|
||||
vmovsd 72(%r12), %xmm2 #111.27
|
||||
vmovsd 8(%r12), %xmm1 #112.23
|
||||
vmovsd (%r12), %xmm0 #113.24
|
||||
testl %ecx, %ecx #116.24
|
||||
jle ..B1.42 # Prob 50% #116.24
|
||||
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
|
||||
..B1.3: # Preds ..B1.2
|
||||
# Execution count [1.00e+00]
|
||||
xorl %edi, %edi #116.5
|
||||
movl %ecx, %edx #116.5
|
||||
xorl %esi, %esi #116.5
|
||||
movl $1, %r8d #116.5
|
||||
xorl %eax, %eax #117.17
|
||||
shrl $1, %edx #116.5
|
||||
je ..B1.7 # Prob 9% #116.5
|
||||
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
|
||||
..B1.5: # Preds ..B1.3 ..B1.5
|
||||
# Execution count [2.50e+00]
|
||||
movq %rax, (%rsi,%r11) #117.9
|
||||
incq %rdi #116.5
|
||||
movq %rax, (%rsi,%r10) #118.9
|
||||
movq %rax, (%rsi,%r9) #119.9
|
||||
movq %rax, 8(%rsi,%r11) #117.9
|
||||
movq %rax, 8(%rsi,%r10) #118.9
|
||||
movq %rax, 8(%rsi,%r9) #119.9
|
||||
addq $16, %rsi #116.5
|
||||
cmpq %rdx, %rdi #116.5
|
||||
jb ..B1.5 # Prob 63% #116.5
|
||||
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
|
||||
..B1.6: # Preds ..B1.5
|
||||
# Execution count [9.00e-01]
|
||||
lea 1(%rdi,%rdi), %r8d #117.9
|
||||
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
|
||||
..B1.7: # Preds ..B1.3 ..B1.6
|
||||
# Execution count [1.00e+00]
|
||||
lea -1(%r8), %edx #116.5
|
||||
cmpl %ecx, %edx #116.5
|
||||
jae ..B1.9 # Prob 9% #116.5
|
||||
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
|
||||
..B1.8: # Preds ..B1.7
|
||||
# Execution count [9.00e-01]
|
||||
movslq %r8d, %r8 #116.5
|
||||
movq %rax, -8(%r11,%r8,8) #117.9
|
||||
movq %rax, -8(%r10,%r8,8) #118.9
|
||||
movq %rax, -8(%r9,%r8,8) #119.9
|
||||
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
|
||||
..B1.9: # Preds ..B1.7 ..B1.8
|
||||
# Execution count [9.00e-01]
|
||||
vmulsd %xmm2, %xmm2, %xmm13 #111.45
|
||||
xorl %edi, %edi #124.15
|
||||
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #153.13
|
||||
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #177.45
|
||||
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #153.13
|
||||
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #177.58
|
||||
vbroadcastsd %xmm13, %zmm14 #111.25
|
||||
vbroadcastsd %xmm1, %zmm13 #112.21
|
||||
vbroadcastsd %xmm0, %zmm9 #177.45
|
||||
movq 16(%r13), %rdx #127.25
|
||||
xorl %r8d, %r8d #124.5
|
||||
movslq %ecx, %r12 #124.5
|
||||
xorl %eax, %eax #124.5
|
||||
movq 24(%r14), %r13 #126.25
|
||||
movslq 16(%r14), %rcx #125.43
|
||||
movq 8(%r14), %rsi #125.19
|
||||
shlq $2, %rcx #108.5
|
||||
movq %r12, 80(%rsp) #124.5[spill]
|
||||
movq %r13, 88(%rsp) #124.5[spill]
|
||||
movq %r11, 96(%rsp) #124.5[spill]
|
||||
movq %r15, 8(%rsp) #124.5[spill]
|
||||
movq %rbx, (%rsp) #124.5[spill]
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.10: # Preds ..B1.40 ..B1.9
|
||||
# Execution count [5.00e+00]
|
||||
movq 88(%rsp), %rbx #126.25[spill]
|
||||
vxorpd %xmm24, %xmm24, %xmm24 #130.22
|
||||
vmovapd %xmm24, %xmm18 #131.22
|
||||
movl (%rbx,%r8,4), %r11d #126.25
|
||||
vmovapd %xmm18, %xmm4 #132.22
|
||||
vmovsd (%rax,%rdx), %xmm10 #127.25
|
||||
vmovsd 8(%rax,%rdx), %xmm6 #128.25
|
||||
vmovsd 16(%rax,%rdx), %xmm12 #129.25
|
||||
testl %r11d, %r11d #153.32
|
||||
jle ..B1.40 # Prob 50% #153.32
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.11: # Preds ..B1.10
|
||||
# Execution count [4.50e+00]
|
||||
vpxord %zmm8, %zmm8, %zmm8 #130.22
|
||||
vmovaps %zmm8, %zmm7 #131.22
|
||||
vmovaps %zmm7, %zmm11 #132.22
|
||||
cmpl $8, %r11d #153.13
|
||||
jl ..B1.45 # Prob 10% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.12: # Preds ..B1.11
|
||||
# Execution count [4.50e+00]
|
||||
cmpl $1200, %r11d #153.13
|
||||
jl ..B1.44 # Prob 10% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.13: # Preds ..B1.12
|
||||
# Execution count [4.50e+00]
|
||||
movq %rcx, %r15 #125.43
|
||||
imulq %rdi, %r15 #125.43
|
||||
addq %rsi, %r15 #108.5
|
||||
movq %r15, %r12 #153.13
|
||||
andq $63, %r12 #153.13
|
||||
testl $3, %r12d #153.13
|
||||
je ..B1.15 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.14: # Preds ..B1.13
|
||||
# Execution count [2.25e+00]
|
||||
xorl %r12d, %r12d #153.13
|
||||
jmp ..B1.17 # Prob 100% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.15: # Preds ..B1.13
|
||||
# Execution count [2.25e+00]
|
||||
testl %r12d, %r12d #153.13
|
||||
je ..B1.17 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.16: # Preds ..B1.15
|
||||
# Execution count [2.50e+01]
|
||||
negl %r12d #153.13
|
||||
addl $64, %r12d #153.13
|
||||
shrl $2, %r12d #153.13
|
||||
cmpl %r12d, %r11d #153.13
|
||||
cmovl %r11d, %r12d #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.17: # Preds ..B1.14 ..B1.16 ..B1.15
|
||||
# Execution count [5.00e+00]
|
||||
movl %r11d, %r14d #153.13
|
||||
subl %r12d, %r14d #153.13
|
||||
andl $7, %r14d #153.13
|
||||
negl %r14d #153.13
|
||||
addl %r11d, %r14d #153.13
|
||||
cmpl $1, %r12d #153.13
|
||||
jb ..B1.25 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.18: # Preds ..B1.17
|
||||
# Execution count [4.50e+00]
|
||||
vmovdqa %ymm15, %ymm4 #153.13
|
||||
xorl %r13d, %r13d #153.13
|
||||
vpbroadcastd %r12d, %ymm3 #153.13
|
||||
vbroadcastsd %xmm10, %zmm2 #127.23
|
||||
vbroadcastsd %xmm6, %zmm1 #128.23
|
||||
vbroadcastsd %xmm12, %zmm0 #129.23
|
||||
movslq %r12d, %rbx #153.13
|
||||
movq %r9, 24(%rsp) #153.13[spill]
|
||||
movq %r10, 32(%rsp) #153.13[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.19: # Preds ..B1.23 ..B1.18
|
||||
# Execution count [2.50e+01]
|
||||
vpcmpgtd %ymm4, %ymm3, %k3 #153.13
|
||||
vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z} #154.25
|
||||
kmovw %k3, %r10d #153.13
|
||||
vpaddd %ymm17, %ymm17, %ymm18 #155.40
|
||||
vpaddd %ymm18, %ymm17, %ymm17 #155.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.22: # Preds ..B1.19
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #155.40
|
||||
kmovw %k3, %k2 #155.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #155.40
|
||||
vpxord %zmm19, %zmm19, %zmm19 #155.40
|
||||
vpxord %zmm20, %zmm20, %zmm20 #155.40
|
||||
vgatherdpd 16(%rdx,%ymm17,8), %zmm18{%k1} #155.40
|
||||
vgatherdpd 8(%rdx,%ymm17,8), %zmm19{%k2} #155.40
|
||||
vgatherdpd (%rdx,%ymm17,8), %zmm20{%k3} #155.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
|
||||
..B1.23: # Preds ..B1.22
|
||||
# Execution count [2.50e+01]
|
||||
addq $8, %r13 #153.13
|
||||
#vpaddd %ymm16, %ymm4, %ymm4 #153.13
|
||||
#vsubpd %zmm18, %zmm0, %zmm29 #157.40
|
||||
#vsubpd %zmm19, %zmm1, %zmm27 #156.40
|
||||
#vsubpd %zmm20, %zmm2, %zmm26 #155.40
|
||||
#vmulpd %zmm27, %zmm27, %zmm25 #158.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm25 #158.53
|
||||
#vfmadd231pd %zmm29, %zmm29, %zmm25 #158.67
|
||||
#vrcp14pd %zmm25, %zmm24 #175.42
|
||||
#vcmppd $1, %zmm14, %zmm25, %k2 #174.26
|
||||
#vfpclasspd $30, %zmm24, %k0 #175.42
|
||||
#kmovw %k2, %r9d #174.26
|
||||
#knotw %k0, %k1 #175.42
|
||||
#vmovaps %zmm25, %zmm17 #175.42
|
||||
#andl %r9d, %r10d #174.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #175.42
|
||||
#kmovw %r10d, %k3 #178.21
|
||||
#vmulpd %zmm17, %zmm17, %zmm18 #175.42
|
||||
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #175.42
|
||||
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #175.42
|
||||
#vmulpd %zmm13, %zmm24, %zmm19 #176.42
|
||||
#vmulpd %zmm9, %zmm24, %zmm21 #177.58
|
||||
#vmulpd %zmm19, %zmm24, %zmm22 #176.48
|
||||
#vmulpd %zmm22, %zmm24, %zmm20 #176.54
|
||||
#vfmsub213pd %zmm5, %zmm22, %zmm24 #177.58
|
||||
#vmulpd %zmm21, %zmm20, %zmm23 #177.65
|
||||
#vmulpd %zmm24, %zmm23, %zmm28 #177.71
|
||||
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #178.21
|
||||
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #179.21
|
||||
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #180.21
|
||||
cmpq %rbx, %r13 #153.13
|
||||
jb ..B1.19 # Prob 82% #153.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.24: # Preds ..B1.23
|
||||
# Execution count [4.50e+00]
|
||||
movq 24(%rsp), %r9 #[spill]
|
||||
movq 32(%rsp), %r10 #[spill]
|
||||
cmpl %r12d, %r11d #153.13
|
||||
je ..B1.39 # Prob 10% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.25: # Preds ..B1.24 ..B1.17 ..B1.44
|
||||
# Execution count [2.50e+01]
|
||||
lea 8(%r12), %ebx #153.13
|
||||
cmpl %ebx, %r14d #153.13
|
||||
jl ..B1.33 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.26: # Preds ..B1.25
|
||||
# Execution count [4.50e+00]
|
||||
movq %rcx, %r13 #125.43
|
||||
imulq %rdi, %r13 #125.43
|
||||
vbroadcastsd %xmm10, %zmm1 #127.23
|
||||
vbroadcastsd %xmm6, %zmm0 #128.23
|
||||
vbroadcastsd %xmm12, %zmm2 #129.23
|
||||
movslq %r12d, %rbx #153.13
|
||||
addq %rsi, %r13 #108.5
|
||||
movq %rax, 40(%rsp) #108.5[spill]
|
||||
movq %rcx, 48(%rsp) #108.5[spill]
|
||||
movq %rsi, 56(%rsp) #108.5[spill]
|
||||
movq %r8, 64(%rsp) #108.5[spill]
|
||||
movq %rdi, 72(%rsp) #108.5[spill]
|
||||
movq %r9, 24(%rsp) #108.5[spill]
|
||||
movq %r10, 32(%rsp) #108.5[spill]
|
||||
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.27: # Preds ..B1.31 ..B1.26
|
||||
# Execution count [2.50e+01]
|
||||
vmovdqu (%r13,%rbx,4), %ymm3 #154.25
|
||||
vpaddd %ymm3, %ymm3, %ymm4 #155.40
|
||||
vpaddd %ymm4, %ymm3, %ymm3 #155.40
|
||||
movl (%r13,%rbx,4), %r10d #154.25
|
||||
movl 4(%r13,%rbx,4), %r9d #154.25
|
||||
movl 8(%r13,%rbx,4), %r8d #154.25
|
||||
movl 12(%r13,%rbx,4), %edi #154.25
|
||||
lea (%r10,%r10,2), %r10d #155.40
|
||||
movl 16(%r13,%rbx,4), %esi #154.25
|
||||
lea (%r9,%r9,2), %r9d #155.40
|
||||
movl 20(%r13,%rbx,4), %ecx #154.25
|
||||
lea (%r8,%r8,2), %r8d #155.40
|
||||
movl 24(%r13,%rbx,4), %eax #154.25
|
||||
lea (%rdi,%rdi,2), %edi #155.40
|
||||
movl 28(%r13,%rbx,4), %r15d #154.25
|
||||
lea (%rsi,%rsi,2), %esi #155.40
|
||||
lea (%rcx,%rcx,2), %ecx #155.40
|
||||
lea (%rax,%rax,2), %eax #155.40
|
||||
lea (%r15,%r15,2), %r15d #155.40
|
||||
# LOE rdx rbx r13 eax ecx esi edi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.30: # Preds ..B1.27
|
||||
# Execution count [1.25e+01]
|
||||
vpcmpeqb %xmm0, %xmm0, %k1 #155.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k2 #155.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k3 #155.40
|
||||
vpxord %zmm4, %zmm4, %zmm4 #155.40
|
||||
vpxord %zmm17, %zmm17, %zmm17 #155.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #155.40
|
||||
vgatherdpd 16(%rdx,%ymm3,8), %zmm4{%k1} #155.40
|
||||
vgatherdpd 8(%rdx,%ymm3,8), %zmm17{%k2} #155.40
|
||||
vgatherdpd (%rdx,%ymm3,8), %zmm18{%k3} #155.40
|
||||
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
|
||||
..B1.31: # Preds ..B1.30
|
||||
# Execution count [2.50e+01]
|
||||
addl $8, %r12d #153.13
|
||||
addq $8, %rbx #153.13
|
||||
#vsubpd %zmm4, %zmm2, %zmm26 #157.40
|
||||
#vsubpd %zmm17, %zmm0, %zmm24 #156.40
|
||||
#vsubpd %zmm18, %zmm1, %zmm23 #155.40
|
||||
#vmulpd %zmm24, %zmm24, %zmm3 #158.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm3 #158.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm3 #158.67
|
||||
#vrcp14pd %zmm3, %zmm22 #175.42
|
||||
#vcmppd $1, %zmm14, %zmm3, %k2 #174.26
|
||||
#vfpclasspd $30, %zmm22, %k0 #175.42
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #175.42
|
||||
#knotw %k0, %k1 #175.42
|
||||
#vmulpd %zmm3, %zmm3, %zmm4 #175.42
|
||||
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #175.42
|
||||
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #175.42
|
||||
#vmulpd %zmm13, %zmm22, %zmm17 #176.42
|
||||
#vmulpd %zmm9, %zmm22, %zmm19 #177.58
|
||||
#vmulpd %zmm17, %zmm22, %zmm20 #176.48
|
||||
#vmulpd %zmm20, %zmm22, %zmm18 #176.54
|
||||
#vfmsub213pd %zmm5, %zmm20, %zmm22 #177.58
|
||||
#vmulpd %zmm19, %zmm18, %zmm21 #177.65
|
||||
#vmulpd %zmm22, %zmm21, %zmm25 #177.71
|
||||
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #178.21
|
||||
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #179.21
|
||||
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #180.21
|
||||
cmpl %r14d, %r12d #153.13
|
||||
jb ..B1.27 # Prob 82% #153.13
|
||||
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.32: # Preds ..B1.31
|
||||
# Execution count [4.50e+00]
|
||||
movq 40(%rsp), %rax #[spill]
|
||||
movq 48(%rsp), %rcx #[spill]
|
||||
movq 56(%rsp), %rsi #[spill]
|
||||
movq 64(%rsp), %r8 #[spill]
|
||||
movq 72(%rsp), %rdi #[spill]
|
||||
movq 24(%rsp), %r9 #[spill]
|
||||
movq 32(%rsp), %r10 #[spill]
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.33: # Preds ..B1.32 ..B1.25 ..B1.45
|
||||
# Execution count [5.00e+00]
|
||||
lea 1(%r14), %ebx #153.13
|
||||
cmpl %r11d, %ebx #153.13
|
||||
ja ..B1.39 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.34: # Preds ..B1.33
|
||||
# Execution count [2.50e+01]
|
||||
imulq %rcx, %rdi #125.43
|
||||
vbroadcastsd %xmm10, %zmm4 #127.23
|
||||
subl %r14d, %r11d #153.13
|
||||
addq %rsi, %rdi #108.5
|
||||
vpbroadcastd %r11d, %ymm0 #153.13
|
||||
vpcmpgtd %ymm15, %ymm0, %k3 #153.13
|
||||
movslq %r14d, %r14 #153.13
|
||||
vmovdqu32 (%rdi,%r14,4), %ymm1{%k3}{z} #154.25
|
||||
kmovw %k3, %edi #153.13
|
||||
vpaddd %ymm1, %ymm1, %ymm2 #155.40
|
||||
vpaddd %ymm2, %ymm1, %ymm0 #155.40
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.37: # Preds ..B1.34
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #155.40
|
||||
kmovw %k3, %k2 #155.40
|
||||
vpxord %zmm1, %zmm1, %zmm1 #155.40
|
||||
vpxord %zmm2, %zmm2, %zmm2 #155.40
|
||||
vpxord %zmm3, %zmm3, %zmm3 #155.40
|
||||
vgatherdpd 16(%rdx,%ymm0,8), %zmm1{%k1} #155.40
|
||||
vgatherdpd 8(%rdx,%ymm0,8), %zmm2{%k2} #155.40
|
||||
vgatherdpd (%rdx,%ymm0,8), %zmm3{%k3} #155.40
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.38: # Preds ..B1.37
|
||||
# Execution count [2.50e+01]
|
||||
#vbroadcastsd %xmm6, %zmm6 #128.23
|
||||
#vbroadcastsd %xmm12, %zmm12 #129.23
|
||||
#vsubpd %zmm1, %zmm12, %zmm23 #157.40
|
||||
#vsubpd %zmm2, %zmm6, %zmm21 #156.40
|
||||
#vsubpd %zmm3, %zmm4, %zmm20 #155.40
|
||||
#vmulpd %zmm21, %zmm21, %zmm19 #158.53
|
||||
#vfmadd231pd %zmm20, %zmm20, %zmm19 #158.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm19 #158.67
|
||||
#vrcp14pd %zmm19, %zmm18 #175.42
|
||||
#vcmppd $1, %zmm14, %zmm19, %k2 #174.26
|
||||
#vfpclasspd $30, %zmm18, %k0 #175.42
|
||||
#kmovw %k2, %ebx #174.26
|
||||
#knotw %k0, %k1 #175.42
|
||||
#vmovaps %zmm19, %zmm0 #175.42
|
||||
#andl %ebx, %edi #174.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #175.42
|
||||
#kmovw %edi, %k3 #178.21
|
||||
#vmulpd %zmm0, %zmm0, %zmm1 #175.42
|
||||
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #175.42
|
||||
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #175.42
|
||||
#vmulpd %zmm13, %zmm18, %zmm2 #176.42
|
||||
#vmulpd %zmm9, %zmm18, %zmm4 #177.58
|
||||
#vmulpd %zmm2, %zmm18, %zmm10 #176.48
|
||||
#vmulpd %zmm10, %zmm18, %zmm3 #176.54
|
||||
#vfmsub213pd %zmm5, %zmm10, %zmm18 #177.58
|
||||
#vmulpd %zmm4, %zmm3, %zmm17 #177.65
|
||||
#vmulpd %zmm18, %zmm17, %zmm22 #177.71
|
||||
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #178.21
|
||||
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #179.21
|
||||
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #180.21
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.39: # Preds ..B1.24 ..B1.38 ..B1.33
|
||||
# Execution count [4.50e+00]
|
||||
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #132.22
|
||||
vpermd %zmm11, %zmm19, %zmm0 #132.22
|
||||
vpermd %zmm7, %zmm19, %zmm6 #131.22
|
||||
vpermd %zmm8, %zmm19, %zmm20 #130.22
|
||||
vaddpd %zmm11, %zmm0, %zmm11 #132.22
|
||||
vaddpd %zmm7, %zmm6, %zmm7 #131.22
|
||||
vaddpd %zmm8, %zmm20, %zmm8 #130.22
|
||||
vpermpd $78, %zmm11, %zmm1 #132.22
|
||||
vpermpd $78, %zmm7, %zmm10 #131.22
|
||||
vpermpd $78, %zmm8, %zmm21 #130.22
|
||||
vaddpd %zmm1, %zmm11, %zmm2 #132.22
|
||||
vaddpd %zmm10, %zmm7, %zmm12 #131.22
|
||||
vaddpd %zmm21, %zmm8, %zmm22 #130.22
|
||||
vpermpd $177, %zmm2, %zmm3 #132.22
|
||||
vpermpd $177, %zmm12, %zmm17 #131.22
|
||||
vpermpd $177, %zmm22, %zmm23 #130.22
|
||||
vaddpd %zmm3, %zmm2, %zmm4 #132.22
|
||||
vaddpd %zmm17, %zmm12, %zmm18 #131.22
|
||||
vaddpd %zmm23, %zmm22, %zmm24 #130.22
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.40: # Preds ..B1.39 ..B1.10
|
||||
# Execution count [5.00e+00]
|
||||
movq 96(%rsp), %rbx #188.9[spill]
|
||||
addq $24, %rax #124.5
|
||||
movslq %r8d, %rdi #124.32
|
||||
incq %rdi #124.32
|
||||
#vaddsd (%rbx,%r8,8), %xmm24, %xmm0 #188.9
|
||||
#vmovsd %xmm0, (%rbx,%r8,8) #188.9
|
||||
#vaddsd (%r10,%r8,8), %xmm18, %xmm1 #189.9
|
||||
#vmovsd %xmm1, (%r10,%r8,8) #189.9
|
||||
#vaddsd (%r9,%r8,8), %xmm4, %xmm2 #190.9
|
||||
#vmovsd %xmm2, (%r9,%r8,8) #190.9
|
||||
incq %r8 #124.5
|
||||
cmpq 80(%rsp), %r8 #124.5[spill]
|
||||
jb ..B1.10 # Prob 82% #124.5
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.41: # Preds ..B1.40
|
||||
# Execution count [9.00e-01]
|
||||
movq 8(%rsp), %r15 #[spill]
|
||||
.cfi_restore 15
|
||||
movq (%rsp), %rbx #[spill]
|
||||
.cfi_restore 3
|
||||
# LOE rbx r15
|
||||
..B1.42: # Preds ..B1.2 ..B1.41
|
||||
# Execution count [1.00e+00]
|
||||
xorl %eax, %eax #201.16
|
||||
vzeroupper #201.16
|
||||
..___tag_value_computeForce.43:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #201.16
|
||||
..___tag_value_computeForce.44:
|
||||
# LOE rbx r15 xmm0
|
||||
..B1.43: # Preds ..B1.42
|
||||
# Execution count [1.00e+00]
|
||||
vsubsd 16(%rsp), %xmm0, %xmm0 #204.14[spill]
|
||||
addq $104, %rsp #204.14
|
||||
.cfi_restore 14
|
||||
popq %r14 #204.14
|
||||
.cfi_restore 13
|
||||
popq %r13 #204.14
|
||||
.cfi_restore 12
|
||||
popq %r12 #204.14
|
||||
movq %rbp, %rsp #204.14
|
||||
popq %rbp #204.14
|
||||
.cfi_def_cfa 7, 8
|
||||
.cfi_restore 6
|
||||
ret #204.14
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_offset 6, -16
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE
|
||||
..B1.44: # Preds ..B1.12
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
movl %r11d, %r14d #153.13
|
||||
xorl %r12d, %r12d #153.13
|
||||
andl $-8, %r14d #153.13
|
||||
jmp ..B1.25 # Prob 100% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.45: # Preds ..B1.11
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
xorl %r14d, %r14d #153.13
|
||||
jmp ..B1.33 # Prob 100% #153.13
|
||||
.align 16,0x90
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
.cfi_endproc
|
||||
# mark_end;
|
||||
.type computeForce,@function
|
||||
.size computeForce,.-computeForce
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForce
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.7:
|
||||
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
|
||||
.type .L_2il0floatpacket.7,@object
|
||||
.size .L_2il0floatpacket.7,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.8:
|
||||
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
|
||||
.type .L_2il0floatpacket.8,@object
|
||||
.size .L_2il0floatpacket.8,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.10:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.10,@object
|
||||
.size .L_2il0floatpacket.10,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.9:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.9,@object
|
||||
.size .L_2il0floatpacket.9,8
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
||||
312
asm/unused/force.s
Normal file
312
asm/unused/force.s
Normal file
@@ -0,0 +1,312 @@
|
||||
.intel_syntax noprefix
|
||||
|
||||
.text
|
||||
.align 16,0x90
|
||||
.globl computeForce
|
||||
computeForce:
|
||||
# parameter 1: rdi Parameter*
|
||||
# parameter 2: rsi Atom*
|
||||
# parameter 3: rdx Neighbor*
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
|
||||
vmovsd xmm2, QWORD PTR [72+rdi] # xmm2 <- param->cutforce
|
||||
vmovsd xmm1, QWORD PTR [8+rdi] # xmm1 <- param->sigma6
|
||||
vmovsd xmm0, QWORD PTR [rdi] # xmm0 <- param->epsilon
|
||||
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
|
||||
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
|
||||
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
|
||||
test r9d, r9d # atom->Nlocal <= 0
|
||||
jle ..exit_func
|
||||
|
||||
..B1.2:
|
||||
xor r10d, r10d # r10d <- 0
|
||||
mov ecx, r9d # ecx <- atom->Nlocal
|
||||
xor r8d, r8d # r8d <- 0
|
||||
mov r11d, 1 # r11d <- 1
|
||||
xor eax, eax # eax <- 0
|
||||
shr ecx, 1 # ecx <- atom->Nlocal >> 1
|
||||
je ..B1.6 # ecx == 0
|
||||
|
||||
# Init forces to zero loop
|
||||
..B1.4:
|
||||
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
|
||||
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
|
||||
add r8, 16 # i++
|
||||
inc r10 # i++
|
||||
cmp r10, rcx # i < Nlocal
|
||||
jb ..B1.4
|
||||
|
||||
..B1.5:
|
||||
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
|
||||
..B1.6:
|
||||
lea ecx, DWORD PTR [-1+r11] # r11d <- i * 2
|
||||
cmp ecx, r9d # i < Nlocal
|
||||
jae ..B1.8
|
||||
|
||||
..B1.7:
|
||||
movsxd r11, r11d # r11 <- i * 2
|
||||
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
|
||||
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
|
||||
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
|
||||
|
||||
..B1.8:
|
||||
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
|
||||
xor r8d, r8d # r8d <- 0
|
||||
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
|
||||
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
|
||||
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
|
||||
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
|
||||
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
|
||||
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
|
||||
vbroadcastsd zmm14, xmm0 # zmm16 <- [48 * epsilon, ...]
|
||||
movsxd r9, r9d # r9 <- atom->Nlocal
|
||||
xor r10d, r10d # r10d <- 0 (i)
|
||||
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
|
||||
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
|
||||
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
|
||||
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
|
||||
|
||||
### AOS
|
||||
xor eax, eax
|
||||
### SOA
|
||||
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
|
||||
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
|
||||
###
|
||||
|
||||
shl r12, 2 # r12 <- neighbor->maxneighs * 4
|
||||
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
|
||||
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
|
||||
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
|
||||
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
|
||||
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
|
||||
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
|
||||
|
||||
# Loop over all atoms
|
||||
..B1.9:
|
||||
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
|
||||
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0
|
||||
vmovapd xmm20, xmm25 # xmm20 <- 0
|
||||
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
|
||||
vmovapd xmm4, xmm20 # xmm4 <- 0
|
||||
|
||||
### AOS
|
||||
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
|
||||
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
|
||||
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
|
||||
### SOA
|
||||
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
|
||||
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
|
||||
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
|
||||
###
|
||||
|
||||
test r13d, r13d # numneighs <= 0
|
||||
jle ..exit_func
|
||||
|
||||
..B1.10:
|
||||
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
|
||||
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
|
||||
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
|
||||
|
||||
mov r14d, r13d # r14d <- numneighs
|
||||
xor r11d, r11d # r11d <- 0
|
||||
and r14d, -8 # r14d <- numneighs & (-8)
|
||||
lea r9d, DWORD PTR [8+r11] # r9d <- 8 (why lea?)
|
||||
cmp r14d, r9d # r14d < r9d
|
||||
jl ..B1.33
|
||||
|
||||
# cmp r13d, 8 # numneighs < 8
|
||||
# jl ..B1.32
|
||||
#..B1.11:
|
||||
# cmp r13d, 1200 # numneighs < 1200
|
||||
# jl ..B1.31
|
||||
#..B1.12:
|
||||
# mov rcx, r12
|
||||
# imul rcx, r8
|
||||
# add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i (r8)]
|
||||
# mov r9, rcx # r9 <- neighs
|
||||
# and r9, 63 # r9 <- neighs & 63
|
||||
# test r9d, 3 # (r9d & 3) == 0 => r9d divisible by 8
|
||||
# je ..B1.14
|
||||
#..B1.13:
|
||||
# xor r9d, r9d # r9d <- 0
|
||||
# jmp ..B1.16
|
||||
#..B1.14:
|
||||
# test r9d, r9d # r9d == 0
|
||||
# je ..B1.16
|
||||
#..B1.15:
|
||||
# neg r9d
|
||||
# add r9d, 64
|
||||
# shr r9d, 2 # r9d <- (64 - r9d) / 4
|
||||
# cmp r13d, r9d # numneighs < r9d
|
||||
# cmovl r9d, r13d # r9d <- MIN(numneighs, r9d)
|
||||
#..B1.16:
|
||||
# mov ebx, r13d
|
||||
# sub ebx, r9d
|
||||
# and ebx, 7
|
||||
# neg ebx
|
||||
# add ebx, r13d # ebx <- -((numneighs - r9d) & 7) + numneighs
|
||||
# cmp r9d, 1 # r9d < 1
|
||||
# jb ..B1.20
|
||||
#..B1.20:
|
||||
# lea ecx, DWORD PTR [8+r9] # ecx <- r9d[1]
|
||||
# cmp ebx, ecx # -((numneighs - r9d) & 7) + numneighs < neighs
|
||||
# jl ..B1.24
|
||||
|
||||
..B1.21:
|
||||
mov rcx, r12
|
||||
imul rcx, r8
|
||||
vbroadcastsd zmm0, xmm8
|
||||
vbroadcastsd zmm1, xmm9
|
||||
vbroadcastsd zmm2, xmm10
|
||||
movsxd r14, r9d
|
||||
add rcx, r11
|
||||
|
||||
..B1.22:
|
||||
vpcmpeqb k2, xmm0, xmm0
|
||||
add r9d, 8
|
||||
vpcmpeqb k1, xmm0, xmm0
|
||||
vpcmpeqb k3, xmm0, xmm0
|
||||
vmovdqu ymm3, YMMWORD PTR [rcx+r14*4]
|
||||
add r14, 8
|
||||
vpxord zmm5, zmm5, zmm5
|
||||
vpxord zmm6, zmm6, zmm6
|
||||
|
||||
### AOS
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
vpxord zmm4, zmm4, zmm4
|
||||
vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8]
|
||||
vgatherdpd zmm5{k2}, QWORD PTR [8+rdx+ymm3*8]
|
||||
vgatherdpd zmm6{k3}, QWORD PTR [16+rdx+ymm3*8]
|
||||
### SOA
|
||||
#vpxord zmm4, zmm4, zmm4
|
||||
#vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8]
|
||||
#vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8]
|
||||
#vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8]
|
||||
###
|
||||
|
||||
vsubpd zmm29, zmm1, zmm5
|
||||
vsubpd zmm28, zmm0, zmm4
|
||||
vsubpd zmm31, zmm2, zmm6
|
||||
vmulpd zmm20, zmm29, zmm29
|
||||
vfmadd231pd zmm20, zmm28, zmm28
|
||||
vfmadd231pd zmm20, zmm31, zmm31
|
||||
|
||||
# if condition cutoff radius
|
||||
vrcp14pd zmm27, zmm20 #-> sr2
|
||||
vcmppd k5, zmm20, zmm16, 1
|
||||
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
|
||||
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
|
||||
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
|
||||
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
|
||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||
vfmadd231pd zmm13{k5}, zmm30, zmm28
|
||||
vfmadd231pd zmm12{k5}, zmm30, zmm29
|
||||
vfmadd231pd zmm11{k5}, zmm30, zmm31
|
||||
cmp r9d, ebx
|
||||
jb ..B1.22
|
||||
#end neighbor loop
|
||||
|
||||
..B1.26:
|
||||
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
|
||||
vpermd zmm0, zmm10, zmm11
|
||||
vpermd zmm5, zmm10, zmm12
|
||||
vpermd zmm21, zmm10, zmm13
|
||||
vaddpd zmm11, zmm0, zmm11
|
||||
vaddpd zmm12, zmm5, zmm12
|
||||
vaddpd zmm13, zmm21, zmm13
|
||||
vpermpd zmm1, zmm11, 78
|
||||
vpermpd zmm6, zmm12, 78
|
||||
vpermpd zmm22, zmm13, 78
|
||||
vaddpd zmm2, zmm11, zmm1
|
||||
vaddpd zmm8, zmm12, zmm6
|
||||
vaddpd zmm23, zmm13, zmm22
|
||||
vpermpd zmm3, zmm2, 177
|
||||
vpermpd zmm9, zmm8, 177
|
||||
vpermpd zmm24, zmm23, 177
|
||||
vaddpd zmm4, zmm2, zmm3
|
||||
vaddpd zmm20, zmm8, zmm9
|
||||
vaddpd zmm25, zmm23, zmm24
|
||||
|
||||
#exit function
|
||||
..exit_func:
|
||||
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
|
||||
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
|
||||
|
||||
### AOS
|
||||
add rax, 24
|
||||
###
|
||||
|
||||
movsxd r8, r10d #55.32
|
||||
inc r8 #55.32
|
||||
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
|
||||
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
|
||||
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
|
||||
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
|
||||
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
|
||||
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
|
||||
inc r10 #55.5
|
||||
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
|
||||
jb ..B1.9
|
||||
vzeroupper #93.12
|
||||
vxorpd xmm0, xmm0, xmm0 #93.12
|
||||
pop r14 #93.12
|
||||
pop r13 #93.12
|
||||
pop r12 #93.12
|
||||
ret #93.12
|
||||
|
||||
.type computeForce,@function
|
||||
.size computeForce,.-computeForce
|
||||
|
||||
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForce
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,8
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
||||
@@ -1,44 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <util.h>
|
||||
|
||||
void *allocate(int alignment, size_t bytesize) {
|
||||
void *ptr;
|
||||
int errorCode;
|
||||
|
||||
errorCode = posix_memalign(&ptr, alignment, bytesize);
|
||||
if(errorCode == EINVAL) {
|
||||
fprintf(stderr, "Error: Alignment parameter is not a power of two\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if(errorCode == ENOMEM) {
|
||||
fprintf(stderr, "Error: Insufficient memory to fulfill the request\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if(ptr == NULL) {
|
||||
fprintf(stderr, "Error: posix_memalign failed!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void *reallocate(void* ptr, int alignment, size_t new_bytesize, size_t old_bytesize) {
|
||||
void *newarray = allocate(alignment, new_bytesize);
|
||||
if(ptr != NULL) {
|
||||
memcpy(newarray, ptr, old_bytesize);
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
return newarray;
|
||||
}
|
||||
97
common/box.c
97
common/box.c
@@ -1,97 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
#include <box.h>
|
||||
#include <mpi.h>
|
||||
|
||||
int overlapBox(int dim, int dir, const Box* mybox, const Box* other, Box* cut, MD_FLOAT xprd, MD_FLOAT cutneigh)
|
||||
{
|
||||
int pbc = -100;
|
||||
MD_FLOAT min[3], max[3];
|
||||
int same = (mybox->id == other->id) ? 1 : 0;
|
||||
|
||||
//projections
|
||||
min[_x] = MAX(mybox->lo[_x], other->lo[_x]); max[_x] = MIN(mybox->hi[_x], other->hi[_x]);
|
||||
min[_y] = MAX(mybox->lo[_y], other->lo[_y]); max[_y] = MIN(mybox->hi[_y], other->hi[_y]);
|
||||
min[_z] = MAX(mybox->lo[_z], other->lo[_z]); max[_z] = MIN(mybox->hi[_z], other->hi[_z]);
|
||||
|
||||
//Intersection no periodic case
|
||||
if(!same){
|
||||
if (dir == 0) max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ cutneigh);
|
||||
if (dir == 1) min[dim] = MAX(mybox->lo[dim], other->lo[dim]- cutneigh);
|
||||
if ((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z])) pbc = 0;
|
||||
}
|
||||
|
||||
//Intersection periodic case
|
||||
if(pbc < 0)
|
||||
{
|
||||
if(dir == 0){
|
||||
min[dim] = MAX(mybox->lo[dim] , other->lo[dim]- xprd);
|
||||
max[dim] = MIN(mybox->hi[dim] , other->hi[dim]- xprd + cutneigh);
|
||||
|
||||
} else {
|
||||
min[dim] = MAX(mybox->lo[dim], other->lo[dim]+ xprd - cutneigh);
|
||||
max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ xprd);
|
||||
|
||||
}
|
||||
if((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z]))
|
||||
pbc = (dir == 0) ? 1:-1;
|
||||
}
|
||||
|
||||
//storing the cuts
|
||||
cut->lo[_x] = min[_x]; cut->hi[_x] = max[_x];
|
||||
cut->lo[_y] = min[_y]; cut->hi[_y] = max[_y];
|
||||
cut->lo[_z] = min[_z]; cut->hi[_z] = max[_z];
|
||||
|
||||
return pbc;
|
||||
}
|
||||
|
||||
int overlapFullBox(Parameter* param, MD_FLOAT *cutneigh ,const Box* mybox, const Box* other)
|
||||
{
|
||||
MD_FLOAT min[3], max[3];
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int k = -1; k < 2; k++)
|
||||
{
|
||||
for(int j = -1; j < 2; j++)
|
||||
{
|
||||
for(int i= -1; i < 2; i++)
|
||||
{
|
||||
min[_x] = MAX(mybox->lo[_x], other->lo[_x]-cutneigh[_x] + i*xprd);
|
||||
min[_y] = MAX(mybox->lo[_y], other->lo[_y]-cutneigh[_y] + j*yprd);
|
||||
min[_z] = MAX(mybox->lo[_z], other->lo[_z]-cutneigh[_z] + k*zprd);
|
||||
max[_x] = MIN(mybox->hi[_x], other->hi[_x]+cutneigh[_x] + i*xprd);
|
||||
max[_y] = MIN(mybox->hi[_y], other->hi[_y]+cutneigh[_y] + j*yprd);
|
||||
max[_z] = MIN(mybox->hi[_z], other->hi[_z]+cutneigh[_z] + k*zprd);
|
||||
if ((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z]))
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void expandBox(int iswap, const Box* me, const Box* other, Box* cut, MD_FLOAT cutneigh)
|
||||
{
|
||||
if(iswap==2 || iswap==3){
|
||||
if(me->lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh;
|
||||
if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh;
|
||||
}
|
||||
|
||||
if(iswap==4 || iswap==5){
|
||||
if(me->lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh;
|
||||
if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh;
|
||||
if(me->lo[_y] <= other->lo[_y]) cut->lo[_y] -= cutneigh;
|
||||
if(me->hi[_y] >= other->hi[_y]) cut->hi[_y] += cutneigh;
|
||||
}
|
||||
}
|
||||
|
||||
556
common/comm.c
556
common/comm.c
@@ -1,556 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <comm.h>
|
||||
#include <allocate.h>
|
||||
#include <mpi.h>
|
||||
#include <util.h>
|
||||
|
||||
#define NEIGHMIN 6
|
||||
#define BUFFACTOR 2
|
||||
#define BUFMIN 1000
|
||||
#define BUFEXTRA 100
|
||||
#define world MPI_COMM_WORLD
|
||||
|
||||
MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;
|
||||
static inline void allocDynamicBuffers(Comm*);
|
||||
static inline void freeDynamicBuffers(Comm*);
|
||||
static inline void freeBuffers(Comm*);
|
||||
|
||||
void defineReverseList(Comm* comm){
|
||||
int dim = 0;
|
||||
int index = 0;
|
||||
int me = comm->myproc;
|
||||
|
||||
//Set the inverse list
|
||||
for(int iswap = 0; iswap<6; iswap++){
|
||||
int dim = comm->swapdim[iswap];
|
||||
int dir = comm->swapdir[iswap];
|
||||
int invswap = comm->swap[dim][(dir+1)%2];
|
||||
|
||||
for(int ineigh = comm->sendfrom[invswap]; ineigh< comm->sendtill[invswap]; ineigh++)
|
||||
comm->nrecv[index++] = comm->nsend[ineigh];
|
||||
|
||||
comm->recvfrom[iswap] = (iswap == 0) ? 0 : comm->recvtill[iswap-1];
|
||||
comm->recvtill[iswap] = index;
|
||||
}
|
||||
|
||||
//set if myproc is unique in the swap
|
||||
for(int iswap = 0; iswap<6; iswap++){
|
||||
int sizeswap = comm->sendtill[iswap]-comm->sendfrom[iswap];
|
||||
int index = comm->sendfrom[iswap];
|
||||
int myneigh = comm->nsend[index];
|
||||
comm->othersend[iswap] = (sizeswap != 1 || comm->myproc != myneigh) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
void addNeighToExchangeList(Comm* comm, int newneigh){
|
||||
|
||||
int numneigh = comm->numneighexch;
|
||||
|
||||
if(comm->numneighexch>=comm->maxneighexch){
|
||||
size_t oldByteSize = comm->maxneighexch*sizeof(int);
|
||||
comm->maxneighexch *=2;
|
||||
comm->nexch = (int*) reallocate(comm->nexch, ALIGNMENT, comm->maxneighexch * sizeof(int), oldByteSize);
|
||||
}
|
||||
|
||||
// Add the new element to the list
|
||||
comm->nexch[numneigh] = newneigh;
|
||||
comm->numneighexch++;
|
||||
}
|
||||
|
||||
//Exported functions
|
||||
void neighComm(Comm *comm, Parameter* param, Grid *grid)
|
||||
{
|
||||
int me = comm->myproc;
|
||||
int numproc = comm ->numproc;
|
||||
int PAD = 6; //number of elements for processor in the map
|
||||
int ineigh = 0;
|
||||
int sneigh = 0;
|
||||
MD_FLOAT *map = grid->map;
|
||||
MD_FLOAT cutneigh = param->cutneigh;
|
||||
MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd};
|
||||
Box mybox, other, cut;
|
||||
|
||||
//needed for rebalancing
|
||||
freeDynamicBuffers(comm);
|
||||
|
||||
//Local box
|
||||
mybox.id = me;
|
||||
mybox.lo[_x] = map[me*PAD+0]; mybox.hi[_x] = map[me*PAD+3];
|
||||
mybox.lo[_y] = map[me*PAD+1]; mybox.hi[_y] = map[me*PAD+4];
|
||||
mybox.lo[_z] = map[me*PAD+2]; mybox.hi[_z] = map[me*PAD+5];
|
||||
|
||||
//Check for all possible neighbours only for exchange atoms
|
||||
comm->numneighexch = 0;
|
||||
for(int proc = 0; proc <numproc; proc++){
|
||||
other.id = proc;
|
||||
other.lo[_x] = map[proc*PAD+0]; other.hi[_x] = map[proc*PAD+3];
|
||||
other.lo[_y] = map[proc*PAD+1]; other.hi[_y] = map[proc*PAD+4];
|
||||
other.lo[_z] = map[proc*PAD+2]; other.hi[_z] = map[proc*PAD+5];
|
||||
|
||||
if(proc != me){
|
||||
int intersection = overlapFullBox(param,grid->cutneigh,&mybox,&other);
|
||||
if(intersection) addNeighToExchangeList(comm,proc);
|
||||
}
|
||||
}
|
||||
|
||||
//MAP is stored as follows: xlo,ylo,zlo,xhi,yhi,zhi
|
||||
for(int iswap = 0; iswap <6; iswap++)
|
||||
{
|
||||
int dir = comm->swapdir[iswap];
|
||||
int dim = comm->swapdim[iswap];
|
||||
|
||||
for(int proc = 0; proc < numproc; proc++)
|
||||
{
|
||||
//Check for neighbours along dimmensions, for forwardComm, backwardComm and ghostComm
|
||||
other.id = proc;
|
||||
other.lo[_x] = map[proc*PAD+0]; other.hi[_x] = map[proc*PAD+3];
|
||||
other.lo[_y] = map[proc*PAD+1]; other.hi[_y] = map[proc*PAD+4];
|
||||
other.lo[_z] = map[proc*PAD+2]; other.hi[_z] = map[proc*PAD+5];
|
||||
|
||||
//return if two boxes intersect: -100 not intersection, 0, 1 and -1 intersection for each different pbc.
|
||||
int pbc = overlapBox(dim,dir,&mybox,&other,&cut,prd[dim],cutneigh);
|
||||
if(pbc == -100) continue;
|
||||
|
||||
expandBox(iswap, &mybox, &other, &cut, cutneigh);
|
||||
|
||||
if(ineigh >= comm->maxneigh) {
|
||||
size_t oldByteSize = comm->maxneigh*sizeof(int);
|
||||
size_t oldBoxSize = comm->maxneigh*sizeof(Box);
|
||||
comm->maxneigh = 2*ineigh;
|
||||
comm->nsend = (int*) reallocate(comm->nsend, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
|
||||
comm->nrecv = (int*) reallocate(comm->nrecv, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
|
||||
comm->pbc_x = (int*) reallocate(comm->pbc_x, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
|
||||
comm->pbc_y = (int*) reallocate(comm->pbc_y, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
|
||||
comm->pbc_z = (int*) reallocate(comm->pbc_z, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
|
||||
comm->boxes = (Box*) reallocate(comm->boxes, ALIGNMENT, comm->maxneigh * sizeof(Box), oldBoxSize);
|
||||
}
|
||||
|
||||
comm->boxes[ineigh] = cut;
|
||||
comm->nsend[ineigh] = proc;
|
||||
comm->pbc_x[ineigh] = (dim == _x) ? pbc : 0;
|
||||
comm->pbc_y[ineigh] = (dim == _y) ? pbc : 0;
|
||||
comm->pbc_z[ineigh] = (dim == _z) ? pbc : 0;
|
||||
ineigh++;
|
||||
}
|
||||
|
||||
comm->sendfrom[iswap] = (iswap == 0) ? 0:comm->sendtill[iswap-1];
|
||||
comm->sendtill[iswap] = ineigh;
|
||||
comm->numneigh = ineigh;
|
||||
}
|
||||
|
||||
allocDynamicBuffers(comm);
|
||||
defineReverseList(comm);
|
||||
}
|
||||
|
||||
void initComm(int* argc, char*** argv, Comm* comm)
|
||||
{
|
||||
//MPI Initialize
|
||||
MPI_Init(argc, argv);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &(comm->numproc));
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &(comm->myproc));
|
||||
comm->numneigh = 0;
|
||||
comm->numneighexch = 0;
|
||||
comm->nrecv=NULL;
|
||||
comm->nsend=NULL;
|
||||
comm->nexch=NULL;
|
||||
comm->pbc_x=NULL;
|
||||
comm->pbc_y=NULL;
|
||||
comm->pbc_z=NULL;
|
||||
comm->boxes=NULL;
|
||||
comm->atom_send=NULL;
|
||||
comm->atom_recv=NULL;
|
||||
comm->off_atom_send=NULL;
|
||||
comm->off_atom_recv=NULL;
|
||||
comm->maxsendlist=NULL;
|
||||
comm->sendlist=NULL;
|
||||
comm->buf_send=NULL;
|
||||
comm->buf_recv=NULL;
|
||||
}
|
||||
|
||||
void endComm(Comm* comm)
|
||||
{
|
||||
comm->maxneigh = 0;
|
||||
comm->maxneighexch =0;
|
||||
comm->maxsend = 0;
|
||||
comm->maxrecv = 0;
|
||||
freeBuffers(comm);
|
||||
MPI_Finalize();
|
||||
}
|
||||
|
||||
void setupComm(Comm* comm, Parameter* param, Grid* grid){
|
||||
|
||||
comm->swap[_x][0] = 0; comm->swap[_x][1] =1;
|
||||
comm->swap[_y][0] = 2; comm->swap[_y][1] =3;
|
||||
comm->swap[_z][0] = 4; comm->swap[_z][1] =5;
|
||||
|
||||
comm->swapdim[0] = comm->swapdim[1] = _x;
|
||||
comm->swapdim[2] = comm->swapdim[3] = _y;
|
||||
comm->swapdim[4] = comm->swapdim[5] = _z;
|
||||
|
||||
comm->swapdir[0] = comm->swapdir[2] = comm->swapdir[4] = 0;
|
||||
comm->swapdir[1] = comm->swapdir[3] = comm->swapdir[5] = 1;
|
||||
|
||||
for(int i = 0; i<6; i++){
|
||||
comm->sendfrom[i] = 0;
|
||||
comm->sendtill[i] = 0;
|
||||
comm->recvfrom[i] = 0;
|
||||
comm->recvtill[i] = 0;
|
||||
}
|
||||
|
||||
comm->forwardSize = FORWARD_SIZE; //send coordiantes x,y,z
|
||||
comm->reverseSize = REVERSE_SIZE; //return forces fx, fy, fz
|
||||
comm->ghostSize = GHOST_SIZE; //send x,y,z,type;
|
||||
comm->exchangeSize = EXCHANGE_SIZE; //send x,y,z,vx,vy,vz,type
|
||||
|
||||
//Allocate memory for recv buffer and recv buffer
|
||||
comm->maxsend = BUFMIN;
|
||||
comm->maxrecv = BUFMIN;
|
||||
comm->buf_send = (MD_FLOAT*) allocate(ALIGNMENT,(comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT));
|
||||
comm->buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT));
|
||||
|
||||
comm->maxneighexch = NEIGHMIN;
|
||||
comm->nexch = (int*) allocate(ALIGNMENT, comm->maxneighexch * sizeof(int));
|
||||
|
||||
comm->maxneigh = NEIGHMIN;
|
||||
comm->nsend = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
|
||||
comm->nrecv = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
|
||||
comm->pbc_x = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
|
||||
comm->pbc_y = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
|
||||
comm->pbc_z = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
|
||||
comm->boxes = (Box*) allocate(ALIGNMENT, comm->maxneigh * sizeof(Box));
|
||||
|
||||
neighComm(comm, param, grid);
|
||||
}
|
||||
|
||||
void forwardComm(Comm* comm, Atom* atom, int iswap)
|
||||
{
|
||||
int nrqst=0, offset=0, nsend=0, nrecv=0;
|
||||
int pbc[3];
|
||||
int size = comm->forwardSize;
|
||||
int maxrqst = comm->numneigh;
|
||||
MD_FLOAT* buf;
|
||||
MPI_Request requests[maxrqst];
|
||||
|
||||
for(int ineigh = comm->sendfrom[iswap]; ineigh < comm->sendtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_send[ineigh];
|
||||
pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh]; pbc[_z]=comm->pbc_z[ineigh];
|
||||
packForward(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &comm->buf_send[offset*size],pbc);
|
||||
}
|
||||
|
||||
//Receives elements
|
||||
if(comm->othersend[iswap])
|
||||
for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_recv[ineigh]*size;
|
||||
nrecv = comm->atom_recv[ineigh]*size;
|
||||
MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]);
|
||||
}
|
||||
|
||||
//Send elements
|
||||
if(comm->othersend[iswap])
|
||||
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_send[ineigh]*size;
|
||||
nsend = comm->atom_send[ineigh]*size;
|
||||
MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world);
|
||||
}
|
||||
|
||||
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
|
||||
if(comm->othersend[iswap]) buf = comm->buf_recv;
|
||||
else buf = comm->buf_send;
|
||||
|
||||
/* unpack buffer */
|
||||
for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_recv[ineigh];
|
||||
unpackForward(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &buf[offset*size]);
|
||||
}
|
||||
}
|
||||
|
||||
void reverseComm(Comm* comm, Atom* atom, int iswap)
|
||||
{
|
||||
int nrqst=0, offset=0, nsend=0, nrecv=0 ;
|
||||
int size = comm->reverseSize;
|
||||
int maxrqst = comm->numneigh;
|
||||
MD_FLOAT* buf;
|
||||
MPI_Request requests[maxrqst];
|
||||
|
||||
for(int ineigh = comm->recvfrom[iswap]; ineigh < comm->recvtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_recv[ineigh];
|
||||
packReverse(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &comm->buf_send[offset*size]);
|
||||
}
|
||||
//Receives elements
|
||||
if(comm->othersend[iswap])
|
||||
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_send[ineigh]*size;
|
||||
nrecv = comm->atom_send[ineigh]*size;
|
||||
MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nsend[ineigh],0,world,&requests[nrqst++]);
|
||||
}
|
||||
//Send elements
|
||||
if(comm->othersend[iswap])
|
||||
for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_recv[ineigh]*size;
|
||||
nsend = comm->atom_recv[ineigh]*size;
|
||||
MPI_Send(&comm->buf_send[offset],nsend,type,comm->nrecv[ineigh],0,world);
|
||||
}
|
||||
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
if(comm->othersend[iswap]) buf = comm->buf_recv;
|
||||
else buf = comm->buf_send;
|
||||
|
||||
/* unpack buffer */
|
||||
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_send[ineigh];
|
||||
unpackReverse(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &buf[offset*size]);
|
||||
}
|
||||
}
|
||||
|
||||
void ghostComm(Comm* comm, Atom* atom,int iswap){
|
||||
|
||||
MD_FLOAT xlo=0, xhi=0, ylo=0, yhi=0, zlo=0, zhi=0;
|
||||
MD_FLOAT* buf;
|
||||
int nrqst=0, nsend=0, nrecv=0, offset=0, ineigh=0, pbc[3];
|
||||
int all_recv=0, all_send=0, currentSend=0;
|
||||
int size = comm->ghostSize;
|
||||
int maxrqrst = comm->numneigh;
|
||||
MPI_Request requests[maxrqrst];
|
||||
for(int i = 0; i<maxrqrst; i++)
|
||||
requests[maxrqrst]=MPI_REQUEST_NULL;
|
||||
if(iswap%2==0) comm->iterAtom = LOCAL+GHOST;
|
||||
int iter = 0;
|
||||
for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++)
|
||||
{
|
||||
Box* tile = &comm->boxes[ineigh];
|
||||
|
||||
xlo = tile->lo[_x]; ylo = tile->lo[_y]; zlo = tile->lo[_z];
|
||||
xhi = tile->hi[_x]; yhi = tile->hi[_y]; zhi = tile->hi[_z];
|
||||
pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh]; pbc[_z]=comm->pbc_z[ineigh];
|
||||
nsend = 0;
|
||||
|
||||
for(int i = 0; i < comm->iterAtom ; i++)
|
||||
{
|
||||
if(IsinRegionToSend(i)){
|
||||
if(nsend >= comm->maxsendlist[ineigh]) growList(comm,ineigh,nsend);
|
||||
if(currentSend + size >= comm->maxsend) growSend(comm,currentSend);
|
||||
comm->sendlist[ineigh][nsend++] = i;
|
||||
currentSend += packGhost(atom, i, &comm->buf_send[currentSend], pbc);
|
||||
}
|
||||
}
|
||||
comm->atom_send[ineigh] = nsend; //#atoms send per neigh
|
||||
comm->off_atom_send[ineigh] = all_send; //offset atom respect to neighbours in a swap
|
||||
all_send += nsend; //all atoms send
|
||||
}
|
||||
//Receives how many elements to be received.
|
||||
if(comm->othersend[iswap])
|
||||
for(nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++)
|
||||
MPI_Irecv(&comm->atom_recv[ineigh],1,MPI_INT,comm->nrecv[ineigh],0,world,&requests[nrqst++]);
|
||||
|
||||
if(!comm->othersend[iswap]) comm->atom_recv[comm->recvfrom[iswap]] = nsend;
|
||||
|
||||
//Communicate how many elements to be sent.
|
||||
if(comm->othersend[iswap])
|
||||
for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++)
|
||||
MPI_Send(&comm->atom_send[ineigh],1,MPI_INT,comm->nsend[ineigh],0,world);
|
||||
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
|
||||
//Define offset to store in the recv_buff
|
||||
for(int ineigh = comm->recvfrom[iswap]; ineigh<comm->recvtill[iswap]; ineigh++){
|
||||
comm->off_atom_recv[ineigh] = all_recv;
|
||||
all_recv += comm->atom_recv[ineigh];
|
||||
}
|
||||
|
||||
if(all_recv*size>=comm->maxrecv) growRecv(comm,all_recv*size);
|
||||
|
||||
//Receives elements
|
||||
if(comm->othersend[iswap])
|
||||
for (nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_recv[ineigh]*size;
|
||||
nrecv = comm->atom_recv[ineigh]*size;
|
||||
MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]);
|
||||
}
|
||||
//Send elements
|
||||
if(comm->othersend[iswap])
|
||||
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_send[ineigh]*size;
|
||||
nsend = comm->atom_send[ineigh]*size;
|
||||
MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world);
|
||||
}
|
||||
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
|
||||
if(comm->othersend[iswap]) buf = comm->buf_recv;
|
||||
else buf = comm->buf_send;
|
||||
//unpack elements
|
||||
comm->firstrecv[iswap] = LOCAL+GHOST;
|
||||
for(int i = 0; i < all_recv; i++)
|
||||
unpackGhost(atom, LOCAL+GHOST, &buf[i*size]);
|
||||
|
||||
//Increases the buffer if needed
|
||||
int max_size = MAX(comm->forwardSize,comm->reverseSize);
|
||||
int max_buf = max_size * MAX(all_recv, all_send);
|
||||
if(max_buf>=comm->maxrecv) growRecv(comm,max_buf);
|
||||
if(max_buf>=comm->maxsend) growSend(comm,max_buf);
|
||||
}
|
||||
|
||||
void exchangeComm(Comm* comm, Atom* atom){
|
||||
|
||||
MD_FLOAT x,y,z;
|
||||
MD_FLOAT *lo = atom->mybox.lo;
|
||||
MD_FLOAT *hi = atom->mybox.hi;
|
||||
int size = comm->exchangeSize;
|
||||
int numneigh = comm->numneighexch;
|
||||
int offset_recv[numneigh];
|
||||
int size_recv[numneigh];
|
||||
MPI_Request requests[numneigh];
|
||||
int i =0, nsend = 0, nrecv = 0;
|
||||
int nrqst = 0;
|
||||
int nlocal, offset,m;
|
||||
|
||||
/* enforce PBC */
|
||||
pbc(atom);
|
||||
|
||||
if(comm->numneigh == 0) return;
|
||||
|
||||
nlocal = atom->Nlocal;
|
||||
while(i < nlocal) {
|
||||
if(atom_x(i) < lo[_x] || atom_x(i) >= hi[_x] ||
|
||||
atom_y(i) < lo[_y] || atom_y(i) >= hi[_y] ||
|
||||
atom_z(i) < lo[_z] || atom_z(i) >= hi[_z]) {
|
||||
if(nsend+size >= comm->maxsend) growSend(comm, nsend);
|
||||
nsend += packExchange(atom, i, &comm->buf_send[nsend]);
|
||||
copy(atom, i, nlocal-1);
|
||||
nlocal--;
|
||||
} else i++;
|
||||
}
|
||||
atom->Nlocal = nlocal;
|
||||
|
||||
/* send/recv number of to share atoms with neighbouring procs*/
|
||||
for(int ineigh = 0; ineigh < numneigh; ineigh++)
|
||||
MPI_Irecv(&size_recv[ineigh],1,MPI_INT,comm->nexch[ineigh],0,world,&requests[nrqst++]);
|
||||
|
||||
for (int ineigh = 0; ineigh < numneigh; ineigh++)
|
||||
MPI_Send(&nsend,1,MPI_INT,comm->nexch[ineigh],0,world);
|
||||
MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
|
||||
//Define offset to store in the recv_buff
|
||||
for(int ineigh = 0; ineigh<numneigh; ineigh++){
|
||||
offset_recv[ineigh] = nrecv;
|
||||
nrecv += size_recv[ineigh];
|
||||
}
|
||||
|
||||
if(nrecv >= comm->maxrecv) growRecv(comm,nrecv);
|
||||
|
||||
//Receives elements
|
||||
nrqst=0;
|
||||
for (int ineigh = 0; ineigh< numneigh; ineigh++){
|
||||
offset = offset_recv[ineigh];
|
||||
MPI_Irecv(&comm->buf_recv[offset], size_recv[ineigh], type, comm->nexch[ineigh],0,world,&requests[nrqst++]);
|
||||
}
|
||||
//Send elements
|
||||
for (int ineigh = 0; ineigh< numneigh; ineigh++)
|
||||
MPI_Send(comm->buf_send,nsend,type,comm->nexch[ineigh],0,world);
|
||||
MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
|
||||
nlocal = atom->Nlocal;
|
||||
m = 0;
|
||||
while(m < nrecv) {
|
||||
x = comm->buf_recv[m + _x];
|
||||
y = comm->buf_recv[m + _y];
|
||||
z = comm->buf_recv[m + _z];
|
||||
|
||||
if(x >= lo[_x] && x < hi[_x] &&
|
||||
y >= lo[_y] && y < hi[_y] &&
|
||||
z >= lo[_z] && z < hi[_z]){
|
||||
m += unpackExchange(atom, nlocal++, &comm->buf_recv[m]);
|
||||
} else {
|
||||
m += size;
|
||||
}
|
||||
}
|
||||
atom->Nlocal = nlocal;
|
||||
|
||||
int all_atoms=0;
|
||||
MPI_Allreduce(&atom->Nlocal, &all_atoms, 1, MPI_INT, MPI_SUM, world);
|
||||
if(atom->Natoms!=all_atoms && comm->myproc ==0){
|
||||
printf("Losing atoms! current atoms:%d expected atoms:%d\n",all_atoms,atom->Natoms);
|
||||
}
|
||||
}
|
||||
|
||||
//Internal functions
|
||||
|
||||
inline void growRecv(Comm* comm, int n)
|
||||
{
|
||||
comm -> maxrecv = BUFFACTOR * n;
|
||||
if(comm->buf_recv) free(comm -> buf_recv);
|
||||
comm -> buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT));
|
||||
}
|
||||
|
||||
inline void growSend(Comm* comm, int n)
|
||||
{
|
||||
size_t oldByteSize = (comm->maxsend+BUFEXTRA)*sizeof(MD_FLOAT);
|
||||
comm -> maxsend = BUFFACTOR * n;
|
||||
comm -> buf_send = (MD_FLOAT*) reallocate(comm->buf_send, ALIGNMENT, (comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT), oldByteSize);
|
||||
}
|
||||
|
||||
inline void growList(Comm* comm, int ineigh, int n)
|
||||
{
|
||||
size_t oldByteSize = comm->maxsendlist[ineigh]*sizeof(int);
|
||||
comm->maxsendlist[ineigh] = BUFFACTOR * n;
|
||||
comm->sendlist[ineigh] = (int*) reallocate(comm->sendlist[ineigh],ALIGNMENT, comm->maxsendlist[ineigh] * sizeof(int), oldByteSize);
|
||||
}
|
||||
|
||||
static inline void allocDynamicBuffers(Comm* comm)
|
||||
{
|
||||
//Buffers depending on the # of my neighs
|
||||
int numneigh = comm->numneigh;
|
||||
comm->atom_send = (int*) allocate(ALIGNMENT, numneigh * sizeof(int));
|
||||
comm->atom_recv = (int*) allocate(ALIGNMENT, numneigh * sizeof(int));
|
||||
comm->off_atom_send = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
|
||||
comm->off_atom_recv = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
|
||||
comm->maxsendlist = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
|
||||
|
||||
for(int i = 0; i < numneigh; i++)
|
||||
comm->maxsendlist[i] = BUFMIN;
|
||||
|
||||
comm->sendlist = (int**) allocate(ALIGNMENT, numneigh * sizeof(int*));
|
||||
for(int i = 0; i < numneigh; i++)
|
||||
comm->sendlist[i] = (int*) allocate(ALIGNMENT, comm->maxsendlist[i] * sizeof(int));
|
||||
}
|
||||
|
||||
static inline void freeDynamicBuffers(Comm* comm)
|
||||
{
|
||||
int numneigh =comm->numneigh;
|
||||
|
||||
if(comm->atom_send) free(comm->atom_send);
|
||||
if(comm->atom_recv) free(comm->atom_recv);
|
||||
if(comm->off_atom_send) free(comm->off_atom_send);
|
||||
if(comm->off_atom_recv) free(comm->off_atom_recv);
|
||||
if(comm->maxsendlist) free(comm->maxsendlist);
|
||||
if(comm->sendlist){
|
||||
for(int i = 0; i < numneigh; i++)
|
||||
if(comm->sendlist[i]) free(comm->sendlist[i]);
|
||||
}
|
||||
if(comm->sendlist) free(comm->sendlist);
|
||||
}
|
||||
|
||||
static inline void freeBuffers(Comm* comm)
|
||||
{
|
||||
if(comm->nrecv) free(comm->nrecv);
|
||||
if(comm->nsend) free(comm->nsend);
|
||||
if(comm->nexch) free(comm->nexch);
|
||||
if(comm->pbc_x) free(comm->pbc_x);
|
||||
if(comm->pbc_y) free(comm->pbc_y);
|
||||
if(comm->pbc_z) free(comm->pbc_z);
|
||||
if(comm->boxes) free(comm->boxes);
|
||||
if(comm->atom_send) free(comm->atom_send);
|
||||
if(comm->atom_recv) free(comm->atom_recv);
|
||||
if(comm->off_atom_send) free(comm->off_atom_send);
|
||||
if(comm->off_atom_recv) free(comm->off_atom_recv);
|
||||
if(comm->maxsendlist) free(comm->maxsendlist);
|
||||
|
||||
if(comm->sendlist){
|
||||
for(int i = 0; i < comm->numneigh; i++)
|
||||
if(comm->sendlist[i]) free(comm->sendlist[i]);
|
||||
}
|
||||
if(comm->sendlist) free(comm->sendlist);
|
||||
|
||||
if(comm->buf_send) free(comm->buf_send);
|
||||
if(comm->buf_recv) free(comm->buf_recv);
|
||||
}
|
||||
@@ -1,68 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
//---
|
||||
#include <device.h>
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
void cuda_assert(const char *label, cudaError_t err) {
|
||||
if (err != cudaSuccess) {
|
||||
printf("[CUDA Error]: %s: %s\r\n", label, cudaGetErrorString(err));
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void *allocateGPU(size_t bytesize) {
|
||||
void *ptr;
|
||||
#ifdef CUDA_HOST_MEMORY
|
||||
cuda_assert("allocateGPU", cudaMallocHost((void **) &ptr, bytesize));
|
||||
#else
|
||||
cuda_assert("allocateGPU", cudaMalloc((void **) &ptr, bytesize));
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Data is not preserved
|
||||
void *reallocateGPU(void *ptr, size_t new_bytesize) {
|
||||
if(ptr != NULL) {
|
||||
#ifdef CUDA_HOST_MEMORY
|
||||
cudaFreeHost(ptr);
|
||||
#else
|
||||
cudaFree(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
return allocateGPU(new_bytesize);
|
||||
}
|
||||
|
||||
void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {
|
||||
#ifndef CUDA_HOST_MEMORY
|
||||
cuda_assert("memcpyToGPU", cudaMemcpy(d_ptr, h_ptr, bytesize, cudaMemcpyHostToDevice));
|
||||
#endif
|
||||
}
|
||||
|
||||
void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {
|
||||
#ifndef CUDA_HOST_MEMORY
|
||||
cuda_assert("memcpyFromGPU", cudaMemcpy(h_ptr, d_ptr, bytesize, cudaMemcpyDeviceToHost));
|
||||
#endif
|
||||
}
|
||||
|
||||
void memsetGPU(void *d_ptr, int value, size_t bytesize) {
|
||||
cuda_assert("memsetGPU", cudaMemset(d_ptr, value, bytesize));
|
||||
}
|
||||
|
||||
#else
|
||||
void initDevice(Atom *atom, Neighbor *neighbor) {}
|
||||
void *allocateGPU(size_t bytesize) { return NULL; }
|
||||
void *reallocateGPU(void *ptr, size_t new_bytesize) { return NULL; }
|
||||
void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {}
|
||||
void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {}
|
||||
void memsetGPU(void *d_ptr, int value, size_t bytesize) {}
|
||||
#endif
|
||||
490
common/grid.c
490
common/grid.c
@@ -1,490 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <grid.h>
|
||||
#include <mpi.h>
|
||||
#include <parameter.h>
|
||||
#include <allocate.h>
|
||||
#include <util.h>
|
||||
#include <math.h>
|
||||
|
||||
static MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;
|
||||
|
||||
//Grommacs Balancing
|
||||
MD_FLOAT f_normalization(MD_FLOAT* x,MD_FLOAT* fx, MD_FLOAT minx, int nprocs) {
|
||||
|
||||
MD_FLOAT sum=0;
|
||||
for(int n = 0; n<nprocs; n++){
|
||||
fx[n] = MAX(minx,x[n]);
|
||||
sum+=fx[n];
|
||||
}
|
||||
|
||||
for(int n = 0; n<nprocs; n++)
|
||||
fx[n] /= sum;
|
||||
}
|
||||
|
||||
void fixedPointIteration(MD_FLOAT* x0, int nprocs, MD_FLOAT minx)
|
||||
{
|
||||
MD_FLOAT tolerance = 1e-3;
|
||||
MD_FLOAT alpha = 0.5;
|
||||
MD_FLOAT *fx = (MD_FLOAT*) malloc(nprocs*sizeof(MD_FLOAT));
|
||||
int maxIterations = 100;
|
||||
|
||||
for (int i = 0; i < maxIterations; i++) {
|
||||
|
||||
int converged = 1;
|
||||
f_normalization(x0,fx,minx,nprocs);
|
||||
|
||||
for(int n=0; n<nprocs; n++)
|
||||
fx[n]= (1-alpha) * x0[n] + alpha * fx[n];
|
||||
|
||||
for (int n=0; n<nprocs; n++) {
|
||||
if (fabs(fx[n] - x0[n]) >= tolerance) {
|
||||
converged = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int n=0; n<nprocs; n++)
|
||||
x0[n] = fx[n];
|
||||
|
||||
if(converged){
|
||||
for(int n = 0; n<nprocs; n++)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
void staggeredBalance(Grid* grid, Atom* atom, Parameter* param, double newTime)
|
||||
{
|
||||
int me;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
int *coord = grid->coord;
|
||||
int *nprocs = grid ->nprocs;
|
||||
//Elapsed time since the last rebalance
|
||||
double time = newTime - grid->Timer;
|
||||
grid->Timer = newTime;
|
||||
//store the older dimm to compare later for exchange
|
||||
MD_FLOAT lo[3], hi[3];
|
||||
for(int dim = 0; dim< 3; dim++){
|
||||
lo[dim] = atom->mybox.lo[dim];
|
||||
hi[dim] = atom->mybox.hi[dim];
|
||||
}
|
||||
|
||||
//Define parameters
|
||||
MPI_Comm subComm[3];
|
||||
int color[3] = {0,0,0};
|
||||
int id[3] = {0,0,0};
|
||||
MD_FLOAT ** load = (MD_FLOAT**) malloc(3*sizeof(MD_FLOAT*));
|
||||
for(int dim = 0; dim<3; dim++)
|
||||
load[dim] = (MD_FLOAT*) malloc(nprocs[dim]*sizeof(MD_FLOAT));
|
||||
|
||||
int maxprocs = MAX(MAX(nprocs[_x],nprocs[_y]),nprocs[_z]);
|
||||
MD_FLOAT* cellSize = (MD_FLOAT*) malloc(maxprocs*sizeof(MD_FLOAT));
|
||||
MD_FLOAT* limits = (MD_FLOAT*) malloc(2*maxprocs*sizeof(MD_FLOAT)); //limits: (x0, x1), (x1, x2)... Repeat values in between to perfom MPI_Scatter later
|
||||
MD_FLOAT t_sum[3] = {0,0,0};
|
||||
MD_FLOAT recv_buf[2] = {0,0}; //Each proc only receives 2 elments per dimension xlo and xhi
|
||||
MD_FLOAT balancedLoad[3] = {0,0,0}; //1/nprocs
|
||||
MD_FLOAT minLoad[3] = {0,0,0}; //beta*(1/nprocs)
|
||||
MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd};
|
||||
MD_FLOAT boundaries[6] ={0,0,0,0,0,0}; // xlo,xhi,ylo,yhi,zlo,zhi
|
||||
|
||||
//Create sub-communications along each dimension
|
||||
for(int dim = 0; dim<3; dim++){
|
||||
if(dim == _x){
|
||||
color[_x] = (coord[_y] == 0 && coord[_z] ==0) ? 1:MPI_UNDEFINED;
|
||||
id[_x] = me;
|
||||
} else if(dim == _y) {
|
||||
color[_y] = coord[_z] == 0 ? coord[_x]:MPI_UNDEFINED;
|
||||
id[_y] = (coord[_y] == 0 && coord[_z] == 0) ? 0:me;
|
||||
} else {
|
||||
color[_z]= coord[_y]*nprocs[_x]+coord[_x];
|
||||
id[_z] = coord[_z] == 0 ? 0 : me;
|
||||
}
|
||||
MPI_Comm_split(world, color[dim], id[dim], &subComm[dim]);
|
||||
}
|
||||
|
||||
//Set the minimum load and the balance load
|
||||
for(int dim = 0; dim<3; dim++){
|
||||
balancedLoad[dim] = 1./nprocs[dim];
|
||||
minLoad[dim] = 0.8*balancedLoad[dim];
|
||||
}
|
||||
//set and communicate the workload in reverse order
|
||||
for(int dim = _z; dim>= _x; dim--)
|
||||
{
|
||||
if(subComm[dim] != MPI_COMM_NULL){
|
||||
MPI_Gather(&time,1,type,load[dim],1,type,0,subComm[dim]);
|
||||
|
||||
if(id[dim] == 0)
|
||||
{
|
||||
for(int n=0; n<nprocs[dim]; n++)
|
||||
t_sum[dim] += load[dim][n];
|
||||
|
||||
for(int n=0; n<nprocs[dim]; n++)
|
||||
load[dim][n] /= t_sum[dim];
|
||||
}
|
||||
time =t_sum[dim];
|
||||
}
|
||||
MPI_Barrier(world);
|
||||
}
|
||||
|
||||
//Brodacast the new boundaries along dimensions
|
||||
for(int dim=0; dim<3; dim++){
|
||||
|
||||
if(subComm[dim] != MPI_COMM_NULL){
|
||||
|
||||
MPI_Bcast(boundaries,6,type,0,subComm[dim]);
|
||||
if(id[dim] == 0) {
|
||||
fixedPointIteration(load[dim], nprocs[dim], minLoad[dim]);
|
||||
MD_FLOAT inv_sum=0;
|
||||
for(int n=0; n<nprocs[dim];n++)
|
||||
inv_sum +=(1/load[dim][n]);
|
||||
|
||||
for(int n=0; n<nprocs[dim];n++)
|
||||
cellSize[n] = (prd[dim]/load[dim][n])*(1./inv_sum);
|
||||
|
||||
MD_FLOAT sum=0;
|
||||
for(int n=0; n<nprocs[dim]; n++){
|
||||
limits[2*n] = sum;
|
||||
limits[2*n+1] = sum+cellSize[n];
|
||||
sum+= cellSize[n];
|
||||
}
|
||||
limits[2*nprocs[dim]-1] = prd[dim];
|
||||
}
|
||||
MPI_Scatter(limits,2,type,recv_buf,2,type,0,subComm[dim]);
|
||||
boundaries[2*dim] = recv_buf[0];
|
||||
boundaries[2*dim+1] = recv_buf[1];
|
||||
}
|
||||
MPI_Barrier(world);
|
||||
}
|
||||
|
||||
atom->mybox.lo[_x]=boundaries[0]; atom->mybox.hi[_x]=boundaries[1];
|
||||
atom->mybox.lo[_y]=boundaries[2]; atom->mybox.hi[_y]=boundaries[3];
|
||||
atom->mybox.lo[_z]=boundaries[4]; atom->mybox.hi[_z]=boundaries[5];
|
||||
|
||||
MD_FLOAT domain[6] = {boundaries[0], boundaries[2], boundaries[4], boundaries[1], boundaries[3], boundaries[5]};
|
||||
MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
|
||||
|
||||
//because cells change dynamically, It is required to increase the neighbouring exchange region
|
||||
for(int dim =_x; dim<=_z; dim++){
|
||||
MD_FLOAT dr,dr_max;
|
||||
int n = grid->nprocs[dim];
|
||||
MD_FLOAT maxdelta = 0.2*prd[dim];
|
||||
dr = MAX(fabs(lo[dim] - atom->mybox.lo[dim]),fabs(hi[dim] - atom->mybox.hi[dim]));
|
||||
MPI_Allreduce(&dr, &dr_max, 1, type, MPI_MAX, world);
|
||||
grid->cutneigh[dim] = param->cutneigh+dr_max;
|
||||
}
|
||||
|
||||
for(int dim=0; dim<3; dim++) {
|
||||
if(subComm[dim] != MPI_COMM_NULL){
|
||||
MPI_Comm_free(&subComm[dim]);
|
||||
}
|
||||
free(load[dim]);
|
||||
}
|
||||
free(load);
|
||||
free(limits);
|
||||
}
|
||||
|
||||
//RCB Balancing
|
||||
MD_FLOAT meanTimeBisect(Atom *atom, MPI_Comm subComm, int dim, double time)
|
||||
{
|
||||
MD_FLOAT mean=0, sum=0, total_sum=0, weightAtoms= 0, total_weight=0;
|
||||
|
||||
for(int i=0; i<atom->Nlocal; i++){
|
||||
sum += atom_pos(i);
|
||||
}
|
||||
sum*=time;
|
||||
weightAtoms = atom->Nlocal*time;
|
||||
MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm);
|
||||
MPI_Allreduce(&weightAtoms, &total_weight, 1, type, MPI_SUM, subComm);
|
||||
|
||||
mean = total_sum/total_weight;
|
||||
return mean;
|
||||
}
|
||||
|
||||
MD_FLOAT meanBisect(Atom* atom, MPI_Comm subComm, int dim, double time)
|
||||
{
|
||||
int Natoms = 0;
|
||||
MD_FLOAT sum=0, mean=0, total_sum=0;
|
||||
|
||||
for(int i=0; i<atom->Nlocal; i++){
|
||||
sum += atom_pos(i);
|
||||
}
|
||||
MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm);
|
||||
MPI_Allreduce(&atom->Nlocal, &Natoms, 1, MPI_INT, MPI_SUM, subComm);
|
||||
mean = total_sum/Natoms;
|
||||
return mean;
|
||||
}
|
||||
|
||||
void nextBisectionLevel(Grid* grid, Atom* atom, RCB_Method method, MPI_Comm subComm, int dim ,int* color, int ilevel, double time)
|
||||
{
|
||||
int rank, size;
|
||||
int branch = 0, i = 0, m = 0;
|
||||
int nsend = 0, nrecv = 0, nrecv2 = 0;
|
||||
int values_per_atom = 7;
|
||||
MD_FLOAT bisection, pos;
|
||||
MPI_Request request[2] = {MPI_REQUEST_NULL,MPI_REQUEST_NULL};
|
||||
MPI_Comm_rank(subComm,&rank);
|
||||
MPI_Comm_size(subComm,&size);
|
||||
|
||||
int odd = size%2;
|
||||
int extraProc = odd ? size-1:size;
|
||||
int half = (int) (0.5*size);
|
||||
int partner = (rank<half) ? rank+half:rank-half;
|
||||
if(odd && rank == extraProc) partner = 0;
|
||||
//Apply the bisection
|
||||
bisection = method(atom,subComm,dim,time);
|
||||
//Define the new boundaries
|
||||
if(rank<half){
|
||||
atom->mybox.hi[dim] = bisection;
|
||||
branch = 0;
|
||||
} else {
|
||||
atom->mybox.lo[dim] = bisection;
|
||||
branch = 1;
|
||||
}
|
||||
//Define new color for the further communicaton
|
||||
*color = (branch << ilevel) | *color;
|
||||
//Grow the send buffer
|
||||
if(atom->Nlocal>=grid->maxsend){
|
||||
if(grid->buf_send) free(grid->buf_send);
|
||||
grid->buf_send = (MD_FLOAT*) malloc(atom->Nlocal*values_per_atom* sizeof(MD_FLOAT));
|
||||
grid->maxsend = atom->Nlocal;
|
||||
}
|
||||
//buffer particles to send
|
||||
while(i < atom->Nlocal) {
|
||||
pos = atom_pos(i);
|
||||
if(pos < atom->mybox.lo[dim] || pos >= atom->mybox.hi[dim]) {
|
||||
nsend += packExchange(atom, i, &grid->buf_send[nsend]);
|
||||
copy(atom, i, atom->Nlocal-1);
|
||||
atom->Nlocal--;
|
||||
} else i++;
|
||||
}
|
||||
|
||||
//Communicate the number of elements to be sent
|
||||
if(rank < extraProc){
|
||||
MPI_Irecv(&nrecv,1,MPI_INT,partner,0,subComm,&request[0]);
|
||||
}
|
||||
if(odd && rank == 0){
|
||||
MPI_Irecv(&nrecv2,1,MPI_INT,extraProc,0,subComm,&request[1]);
|
||||
}
|
||||
MPI_Send(&nsend,1,MPI_INT,partner,0,subComm);
|
||||
MPI_Waitall(2,request,MPI_STATUS_IGNORE);
|
||||
|
||||
//Grow the recv buffer
|
||||
if(nrecv+nrecv2>=grid->maxrecv){
|
||||
if(grid->buf_recv) free(grid->buf_recv);
|
||||
grid->buf_recv = (MD_FLOAT*) malloc((nrecv+nrecv2)*values_per_atom*sizeof(MD_FLOAT));
|
||||
grid->maxrecv = nrecv+nrecv2;
|
||||
}
|
||||
|
||||
//communicate elements in the buffer
|
||||
request[0] = MPI_REQUEST_NULL;
|
||||
request[1] = MPI_REQUEST_NULL;
|
||||
|
||||
if(rank < extraProc){
|
||||
MPI_Irecv(grid->buf_recv,nrecv,type,partner,0,subComm,&request[0]);
|
||||
}
|
||||
if(odd && rank == 0){
|
||||
MPI_Irecv(&grid->buf_recv[nrecv],nrecv2,type,extraProc,0,subComm,&request[1]);
|
||||
}
|
||||
MPI_Send (grid->buf_send,nsend,type,partner,0,subComm);
|
||||
MPI_Waitall(2,request,MPI_STATUS_IGNORE);
|
||||
|
||||
//store atoms in atom list
|
||||
while(m < nrecv+nrecv2){
|
||||
m += unpackExchange(atom, atom->Nlocal++, &grid->buf_recv[m]);
|
||||
}
|
||||
}
|
||||
|
||||
void rcbBalance(Grid* grid, Atom* atom, Parameter* param, RCB_Method method, int ndim, double newTime)
|
||||
{
|
||||
int me, nprocs=0, ilevel=0, nboxes=1;
|
||||
int color = 0, size =0;
|
||||
int index, prd[3];
|
||||
MPI_Comm subComm;
|
||||
MPI_Comm_size(world, &nprocs);
|
||||
MPI_Comm_rank(world, &me);
|
||||
|
||||
//set the elapsed time since the last dynamic balance
|
||||
double time = newTime - grid->Timer;
|
||||
|
||||
prd[_x] = atom->mybox.xprd = param->xprd;
|
||||
prd[_y] = atom->mybox.yprd = param->yprd;
|
||||
prd[_z] = atom->mybox.zprd = param->zprd;
|
||||
|
||||
//Sort by larger dimension
|
||||
int largerDim[3] ={_x, _y, _z};
|
||||
|
||||
for(int i = 0; i< 2; i++){
|
||||
for(int j = i+1; j<3; j++)
|
||||
{
|
||||
if(prd[largerDim[j]]>prd[largerDim[i]]){
|
||||
MD_FLOAT tmp = largerDim[j];
|
||||
largerDim[j] = largerDim[i];
|
||||
largerDim[i] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
//Initial Partition
|
||||
atom->mybox.lo[_x] = 0; atom->mybox.hi[_x] = atom->mybox.xprd;
|
||||
atom->mybox.lo[_y] = 0; atom->mybox.hi[_y] = atom->mybox.yprd;
|
||||
atom->mybox.lo[_z] = 0; atom->mybox.hi[_z] = atom->mybox.zprd;
|
||||
|
||||
//Recursion tree
|
||||
while(nboxes<nprocs)
|
||||
{
|
||||
index = ilevel%ndim;
|
||||
MPI_Comm_split(world, color, me, &subComm);
|
||||
MPI_Comm_size(subComm,&size);
|
||||
if(size > 1){
|
||||
nextBisectionLevel(grid, atom, method, subComm, largerDim[index], &color, ilevel, time);
|
||||
}
|
||||
MPI_Comm_free(&subComm);
|
||||
nboxes = pow(2,++ilevel);
|
||||
}
|
||||
//Set the new timer grid
|
||||
grid->Timer = newTime;
|
||||
|
||||
//Creating the global map
|
||||
MD_FLOAT domain[6] = {atom->mybox.lo[_x], atom->mybox.lo[_y], atom->mybox.lo[_z], atom->mybox.hi[_x], atom->mybox.hi[_y], atom->mybox.hi[_z]};
|
||||
MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
|
||||
|
||||
//Define the same cutneighbour in all dimensions for the exchange communication
|
||||
for(int dim =_x; dim<=_z; dim++)
|
||||
grid->cutneigh[dim] = param->cutneigh;
|
||||
}
|
||||
|
||||
//Regular grid
|
||||
void cartisian3d(Grid* grid, Parameter* param, Box* box)
|
||||
{
|
||||
int me, nproc;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
|
||||
int numdim=3;
|
||||
int reorder=0;
|
||||
int periods[3]={1,1,1};
|
||||
int mycoord[3]={0,0,0};
|
||||
int griddim[3]={0,0,0};
|
||||
MD_FLOAT len[3];
|
||||
MPI_Comm cartesian;
|
||||
|
||||
box->xprd = param->xprd;
|
||||
box->yprd = param->yprd;
|
||||
box->zprd = param->zprd;
|
||||
|
||||
//Creates a cartesian 3d grid
|
||||
MPI_Dims_create(nproc, numdim, griddim);
|
||||
MPI_Cart_create(world,numdim,griddim,periods,reorder,&cartesian);
|
||||
grid->nprocs[_x] = griddim[_x];
|
||||
grid->nprocs[_y] = griddim[_y];
|
||||
grid->nprocs[_z] = griddim[_z];
|
||||
|
||||
//Coordinates position in the grid
|
||||
MPI_Cart_coords(cartesian,me,3,mycoord);
|
||||
grid->coord[_x] = mycoord[_x];
|
||||
grid->coord[_y] = mycoord[_y];
|
||||
grid->coord[_z] = mycoord[_z];
|
||||
|
||||
//boundaries of my local box, with origin in (0,0,0).
|
||||
len[_x] = param->xprd / griddim[_x];
|
||||
len[_y] = param->yprd / griddim[_y];
|
||||
len[_z] = param->zprd / griddim[_z];
|
||||
|
||||
box->lo[_x] = mycoord[_x] * len[_x];
|
||||
box->hi[_x] = (mycoord[_x] + 1) * len[_x];
|
||||
box->lo[_y] = mycoord[_y] * len[_y];
|
||||
box->hi[_y] = (mycoord[_y] + 1) * len[_y];
|
||||
box->lo[_z] = mycoord[_z] * len[_z];
|
||||
box->hi[_z] = (mycoord[_z] + 1) * len[_z];
|
||||
|
||||
MD_FLOAT domain[6] = {box->lo[_x], box->lo[_y], box->lo[_z], box->hi[_x], box->hi[_y], box->hi[_z]};
|
||||
MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
|
||||
MPI_Comm_free(&cartesian);
|
||||
|
||||
//Define the same cutneighbour in all dimensions for the exchange communication
|
||||
for(int dim =_x; dim<=_z; dim++)
|
||||
grid->cutneigh[dim] = param->cutneigh;
|
||||
}
|
||||
|
||||
//Other Functions from the grid
|
||||
void initGrid(Grid* grid)
|
||||
{ //start with regular grid
|
||||
int nprocs;
|
||||
MPI_Comm_size(world, &nprocs);
|
||||
grid->map_size = 6 * nprocs;
|
||||
grid->map = (MD_FLOAT*) allocate(ALIGNMENT, grid->map_size * sizeof(MD_FLOAT));
|
||||
//========rcb=======
|
||||
grid->maxsend = 0;
|
||||
grid->maxrecv = 0;
|
||||
grid->buf_send = NULL;
|
||||
grid->buf_recv = NULL;
|
||||
//====staggered=====
|
||||
grid->Timer = 0.;
|
||||
}
|
||||
|
||||
void setupGrid(Grid* grid, Atom* atom, Parameter* param)
|
||||
{
|
||||
int me;
|
||||
MD_FLOAT xlo, ylo, zlo, xhi, yhi, zhi;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
initGrid(grid);
|
||||
|
||||
//Set the origin at (0,0,0)
|
||||
if(param->input_file){
|
||||
for(int i=0; i<atom->Nlocal; i++){
|
||||
atom_x(i) = atom_x(i) - param->xlo;
|
||||
atom_y(i) = atom_y(i) - param->ylo;
|
||||
atom_z(i) = atom_z(i) - param->zlo;
|
||||
}
|
||||
}
|
||||
|
||||
cartisian3d(grid, param, &atom->mybox);
|
||||
|
||||
xlo = atom->mybox.lo[_x]; xhi = atom->mybox.hi[_x];
|
||||
ylo = atom->mybox.lo[_y]; yhi = atom->mybox.hi[_y];
|
||||
zlo = atom->mybox.lo[_z]; zhi = atom->mybox.hi[_z];
|
||||
|
||||
int i = 0;
|
||||
while(i < atom->Nlocal)
|
||||
{
|
||||
if(atom_x(i) >= xlo && atom_x(i)< xhi &&
|
||||
atom_y(i) >= ylo && atom_y(i)< yhi &&
|
||||
atom_z(i) >= zlo && atom_z(i)< zhi)
|
||||
{
|
||||
i++;
|
||||
} else {
|
||||
copy(atom, i, atom->Nlocal-1);
|
||||
atom->Nlocal--;
|
||||
}
|
||||
}
|
||||
|
||||
//printGrid(grid);
|
||||
if(!param->balance){
|
||||
MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world);
|
||||
printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
|
||||
MPI_Barrier(world);
|
||||
}
|
||||
}
|
||||
|
||||
void printGrid(Grid* grid)
|
||||
{
|
||||
int me, nprocs;
|
||||
MPI_Comm_size(world, &nprocs);
|
||||
MPI_Comm_rank(world, &me);
|
||||
MD_FLOAT* map = grid->map;
|
||||
if(me==0)
|
||||
{
|
||||
|
||||
printf("GRID:\n");
|
||||
printf("===================================================================================================\n");
|
||||
for(int i=0; i<nprocs; i++)
|
||||
printf("Box:%i\txlo:%.4f\txhi:%.4f\tylo:%.4f\tyhi:%.4f\tzlo:%.4f\tzhi:%.4f\n", i,map[6*i],map[6*i+3],map[6*i+1],map[6*i+4],map[6*i+2],map[6*i+5]);
|
||||
printf("\n\n");
|
||||
//printf("Box processor:%i\n xlo:%.4f\txhi:%.4f\n ylo:%.4f\tyhi:%.4f\n zlo:%.4f\tzhi:%.4f\n", i,map[6*i],map[6*i+3],map[6*i+1],map[6*i+4],map[6*i+2],map[6*i+5]);
|
||||
}
|
||||
MPI_Barrier(world);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifndef __ALLOCATE_H_
|
||||
#define __ALLOCATE_H_
|
||||
extern void* allocate (int alignment, size_t bytesize);
|
||||
extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
|
||||
#endif
|
||||
@@ -1,22 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __BOX_H_
|
||||
#define __BOX_H_
|
||||
|
||||
typedef struct {
|
||||
int id;
|
||||
MD_FLOAT xprd, yprd, zprd; //Domain Dimension
|
||||
MD_FLOAT lo[3]; //smallest coordinate of my subdomain
|
||||
MD_FLOAT hi[3]; //Highest coordinate of my subdomain
|
||||
} Box;
|
||||
|
||||
int overlapBox(int, int , const Box*, const Box* , Box* , MD_FLOAT , MD_FLOAT);
|
||||
int overlapFullBox(Parameter*, MD_FLOAT*, const Box*, const Box*);
|
||||
void expandBox(int , const Box*, const Box* , Box* , MD_FLOAT);
|
||||
#endif
|
||||
@@ -1,104 +0,0 @@
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <box.h>
|
||||
#include <grid.h>
|
||||
|
||||
#ifndef COMM_H
|
||||
#define COMM_H
|
||||
|
||||
#ifdef GROMACS
|
||||
#define FORWARD_SIZE (3*CLUSTER_N)
|
||||
#define REVERSE_SIZE (3*CLUSTER_N)
|
||||
#define GHOST_SIZE (4*CLUSTER_N+10)
|
||||
#define EXCHANGE_SIZE 7
|
||||
|
||||
#define JFAC MAX(1, CLUSTER_N / CLUSTER_M)
|
||||
#define LOCAL atom->Nclusters_local / JFAC
|
||||
#define GHOST atom->Nclusters_ghost
|
||||
|
||||
#define IsinRegionToSend(cj) \
|
||||
((atom->jclusters[(cj)].bbminx >= xlo || atom->jclusters[(cj)].bbmaxx >= xlo) && \
|
||||
(atom->jclusters[(cj)].bbminx < xhi || atom->jclusters[(cj)].bbmaxx < xhi) && \
|
||||
(atom->jclusters[(cj)].bbminy >= ylo || atom->jclusters[(cj)].bbmaxy >= ylo) && \
|
||||
(atom->jclusters[(cj)].bbminy < yhi || atom->jclusters[(cj)].bbmaxy < yhi) && \
|
||||
(atom->jclusters[(cj)].bbminz >= zlo || atom->jclusters[(cj)].bbmaxz >= zlo) && \
|
||||
(atom->jclusters[(cj)].bbminz < zhi || atom->jclusters[(cj)].bbmaxz < zhi))
|
||||
|
||||
#else
|
||||
|
||||
#define FORWARD_SIZE 3
|
||||
#define REVERSE_SIZE 3
|
||||
#define GHOST_SIZE 4
|
||||
#define EXCHANGE_SIZE 7
|
||||
#define LOCAL atom->Nlocal
|
||||
#define GHOST atom->Nghost
|
||||
|
||||
#define IsinRegionToSend(i) \
|
||||
((atom_x((i)) >= xlo && atom_x((i)) < xhi) && \
|
||||
(atom_y((i)) >= ylo && atom_y((i)) < yhi) && \
|
||||
(atom_z((i)) >= zlo && atom_z((i)) < zhi))
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int myproc; // my proc ID
|
||||
int numproc; // # of processors
|
||||
|
||||
int numneigh; // # of all my neighs along all swaps
|
||||
int maxneigh; // Buffer size for my neighs
|
||||
int sendfrom[6]; //return the lowest neigh index to send in each swap
|
||||
int sendtill[6]; //return the highest neigh index to send in each swao
|
||||
int recvfrom[6]; //return the lowest neigh index to recv in each swap
|
||||
int recvtill[6]; //return the highest neigh index to recv in each swap
|
||||
int* nsend; // neigh whose I want to send
|
||||
int* nrecv; // neigh whose I want to recv
|
||||
|
||||
int* pbc_x; // if pbc in x
|
||||
int* pbc_y; // if pbc in y
|
||||
int* pbc_z; // if pbc in z
|
||||
|
||||
int* atom_send, *atom_recv; // # of atoms to send/recv for each of my neighs
|
||||
int* off_atom_send; // atom offset to send, inside of a swap
|
||||
int* off_atom_recv; // atom offset to recv, inside of a swap
|
||||
|
||||
int* nexch; //procs to exchange
|
||||
int numneighexch; //# of neighbours to exchange
|
||||
int maxneighexch; //max buff size to store neighbours
|
||||
|
||||
int numswap; // # of swaps to perform, it is 6
|
||||
int swapdim[6]; // dimension of the swap (_x, _y or _z)
|
||||
int swapdir[6]; // direction of the swap 0 or 1
|
||||
int swap[3][2]; // given a dim and dir, knows the swap
|
||||
int othersend[6]; // Determine if a proc interact with more procs in a given swap
|
||||
|
||||
int firstrecv[6]; // where to put 1st recv atom in each swap
|
||||
int** sendlist; // list of atoms to send in each swap
|
||||
int* maxsendlist; // max # of atoms send in each list-swap
|
||||
|
||||
int maxsend; // max elements in buff sender
|
||||
int maxrecv; // max elements in buff receiver
|
||||
MD_FLOAT* buf_send; // sender buffer for all comm
|
||||
MD_FLOAT* buf_recv; // receicer buffer for all comm
|
||||
|
||||
int forwardSize; // # of paramaters per atom in forward comm.
|
||||
int reverseSize; // # of parameters per atom in reverse
|
||||
int exchangeSize; // # of parameters per atom in exchange
|
||||
int ghostSize; // # of parameters per atom in ghost list
|
||||
|
||||
int iterAtom; //last atom to iterate in each swap.
|
||||
Box* boxes; // Boundaries to be sent to other procs as ghost.
|
||||
} Comm;
|
||||
|
||||
|
||||
void initComm(int*, char***, Comm*); //Init MPI
|
||||
void endComm(Comm*); //End MPI
|
||||
void setupComm(Comm*,Parameter*,Grid*); //Creates a 3d grid or rcb grid
|
||||
void neighComm(Comm*,Parameter*,Grid*); //Find neighbours within cut-off and defines ghost regions
|
||||
void forwardComm(Comm*,Atom*,int); //Send info in one direction
|
||||
void reverseComm(Comm*,Atom*,int); //Return info after forward communication
|
||||
void exchangeComm(Comm*,Atom*); //Exchange info between procs
|
||||
void ghostComm(Comm*, Atom*,int); //Build the ghost neighbours to send during next forwards
|
||||
void growSend(Comm*,int); //Grows the size of the buffer sender
|
||||
void growRecv(Comm*,int); //Grows the size of the buffer receiver
|
||||
void growList(Comm*, int, int); //Grows the size of the list to send
|
||||
#endif
|
||||
@@ -1,26 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stddef.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <neighbor.h>
|
||||
|
||||
#ifndef __DEVICE_H_
|
||||
#define __DEVICE_H_
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
#include <cuda_runtime.h>
|
||||
extern void cuda_assert(const char *msg, cudaError_t err);
|
||||
#endif
|
||||
|
||||
extern void initDevice(Atom*, Neighbor*);
|
||||
extern void *allocateGPU(size_t bytesize);
|
||||
extern void *reallocateGPU(void *ptr, size_t new_bytesize);
|
||||
extern void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize);
|
||||
extern void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize);
|
||||
extern void memsetGPU(void *d_ptr, int value, size_t bytesize);
|
||||
#endif
|
||||
@@ -1,39 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __EAM_H_
|
||||
#define __EAM_H_
|
||||
typedef struct {
|
||||
int nrho, nr;
|
||||
MD_FLOAT drho, dr, cut, mass;
|
||||
MD_FLOAT *frho, *rhor, *zr;
|
||||
} Funcfl;
|
||||
|
||||
typedef struct {
|
||||
MD_FLOAT* fp;
|
||||
int nmax;
|
||||
int nrho, nr;
|
||||
int nrho_tot, nr_tot;
|
||||
MD_FLOAT dr, rdr, drho, rdrho;
|
||||
MD_FLOAT *frho, *rhor, *z2r;
|
||||
MD_FLOAT *rhor_spline, *frho_spline, *z2r_spline;
|
||||
Funcfl file;
|
||||
} Eam;
|
||||
|
||||
void initEam(Eam* eam, Parameter* param);
|
||||
void coeff(Eam* eam, Parameter* param);
|
||||
void init_style(Eam* eam, Parameter *param);
|
||||
void read_eam_file(Funcfl* file, const char* filename);
|
||||
void file2array(Eam* eam);
|
||||
void array2spline(Eam* eam, Parameter* param);
|
||||
void interpolate(int n, MD_FLOAT delta, MD_FLOAT* f, MD_FLOAT* spline);
|
||||
void grab(FILE* fptr, int n, MD_FLOAT* list);
|
||||
#endif
|
||||
@@ -1,51 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
|
||||
|
||||
#include <parameter.h>
|
||||
#include <box.h>
|
||||
#include <atom.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#ifndef __MAP_H_
|
||||
#define __MAP_H_
|
||||
|
||||
#define world MPI_COMM_WORLD
|
||||
#define atom_pos(i) ((dim == _x) ? atom_x((i)) : (dim == _y) ? atom_y((i)) : atom_z((i)))
|
||||
|
||||
enum {RCB=1, meanTimeRCB, Staggered};
|
||||
|
||||
typedef struct {
|
||||
int balance_every;
|
||||
int map_size;
|
||||
MD_FLOAT* map;
|
||||
//===Param for Staggerd balance
|
||||
int nprocs[3];
|
||||
int coord[3];
|
||||
MD_FLOAT cutneigh[3];
|
||||
double Timer;
|
||||
//===Param for RCB balance
|
||||
MD_FLOAT* buf_send;
|
||||
MD_FLOAT* buf_recv;
|
||||
int maxsend;
|
||||
int maxrecv;
|
||||
} Grid;
|
||||
|
||||
|
||||
typedef MD_FLOAT(*RCB_Method)(Atom*,MPI_Comm,int,double);
|
||||
|
||||
void setupGrid(Grid*, Atom*, Parameter*);
|
||||
void cartisian3d(Grid*, Parameter*, Box*);
|
||||
void rcbBalance(Grid*, Atom*, Parameter* ,RCB_Method, int, double);
|
||||
void staggeredBalance(Grid*, Atom*, Parameter*, double);
|
||||
void printGrid(Grid*);
|
||||
//rcb methods
|
||||
MD_FLOAT meanBisect(Atom* , MPI_Comm, int, double);
|
||||
MD_FLOAT meanTimeBisect(Atom*, MPI_Comm, int, double);
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#ifndef __PARAMETER_H_
|
||||
#define __PARAMETER_H_
|
||||
|
||||
#if PRECISION == 1
|
||||
# define MD_FLOAT float
|
||||
# define MD_UINT unsigned int
|
||||
#else
|
||||
# define MD_FLOAT double
|
||||
# define MD_UINT unsigned long long int
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int force_field;
|
||||
char* param_file;
|
||||
char* input_file;
|
||||
char* vtk_file;
|
||||
char* xtc_file;
|
||||
char* write_atom_file;
|
||||
MD_FLOAT epsilon;
|
||||
MD_FLOAT sigma;
|
||||
MD_FLOAT sigma6;
|
||||
MD_FLOAT temp;
|
||||
MD_FLOAT rho;
|
||||
MD_FLOAT mass;
|
||||
int ntypes;
|
||||
int ntimes;
|
||||
int nstat;
|
||||
int reneigh_every;
|
||||
int prune_every;
|
||||
int x_out_every;
|
||||
int v_out_every;
|
||||
int half_neigh;
|
||||
MD_FLOAT dt;
|
||||
MD_FLOAT dtforce;
|
||||
MD_FLOAT skin;
|
||||
MD_FLOAT cutforce;
|
||||
MD_FLOAT cutneigh;
|
||||
int nx, ny, nz;
|
||||
int pbc_x, pbc_y, pbc_z;
|
||||
MD_FLOAT lattice;
|
||||
MD_FLOAT xlo, xhi, ylo, yhi, zlo, zhi;
|
||||
MD_FLOAT xprd, yprd, zprd;
|
||||
double proc_freq;
|
||||
char* eam_file;
|
||||
// DEM
|
||||
MD_FLOAT k_s;
|
||||
MD_FLOAT k_dn;
|
||||
MD_FLOAT gx, gy, gz;
|
||||
MD_FLOAT reflect_x, reflect_y, reflect_z;
|
||||
//MPI implementation
|
||||
int balance;
|
||||
int method;
|
||||
int balance_every;
|
||||
} Parameter;
|
||||
|
||||
void initParameter(Parameter*);
|
||||
void readParameter(Parameter*, const char*);
|
||||
void printParameter(Parameter*);
|
||||
|
||||
#endif
|
||||
@@ -1,71 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
#include <comm.h>
|
||||
#include <atom.h>
|
||||
#include <timing.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
|
||||
//static void addDummyCluster(Atom*);
|
||||
|
||||
double forward(Comm* comm, Atom *atom, Parameter* param){
|
||||
double S, E;
|
||||
S = getTimeStamp();
|
||||
if(param->method == halfShell){
|
||||
for(int iswap = 0; iswap < 5; iswap++)
|
||||
forwardComm(comm, atom, iswap);
|
||||
} else if(param->method == eightShell){
|
||||
for(int iswap = 0; iswap < 6; iswap+=2)
|
||||
forwardComm(comm, atom, iswap);
|
||||
} else {
|
||||
for(int iswap = 0; iswap < 6; iswap++)
|
||||
forwardComm(comm, atom, iswap);
|
||||
}
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double reverse(Comm* comm, Atom *atom, Parameter* param){
|
||||
double S, E;
|
||||
S = getTimeStamp();
|
||||
if(param->method == halfShell){
|
||||
for(int iswap = 4; iswap >= 0; iswap--)
|
||||
reverseComm(comm, atom, iswap);
|
||||
} else if(param->method == eightShell){
|
||||
for(int iswap = 4; iswap >= 0; iswap-=2)
|
||||
reverseComm(comm, atom, iswap);
|
||||
} else if(param->method == halfStencil){
|
||||
for(int iswap = 5; iswap >= 0; iswap--)
|
||||
reverseComm(comm, atom, iswap);
|
||||
} else { } //Full Shell Reverse does nothing
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
void ghostNeighbor(Comm* comm, Atom* atom, Parameter* param)
|
||||
{
|
||||
#ifdef GROMACS
|
||||
atom->Nclusters_ghost = 0;
|
||||
#endif
|
||||
atom->Nghost = 0;
|
||||
if(param->method == halfShell){
|
||||
for(int iswap=0; iswap<5; iswap++)
|
||||
ghostComm(comm,atom,iswap);
|
||||
} else if(param->method == eightShell){
|
||||
for(int iswap = 0; iswap<6; iswap+=2)
|
||||
ghostComm(comm, atom,iswap);
|
||||
} else {
|
||||
for(int iswap=0; iswap<6; iswap++)
|
||||
ghostComm(comm,atom,iswap);
|
||||
}
|
||||
}
|
||||
@@ -1,68 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#ifndef __SIMD_H__
|
||||
#define __SIMD_H__
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#ifndef NO_ZMM_INTRIN
|
||||
# include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifndef CLUSTER_M
|
||||
# define CLUSTER_M 1
|
||||
#endif
|
||||
|
||||
#ifndef CLUSTER_N
|
||||
# define CLUSTER_N 1
|
||||
#endif
|
||||
|
||||
#if defined(__ISA_AVX512__)
|
||||
# if PRECISION == 2
|
||||
# include "simd/avx512_double.h"
|
||||
# else
|
||||
# include "simd/avx512_float.h"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(__ISA_AVX2__)
|
||||
# if PRECISION == 2
|
||||
# include "simd/avx2_double.h"
|
||||
# else
|
||||
# include "simd/avx2_float.h"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(__ISA_AVX__)
|
||||
# if PRECISION == 2
|
||||
# include "simd/avx_double.h"
|
||||
# else
|
||||
# include "simd/avx_float.h"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define SIMD_PRINT_REAL(a) simd_print_real(#a, a);
|
||||
#define SIMD_PRINT_MASK(a) simd_print_mask(#a, a);
|
||||
|
||||
static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
|
||||
double x[VECTOR_WIDTH];
|
||||
memcpy(x, &a, sizeof(x));
|
||||
|
||||
fprintf(stdout, "%s: ", ref);
|
||||
for(int i = 0; i < VECTOR_WIDTH; i++) {
|
||||
fprintf(stdout, "%f ", x[i]);
|
||||
}
|
||||
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
|
||||
static inline void simd_print_mask(const char *ref, MD_SIMD_MASK a) { fprintf(stdout, "%s: %x\n", ref, simd_mask_to_u32(a)); }
|
||||
|
||||
#endif // __SIMD_H__
|
||||
@@ -1,103 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#define MD_SIMD_FLOAT __m256d
|
||||
#define MD_SIMD_INT __m128i
|
||||
#define MD_SIMD_MASK __m256d
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_pd(p, a); }
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
|
||||
MD_SIMD_FLOAT ret;
|
||||
fprintf(stderr, "simd_load_h_duplicate(): Not implemented for AVX2 with double precision!");
|
||||
exit(-1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
|
||||
MD_SIMD_FLOAT ret;
|
||||
fprintf(stderr, "simd_load_h_dual(): Not implemented for AVX2 with double precision!");
|
||||
exit(-1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
fprintf(stderr, "simd_h_dual_incr_reduced_sum(): Not implemented for AVX2 with double precision!");
|
||||
exit(-1);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
__m256d t0, t1, t2;
|
||||
__m128d a0, a1;
|
||||
|
||||
t0 = _mm256_hadd_pd(v0, v1);
|
||||
t1 = _mm256_hadd_pd(v2, v3);
|
||||
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
|
||||
t0 = _mm256_add_pd(t0, t2);
|
||||
t1 = _mm256_add_pd(t1, t2);
|
||||
t0 = _mm256_blend_pd(t0, t1, 0xC);
|
||||
//t0 = _mm256_blend_pd(t0, t1, 0b1100);
|
||||
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
|
||||
_mm256_store_pd(m, t1);
|
||||
|
||||
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
|
||||
//t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
|
||||
a0 = _mm256_castpd256_pd128(t0);
|
||||
a1 = _mm256_extractf128_pd(t0, 0x1);
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
return *((MD_FLOAT *) &a0);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_and_pd(a, m); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
|
||||
//static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cvtepi32_pd(_mm_cmplt_epi32(a, b)); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _mm256_and_pd(a, b); }
|
||||
// TODO: Initialize all diagonal cases and just select the proper one (all bits set or diagonal) based on cond0
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
|
||||
const unsigned long long int all = 0xFFFFFFFFFFFFFFFF;
|
||||
const unsigned long long int none = 0x0;
|
||||
return _mm256_castsi256_pd(_mm256_set_epi64x((a & 0x8) ? all : none, (a & 0x4) ? all : none, (a & 0x2) ? all : none, (a & 0x1) ? all : none));
|
||||
}
|
||||
// TODO: Implement this, althrough it is just required for debugging
|
||||
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
__m128d a0, a1;
|
||||
// test with shuffle & add as an alternative to hadd later
|
||||
a = _mm256_hadd_pd(a, a);
|
||||
a0 = _mm256_castpd256_pd128(a);
|
||||
a1 = _mm256_extractf128_pd(a, 0x1);
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
return *((MD_FLOAT *) &a0);
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
fprintf(stderr, "simd_h_decr3(): Not implemented for AVX2 with double precision!");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// Functions used in LAMMPS kernel
|
||||
#define simd_gather(vidx, m, s) _mm256_i32gather_pd(m, vidx, s);
|
||||
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
|
||||
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
|
||||
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
|
||||
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm_load_si128((__m128i const *) m); }
|
||||
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_add_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_mul_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return simd_int_load(m) & _mm256_cvtpd_epi32(k); }
|
||||
@@ -1,84 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
|
||||
#define MD_SIMD_FLOAT __m256
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_ps(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_ps(p, a); }
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_mask_mov_ps(_mm256_setzero_ps(), m, a); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_ps_mask(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
__m128 t0;
|
||||
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
__m128 t0, t2;
|
||||
v0 = _mm256_hadd_ps(v0, v1);
|
||||
v2 = _mm256_hadd_ps(v2, v3);
|
||||
v0 = _mm256_hadd_ps(v0, v2);
|
||||
t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 0x1));
|
||||
t2 = _mm_add_ps(t0, _mm_load_ps(m));
|
||||
_mm_store_ps(m, t2);
|
||||
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
|
||||
return _mm256_broadcast_ps((const __m128 *)(m));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
|
||||
__m128 t0, t1;
|
||||
t0 = _mm_broadcast_ss(m);
|
||||
t1 = _mm_broadcast_ss(m + 1);
|
||||
return _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t1, 0x1);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
__m128 t0, t1;
|
||||
v0 = _mm256_hadd_ps(v0, v1);
|
||||
t0 = _mm256_extractf128_ps(v0, 0x1);
|
||||
t0 = _mm_hadd_ps(_mm256_castps256_ps128(v0), t0);
|
||||
t0 = _mm_permute_ps(t0, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
t1 = _mm_add_ps(t0, _mm_load_ps(m));
|
||||
_mm_store_ps(m, t1);
|
||||
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
__m128 asum = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
|
||||
_mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
simd_h_decr(m, a0);
|
||||
simd_h_decr(m + CLUSTER_N, a1);
|
||||
simd_h_decr(m + CLUSTER_N * 2, a2);
|
||||
}
|
||||
@@ -1,114 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <immintrin.h>
|
||||
#ifndef NO_ZMM_INTRIN
|
||||
# include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
#define MD_SIMD_FLOAT __m512d
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_BITMASK MD_SIMD_INT
|
||||
#define MD_SIMD_IBOOL __mmask16
|
||||
|
||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_pd(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_pd(a); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_pd(a, m, a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm512_store_pd(p, a); }
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_pd(_mm512_setzero_pd(), m, a); }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
|
||||
x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
|
||||
x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
|
||||
return *((MD_FLOAT *) &x);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
__m512d t0, t2;
|
||||
__m256d t3, t4;
|
||||
|
||||
t0 = _mm512_add_pd(v0, _mm512_permute_pd(v0, 0x55));
|
||||
t2 = _mm512_add_pd(v2, _mm512_permute_pd(v2, 0x55));
|
||||
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xaa), v1, _mm512_permute_pd(v1, 0x55));
|
||||
t2 = _mm512_mask_add_pd(t2, simd_mask_from_u32(0xaa), v3, _mm512_permute_pd(v3, 0x55));
|
||||
t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0x4e));
|
||||
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xF0), t2, _mm512_shuffle_f64x2(t2, t2, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0xb1));
|
||||
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0x0C), t0, t0, 0xee);
|
||||
t3 = _mm512_castpd512_pd256(t0);
|
||||
t4 = _mm256_load_pd(m);
|
||||
t4 = _mm256_add_pd(t4, t3);
|
||||
_mm256_store_pd(m, t4);
|
||||
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
|
||||
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
|
||||
return _mm512_broadcast_f64x4(_mm256_load_pd(m));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
|
||||
return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
__m512d t0;
|
||||
__m256d t2, t3;
|
||||
|
||||
t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
|
||||
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
|
||||
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
|
||||
t2 = _mm512_castpd512_pd256(t0);
|
||||
t3 = _mm256_load_pd(m);
|
||||
t3 = _mm256_add_pd(t3, t2);
|
||||
_mm256_store_pd(m, t3);
|
||||
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
|
||||
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
|
||||
}
|
||||
|
||||
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
__m256d t;
|
||||
a = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
|
||||
t = _mm256_load_pd(m);
|
||||
t = _mm256_sub_pd(t, _mm512_castpd512_pd256(a));
|
||||
_mm256_store_pd(m, t);
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
simd_h_decr(m, a0);
|
||||
simd_h_decr(m + CLUSTER_N, a1);
|
||||
simd_h_decr(m + CLUSTER_N * 2, a2);
|
||||
}
|
||||
|
||||
// Functions used in LAMMPS kernel
|
||||
//static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm512_i32gather_pd(vidx, m, s); }
|
||||
#define simd_gather(vidx,m,s) (_mm512_i32gather_pd(vidx, m, s))
|
||||
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm256_set1_epi32(scalar); }
|
||||
static inline MD_SIMD_INT simd_int_zero() { return _mm256_setzero_si256(); }
|
||||
static inline MD_SIMD_INT simd_int_seq() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
|
||||
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_si256((const MD_SIMD_INT *) m); }
|
||||
//static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_epi32(m); }
|
||||
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_add_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_mul_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return _mm256_mask_load_epi32(simd_int_zero(), k, m); }
|
||||
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cmp_epi32_mask(a, b, _MM_CMPINT_LT); }
|
||||
@@ -1,103 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#ifndef NO_ZMM_INTRIN
|
||||
# include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
#define MD_SIMD_FLOAT __m512
|
||||
#define MD_SIMD_MASK __mmask16
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_IBOOL __mmask16
|
||||
#define MD_SIMD_INT32 __m512i
|
||||
#define MD_SIMD_BITMASK MD_SIMD_INT32
|
||||
|
||||
static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
|
||||
return _mm512_load_si512(m);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
|
||||
return _mm512_set1_epi32(a);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
|
||||
return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm512_store_ps(p, a); }
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_ps(_mm512_setzero_ps(), m, a); }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
|
||||
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
|
||||
exit(-1);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
|
||||
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
|
||||
exit(-1);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
|
||||
return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
|
||||
return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
__m512 t0, t1;
|
||||
__m128 t2, t3;
|
||||
|
||||
t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
|
||||
t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
|
||||
t0 = _mm512_add_ps(t0, t1);
|
||||
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
|
||||
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
|
||||
t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
|
||||
t3 = _mm512_castps512_ps128(t0);
|
||||
t2 = _mm_load_ps(m);
|
||||
t2 = _mm_add_ps(t2, t3);
|
||||
_mm_store_ps(m, t2);
|
||||
|
||||
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
|
||||
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
|
||||
return _mm_cvtss_f32(t3);
|
||||
}
|
||||
|
||||
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
__m256 t;
|
||||
a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
|
||||
t = _mm256_load_ps(m);
|
||||
t = _mm256_sub_ps(t, _mm512_castps512_ps256(a));
|
||||
_mm256_store_ps(m, t);
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
simd_h_decr(m, a0);
|
||||
simd_h_decr(m + CLUSTER_N, a1);
|
||||
simd_h_decr(m + CLUSTER_N * 2, a2);
|
||||
}
|
||||
@@ -1,103 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#define MD_SIMD_FLOAT __m256d
|
||||
#define MD_SIMD_INT __m128i
|
||||
#define MD_SIMD_MASK __m256d
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_pd(p, a); }
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
|
||||
MD_SIMD_FLOAT ret;
|
||||
fprintf(stderr, "simd_load_h_duplicate(): Not implemented for AVX with double precision!");
|
||||
exit(-1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
|
||||
MD_SIMD_FLOAT ret;
|
||||
fprintf(stderr, "simd_load_h_dual(): Not implemented for AVX with double precision!");
|
||||
exit(-1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
fprintf(stderr, "simd_h_dual_incr_reduced_sum(): Not implemented for AVX with double precision!");
|
||||
exit(-1);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
__m256d t0, t1, t2;
|
||||
__m128d a0, a1;
|
||||
|
||||
t0 = _mm256_hadd_pd(v0, v1);
|
||||
t1 = _mm256_hadd_pd(v2, v3);
|
||||
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
|
||||
t0 = _mm256_add_pd(t0, t2);
|
||||
t1 = _mm256_add_pd(t1, t2);
|
||||
t0 = _mm256_blend_pd(t0, t1, 0b1100);
|
||||
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
|
||||
_mm256_store_pd(m, t1);
|
||||
|
||||
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
|
||||
a0 = _mm256_castpd256_pd128(t0);
|
||||
a1 = _mm256_extractf128_pd(t0, 0x1);
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
return *((MD_FLOAT *) &a0);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_and_pd(a, m); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
|
||||
#ifdef __ISA_AVX_FMA__
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
|
||||
#else
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
|
||||
#endif
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cvtepi32_pd(_mm_cmplt_epi32(a, b)); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _mm256_and_pd(a, b); }
|
||||
// TODO: Initialize all diagonal cases and just select the proper one (all bits set or diagonal) based on cond0
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
|
||||
const unsigned long long int all = 0xFFFFFFFFFFFFFFFF;
|
||||
const unsigned long long int none = 0x0;
|
||||
return _mm256_castsi256_pd(_mm256_set_epi64x((a & 0x8) ? all : none, (a & 0x4) ? all : none, (a & 0x2) ? all : none, (a & 0x1) ? all : none));
|
||||
}
|
||||
// TODO: Implement this, althrough it is just required for debugging
|
||||
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
__m128d a0, a1;
|
||||
a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
|
||||
a0 = _mm256_castpd256_pd128(a);
|
||||
a1 = _mm256_extractf128_pd(a, 0x1);
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
return *((MD_FLOAT *) &a0);
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
fprintf(stderr, "simd_h_decr3(): Not implemented for AVX with double precision!");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// Functions used in LAMMPS kernel
|
||||
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
|
||||
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
|
||||
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
|
||||
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
|
||||
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm_load_si128((__m128i const *) m); }
|
||||
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_add_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_mul_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return simd_int_load(m) & _mm256_cvtpd_epi32(k); }
|
||||
@@ -1,84 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
|
||||
#define MD_SIMD_FLOAT __m256
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_ps(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_ps(p, a); }
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_mask_mov_ps(_mm256_setzero_ps(), m, a); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_ps_mask(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
__m128 t0;
|
||||
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
__m128 t0, t2;
|
||||
v0 = _mm256_hadd_ps(v0, v1);
|
||||
v2 = _mm256_hadd_ps(v2, v3);
|
||||
v0 = _mm256_hadd_ps(v0, v2);
|
||||
t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 0x1));
|
||||
t2 = _mm_add_ps(t0, _mm_load_ps(m));
|
||||
_mm_store_ps(m, t2);
|
||||
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
|
||||
return _mm256_broadcast_ps((const __m128 *)(m));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
|
||||
__m128 t0, t1;
|
||||
t0 = _mm_broadcast_ss(m);
|
||||
t1 = _mm_broadcast_ss(m + 1);
|
||||
return _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t1, 0x1);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
__m128 t0, t1;
|
||||
v0 = _mm256_hadd_ps(v0, v1);
|
||||
t0 = _mm256_extractf128_ps(v0, 0x1);
|
||||
t0 = _mm_hadd_ps(_mm256_castps256_ps128(v0), t0);
|
||||
t0 = _mm_permute_ps(t0, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
t1 = _mm_add_ps(t0, _mm_load_ps(m));
|
||||
_mm_store_ps(m, t1);
|
||||
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
__m128 asum = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
|
||||
_mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
simd_h_decr(m, a0);
|
||||
simd_h_decr(m + CLUSTER_N, a1);
|
||||
simd_h_decr(m + CLUSTER_N * 2, a2);
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
|
||||
#ifndef __THERMO_H_
|
||||
#define __THERMO_H_
|
||||
extern void setupThermo(Parameter*, int);
|
||||
extern void computeThermo(int, Parameter*, Atom*);
|
||||
extern void adjustThermo(Parameter*, Atom*);
|
||||
#endif
|
||||
@@ -1,23 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#ifndef __TIMERS_H_
|
||||
#define __TIMERS_H_
|
||||
|
||||
typedef enum {
|
||||
TOTAL = 0,
|
||||
FORCE,
|
||||
NEIGH,
|
||||
FORWARD,
|
||||
REVERSE,
|
||||
UPDATE,
|
||||
BALANCE,
|
||||
SETUP,
|
||||
REST,
|
||||
NUMTIMER
|
||||
} timerComm;
|
||||
|
||||
#endif
|
||||
@@ -1,14 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#ifndef __TIMING_H_
|
||||
#define __TIMING_H_
|
||||
|
||||
extern double getTimeStamp(void);
|
||||
extern double getTimeResolution(void);
|
||||
extern double getTimeStamp_(void);
|
||||
|
||||
#endif
|
||||
@@ -1,55 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <math.h>
|
||||
|
||||
#ifndef __UTIL_H_
|
||||
#define __UTIL_H_
|
||||
|
||||
#ifndef MIN
|
||||
# define MIN(x,y) ((x)<(y)?(x):(y))
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
# define MAX(x,y) ((x)>(y)?(x):(y))
|
||||
#endif
|
||||
|
||||
#ifndef ABS
|
||||
# define ABS(a) ((a) >= 0 ? (a) : -(a))
|
||||
#endif
|
||||
|
||||
#define DEBUG_MESSAGE debug_printf
|
||||
|
||||
#ifndef MAXLINE
|
||||
# define MAXLINE 4096
|
||||
#endif
|
||||
|
||||
#define FF_LJ 0
|
||||
#define FF_EAM 1
|
||||
#define FF_DEM 2
|
||||
|
||||
#if PRECISION == 1
|
||||
# define PRECISION_STRING "single"
|
||||
#else
|
||||
# define PRECISION_STRING "double"
|
||||
#endif
|
||||
|
||||
#define BigOrEqual(a,b) (fabs((a)-(b))<1e-9 || (a)>(b))
|
||||
#define Equal(a,b) (fabs((a)-(b))<1e-9)
|
||||
|
||||
enum {_x=0, _y, _z};
|
||||
enum {fullShell=0, halfShell, eightShell, halfStencil};
|
||||
|
||||
|
||||
extern double myrandom(int*);
|
||||
extern void random_reset(int *seed, int ibase, double *coord);
|
||||
extern int str2ff(const char *string);
|
||||
extern const char* ff2str(int ff);
|
||||
extern void readline(char *line, FILE *fp);
|
||||
extern void debug_printf(const char *format, ...);
|
||||
extern int get_cuda_num_threads();
|
||||
|
||||
#endif
|
||||
@@ -1,211 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
#include <mpi.h>
|
||||
|
||||
void initParameter(Parameter *param) {
|
||||
param->input_file = NULL;
|
||||
param->vtk_file = NULL;
|
||||
param->xtc_file = NULL;
|
||||
param->eam_file = NULL;
|
||||
param->write_atom_file = NULL;
|
||||
param->force_field = FF_LJ;
|
||||
param->epsilon = 1.0;
|
||||
param->sigma = 1.0;
|
||||
param->sigma6 = 1.0;
|
||||
param->rho = 0.8442;
|
||||
param->ntypes = 4;
|
||||
param->ntimes = 200;
|
||||
param->dt = 0.005;
|
||||
param->nx = 32;
|
||||
param->ny = 32;
|
||||
param->nz = 32;
|
||||
param->pbc_x = 1;
|
||||
param->pbc_y = 1;
|
||||
param->pbc_z = 1;
|
||||
param->cutforce = 2.5;
|
||||
param->skin = 0.3;
|
||||
param->cutneigh = param->cutforce + param->skin;
|
||||
param->temp = 1.44;
|
||||
param->nstat = 100;
|
||||
param->mass = 1.0;
|
||||
param->dtforce = 0.5 * param->dt;
|
||||
param->reneigh_every = 20;
|
||||
param->prune_every = 1000;
|
||||
param->x_out_every = 20;
|
||||
param->v_out_every = 5;
|
||||
param->half_neigh = 0;
|
||||
param->proc_freq = 2.4;
|
||||
// DEM
|
||||
param->k_s = 1.0;
|
||||
param->k_dn = 1.0;
|
||||
param->gx = 0.0;
|
||||
param->gy = 0.0;
|
||||
param->gz = 0.0;
|
||||
param->reflect_x = 0.0;
|
||||
param->reflect_y = 0.0;
|
||||
param->reflect_z = 0.0;
|
||||
//MPI
|
||||
param->balance = 0;
|
||||
param->method = 0;
|
||||
param->balance_every =param->reneigh_every;
|
||||
}
|
||||
|
||||
void readParameter(Parameter *param, const char *filename) {
|
||||
FILE *fp = fopen(filename, "r");
|
||||
char line[MAXLINE];
|
||||
int i;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open parameter file: %s\n", filename);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
while(!feof(fp)) {
|
||||
line[0] = '\0';
|
||||
readline(line, fp);
|
||||
for(i = 0; line[i] != '\0' && line[i] != '#'; i++);
|
||||
line[i] = '\0';
|
||||
|
||||
char *tok = strtok(line, "\t ");
|
||||
char *val = strtok(NULL, "\t ");
|
||||
|
||||
#define PARSE_PARAM(p,f) if(strncmp(tok, #p, sizeof(#p) / sizeof(#p[0]) - 1) == 0) { param->p = f(val); }
|
||||
#define PARSE_STRING(p) PARSE_PARAM(p, strdup)
|
||||
#define PARSE_INT(p) PARSE_PARAM(p, atoi)
|
||||
#define PARSE_REAL(p) PARSE_PARAM(p, atof)
|
||||
|
||||
if(tok != NULL && val != NULL) {
|
||||
PARSE_PARAM(force_field, str2ff);
|
||||
PARSE_STRING(input_file);
|
||||
PARSE_STRING(eam_file);
|
||||
PARSE_STRING(vtk_file);
|
||||
PARSE_STRING(xtc_file);
|
||||
PARSE_REAL(epsilon);
|
||||
PARSE_REAL(sigma);
|
||||
PARSE_REAL(k_s);
|
||||
PARSE_REAL(k_dn);
|
||||
PARSE_REAL(reflect_x);
|
||||
PARSE_REAL(reflect_y);
|
||||
PARSE_REAL(reflect_z);
|
||||
PARSE_REAL(gx);
|
||||
PARSE_REAL(gy);
|
||||
PARSE_REAL(gz);
|
||||
PARSE_REAL(rho);
|
||||
PARSE_REAL(dt);
|
||||
PARSE_REAL(cutforce);
|
||||
PARSE_REAL(skin);
|
||||
PARSE_REAL(temp);
|
||||
PARSE_REAL(mass);
|
||||
PARSE_REAL(proc_freq);
|
||||
PARSE_INT(ntypes);
|
||||
PARSE_INT(ntimes);
|
||||
PARSE_INT(nx);
|
||||
PARSE_INT(ny);
|
||||
PARSE_INT(nz);
|
||||
PARSE_INT(pbc_x);
|
||||
PARSE_INT(pbc_y);
|
||||
PARSE_INT(pbc_z);
|
||||
PARSE_INT(nstat);
|
||||
PARSE_INT(reneigh_every);
|
||||
PARSE_INT(prune_every);
|
||||
PARSE_INT(x_out_every);
|
||||
PARSE_INT(v_out_every);
|
||||
PARSE_INT(half_neigh);
|
||||
PARSE_INT(method);
|
||||
PARSE_INT(balance);
|
||||
PARSE_INT(balance_every);
|
||||
}
|
||||
}
|
||||
// Update dtforce
|
||||
param->dtforce = 0.5 * param->dt;
|
||||
|
||||
// Update sigma6 parameter
|
||||
MD_FLOAT s2 = param->sigma * param->sigma;
|
||||
param->sigma6 = s2 * s2 * s2;
|
||||
|
||||
//Update balance parameter, 10 could be change
|
||||
param->balance_every *=param->reneigh_every;
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void printParameter(Parameter *param) {
|
||||
printf("Parameters:\n");
|
||||
if(param->input_file != NULL) {
|
||||
printf("\tInput file: %s\n", param->input_file);
|
||||
}
|
||||
|
||||
if(param->vtk_file != NULL) {
|
||||
printf("\tVTK file: %s\n", param->vtk_file);
|
||||
}
|
||||
|
||||
if(param->xtc_file != NULL) {
|
||||
printf("\tXTC file: %s\n", param->xtc_file);
|
||||
}
|
||||
|
||||
if(param->eam_file != NULL) {
|
||||
printf("\tEAM file: %s\n", param->eam_file);
|
||||
}
|
||||
|
||||
printf("\tForce field: %s\n", ff2str(param->force_field));
|
||||
#ifdef CLUSTER_M
|
||||
printf("\tKernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
|
||||
#else
|
||||
printf("\tKernel: %s\n", KERNEL_NAME);
|
||||
#endif
|
||||
printf("\tData layout: %s\n", POS_DATA_LAYOUT);
|
||||
printf("\tFloating-point precision: %s\n", PRECISION_STRING);
|
||||
printf("\tUnit cells (nx, ny, nz): %d, %d, %d\n", param->nx, param->ny, param->nz);
|
||||
printf("\tDomain box sizes (x, y, z): %e, %e, %e\n", param->xprd, param->yprd, param->zprd);
|
||||
printf("\tPeriodic (x, y, z): %d, %d, %d\n", param->pbc_x, param->pbc_y, param->pbc_z);
|
||||
printf("\tLattice size: %e\n", param->lattice);
|
||||
printf("\tEpsilon: %e\n", param->epsilon);
|
||||
printf("\tSigma: %e\n", param->sigma);
|
||||
printf("\tSpring constant: %e\n", param->k_s);
|
||||
printf("\tDamping constant: %e\n", param->k_dn);
|
||||
printf("\tTemperature: %e\n", param->temp);
|
||||
printf("\tRHO: %e\n", param->rho);
|
||||
printf("\tMass: %e\n", param->mass);
|
||||
printf("\tNumber of types: %d\n", param->ntypes);
|
||||
printf("\tNumber of timesteps: %d\n", param->ntimes);
|
||||
printf("\tReport stats every (timesteps): %d\n", param->nstat);
|
||||
printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
|
||||
#ifdef SORT_ATOMS
|
||||
printf("\tSort atoms when reneighboring: yes\n");
|
||||
#else
|
||||
printf("\tSort atoms when reneighboring: no\n");
|
||||
#endif
|
||||
printf("\tPrune every (timesteps): %d\n", param->prune_every);
|
||||
printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
|
||||
printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);
|
||||
printf("\tDelta time (dt): %e\n", param->dt);
|
||||
printf("\tCutoff radius: %e\n", param->cutforce);
|
||||
printf("\tSkin: %e\n", param->skin);
|
||||
printf("\tHalf neighbor lists: %d\n", param->half_neigh);
|
||||
printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq);
|
||||
|
||||
// ================ New MPI features =============
|
||||
char str[20];
|
||||
strcpy(str, (param->method == 1) ? "Half Shell" :
|
||||
(param->method == 2) ? "Eight Shell" :
|
||||
(param->method == 3) ? "Half Stencil":
|
||||
"Full Shell");
|
||||
printf("\tMethod: %s\n", str);
|
||||
strcpy(str, (param->balance == 1) ? "mean RCB" :
|
||||
(param->balance == 2) ? "mean Time RCB" :
|
||||
(param->balance == 3) ? "Staggered" :
|
||||
"cartisian");
|
||||
printf("\tPartition: %s\n", str);
|
||||
if(param->balance)
|
||||
printf("\tRebalancing every (timesteps): %d\n",param->balance_every);
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
double getTimeStamp()
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
|
||||
}
|
||||
|
||||
double getTimeResolution()
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_getres(CLOCK_MONOTONIC, &ts);
|
||||
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
|
||||
}
|
||||
|
||||
double getTimeStamp_()
|
||||
{
|
||||
return getTimeStamp();
|
||||
}
|
||||
107
common/util.c
107
common/util.c
@@ -1,107 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <errno.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <util.h>
|
||||
#include <math.h>
|
||||
|
||||
/* Park/Miller RNG w/out MASKING, so as to be like f90s version */
|
||||
#define IA 16807
|
||||
#define IM 2147483647
|
||||
#define AM (1.0/IM)
|
||||
#define IQ 127773
|
||||
#define IR 2836
|
||||
#define MASK 123459876
|
||||
|
||||
double myrandom(int* seed) {
|
||||
int k= (*seed) / IQ;
|
||||
double ans;
|
||||
|
||||
*seed = IA * (*seed - k * IQ) - IR * k;
|
||||
if(*seed < 0) *seed += IM;
|
||||
ans = AM * (*seed);
|
||||
return ans;
|
||||
}
|
||||
|
||||
void random_reset(int *seed, int ibase, double *coord) {
|
||||
int i;
|
||||
char *str = (char *) &ibase;
|
||||
int n = sizeof(int);
|
||||
unsigned int hash = 0;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
hash += str[i];
|
||||
hash += (hash << 10);
|
||||
hash ^= (hash >> 6);
|
||||
}
|
||||
|
||||
str = (char *) coord;
|
||||
n = 3 * sizeof(double);
|
||||
for (i = 0; i < n; i++) {
|
||||
hash += str[i];
|
||||
hash += (hash << 10);
|
||||
hash ^= (hash >> 6);
|
||||
}
|
||||
|
||||
hash += (hash << 3);
|
||||
hash ^= (hash >> 11);
|
||||
hash += (hash << 15);
|
||||
|
||||
// keep 31 bits of unsigned int as new seed
|
||||
// do not allow seed = 0, since will cause hang in gaussian()
|
||||
|
||||
*seed = hash & 0x7ffffff;
|
||||
if (!(*seed)) *seed = 1;
|
||||
|
||||
// warm up the RNG
|
||||
|
||||
for (i = 0; i < 5; i++) myrandom(seed);
|
||||
//save = 0;
|
||||
}
|
||||
|
||||
int str2ff(const char *string) {
|
||||
if(strncmp(string, "lj", 2) == 0) return FF_LJ;
|
||||
if(strncmp(string, "eam", 3) == 0) return FF_EAM;
|
||||
if(strncmp(string, "dem", 3) == 0) return FF_DEM;
|
||||
return -1;
|
||||
}
|
||||
|
||||
const char* ff2str(int ff) {
|
||||
if(ff == FF_LJ) { return "lj"; }
|
||||
if(ff == FF_EAM) { return "eam"; }
|
||||
if(ff == FF_DEM) { return "dem"; }
|
||||
return "invalid";
|
||||
}
|
||||
|
||||
int get_cuda_num_threads() {
|
||||
const char *num_threads_env = getenv("NUM_THREADS");
|
||||
return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
|
||||
}
|
||||
|
||||
void readline(char *line, FILE *fp) {
|
||||
if(fgets(line, MAXLINE, fp) == NULL) {
|
||||
printf("error %i\n",errno);
|
||||
if(errno != 0) {
|
||||
perror("readline()");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void debug_printf(const char *format, ...) {
|
||||
#ifdef DEBUG
|
||||
va_list arg;
|
||||
int ret;
|
||||
|
||||
va_start(arg, format);
|
||||
if((vfprintf(stdout, format, arg)) < 0) { perror("debug_printf()"); }
|
||||
va_end(arg);
|
||||
#endif
|
||||
}
|
||||
38
config.mk
38
config.mk
@@ -1,49 +1,29 @@
|
||||
# Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
|
||||
TAG ?= MPIICC
|
||||
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
|
||||
ISA ?= AVX512
|
||||
# Optimization scheme (lammps/gromacs/clusters_per_bin)
|
||||
OPT_SCHEME ?= gromacs
|
||||
# Compiler tag (GCC/CLANG/ICC/NVCC)
|
||||
TAG ?= NVCC
|
||||
# Enable likwid (true or false)
|
||||
ENABLE_LIKWID ?= false
|
||||
# SP or DP
|
||||
DATA_TYPE ?= DP
|
||||
# AOS or SOA
|
||||
DATA_LAYOUT ?= SOA
|
||||
DATA_LAYOUT ?= AOS
|
||||
# Assembly syntax to generate (ATT/INTEL)
|
||||
ASM_SYNTAX ?= ATT
|
||||
# Debug
|
||||
DEBUG ?= false
|
||||
|
||||
# Sort atoms when reneighboring (true or false)
|
||||
SORT_ATOMS ?= true
|
||||
# Number of times to run the atoms loop on stubbed variant
|
||||
ATOMS_LOOP_RUNS ?= 1
|
||||
# Number of times to run the neighbors loop on stubbed variant
|
||||
NEIGHBORS_LOOP_RUNS ?= 1
|
||||
# Explicitly store and load atom types (true or false)
|
||||
EXPLICIT_TYPES ?= false
|
||||
# Trace memory addresses for cache simulator (true or false)
|
||||
MEM_TRACER ?= false
|
||||
# Trace indexes and distances for gather-md (true or false)
|
||||
INDEX_TRACER ?= false
|
||||
# Vector width (elements) for index and distance tracer
|
||||
VECTOR_WIDTH ?= 8
|
||||
# Compute statistics
|
||||
COMPUTE_STATS ?= false
|
||||
|
||||
# Configurations for lammps optimization scheme
|
||||
# Use omp simd pragma when running with half neighbor-lists
|
||||
ENABLE_OMP_SIMD ?= false
|
||||
# Use kernel with explicit SIMD intrinsics
|
||||
USE_SIMD_KERNEL ?= true
|
||||
|
||||
# Configurations for gromacs optimization scheme
|
||||
# Use reference version
|
||||
USE_REFERENCE_VERSION ?= false
|
||||
# Enable XTC output
|
||||
XTC_OUTPUT ?= false
|
||||
# Check if cj is local when decreasing reaction force
|
||||
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
|
||||
|
||||
# Configurations for CUDA
|
||||
# Use CUDA host memory to optimize transfers
|
||||
USE_CUDA_HOST_MEMORY ?= false
|
||||
|
||||
#Feature options
|
||||
OPTIONS = -DALIGNMENT=64
|
||||
#OPTIONS += More options
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,244 +0,0 @@
|
||||
;
|
||||
; Generated by:
|
||||
; Vitaly V. Chaban
|
||||
; School of Chemistry
|
||||
; University of Kharkiv
|
||||
; Ukraine, Kharkiv-61077, Svoboda sq., 4
|
||||
; email: chaban@univer.kharkov.ua, vvchaban@gmail.com
|
||||
; skype: vvchaban
|
||||
|
||||
; System: Liquid argon (1000 atoms) at 80 K. Equilibrated for 500ps.
|
||||
|
||||
; VARIOUS PREPROCESSING OPTIONS
|
||||
title = Yo
|
||||
cpp = /usr/bin/cpp
|
||||
include =
|
||||
define =
|
||||
|
||||
; RUN CONTROL PARAMETERS
|
||||
integrator = md
|
||||
; Start time and timestep in ps
|
||||
tinit = 0
|
||||
dt = 0.001
|
||||
nsteps = 250000
|
||||
; For exact run continuation or redoing part of a run
|
||||
init_step = 0
|
||||
; mode for center of mass motion removal
|
||||
comm-mode = Linear
|
||||
; number of steps for center of mass motion removal
|
||||
nstcomm = 1
|
||||
; group(s) for center of mass motion removal
|
||||
comm-grps =
|
||||
|
||||
; LANGEVIN DYNAMICS OPTIONS
|
||||
; Temperature, friction coefficient (amu/ps) and random seed
|
||||
bd-temp = 300
|
||||
bd-fric = 0
|
||||
ld-seed = 1993
|
||||
|
||||
; ENERGY MINIMIZATION OPTIONS
|
||||
; Force tolerance and initial step-size
|
||||
emtol = 100
|
||||
emstep = 0.01
|
||||
; Max number of iterations in relax_shells
|
||||
niter = 20
|
||||
; Step size (1/ps^2) for minimization of flexible constraints
|
||||
fcstep = 0
|
||||
; Frequency of steepest descents steps when doing CG
|
||||
nstcgsteep = 1000
|
||||
nbfgscorr = 10
|
||||
|
||||
; OUTPUT CONTROL OPTIONS
|
||||
; Output frequency for coords (x), velocities (v) and forces (f)
|
||||
nstxout = 500
|
||||
nstvout = 5
|
||||
nstfout = 0
|
||||
; Checkpointing helps you continue after crashes
|
||||
nstcheckpoint = 1000
|
||||
; Output frequency for energies to log file and energy file
|
||||
nstlog = 50
|
||||
nstenergy = 50
|
||||
; Output frequency and precision for xtc file
|
||||
nstxtcout = 5
|
||||
xtc-precision = 1000
|
||||
; This selects the subset of atoms for the xtc file. You can
|
||||
; select multiple groups. By default all atoms will be written.
|
||||
xtc-grps =
|
||||
; Selection of energy groups
|
||||
energygrps =
|
||||
|
||||
; NEIGHBORSEARCHING PARAMETERS
|
||||
; nblist update frequency
|
||||
nstlist = 5
|
||||
; ns algorithm (simple or grid)
|
||||
ns_type = grid
|
||||
; Periodic boundary conditions: xyz (default), no (vacuum)
|
||||
; or full (infinite systems only)
|
||||
pbc = xyz
|
||||
; nblist cut-off
|
||||
rlist = 0.9
|
||||
domain-decomposition = no
|
||||
|
||||
; OPTIONS FOR ELECTROSTATICS AND VDW
|
||||
; Method for doing electrostatics
|
||||
coulombtype = Cut-off
|
||||
rcoulomb-switch = 0
|
||||
rcoulomb = 0.9
|
||||
; Dielectric constant (DC) for cut-off or DC of reaction field
|
||||
epsilon-r = 1
|
||||
; Method for doing Van der Waals
|
||||
vdw-type = Cut-off
|
||||
; cut-off lengths
|
||||
rvdw-switch = 0
|
||||
rvdw = 0.9
|
||||
; Apply long range dispersion corrections for Energy and Pressure
|
||||
DispCorr = EnerPres
|
||||
; Extension of the potential lookup tables beyond the cut-off
|
||||
table-extension = 1
|
||||
; Spacing for the PME/PPPM FFT grid
|
||||
fourierspacing = 0.12
|
||||
; FFT grid size, when a value is 0 fourierspacing will be used
|
||||
fourier_nx = 0
|
||||
fourier_ny = 0
|
||||
fourier_nz = 0
|
||||
; EWALD/PME/PPPM parameters
|
||||
pme_order = 4
|
||||
ewald_rtol = 1e-05
|
||||
ewald_geometry = 3d
|
||||
epsilon_surface = 0
|
||||
optimize_fft = no
|
||||
|
||||
; GENERALIZED BORN ELECTROSTATICS
|
||||
; Algorithm for calculating Born radii
|
||||
gb_algorithm = Still
|
||||
; Frequency of calculating the Born radii inside rlist
|
||||
nstgbradii = 1
|
||||
; Cutoff for Born radii calculation; the contribution from atoms
|
||||
; between rlist and rgbradii is updated every nstlist steps
|
||||
rgbradii = 2
|
||||
; Salt concentration in M for Generalized Born models
|
||||
gb_saltconc = 0
|
||||
|
||||
; IMPLICIT SOLVENT (for use with Generalized Born electrostatics)
|
||||
implicit_solvent = No
|
||||
|
||||
; OPTIONS FOR WEAK COUPLING ALGORITHMS
|
||||
; Temperature coupling
|
||||
Tcoupl = berendsen
|
||||
; Groups to couple separately
|
||||
tc-grps = System
|
||||
; Time constant (ps) and reference temperature (K)
|
||||
tau_t = 0.1
|
||||
ref_t = 80
|
||||
; Pressure coupling
|
||||
Pcoupl = no
|
||||
Pcoupltype = isotropic
|
||||
; Time constant (ps), compressibility (1/bar) and reference P (bar)
|
||||
tau_p = 1.0
|
||||
compressibility = 4.5e-5
|
||||
ref_p = 1.0
|
||||
; Random seed for Andersen thermostat
|
||||
andersen_seed = 815131
|
||||
|
||||
; SIMULATED ANNEALING
|
||||
; Type of annealing for each temperature group (no/single/periodic)
|
||||
annealing = no
|
||||
; Number of time points to use for specifying annealing in each group
|
||||
annealing_npoints =
|
||||
; List of times at the annealing points for each group
|
||||
annealing_time =
|
||||
; Temp. at each annealing point, for each group.
|
||||
annealing_temp =
|
||||
|
||||
; GENERATE VELOCITIES FOR STARTUP RUN
|
||||
gen_vel = yes
|
||||
gen_temp = 80
|
||||
gen_seed = 1993
|
||||
|
||||
; OPTIONS FOR BONDS
|
||||
constraints = all-bonds
|
||||
; Type of constraint algorithm
|
||||
constraint-algorithm = Lincs
|
||||
; Do not constrain the start configuration
|
||||
unconstrained-start = no
|
||||
; Use successive overrelaxation to reduce the number of shake iterations
|
||||
Shake-SOR = no
|
||||
; Relative tolerance of shake
|
||||
shake-tol = 1e-04
|
||||
; Highest order in the expansion of the constraint coupling matrix
|
||||
lincs-order = 4
|
||||
; Number of iterations in the final step of LINCS. 1 is fine for
|
||||
; normal simulations, but use 2 to conserve energy in NVE runs.
|
||||
; For energy minimization with constraints it should be 4 to 8.
|
||||
lincs-iter = 1
|
||||
; Lincs will write a warning to the stderr if in one step a bond
|
||||
; rotates over more degrees than
|
||||
lincs-warnangle = 30
|
||||
; Convert harmonic bonds to morse potentials
|
||||
morse = no
|
||||
|
||||
; ENERGY GROUP EXCLUSIONS
|
||||
; Pairs of energy groups for which all non-bonded interactions are excluded
|
||||
energygrp_excl =
|
||||
|
||||
; NMR refinement stuff
|
||||
; Distance restraints type: No, Simple or Ensemble
|
||||
disre = No
|
||||
; Force weighting of pairs in one distance restraint: Conservative or Equal
|
||||
disre-weighting = Conservative
|
||||
; Use sqrt of the time averaged times the instantaneous violation
|
||||
disre-mixed = no
|
||||
disre-fc = 1000
|
||||
disre-tau = 0
|
||||
; Output frequency for pair distances to energy file
|
||||
nstdisreout = 100
|
||||
; Orientation restraints: No or Yes
|
||||
orire = no
|
||||
; Orientation restraints force constant and tau for time averaging
|
||||
orire-fc = 0
|
||||
orire-tau = 0
|
||||
orire-fitgrp =
|
||||
; Output frequency for trace(SD) to energy file
|
||||
nstorireout = 100
|
||||
; Dihedral angle restraints: No, Simple or Ensemble
|
||||
dihre = No
|
||||
dihre-fc = 1000
|
||||
dihre-tau = 0
|
||||
; Output frequency for dihedral values to energy file
|
||||
nstdihreout = 100
|
||||
|
||||
; Free energy control stuff
|
||||
free-energy = no
|
||||
init-lambda = 0
|
||||
delta-lambda = 0
|
||||
sc-alpha = 0
|
||||
sc-sigma = 0.3
|
||||
|
||||
; Non-equilibrium MD stuff
|
||||
acc-grps =
|
||||
accelerate =
|
||||
freezegrps =
|
||||
freezedim =
|
||||
cos-acceleration = 0
|
||||
|
||||
; Electric fields
|
||||
; Format is number of terms (int) and for all terms an amplitude (real)
|
||||
; and a phase angle (real)
|
||||
E-x =
|
||||
E-xt =
|
||||
E-y =
|
||||
E-yt =
|
||||
E-z =
|
||||
E-zt =
|
||||
|
||||
; User defined thingies
|
||||
user1-grps =
|
||||
user2-grps =
|
||||
userint1 = 0
|
||||
userint2 = 0
|
||||
userint3 = 0
|
||||
userint4 = 0
|
||||
userreal1 = 0
|
||||
userreal2 = 0
|
||||
userreal3 = 0
|
||||
userreal4 = 0
|
||||
@@ -1,12 +0,0 @@
|
||||
mass 39.94
|
||||
sigma 0.0062220
|
||||
epsilon 0.0000096960
|
||||
ntimes 250000
|
||||
dt 0.001
|
||||
temp 80
|
||||
x_out_freq 500
|
||||
v_out_freq 5
|
||||
cutforce 1.8
|
||||
skin 0.1
|
||||
reneigh_every 100
|
||||
nstat 125000
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,304 +0,0 @@
|
||||
DATE: 2007-06-11 UNITS: metal CONTRIBUTOR: Stephen Foiles, foiles@sandia.gov CITATION: Foiles et al, Phys Rev B, 33, 7983 (1986) COMMENT: Cu functions (universal 3), SM Foiles et al, PRB, 33, 7983 (1986)
|
||||
29 63.550 3.6150 FCC
|
||||
500 5.0100200400801306e-04 500 1.0000000000000009e-02 4.9499999999999886e+00
|
||||
0. -3.1561636903424350e-01 -5.2324876182494506e-01 -6.9740831416804383e-01 -8.5202525457518519e-01
|
||||
-9.9329216586042435e-01 -1.1246331970890324e+00 -1.2481882647347859e+00 -1.3654054700363645e+00 -1.4773214276236644e+00
|
||||
-1.5847099936904741e+00 -1.6865851873526410e+00 -1.7843534091637920e+00 -1.8790616476576076e+00 -1.9710188604521761e+00
|
||||
-2.0604838665854572e+00 -2.1476762477372944e+00 -2.2327843595560068e+00 -2.3159713409697673e+00 -2.3973797031286352e+00
|
||||
-2.4771348895887826e+00 -2.5553480773272810e+00 -2.6321184083774227e+00 -2.7075347880408458e+00 -2.7816773487592030e+00
|
||||
-2.8546186529652005e+00 -2.9264246898861899e+00 -2.9971557080624507e+00 -3.0668669157065978e+00 -3.1356090736776849e+00
|
||||
-3.2034290008357829e+00 -3.2703700069757247e+00 -3.3364722658277230e+00 -3.4017731379735778e+00 -3.4663074517059016e+00
|
||||
-3.5301077484029122e+00 -3.5932044977085980e+00 -3.6556262870729199e+00 -3.7173999892229403e+00 -3.7785509106421671e+00
|
||||
-3.8391029237823773e+00 -3.8990785849196925e+00 -3.9584992397079333e+00 -4.0173851179270912e+00 -4.0744518500210916e+00
|
||||
-4.1306733564032641e+00 -4.1864034067843932e+00 -4.2416582335814326e+00 -4.2964533268445280e+00 -4.3508034838872618e+00
|
||||
-4.4047228547107977e+00 -4.4582249835318351e+00 -4.5113228468570128e+00 -4.5640288884490872e+00 -4.6163550514904443e+00
|
||||
-4.6683128082199232e+00 -4.7199131872767452e+00 -4.7711667990036801e+00 -4.8220838587683374e+00 -4.8726742087289665e+00
|
||||
-4.9229473379113813e+00 -4.9729124009208192e+00 -5.0225782353423369e+00 -5.0719533779533492e+00 -5.1210460798461668e+00
|
||||
-5.1698643205481289e+00 -5.2184158212228908e+00 -5.2667080570261362e+00 -5.3147482686812282e+00 -5.3625434733324937e+00
|
||||
-5.4101004747367369e+00 -5.4574258728391953e+00 -5.5045260727784751e+00 -5.5514072933650311e+00 -5.5980755750691458e+00
|
||||
-5.6445367875538750e+00 -5.6907966367860183e+00 -5.7368606717507191e+00 -5.7827342908000219e+00 -5.8284227476608805e+00
|
||||
-5.8739311571204382e+00 -5.9192645004390272e+00 -5.9644276303605182e+00 -6.0094252761103064e+00 -6.0542620478988169e+00
|
||||
-6.0989424413057520e+00 -6.1434708414539330e+00 -6.1878515269578429e+00 -6.2320886736884802e+00 -6.2761863583589275e+00
|
||||
-6.3201485619430571e+00 -6.3639791729330000e+00 -6.4076819904493902e+00 -6.4512607272098990e+00 -6.4947190123648113e+00
|
||||
-6.5380603942065250e+00 -6.5812883427622069e+00 -6.6243939095620874e+00 -6.6670830925929181e+00 -6.7096660473058591e+00
|
||||
-6.7521459135001862e+00 -6.7945257643836499e+00 -6.8368086085521611e+00 -6.8789973918942735e+00 -6.9210949994162263e+00
|
||||
-6.9631042569970703e+00 -7.0050279330721992e+00 -7.0468687402560874e+00 -7.0886293368973554e+00 -7.1303123285804020e+00
|
||||
-7.1719202695651916e+00 -7.2134556641788095e+00 -7.2549209681507421e+00 -7.2963185899023415e+00 -7.3376508917899628e+00
|
||||
-7.3789201913012903e+00 -7.4201287622117036e+00 -7.4612788356982946e+00 -7.5023726014152032e+00 -7.5434122085331978e+00
|
||||
-7.5843997667427345e+00 -7.6253373472216595e+00 -7.6662269835740062e+00 -7.7070706727342895e+00 -7.7478703758424388e+00
|
||||
-7.7886280190928119e+00 -7.8293454945503811e+00 -7.8700246609474789e+00 -7.9106673444489104e+00 -7.9512753393968865e+00
|
||||
-7.9918504090315139e+00 -8.0323942861870705e+00 -8.0729086739704030e+00 -8.1133952464140293e+00 -8.1538556491162808e+00
|
||||
-8.1942914998523975e+00 -8.2347043891773524e+00 -8.2750958810033808e+00 -8.3154675131659701e+00 -8.3558207979692725e+00
|
||||
-8.3961572227176475e+00 -8.4364782502312892e+00 -8.4767853193496308e+00 -8.5170798454139458e+00 -8.5573632207473906e+00
|
||||
-8.5976368151087286e+00 -8.6379019761436666e+00 -8.6781600298199919e+00 -8.7184122808490656e+00 -8.7586600130993020e+00
|
||||
-8.7989044899963460e+00 -8.8391469549140993e+00 -8.8793886315543773e+00 -8.9196307243150841e+00 -8.9598744186541239e+00
|
||||
-9.0001208814363167e+00 -9.0403712612778122e+00 -9.0806266888772029e+00 -9.1208882773446476e+00 -9.1611571225108719e+00
|
||||
-9.2014343032440138e+00 -9.2417208817437881e+00 -9.2820179038447463e+00 -9.3223263992829857e+00 -9.3626473819958278e+00
|
||||
-9.4029818503831279e+00 -9.4433307875392529e+00 -9.4836951616705960e+00 -9.5237840547885071e+00 -9.5637918926951784e+00
|
||||
-9.6038142178817338e+00 -9.6438519061474608e+00 -9.6839058194810832e+00 -9.7239768064614509e+00 -9.7640657024289226e+00
|
||||
-9.8041733297054634e+00 -9.8443004978059889e+00 -9.8844480036373170e+00 -9.9246166317080906e+00 -9.9648071543198853e+00
|
||||
-1.0005020331762637e+01 -1.0045256912501884e+01 -1.0085517633366123e+01 -1.0125803219723423e+01 -1.0166114385662183e+01
|
||||
-1.0206451834160134e+01 -1.0246816257258331e+01 -1.0287208336224353e+01 -1.0327628741713852e+01 -1.0368078133934148e+01
|
||||
-1.0408557162795717e+01 -1.0449066468066974e+01 -1.0489606679525650e+01 -1.0530178417100558e+01 -1.0570782291022510e+01
|
||||
-1.0611418901960292e+01 -1.0652088841158786e+01 -1.0692792690577562e+01 -1.0733531023022920e+01 -1.0774304402276016e+01
|
||||
-1.0815113383222808e+01 -1.0855958511980305e+01 -1.0896840326017184e+01 -1.0937759354276295e+01 -1.0978716117290730e+01
|
||||
-1.1019711127305925e+01 -1.1060744888386239e+01 -1.1101817896531486e+01 -1.1142930639787664e+01 -1.1184083598352004e+01
|
||||
-1.1225277244679319e+01 -1.1266512043589387e+01 -1.1307788452364719e+01 -1.1349106920870327e+01 -1.1390467891550486e+01
|
||||
-1.1431871799781504e+01 -1.1473319073642074e+01 -1.1514810134213008e+01 -1.1556345395619132e+01 -1.1597925265115521e+01
|
||||
-1.1639550143177303e+01 -1.1681220423591583e+01 -1.1722936493536452e+01 -1.1764698733669888e+01 -1.1806507518187232e+01
|
||||
-1.1848363215029394e+01 -1.1890266185706139e+01 -1.1932216785634637e+01 -1.1974215364086319e+01 -1.2016262264291129e+01
|
||||
-1.2058357823507606e+01 -1.2100502373105996e+01 -1.2142696238631970e+01 -1.2184939739884385e+01 -1.2227233190982815e+01
|
||||
-1.2269576900438324e+01 -1.2311971171220080e+01 -1.2354416300827552e+01 -1.2396912581348374e+01 -1.2439460299532641e+01
|
||||
-1.2482059736851909e+01 -1.2524711169562636e+01 -1.2567414868772744e+01 -1.2610171100495961e+01 -1.2652980125719694e+01
|
||||
-1.2695842200459083e+01 -1.2738757575819193e+01 -1.2781726498053729e+01 -1.2824749208615117e+01 -1.2867825944219817e+01
|
||||
-1.2910956936899197e+01 -1.2954142414054047e+01 -1.2997382598508125e+01 -1.3040677708563408e+01 -1.3084027958052218e+01
|
||||
-1.3127433556386677e+01 -1.3170894708610035e+01 -1.3214411615448739e+01 -1.3257984473359954e+01 -1.3301613474583519e+01
|
||||
-1.3345298807190659e+01 -1.3389040655121903e+01 -1.3432839198243016e+01 -1.3476694612386723e+01 -1.3520607069407617e+01
|
||||
-1.3564576737214225e+01 -1.3608603779754390e+01 -1.3652688357330362e+01 -1.3696830626228689e+01 -1.3741030739041094e+01
|
||||
-1.3785288844633044e+01 -1.3829605088192579e+01 -1.3873979611263849e+01 -1.3918412551792358e+01 -1.3962904044165157e+01
|
||||
-1.4007454219246995e+01 -1.4052063204422609e+01 -1.4096731123636516e+01 -1.4141458097424390e+01 -1.4186244242962175e+01
|
||||
-1.4231089674089560e+01 -1.4275994501358696e+01 -1.4320958832063411e+01 -1.4365982770278379e+01 -1.4411066416893846e+01
|
||||
-1.4456209869649911e+01 -1.4501413223171539e+01 -1.4546676569005058e+01 -1.4591999995647598e+01 -1.4637383588581656e+01
|
||||
-1.4682827430315228e+01 -1.4728331600403862e+01 -1.4773896175488971e+01 -1.4819521229330235e+01 -1.4865206832833337e+01
|
||||
-1.4910953054084985e+01 -1.4956759958383259e+01 -1.5002627608264334e+01 -1.5048556063539081e+01 -1.5094545381317744e+01
|
||||
-1.5140595616041765e+01 -1.5186706819511983e+01 -1.5232879040916600e+01 -1.5279112326867676e+01 -1.5325406721414765e+01
|
||||
-1.5371762266086876e+01 -1.5418178999911675e+01 -1.5464656959446415e+01 -1.5511196178805903e+01 -1.5557796689685119e+01
|
||||
-1.5604458521389688e+01 -1.5651181700861002e+01 -1.5697966252703509e+01 -1.5744812199205967e+01 -1.5791719560374304e+01
|
||||
-1.5838688353945599e+01 -1.5885718595428898e+01 -1.5932810298111235e+01 -1.5979963473102316e+01 -1.6027178129340314e+01
|
||||
-1.6074454273625634e+01 -1.6121791910645470e+01 -1.6169191042992907e+01 -1.6216651671189425e+01 -1.6264173793714576e+01
|
||||
-1.6311757407021901e+01 -1.6359402505566209e+01 -1.6407109081822910e+01 -1.6454877126310635e+01 -1.6502706627614998e+01
|
||||
-1.6550597572407241e+01 -1.6598549945469813e+01 -1.6646563729715353e+01 -1.6694638906205682e+01 -1.6742775454176012e+01
|
||||
-1.6790973351056778e+01 -1.6839232572488413e+01 -1.6887553092348412e+01 -1.6935934882766333e+01 -1.6984377914146876e+01
|
||||
-1.7032882155186826e+01 -1.7081447572897673e+01 -1.7130074132623690e+01 -1.7178761798061373e+01 -1.7227510531275698e+01
|
||||
-1.7276320292724563e+01 -1.7325191041271864e+01 -1.7374122734215121e+01 -1.7423115327299456e+01 -1.7472168774711918e+01
|
||||
-1.7521283029136725e+01 -1.7570458041655343e+01 -1.7619693762170868e+01 -1.7668990138814479e+01 -1.7718347118374936e+01
|
||||
-1.7767764646209685e+01 -1.7817242666259403e+01 -1.7866781121071881e+01 -1.7916379951810882e+01 -1.7966039098283659e+01
|
||||
-1.8015758498943796e+01 -1.8065538090918608e+01 -1.8115377810021755e+01 -1.8165277590764617e+01 -1.8215237366381530e+01
|
||||
-1.8265257068836149e+01 -1.8315336628844307e+01 -1.8365475975885602e+01 -1.8415675038220570e+01 -1.8465933742903644e+01
|
||||
-1.8516252015799409e+01 -1.8566629781600568e+01 -1.8617066963838965e+01 -1.8667563484898778e+01 -1.8718119266039025e+01
|
||||
-1.8768734227397317e+01 -1.8819408288014415e+01 -1.8870141365839345e+01 -1.8920933377750998e+01 -1.8971784239569388e+01
|
||||
-1.9022693866067016e+01 -1.9073662170983084e+01 -1.9124689067045438e+01 -1.9175774465969539e+01 -1.9226918278483254e+01
|
||||
-1.9278120414338218e+01 -1.9329380782317116e+01 -1.9380699290257098e+01 -1.9432075845048644e+01 -1.9483510352663075e+01
|
||||
-1.9535002718153464e+01 -1.9586552845676124e+01 -1.9638160638497766e+01 -1.9689825999008235e+01 -1.9741548828738019e+01
|
||||
-1.9793329028359494e+01 -1.9845166497711489e+01 -1.9897061135804051e+01 -1.9949012840833348e+01 -2.0001021510188707e+01
|
||||
-2.0053087040468540e+01 -2.0105209327494322e+01 -2.0157388266314911e+01 -2.0209623751249865e+01 -2.0261915675825890e+01
|
||||
-2.0314263932714312e+01 -2.0366668414255741e+01 -2.0419129011700647e+01 -2.0471645615726288e+01 -2.0524218116314501e+01
|
||||
-2.0576846402769888e+01 -2.0629530363722893e+01 -2.0682269887147754e+01 -2.0735064860369221e+01 -2.0787915170073120e+01
|
||||
-2.0840820702317274e+01 -2.0893781342541502e+01 -2.0946796975575580e+01 -2.0999867485656864e+01 -2.1052992756428125e+01
|
||||
-2.1106172670961428e+01 -2.1159407111702421e+01 -2.1212695960751944e+01 -2.1266039099329419e+01 -2.1319436408360275e+01
|
||||
-2.1372887768154328e+01 -2.1426393058473991e+01 -2.1479952158748461e+01 -2.1533564947619766e+01 -2.1587231303431395e+01
|
||||
-2.1640951103995235e+01 -2.1694724226644553e+01 -2.1748550548245930e+01 -2.1802429945213817e+01 -2.1856362293508028e+01
|
||||
-2.1910347468648524e+01 -2.1964385345728829e+01 -2.2018475799410339e+01 -2.2072618703948137e+01 -2.2126813933181779e+01
|
||||
-2.2181061360561898e+01 -2.2235360859143157e+01 -2.2289712301596296e+01 -2.2344115560361388e+01 -2.2398570507087584e+01
|
||||
-2.2453077013515781e+01 -2.2507634950890292e+01 -2.2562244190064348e+01 -2.2616904601590250e+01 -2.2671616055687764e+01
|
||||
-2.2726378422261405e+01 -2.2781191570901910e+01 -2.2836055370890790e+01 -2.2890969691219198e+01 -2.2945934400583837e+01
|
||||
-2.3000949367399926e+01 -2.3056014459808921e+01 -2.3111129545678523e+01 -2.3166294492618363e+01 -2.3221509167983868e+01
|
||||
-2.3276773438880355e+01 -2.3332087172173260e+01 -2.3387450234495873e+01 -2.3442862492249787e+01 -2.3498323811618320e+01
|
||||
-2.3553834058571510e+01 -2.3609393098863848e+01 -2.3665000798062465e+01 -2.3720657021526677e+01 -2.3776361634436626e+01
|
||||
-2.3832114501780552e+01 -2.3887915488378439e+01 -2.3943764458878377e+01 -2.3999661277761106e+01 -2.4055605809352301e+01
|
||||
-2.4111597917826657e+01 -2.4167637467209488e+01 -2.4223724321393092e+01 -2.4279858344124932e+01 -2.4336039399030597e+01
|
||||
-2.4392267349614485e+01 -2.4448542059257761e+01 -2.4504863391234494e+01 -2.4561231208711206e+01 -2.4617645374753693e+01
|
||||
-2.4674105752332935e+01 -2.4730612204329191e+01 -2.4787164593538137e+01 -2.4843762782677913e+01 -2.4900406634392539e+01
|
||||
-2.4957096011252133e+01 -2.5013830775771112e+01 -2.5070610790396586e+01 -2.5127435917366029e+01 -2.5184306019355063e+01
|
||||
-2.5241220958503845e+01 -2.5298180597080318e+01 -2.5355184797285347e+01 -2.5412233421340488e+01 -2.5469326331427965e+01
|
||||
1.0000000000000000e+01 1.0801534951171448e+01 1.0617375158244670e+01 1.0436688151228793e+01 1.0259403283230313e+01
|
||||
1.0085451405601304e+01 9.9147648356938589e+00 9.7472773253084029e+00 9.5829240298195373e+00 9.4216414779654656e+00
|
||||
9.2633675422888473e+00 9.1080414102110012e+00 8.9556035557302494e+00 8.8059957117284853e+00 8.6591608428743143e+00
|
||||
8.5150431191084976e+00 8.3735878897014118e+00 8.2347416578681987e+00 8.0984520559319435e+00 7.9646678210201571e+00
|
||||
7.8333387712866624e+00 7.7044157826449009e+00 7.5778507660022569e+00 7.4535966449878401e+00 7.3316073341564731e+00
|
||||
7.2118377176659578e+00 7.0942436284134374e+00 6.9787818276207929e+00 6.8654099848621115e+00 6.7540866585212882e+00
|
||||
6.6447712766712357e+00 6.5374241183666584e+00 6.4320062953403578e+00 6.3284797340946000e+00 6.2268071583795574e+00
|
||||
6.1269520720505000e+00 6.0288787422946655e+00 5.9325521832211621e+00 5.8379381398054591e+00 5.7450030721804524e+00
|
||||
5.6537141402680220e+00 5.5640391887418730e+00 5.4759467323160322e+00 5.3894059413519244e+00 5.3043866277758980e+00
|
||||
5.2208592313018016e+00 5.1387948059520454e+00 5.0581650068698707e+00 4.9789420774166615e+00 4.9010988365496075e+00
|
||||
4.8246086664712777e+00 4.7494455005478358e+00 4.6755838114879396e+00 4.6029985997776066e+00 4.5316653823665547e+00
|
||||
4.4615601815980312e+00 4.3926595143797726e+00 4.3249403815888456e+00 4.2583802577058805e+00 4.1929570806747449e+00
|
||||
4.1286492419807814e+00 4.0654355769448500e+00 4.0032953552278059e+00 3.9422082715398403e+00 3.8821544365521561e+00
|
||||
3.8231143680053350e+00 3.7650689820101348e+00 3.7079995845373759e+00 3.6518878630917868e+00 3.5967158785670392e+00
|
||||
3.5424660572764992e+00 3.4891211831576925e+00 3.4366643901451397e+00 3.3850791547089756e+00 3.3343492885547761e+00
|
||||
3.2844589314827459e+00 3.2353925444006251e+00 3.1871349024889781e+00 3.1396710885139782e+00 3.0929864862859660e+00
|
||||
3.0470667742591075e+00 3.0018979192706325e+00 2.9574661704151453e+00 2.9137580530522627e+00 2.8707603629438552e+00
|
||||
2.8284601605189152e+00 2.7868447652620318e+00 2.7459017502243626e+00 2.7056189366531243e+00 2.6659843887374848e+00
|
||||
2.6269864084689516e+00 2.5886135306124487e+00 2.5508545177868598e+00 2.5136983556521244e+00 2.4771342482006986e+00
|
||||
2.4411516131510069e+00 2.4057400774406830e+00 2.3708894728175807e+00 2.3365898315265383e+00 2.3028313820887689e+00
|
||||
2.2696045451740474e+00 2.2368999295609058e+00 2.2047083281853901e+00 2.1730207142748128e+00 2.1418282375653348e+00
|
||||
2.1111222206016862e+00 2.0808941551166384e+00 2.0511356984892615e+00 2.0218386702793651e+00 1.9929950488372441e+00
|
||||
1.9645969679867363e+00 1.9366367137799969e+00 1.9091067213223525e+00 1.8819995716660998e+00 1.8553079887710169e+00
|
||||
1.8290248365311754e+00 1.8031431158652609e+00 1.7776559618705363e+00 1.7525566410377422e+00 1.7278385485262007e+00
|
||||
1.7034952054980579e+00 1.6795202565098251e+00 1.6559074669601728e+00 1.6326507205929630e+00 1.6097440170540054e+00
|
||||
1.5871814695006066e+00 1.5649573022624637e+00 1.5430658485530984e+00 1.5215015482308161e+00 1.5002589456071576e+00
|
||||
1.4793326873036463e+00 1.4587175201534635e+00 1.4384082891492156e+00 1.4183999354343300e+00 1.3986874943378140e+00
|
||||
1.3792660934511431e+00 1.3601309507466510e+00 1.3412773727360872e+00 1.3227007526689576e+00 1.3043965687692420e+00
|
||||
1.2863603825102174e+00 1.2685878369261090e+00 1.2510746549598935e+00 1.2338166378466084e+00 1.2168096635312082e+00
|
||||
1.2000496851203266e+00 1.1835327293670588e+00 1.1672548951882362e+00 1.1512123522134416e+00 1.1354013393647548e+00
|
||||
1.1198181634671940e+00 1.1044591978884952e+00 1.0893208812080033e+00 1.0743997159140335e+00 1.0596922671287743e+00
|
||||
1.0451951613605601e+00 1.0309050852825337e+00 1.0168187845373140e+00 1.0029330625671378e+00 9.8924477946872713e-01
|
||||
9.7575085087259694e-01 9.6244824684604424e-01 9.4933399081931213e-01 9.3640515853477169e-01 9.2365887701803118e-01
|
||||
9.1109232357100112e-01 8.9870272478628266e-01 8.8648735558209424e-01 8.7444353825798160e-01 8.6256864157006774e-01
|
||||
8.5086007982605949e-01 8.3931531199913678e-01 8.2793184086057892e-01 8.1670721213066955e-01 8.0563901364725510e-01
|
||||
7.9472487455206675e-01 7.8396246449372953e-01 7.7334949284779597e-01 7.6288370795296245e-01 7.5256289636327622e-01
|
||||
7.4238488211596021e-01 7.3234752601463171e-01 7.2244872492728618e-01 7.1268641109915265e-01 7.0305855147956464e-01
|
||||
6.9356314706317335e-01 6.8419823224459719e-01 6.7496187418651843e-01 6.6585217220099224e-01 6.5686725714346750e-01
|
||||
6.4800529081937697e-01 6.3926446540306614e-01 6.3064300286859520e-01 6.2213915443241774e-01 6.1375120000748140e-01
|
||||
6.0547744766850542e-01 5.9731623312840654e-01 5.8926591922531912e-01 5.8132489542033028e-01 5.7349157730523359e-01
|
||||
5.6576440612064971e-01 5.5814184828379609e-01 5.5062239492602316e-01 5.4320456143964790e-01 5.3588688703414888e-01
|
||||
5.2866793430138515e-01 5.2154628878946241e-01 5.1452055858552015e-01 5.0758937390678227e-01 5.0075138669987496e-01
|
||||
4.9400527024841523e-01 4.8734971878830358e-01 4.8078344713093557e-01 4.7430519029390972e-01 4.6791370313911962e-01
|
||||
4.6160776001828552e-01 4.5538615442535857e-01 4.4924769865602876e-01 4.4319122347399365e-01 4.3721557778390086e-01
|
||||
4.3131962831075654e-01 4.2550225928575891e-01 4.1976237213834899e-01 4.1409888519439697e-01 4.0851073338028954e-01
|
||||
4.0299686793291478e-01 3.9755625611540779e-01 3.9218788093843493e-01 3.8689074088692443e-01 3.8166384965228239e-01
|
||||
3.7650623586976018e-01 3.7141694286095728e-01 3.6639502838144544e-01 3.6143956437320846e-01 3.5654963672189943e-01
|
||||
3.5172434501901328e-01 3.4696280232829579e-01 3.4226413495707497e-01 3.3762748223177219e-01 3.3305199627774762e-01
|
||||
3.2853684180349596e-01 3.2408119588894380e-01 3.1968424777773841e-01 3.1534519867361155e-01 3.1106326154055530e-01
|
||||
3.0683766090688813e-01 3.0266763267296426e-01 2.9855242392259740e-01 2.9449129273803010e-01 2.9048350801842027e-01
|
||||
2.8652834930171167e-01 2.8262510658997009e-01 2.7877308017785829e-01 2.7497158048439907e-01 2.7121992788793392e-01
|
||||
2.6751745256412462e-01 2.6386349432690004e-01 2.6025740247248841e-01 2.5669853562631850e-01 2.5318626159266877e-01
|
||||
2.4971995720718354e-01 2.4629900819206618e-01 2.4292280901402563e-01 2.3959076274464408e-01 2.3630228092351846e-01
|
||||
2.3305678342376535e-01 2.2985369832002167e-01 2.2669246175884616e-01 2.2357251783148069e-01 2.2049331844890929e-01
|
||||
2.1745432321916880e-01 2.1445499932688783e-01 2.1149482141498144e-01 2.0857327146848004e-01 2.0568983870040114e-01
|
||||
2.0284401943976604e-01 2.0003531702142130e-01 1.9726324167804599e-01 1.9452731043391402e-01 1.9182704700056608e-01
|
||||
1.8916198167437770e-01 1.8653165123588344e-01 1.8393559885088084e-01 1.8137337397327791e-01 1.7884453224959973e-01
|
||||
1.7634863542523593e-01 1.7388525125224241e-01 1.7145395339876757e-01 1.6905432136008169e-01 1.6668594037109052e-01
|
||||
1.6434840132036665e-01 1.6204130066570688e-01 1.5976424035106618e-01 1.5751682772493769e-01 1.5529867546015819e-01
|
||||
1.5310940147503249e-01 1.5094862885580707e-01 1.4881598578045718e-01 1.4671110544379484e-01 1.4463362598375351e-01
|
||||
1.4258319040899092e-01 1.4055944652768915e-01 1.3856204687748974e-01 1.3659064865666881e-01 1.3464491365640630e-01
|
||||
1.3272450819420012e-01 1.3082910304837103e-01 1.2895837339364213e-01 1.2711199873781265e-01 1.2528966285941134e-01
|
||||
1.2349105374641756e-01 1.2171586353596986e-01 1.1996378845505173e-01 1.1823452876211782e-01 1.1652778868972380e-01
|
||||
1.1484327638801961e-01 1.1318070386919254e-01 1.1153978695277944e-01 1.0992024521187505e-01 1.0832180192018548e-01
|
||||
1.0674418399992769e-01 1.0518712197055757e-01 1.0365034989832456e-01 1.0213360534659532e-01 1.0063662932698936e-01
|
||||
9.9159166251264974e-02 9.7700963883974534e-02 9.6261773295835962e-02 9.4841348817873428e-02 9.3439447996227276e-02
|
||||
9.2055831547688260e-02 9.0690263315935660e-02 8.9342510228411331e-02 8.8012342253891429e-02 8.6699532360706044e-02
|
||||
8.5403856475584128e-02 8.4125093443141896e-02 8.2863024985984080e-02 8.1617435665412685e-02 8.0388112842733062e-02
|
||||
7.9174846641143493e-02 7.7977429908209661e-02 7.6795658178889781e-02 7.5629329639115728e-02 7.4478245089953710e-02
|
||||
7.3342207912248103e-02 7.2221024031827064e-02 7.1114501885225945e-02 7.0022452385910761e-02 6.8944688890991479e-02
|
||||
6.7881027168450458e-02 6.6831285364849169e-02 6.5795283973477225e-02 6.4772845803028556e-02 6.3763795946680801e-02
|
||||
6.2767961751651669e-02 6.1785172789201148e-02 6.0815260825057393e-02 5.9858059790287577e-02 5.8913405752569759e-02
|
||||
5.7981136887894191e-02 5.7061093452682510e-02 5.6153117756271964e-02 5.5257054133826422e-02 5.4372748919636837e-02
|
||||
5.3500050420772105e-02 5.2638808891131372e-02 5.1788876505864945e-02 5.0950107336147354e-02 5.0122357324306366e-02
|
||||
4.9305484259319243e-02 4.8499347752635869e-02 4.7703809214351578e-02 4.6918731829721727e-02 4.6143980535982010e-02
|
||||
4.5379421999521163e-02 4.4624924593352100e-02 4.3880358374905226e-02 4.3145595064128850e-02 4.2420508021892900e-02
|
||||
4.1704972228691739e-02 4.0998864263647405e-02 4.0302062283785300e-02 3.9614446003616965e-02 3.8935896674993531e-02
|
||||
3.8266297067221844e-02 3.7605531447481688e-02 3.6953485561492139e-02 3.6310046614435487e-02 3.5675103252157392e-02
|
||||
3.5048545542616605e-02 3.4430264957581835e-02 3.3820154354582632e-02 3.3218107959093635e-02 3.2624021346983278e-02
|
||||
3.2037791427166340e-02 3.1459316424514716e-02 3.0888495862994469e-02 3.0325230549015147e-02 2.9769422555015357e-02
|
||||
2.9220975203265720e-02 2.8679793049885216e-02 2.8145781869070463e-02 2.7618848637539717e-02 2.7098901519172047e-02
|
||||
2.6585849849867671e-02 2.6079604122596356e-02 2.5580075972643668e-02 2.5087178163056167e-02 2.4600824570288671e-02
|
||||
2.4120930170012267e-02 2.3647411023137499e-02 2.3180184262011627e-02 2.2719168076792418e-02 2.2264281702001121e-02
|
||||
2.1815445403263078e-02 2.1372580464206647e-02 2.0935609173537761e-02 2.0504454812290795e-02 2.0079041641240414e-02
|
||||
1.9659294888467183e-02 1.9245140737102040e-02 1.8836506313223755e-02 1.8433319673904158e-02 1.8035509795416238e-02
|
||||
1.7643006561603891e-02 1.7255740752380899e-02 1.6873644032391555e-02 1.6496648939823388e-02 1.6124688875347792e-02
|
||||
1.5757698091213634e-02 1.5395611680482646e-02 1.5038365566394485e-02 1.4685896491875350e-02 1.4338142009180710e-02
|
||||
1.3995040469664266e-02 1.3656531013687800e-02 1.3322553560652262e-02 1.2993048799157525e-02 1.2667958177290606e-02
|
||||
1.2347223893038994e-02 1.2030788884814458e-02 1.1718596822117511e-02 1.1410592096299910e-02 1.1106719811460941e-02
|
||||
1.0806925775450060e-02 1.0511156490982998e-02 1.0219359146882878e-02 9.9314816094114855e-03 9.6474724137328716e-03
|
||||
9.3672807554677773e-03 9.0908564823645177e-03 8.8181500860711193e-03 8.5491126940134832e-03 8.2836960613733579e-03
|
||||
8.0218525631707838e-03 7.7635351864465685e-03 7.5086975225370223e-03 7.2572937594544973e-03 7.0092786743605195e-03
|
||||
6.7646076261301813e-03 6.5232365480138998e-03 6.2851219403949887e-03 6.0502208636273869e-03 5.8184909309735300e-03
|
||||
5.5898903016277091e-03 5.3643776738254711e-03 5.1419122780385074e-03 4.9224538702609122e-03 4.7059627253757674e-03
|
||||
4.4923996305976099e-03 4.2817258790122659e-03 4.0739032631877392e-03 3.8688940688609841e-03 3.6666610687164924e-03
|
||||
3.4671675162341598e-03 3.2703771396105918e-03 3.0762541357672313e-03 2.8847631644254856e-03 2.6958693422570179e-03
|
||||
2.5095382371091990e-03 2.3257358623008373e-03 2.1444286709895732e-03 1.9655835506104946e-03 1.7891678173820869e-03
|
||||
1.6151492108847365e-03 1.4434958887007410e-03 1.2741764211267048e-03 1.1071597859496629e-03 9.4241536328815156e-04
|
||||
7.7991293049733956e-04 6.1962265713921827e-04 4.6151510001329887e-04 3.0556119825198014e-04 1.5173226847375876e-04
|
||||
0. 0. 0. 0. 0.
|
||||
0. 5.4383329664155645e-05 9.3944898415945083e-04 4.3251847212615047e-03 1.2334244035325348e-02
|
||||
2.7137722173468548e-02 5.0697119791449641e-02 8.4607638668976470e-02 1.3001641279549414e-01 1.8759487452762702e-01
|
||||
2.5754900895683441e-01 3.3965493779430744e-01 4.3331024634064264e-01 5.3759384878832961e-01 6.5132908316254046e-01
|
||||
7.7314622535699939e-01 9.0154178511424377e-01 1.0349328562818201e+00 1.1717054897399350e+00 1.3102565818166738e+00
|
||||
1.4490291582473986e+00 1.5865412121263560e+00 1.7214084470448441e+00 1.8523614026473965e+00 1.9782575145276269e+00
|
||||
2.0980886961566938e+00 2.2109850373516764e+00 2.3162151996095730e+00 2.4131840597491703e+00 2.5014281146549706e+00
|
||||
2.5806091153285706e+00 2.6505063508648590e+00 2.7110079545661563e+00 2.7621015568249447e+00 2.8038645637913220e+00
|
||||
2.8364542979766156e+00 2.8600981973448825e+00 2.8750842333755031e+00 2.8817516761559574e+00 2.8804823057701157e+00
|
||||
2.8716921439699092e+00 2.8558237581894161e+00 2.8333391711552594e+00 2.8047133934346959e+00 2.7704285829676252e+00
|
||||
2.7309688247181469e+00 2.6868155147671331e+00 2.6384433262347358e+00 2.5863167291097398e+00 2.5308870321738226e+00
|
||||
2.4725899125317596e+00 2.4118433966060167e+00 2.3490462556752334e+00 2.2845767789603002e+00 2.2187918877813502e+00
|
||||
2.1520265552815943e+00 2.0845934975626363e+00 2.0167831036919637e+00 1.9488635738636404e+00 1.8810812369508270e+00
|
||||
1.8136610207193371e+00 1.7468070500507196e+00 1.6807033505858371e+00 1.6155146372447149e+00 1.5513871690559142e+00
|
||||
1.4884496536383409e+00 1.4268141864958608e+00 1.3665772120042590e+00 1.3078204945836447e+00 1.2506120900523854e+00
|
||||
1.1950073085502879e+00 1.1410496616995687e+00 1.0887717878420631e+00 1.0381963502565981e+00 9.8933690422003551e-01
|
||||
9.4219872964247031e-01 8.9677962677415124e-01 8.5307067316958651e-01 8.1105694069385592e-01 7.7071817188505065e-01
|
||||
7.3202941544290212e-01 6.9496162100761794e-01 6.5948219372701189e-01 6.2555550939233484e-01 5.9314339115629977e-01
|
||||
5.6220554903693554e-01 5.3269998356387660e-01 5.0458335504023211e-01 4.7781131998032222e-01 4.5233883634534777e-01
|
||||
4.2812043923464138e-01 4.0511048870905242e-01 3.8326339142174781e-01 3.6253379771729577e-01 3.4287677583286325e-01
|
||||
3.2424796479760154e-01 3.0660370758054967e-01 2.8990116598452254e-01 2.7409841872609064e-01 2.5915454407883409e-01
|
||||
2.4502968839369110e-01 2.3168512174254197e-01 2.1908328186436687e-01 2.0718780752542632e-01 1.9596356233750800e-01
|
||||
1.8537665001230508e-01 1.7539442196444632e-01 1.6598547811304609e-01 1.5711966166996927e-01 1.4876804864444715e-01
|
||||
1.4090293273673637e-01 1.3349780623990259e-01 1.2652733751724909e-01 1.1996734557434463e-01 1.1379477219856060e-01
|
||||
1.0798765209582406e-01 1.0252508141368288e-01 9.7387185001678311e-02 9.2555082724584015e-02 8.8010855111109620e-02
|
||||
8.3737508589961873e-02 7.9718940536826377e-02 7.5939904329596963e-02 7.2385974585237101e-02 6.9043512729294765e-02
|
||||
6.5899633029043336e-02 6.2942169202580001e-02 6.0159641699440547e-02 5.7541225732930634e-02 5.5076720130546430e-02
|
||||
5.2756517056398833e-02 5.0571572648238083e-02 4.8513378601664936e-02 4.6573934725081756e-02 4.4745722480991068e-02
|
||||
4.3021679522073253e-02 4.1395175224364866e-02 3.9859987214311721e-02 3.8410278881708670e-02 3.7040577866510604e-02
|
||||
3.5745755503880039e-02 3.4521007208912380e-02 3.3361833779917971e-02 3.2264023597108116e-02 3.1223635691821294e-02
|
||||
3.0236983660070216e-02 2.9300620393215571e-02 2.8411323597772320e-02 2.7566082075896281e-02 2.6762082737777249e-02
|
||||
2.5996698317105604e-02 2.5267475760840985e-02 2.4572125264713973e-02 2.3908509926274246e-02 2.3274635987705516e-02
|
||||
2.2668643641204911e-02 2.2088798370316409e-02 2.1533482801290083e-02 2.1001189039288493e-02 2.0490511464994254e-02
|
||||
2.0000139967999431e-02 1.9528853594166895e-02 1.9075514584991349e-02 1.8639062787818239e-02 1.8218510416650235e-02
|
||||
1.7812937144080498e-02 1.7421485505751177e-02 1.7043356599549031e-02 1.6677806062561751e-02 1.6324140309613155e-02
|
||||
1.5981713017976018e-02 1.5649921843605585e-02 1.5328205354974755e-02 1.5016040171312250e-02 1.4712938292708366e-02
|
||||
1.4418444610242331e-02 1.4132134584901757e-02 1.3853612084676337e-02 1.3582507369821917e-02 1.3318475216818060e-02
|
||||
1.3061193172097418e-02 1.2810359927147186e-02 1.2565693807050415e-02 1.2326931365025051e-02 1.2093826075940506e-02
|
||||
1.1866147122233661e-02 1.1643678266026136e-02 1.1426216801644407e-02 1.1213572583084475e-02 1.1005567121320226e-02
|
||||
1.0802032746662471e-02 1.0602811831688208e-02 1.0407756070544782e-02 1.0216725810699157e-02 1.0029589433467268e-02
|
||||
9.8462227798860602e-03 9.6665086187306404e-03 9.4903361536790021e-03 9.3176005668363371e-03 9.1482025960089031e-03
|
||||
8.9820481433065535e-03 8.8190479128032462e-03 8.6591170751522117e-03 8.5021749571883021e-03 8.3481447546937537e-03
|
||||
8.1969532666261724e-03 8.0485306492223962e-03 7.9028101885199598e-03 7.7597280899136256e-03 7.6192232834934315e-03
|
||||
7.4812372439735375e-03 7.3457138241272979e-03 7.2125991007052359e-03 7.0818412319012813e-03 6.9533903254870300e-03
|
||||
6.8271983168139705e-03 6.7032188559211503e-03 6.5814072030662141e-03 6.4617201320263939e-03 6.3441158405819764e-03
|
||||
6.2285538676237207e-03 6.1149950163802147e-03 6.0034012832899109e-03 5.8937357920846312e-03 5.7859627326801166e-03
|
||||
5.6800473044990030e-03 5.5759556638887986e-03 5.4736548753111791e-03 5.3731128660109428e-03 5.2742983838981461e-03
|
||||
5.1771809583849582e-03 5.0817308639591330e-03 4.9879190862693046e-03 4.8957172905357560e-03 4.8050977921015592e-03
|
||||
4.7160335289582467e-03 4.6284980360953021e-03 4.5424654215287241e-03 4.4579103438822931e-03 4.3748079913988880e-03
|
||||
4.2931340622749670e-03 4.2128647462132407e-03 4.1339767071033873e-03 4.0564470667446839e-03 3.9802533895282599e-03
|
||||
3.9053736680121076e-03 3.8317863093158128e-03 3.7594701222811860e-03 3.6884043053326127e-03 3.6185684349951674e-03
|
||||
3.5499424550168301e-03 3.4825066660512660e-03 3.4162417158645347e-03 3.3511285900229004e-03 3.2871486030347646e-03
|
||||
3.2242833899080170e-03 3.1625148980992668e-03 3.1018253798278661e-03 3.0421973847258310e-03 2.9836137528083811e-03
|
||||
2.9260576077371064e-03 2.8695123503632708e-03 2.8139616525287708e-03 2.7593894511106498e-03 2.7057799422959966e-03
|
||||
2.6531175760685227e-03 2.6013870509009052e-03 2.5505733086344240e-03 2.5006615295404683e-03 2.4516371275501436e-03
|
||||
2.4034857456453340e-03 2.3561932514012535e-03 2.3097457326723414e-03 2.2641294934160616e-03 2.2193310496436136e-03
|
||||
2.1753371254977782e-03 2.1321346494441173e-03 2.0897107505768314e-03 2.0480527550303662e-03 2.0071481824917164e-03
|
||||
1.9669847428123305e-03 1.9275503327108034e-03 1.8888330325659355e-03 1.8508211032951805e-03 1.8135029833145980e-03
|
||||
1.7768672855772646e-03 1.7409027946878666e-03 1.7055984640891586e-03 1.6709434133182904e-03 1.6369269253308227e-03
|
||||
1.6035384438881917e-03 1.5707675710093030e-03 1.5386040644797400e-03 1.5070378354209296e-03 1.4760589459142243e-03
|
||||
1.4456576066784674e-03 1.4158241748004133e-03 1.3865491515145517e-03 1.3578231800324136e-03 1.3296370434173130e-03
|
||||
1.3019816625059188e-03 1.2748480938728074e-03 1.2482275278369870e-03 1.2221112865106742e-03 1.1964908218862064e-03
|
||||
1.1713577139624703e-03 1.1467036689077198e-03 1.1225205172586891e-03 1.0988002121543120e-03 1.0755348276031765e-03
|
||||
1.0527165567835728e-03 1.0303377103750150e-03 1.0083907149206553e-03 9.8686811121878604e-04 9.6576255274356815e-04
|
||||
9.4506680409354657e-04 9.2477373946662708e-04 9.0487634116191706e-04 8.8536769810608137e-04 8.6624100440530968e-04
|
||||
8.4748955791986991e-04 8.2910675886310736e-04 8.1108610842155551e-04 7.9342120739794852e-04 7.7610575487466887e-04
|
||||
7.5913354689786591e-04 7.4249847518158968e-04 7.2619452583109687e-04 7.1021577808524222e-04 6.9455640307671332e-04
|
||||
6.7921066261025093e-04 6.6417290795844214e-04 6.4943757867335500e-04 6.3499920141575628e-04 6.2085238879914031e-04
|
||||
6.0699183824991856e-04 5.9341233088238896e-04 5.8010873038847818e-04 5.6707598194186137e-04 5.5430911111587280e-04
|
||||
5.4180322281523891e-04 5.2955350022104025e-04 5.1755520374872563e-04 5.0580367001857793e-04 4.9429431083891986e-04
|
||||
4.8302261220136561e-04 4.7198413328763435e-04 4.6117450548847222e-04 4.5058943143359842e-04 4.4022468403297037e-04
|
||||
4.3007610552883886e-04 4.2013960655883260e-04 4.1041116522908330e-04 4.0088682619821882e-04 3.9156269977118005e-04
|
||||
3.8243496100300207e-04 3.7349984881274514e-04 3.6475366510662147e-04 3.5619277391102898e-04 3.4781360051482253e-04
|
||||
3.3961263062063513e-04 3.3158640950565685e-04 3.2373154119109092e-04 3.1604468762060252e-04 3.0852256784754707e-04
|
||||
3.0116195723081836e-04 2.9395968663908575e-04 2.8691264166377101e-04 2.8001776184017647e-04 2.7327203987681688e-04
|
||||
2.6667252089326854e-04 2.6021630166557681e-04 2.5390052988028163e-04 2.4772240339593181e-04 2.4167916951265550e-04
|
||||
2.3576812424967210e-04 2.2998661163024531e-04 2.2433202297460642e-04 2.1880179620031078e-04 2.1339341513026532e-04
|
||||
2.0810440880823181e-04 2.0293235082175821e-04 1.9787485863260665e-04 1.9292959291436311e-04 1.8809425689761319e-04
|
||||
1.8336659572205580e-04 1.7874439579616125e-04 1.7422548416372047e-04 1.6980772787763936e-04 1.6548903338088530e-04
|
||||
1.6126734589430591e-04 1.5714064881157744e-04 1.5310696310104604e-04 1.4916434671449329e-04 1.4531089400280153e-04
|
||||
1.4154473513841234e-04 1.3786403554466153e-04 1.3426699533172857e-04 1.3075184873951283e-04 1.2731686358694039e-04
|
||||
1.2396034072819674e-04 1.2068061351527565e-04 1.1747604726729168e-04 1.1434503874632306e-04 1.1128601563955686e-04
|
||||
1.0829743604811193e-04 1.0537778798212988e-04 1.0252558886227753e-04 9.9739385027582898e-05 9.7017751249615057e-05
|
||||
9.4359290252773662e-05 9.1762632240957511e-05 8.9226434430383569e-05 8.6749380588361721e-05 8.4330180578390864e-05
|
||||
8.1967569911181246e-05 7.9660309301724484e-05 7.7407184232279429e-05 7.5207004521348451e-05 7.3058603898526649e-05
|
||||
7.0960839585107720e-05 6.8912591880629977e-05 6.6912763755002085e-05 6.4960280446513426e-05 6.3054089065330086e-05
|
||||
6.1193158202771814e-05 5.9376477546041213e-05 5.7603057498502742e-05 5.5871928805544500e-05 5.4182142185708361e-05
|
||||
5.2532767967318744e-05 5.0922895730446966e-05 4.9351633954125953e-05 4.7818109668823321e-05 4.6321468114150300e-05
|
||||
4.4860872401664663e-05 4.3435503182825573e-05 4.2044558321957873e-05 4.0687252574273750e-05 3.9362817268785450e-05
|
||||
3.8070499996214428e-05 3.6809564301621984e-05 3.5579289382025496e-05 3.4378969788611451e-05 3.3207915133769052e-05
|
||||
3.2065449802711312e-05 3.0950912669766876e-05 2.9863656819185611e-05 2.8803049270468119e-05 2.7768470708167169e-05
|
||||
2.6759315216115260e-05 2.5774990015931323e-05 2.4814915209964844e-05 2.3878523528387922e-05 2.2965260080560611e-05
|
||||
2.2074582110528148e-05 2.1205958756658535e-05 2.0358870815317476e-05 1.9532810508535560e-05 1.8727281255713447e-05
|
||||
1.7941797449145505e-05 1.7175884233475961e-05 1.6429077288930018e-05 1.5700922618341645e-05 1.4990976337865471e-05
|
||||
1.4298804471386687e-05 1.3623982748522034e-05 1.2966096406226424e-05 1.2324739993882115e-05 1.1699517181902770e-05
|
||||
1.1090040573734860e-05 1.0495931521266495e-05 9.9168199435395021e-06 9.3523441487842465e-06 8.8021506596591475e-06
|
||||
8.2658940417265321e-06 7.7432367350197678e-06 7.2338488887770244e-06 6.7374081991923703e-06 6.2535997501888662e-06
|
||||
5.7821158571569505e-06 5.3226559136389283e-06 4.8749262408651290e-06 4.4386399401326240e-06 4.0135167480073166e-06
|
||||
3.5992828942305738e-06 3.1956709623667747e-06 2.8024197531120341e-06 2.4192741502208947e-06 2.0459849890155880e-06
|
||||
1.6823089274468580e-06 1.3280083196495871e-06 9.8285109196557868e-07 6.4661062138351467e-07 3.1906561636122974e-07
|
||||
0. 0. 0. 0. 0.
|
||||
|
||||
@@ -1,303 +0,0 @@
|
||||
Cu functions (universal 4) - JB Adams et al J. Mater. Res., 4(1), 102 (1989)
|
||||
29 63.550 3.6150 FCC
|
||||
500 5.0100200400801306e-04 500 1.0000000000000009e-02 4.9499999999999886e+00
|
||||
0. -3.1589719908208558e-01 -5.2405175291223927e-01 -6.9885553834123115e-01 -8.5420409172727574e-01
|
||||
-9.9627285782417374e-01 -1.1284756487892835e+00 -1.2529454148045645e+00 -1.3711252149943363e+00 -1.4840478277127076e+00
|
||||
-1.5924840805403662e+00 -1.6954285804552853e+00 -1.7942937845174001e+00 -1.8901318213864968e+00 -1.9832501645057476e+00
|
||||
-2.0739063371790252e+00 -2.1623187777983759e+00 -2.2486748239473968e+00 -2.3331367007241965e+00 -2.4158460932269890e+00
|
||||
-2.4969276936450484e+00 -2.5764919917168783e+00 -2.6546374977553171e+00 -2.7314525337618534e+00 -2.8070166913917660e+00
|
||||
-2.8814020298474361e+00 -2.9546740684873072e+00 -3.0268926157701515e+00 -3.0981124665488835e+00 -3.1683839923973949e+00
|
||||
-3.2377536446699082e+00 -3.3062643854300688e+00 -3.3739560586949437e+00 -3.4408657118035535e+00 -3.5070278749074930e+00
|
||||
-3.5724748051301987e+00 -3.6372367006631805e+00 -3.7013418893374563e+00 -3.7648169952241943e+00 -3.8276870863304424e+00
|
||||
-3.8899758061050136e+00 -3.9517054906525857e+00 -4.0128972737054482e+00 -4.0735711808432313e+00 -4.1324020819066334e+00
|
||||
-4.1903921077370399e+00 -4.2479051536742531e+00 -4.3049572584920952e+00 -4.3615637243846948e+00 -4.4177391662932166e+00
|
||||
-4.4734975570518145e+00 -4.5288522686579995e+00 -4.5838161101054880e+00 -4.6384013621277234e+00 -4.6926198091528875e+00
|
||||
-4.7464827687459206e+00 -4.8000011187718314e+00 -4.8531853225280486e+00 -4.9060454519053280e+00 -4.9585912090057604e+00
|
||||
-5.0108319460781274e+00 -5.0627766840981678e+00 -5.1144341300432075e+00 -5.1658126929883963e+00 -5.2169204991179470e+00
|
||||
-5.2677654057399081e+00 -5.3183550143895957e+00 -5.3686966830900360e+00 -5.4187975378393958e+00 -5.4686644833797118e+00
|
||||
-5.5183042133078573e+00 -5.5677232195759814e+00 -5.6169278014231168e+00 -5.6659240737851633e+00 -5.7147179752168427e+00
|
||||
-5.7633152753617765e+00 -5.8117215820040258e+00 -5.8599423477251662e+00 -5.9079828761981332e+00 -5.9558483281427925e+00
|
||||
-6.0035437269616239e+00 -6.0510739641062798e+00 -6.0984438040355258e+00 -6.1456578891786364e+00 -6.1927207443901580e+00
|
||||
-6.2396367812953599e+00 -6.2864103024253382e+00 -6.3330455051271599e+00 -6.3795464852936732e+00 -6.4259172409138614e+00
|
||||
-6.4721616754587501e+00 -6.5182836011053098e+00 -6.5642867418169999e+00 -6.6101747362826870e+00 -6.6559511407215268e+00
|
||||
-6.7016194315664848e+00 -6.7471830080289408e+00 -6.7926321787266488e+00 -6.8376565597087335e+00 -6.8825828851093718e+00
|
||||
-6.9274142262937062e+00 -6.9721535889762833e+00 -7.0168039151812138e+00 -7.0613680851242009e+00 -7.1058489190182854e+00
|
||||
-7.1502491788078260e+00 -7.1945715698338404e+00 -7.2388187424386103e+00 -7.2829932935017325e+00 -7.3270977679243288e+00
|
||||
-7.3711346600521210e+00 -7.4151064150498769e+00 -7.4590154302223652e+00 -7.5028640562861142e+00 -7.5466545985988489e+00
|
||||
-7.5903893183397599e+00 -7.6340704336532212e+00 -7.6777001207475166e+00 -7.7212805149568453e+00 -7.7648137117691078e+00
|
||||
-7.8083017678126225e+00 -7.8517467018166371e+00 -7.8951504955327323e+00 -7.9385150946301337e+00 -7.9818424095582259e+00
|
||||
-8.0251343163853335e+00 -8.0683926576019758e+00 -8.1116192429086027e+00 -8.1548158499697934e+00 -8.1979842251487867e+00
|
||||
-8.2411260842174556e+00 -8.2842431130446244e+00 -8.3273369682627845e+00 -8.3704092779143480e+00 -8.4134616420805628e+00
|
||||
-8.4564956334858721e+00 -8.4995127980902225e+00 -8.5425146556591471e+00 -8.5855027003218538e+00 -8.6284784011075430e+00
|
||||
-8.6714432024708685e+00 -8.7143985247980709e+00 -8.7573457649043576e+00 -8.8002862965079203e+00 -8.8432214707042931e+00
|
||||
-8.8861526164113798e+00 -8.9290810408153902e+00 -8.9720080297988716e+00 -9.0149348483554377e+00 -9.0578627409975070e+00
|
||||
-9.1007929321486927e+00 -9.1437266265307358e+00 -9.1866650095359432e+00 -9.2296092475897353e+00 -9.2725604885091570e+00
|
||||
-9.3155198618426311e+00 -9.3584884792123262e+00 -9.4014674346366292e+00 -9.4444578048540961e+00 -9.4874606496305205e+00
|
||||
-9.5304770120671378e+00 -9.5735079188909253e+00 -9.6165543807549625e+00 -9.6596173924971254e+00 -9.7026979334374914e+00
|
||||
-9.7457969676344760e+00 -9.7889154441094774e+00 -9.8320542972711564e+00 -9.8749024758928954e+00 -9.9176676449848173e+00
|
||||
-9.9604518241702067e+00 -1.0003255855809869e+01 -1.0046080568707225e+01 -1.0088926778514065e+01 -1.0131795287896693e+01
|
||||
-1.0174686886752681e+01 -1.0217602352416293e+01 -1.0260542449857894e+01 -1.0303507931886486e+01 -1.0346499539340471e+01
|
||||
-1.0389518001277565e+01 -1.0432564035159942e+01 -1.0475638347037830e+01 -1.0518741631724708e+01 -1.0561874572971817e+01
|
||||
-1.0605037843637035e+01 -1.0648232105854731e+01 -1.0691458011195323e+01 -1.0734716200826654e+01 -1.0778007305671338e+01
|
||||
-1.0821331946557279e+01 -1.0864690734371720e+01 -1.0908084270204256e+01 -1.0951513145493493e+01 -1.0994977942169044e+01
|
||||
-1.1038479232788461e+01 -1.1082017580672300e+01 -1.1125593540039802e+01 -1.1169207656138724e+01 -1.1212860465371193e+01
|
||||
-1.1256552495422056e+01 -1.1300284265381379e+01 -1.1344056285864497e+01 -1.1387869059131503e+01 -1.1431723079201220e+01
|
||||
-1.1475618831971758e+01 -1.1519556795323240e+01 -1.1563537439236882e+01 -1.1607561225897086e+01 -1.1651628609799957e+01
|
||||
-1.1695740037856638e+01 -1.1739895949496542e+01 -1.1784096776765693e+01 -1.1828342944444898e+01 -1.1872634870038326e+01
|
||||
-1.1916972964140655e+01 -1.1961357630185660e+01 -1.2005789264757027e+01 -1.2050268257623998e+01 -1.2094794991829531e+01
|
||||
-1.2139369843773920e+01 -1.2183993183309894e+01 -1.2228665373815033e+01 -1.2273386772283743e+01 -1.2318157729378811e+01
|
||||
-1.2362978589648264e+01 -1.2407849691329261e+01 -1.2452771366701370e+01 -1.2497743942026659e+01 -1.2542767737650308e+01
|
||||
-1.2587843068072686e+01 -1.2632970242029501e+01 -1.2678149562554552e+01 -1.2723381327055449e+01 -1.2768665827383359e+01
|
||||
-1.2814003349896723e+01 -1.2859394175532202e+01 -1.2904838579871580e+01 -1.2950336833201561e+01 -1.2995889200586475e+01
|
||||
-1.3041495941923358e+01 -1.3087157312007946e+01 -1.3132873560597147e+01 -1.3178644932465090e+01 -1.3224471667467185e+01
|
||||
-1.3270354000597138e+01 -1.3316292162042373e+01 -1.3362286377246335e+01 -1.3408336866957768e+01 -1.3454443847292225e+01
|
||||
-1.3500607529781746e+01 -1.3546828121432384e+01 -1.3593105824775080e+01 -1.3639440837916936e+01 -1.3685833354596070e+01
|
||||
-1.3732283564231011e+01 -1.3778791651965662e+01 -1.3825357798727850e+01 -1.3871982181271051e+01 -1.3918664972226225e+01
|
||||
-1.3965406340150423e+01 -1.4012206449566122e+01 -1.4059065461010562e+01 -1.4105983531084178e+01 -1.4152960812497383e+01
|
||||
-1.4199997454109450e+01 -1.4247093600900882e+01 -1.4294249394311976e+01 -1.4341464971842868e+01 -1.4388740467389482e+01
|
||||
-1.4436076011212833e+01 -1.4483471729977452e+01 -1.4530927746798284e+01 -1.4578444181270356e+01 -1.4626021149517328e+01
|
||||
-1.4673658764226502e+01 -1.4721357134688901e+01 -1.4769116366835874e+01 -1.4816936563275590e+01 -1.4864817823335784e+01
|
||||
-1.4912760243093658e+01 -1.4960763915414873e+01 -1.5008828929991182e+01 -1.5056955373373171e+01 -1.5105143329006637e+01
|
||||
-1.5153392877265333e+01 -1.5201704095489163e+01 -1.5250077058012721e+01 -1.5298511836202465e+01 -1.5347008498487980e+01
|
||||
-1.5395567110395177e+01 -1.5444187734579373e+01 -1.5492870430854509e+01 -1.5541615256228738e+01 -1.5590422264928975e+01
|
||||
-1.5639291508440465e+01 -1.5688223035530768e+01 -1.5737216892280117e+01 -1.5786273122116540e+01 -1.5835391765839859e+01
|
||||
-1.5884572861650554e+01 -1.5933816445184789e+01 -1.5983122549535665e+01 -1.6032491205288238e+01 -1.6081922440538847e+01
|
||||
-1.6131416280932740e+01 -1.6180972749683292e+01 -1.6230591867602584e+01 -1.6280273653129598e+01 -1.6330018122352271e+01
|
||||
-1.6379825289037512e+01 -1.6429695164657005e+01 -1.6479627758410516e+01 -1.6529623077253063e+01 -1.6579681125920047e+01
|
||||
-1.6629801906948160e+01 -1.6679985420709272e+01 -1.6730231665424185e+01 -1.6780540637196850e+01 -1.6830912330026536e+01
|
||||
-1.6881346735842499e+01 -1.6931843844523655e+01 -1.6982403643917451e+01 -1.7033026119869078e+01 -1.7083711256241486e+01
|
||||
-1.7134459034938232e+01 -1.7185269435924170e+01 -1.7236142437252170e+01 -1.7287078015076759e+01 -1.7338076143685498e+01
|
||||
-1.7389136795514560e+01 -1.7440259941169757e+01 -1.7491445549452351e+01 -1.7542693587371559e+01 -1.7594004020176158e+01
|
||||
-1.7645376811364258e+01 -1.7696811922712527e+01 -1.7748309314288690e+01 -1.7799868944476657e+01 -1.7851490769996190e+01
|
||||
-1.7903174745917113e+01 -1.7954920825684781e+01 -1.8006728961136105e+01 -1.8058599102520361e+01 -1.8110531198513968e+01
|
||||
-1.8162525196246406e+01 -1.8214581041310112e+01 -1.8266698677789350e+01 -1.8318878048276588e+01 -1.8371119093861239e+01
|
||||
-1.8423421754189917e+01 -1.8475785967356501e+01 -1.8528211670354381e+01 -1.8580698798442995e+01 -1.8633247285599964e+01
|
||||
-1.8685857064432412e+01 -1.8738528066186518e+01 -1.8791260220768891e+01 -1.8844053456762026e+01 -1.8896907701446821e+01
|
||||
-1.8949822880805868e+01 -1.9002798919552447e+01 -1.9055835741138708e+01 -1.9108933267775342e+01 -1.9162091420446473e+01
|
||||
-1.9215310118921138e+01 -1.9268589281777849e+01 -1.9321928826411295e+01 -1.9375328669053715e+01 -1.9428788724780020e+01
|
||||
-1.9482308907539277e+01 -1.9535889130155283e+01 -1.9589529304345319e+01 -1.9643229340734365e+01 -1.9696989148876355e+01
|
||||
-1.9750808637254977e+01 -1.9804687713312433e+01 -1.9858626283450008e+01 -1.9912624253055810e+01 -1.9966681526506250e+01
|
||||
-2.0020798007185476e+01 -2.0074973597498911e+01 -2.0129208198888136e+01 -2.0183501711838630e+01 -2.0237854035899204e+01
|
||||
-2.0292265069692348e+01 -2.0346734710925716e+01 -2.0401262856409858e+01 -2.0455849402064473e+01 -2.0510494242935920e+01
|
||||
-2.0565197273209719e+01 -2.0619958386219423e+01 -2.0674777474463212e+01 -2.0729654429611969e+01 -2.0784589142526556e+01
|
||||
-2.0839581503263844e+01 -2.0894631401093307e+01 -2.0949738724508393e+01 -2.1004903361235051e+01 -2.1060125198247988e+01
|
||||
-2.1115404121775100e+01 -2.1170740017318053e+01 -2.1226132769657397e+01 -2.1281582262894972e+01 -2.1337088380382852e+01
|
||||
-2.1392651004661843e+01 -2.1448270018015251e+01 -2.1503945301662043e+01 -2.1559676736299366e+01 -2.1615464201984196e+01
|
||||
-2.1671307578140954e+01 -2.1727206743568217e+01 -2.1783161576455427e+01 -2.1839171954393919e+01 -2.1895237754379082e+01
|
||||
-2.1951358852829799e+01 -2.2007535125591289e+01 -2.2063766447950343e+01 -2.2120052694642595e+01 -2.2176393739860259e+01
|
||||
-2.2232789457272474e+01 -2.2289239719961643e+01 -2.2345744400729018e+01 -2.2402303371517178e+01 -2.2458916504038370e+01
|
||||
-2.2515583669430725e+01 -2.2572304738325670e+01 -2.2629079581086330e+01 -2.2685908067306855e+01 -2.2742790066346515e+01
|
||||
-2.2799725447076526e+01 -2.2856714077931429e+01 -2.2913755826927172e+01 -2.2970850561669295e+01 -2.3027998149359064e+01
|
||||
-2.3085198456800299e+01 -2.3142451350411534e+01 -2.3199756696229770e+01 -2.3257114359926845e+01 -2.3314524206806482e+01
|
||||
-2.3371986101824632e+01 -2.3429499909581864e+01 -2.3487065494349963e+01 -2.3544682720213359e+01 -2.3602351450489550e+01
|
||||
-2.3660071548650194e+01 -2.3717842877714475e+01 -2.3775665300345281e+01 -2.3833538678952209e+01 -2.3891462875653588e+01
|
||||
-2.3949437752298763e+01 -2.4007463170460369e+01 -2.4065538991453877e+01 -2.4123665076338057e+01 -2.4181841285923610e+01
|
||||
-2.4240067480782272e+01 -2.4298343521253173e+01 -2.4356669267446705e+01 -2.4415044579252253e+01 -2.4473469316351384e+01
|
||||
-2.4531943338216706e+01 -2.4590466504118467e+01 -2.4649038673143195e+01 -2.4707659704179378e+01 -2.4766329455946106e+01
|
||||
-2.4825047786983760e+01 -2.4883814555664912e+01 -2.4942629620207981e+01 -2.5001492838670174e+01 -2.5060404068966136e+01
|
||||
-2.5119363168864538e+01 -2.5178369996001948e+01 -2.5237424407882145e+01 -2.5296526261886129e+01 -2.5355675415276437e+01
|
||||
-2.5414871725205558e+01 -2.5474115048716612e+01 -2.5533405242759045e+01 -2.5592742164177480e+01 -2.5652125669732186e+01
|
||||
-2.5711555616102487e+01 -2.5771031859886762e+01 -2.5830554257610174e+01 -2.5890122665731269e+01 -2.5949736940650155e+01
|
||||
-2.6009396938703958e+01 -2.6069102516181829e+01 -2.6128853529326307e+01 -2.6188649834339685e+01 -2.6248491287389243e+01
|
||||
-2.6308377744605878e+01 -2.6368309062100934e+01 -2.6428285095962565e+01 -2.6488305702088610e+01 -2.6548370736885317e+01
|
||||
-2.6608480056235521e+01 -2.6668633516188947e+01 -2.6728830972768947e+01 -2.6789072282060033e+01 -2.6849357300140582e+01
|
||||
1.0000000000000000e+01 1.0801250630455797e+01 1.0617301586939504e+01 1.0436833263885262e+01 1.0259774441010791e+01
|
||||
1.0086055417347552e+01 9.9156079783837754e+00 9.7483653639170598e+00 9.5842622365983630e+00 9.4232346511569745e+00
|
||||
9.2652200242883396e+00 9.1101571051919450e+00 8.9579859467472716e+00 8.8086478773091699e+00 8.6620854731154395e+00
|
||||
8.5182425312888768e+00 8.3770640434238430e+00 8.2384961697429731e+00 8.1024862138128810e+00 7.9689825978078090e+00
|
||||
7.8379348383064951e+00 7.7092935226140753e+00 7.5830102855955488e+00 7.4590377870116811e+00 7.3373296893445286e+00
|
||||
7.2178406361032330e+00 7.1005262306008490e+00 6.9853430151894997e+00 6.8722484509459889e+00 6.7612008977976643e+00
|
||||
6.6521595950793255e+00 6.5450846425111706e+00 6.4399369815891703e+00 6.3366783773799114e+00 6.2352714007088537e+00
|
||||
6.1356794107365431e+00 6.0378665379119241e+00 5.9417976672963562e+00 5.8474384222482740e+00 5.7547551484639143e+00
|
||||
5.6637148983634233e+00 5.5742854158160355e+00 5.4864351211977294e+00 5.4001330967728620e+00 5.3153490723937580e+00
|
||||
5.2320534115112594e+00 5.1502170974889339e+00 5.0698117202151138e+00 4.9908094630057178e+00 4.9131830897922271e+00
|
||||
4.8369059325884791e+00 4.7619518792290876e+00 4.6882953613761629e+00 4.6159113427851821e+00 4.5447753078280471e+00
|
||||
4.4748632502644341e+00 4.4061516622586225e+00 4.3386175236344116e+00 4.2722382913651700e+00 4.2069918892916007e+00
|
||||
4.1428566980642358e+00 4.0798115453044659e+00 4.0178356959802954e+00 3.9569088429914245e+00 3.8970110979596200e+00
|
||||
3.8381229822198435e+00 3.7802254180071628e+00 3.7232997198366320e+00 3.6673275860703995e+00 3.6122910906684780e+00
|
||||
3.5581726751199767e+00 3.5049551405497539e+00 3.4526216399971048e+00 3.4011556708634600e+00 3.3505410675235936e+00
|
||||
3.3007619940993322e+00 3.2518029373895416e+00 3.2036486999552665e+00 3.1562843933552358e+00 3.1096954315288059e+00
|
||||
3.0638675243237259e+00 3.0187866711643210e+00 2.9744391548584872e+00 2.9308115355394193e+00 2.8878906447394712e+00
|
||||
2.8456635795935767e+00 2.8041176971686212e+00 2.7632406089169876e+00 2.7230201752508236e+00 2.6834445002347138e+00
|
||||
2.6445019263941703e+00 2.6061810296373835e+00 2.5684706142875200e+00 2.5313597082236612e+00 2.4948375581276281e+00
|
||||
2.4588936248343458e+00 2.4235175787838017e+00 2.3886992955719677e+00 2.3544288515990814e+00 2.3206965198123726e+00
|
||||
2.2874927655419270e+00 2.2548082424273730e+00 2.2226337884330718e+00 2.1909604219504502e+00 2.1597793379851566e+00
|
||||
2.1290819044273803e+00 2.0988596584032564e+00 2.0691043027062364e+00 2.0398077023055379e+00 2.0109618809312195e+00
|
||||
1.9825590177335428e+00 1.9545914440145751e+00 1.9270516400317490e+00 1.8999322318704799e+00 1.8732259883849522e+00
|
||||
1.8469258182056052e+00 1.8210247668116040e+00 1.7955160136671395e+00 1.7703928694199718e+00 1.7456487731607453e+00
|
||||
1.7212772897420763e+00 1.6972721071559391e+00 1.6736270339679251e+00 1.6503359968072218e+00 1.6273930379113750e+00
|
||||
1.6047923127241077e+00 1.5825280875452847e+00 1.5605947372321296e+00 1.5389867429502644e+00 1.5176986899731588e+00
|
||||
1.4967252655299603e+00 1.4760612566992890e+00 1.4557015483494027e+00 1.4356411211222593e+00 1.4158750494618957e+00
|
||||
1.3963984996850130e+00 1.3772067280935900e+00 1.3582950791282968e+00 1.3396589835618542e+00 1.3212939567312461e+00
|
||||
1.3031955968085143e+00 1.2853595831085869e+00 1.2677816744339125e+00 1.2504577074545153e+00 1.2333835951233212e+00
|
||||
1.2165553251254906e+00 1.1999689583611897e+00 1.1836206274610817e+00 1.1675065353339207e+00 1.1516229537450258e+00
|
||||
1.1359662219258198e+00 1.1205327452130547e+00 1.1053189937170771e+00 1.0903215010191687e+00 1.0755368628964561e+00
|
||||
1.0609617360743258e+00 1.0465928370056616e+00 1.0324269406761744e+00 1.0184608794353309e+00 1.0046915418523596e+00
|
||||
9.9111587159659820e-01 9.7773086634202144e-01 9.6453357669496853e-01 9.5152110514480626e-01 9.3869060503712376e-01
|
||||
9.2603927956864140e-01 9.1356438080370239e-01 9.0126320871155485e-01 8.8913311022427521e-01 8.7717147831462938e-01
|
||||
8.6537575109353426e-01 8.5374341092675010e-01 8.4227198357020328e-01 8.3095903732382581e-01 8.1980218220310874e-01
|
||||
8.0879906912829824e-01 7.9794738913080465e-01 7.8724487257618136e-01 7.7668928840378371e-01 7.6627844338220541e-01
|
||||
7.5601018138057441e-01 7.4588238265517859e-01 7.3589296315095609e-01 7.2603987381787150e-01 7.1632109994155613e-01
|
||||
7.0673466048788924e-01 6.9727860746149162e-01 6.8795102527759866e-01 6.7875003014695068e-01 6.6967376947370028e-01
|
||||
6.6072042126578623e-01 6.5188819355765304e-01 6.4317532384495735e-01 6.3458007853101606e-01 6.2610075238490026e-01
|
||||
6.1773566801053903e-01 6.0948317532703555e-01 6.0134165105970538e-01 5.9330949824153834e-01 5.8538514572508404e-01
|
||||
5.7756704770434197e-01 5.6985368324655994e-01 5.6224355583360364e-01 5.5473519291279416e-01 5.4732714545698968e-01
|
||||
5.4001798753364838e-01 5.3280631588273764e-01 5.2569074950327632e-01 5.1866992924830413e-01 5.1174251742814292e-01
|
||||
5.0490719742172274e-01 4.9816267329578068e-01 4.9150766943189694e-01 4.8494093016099704e-01 4.7846121940521869e-01
|
||||
4.7206732032718257e-01 4.6575803498626733e-01 4.5953218400168616e-01 4.5338860622255339e-01 4.4732615840449164e-01
|
||||
4.4134371489266400e-01 4.3544016731128998e-01 4.2961442425926322e-01 4.2386541101190822e-01 4.1819206922867025e-01
|
||||
4.1259335666658892e-01 4.0706824689955035e-01 4.0161572904300691e-01 3.9623480748423034e-01 3.9092450161784775e-01
|
||||
3.8568384558664448e-01 3.8051188802739233e-01 3.7540769182181144e-01 3.7037033385227858e-01 3.6539890476236891e-01
|
||||
3.6049250872219041e-01 3.5565026319808801e-01 3.5087129872705347e-01 3.4615475869537526e-01 3.4149979912160333e-01
|
||||
3.3690558844382679e-01 3.3237130731094133e-01 3.2789614837797743e-01 3.2347931610542879e-01 3.1912002656234151e-01
|
||||
3.1481750723327195e-01 3.1057099682888989e-01 3.0637974510021770e-01 3.0224301265637266e-01 2.9816007078585649e-01
|
||||
2.9413020128113843e-01 2.9015269626666473e-01 2.8622685803004089e-01 2.8235199885637741e-01 2.7852744086590420e-01
|
||||
2.7475251585446614e-01 2.7102656513704559e-01 2.6734893939429405e-01 2.6371899852185088e-01 2.6013611148246873e-01
|
||||
2.5659965616091007e-01 2.5310901922152595e-01 2.4966359596849230e-01 2.4626279020852770e-01 2.4290601411627044e-01
|
||||
2.3959268810202516e-01 2.3632224068197960e-01 2.3309410835077227e-01 2.2990773545640408e-01 2.2676257407740064e-01
|
||||
2.2365808390219311e-01 2.2059373211072231e-01 2.1756899325815038e-01 2.1458334916067301e-01 2.1163628878335583e-01
|
||||
2.0872730813005891e-01 2.0585591013519533e-01 2.0302160455757168e-01 2.0022390787598532e-01 1.9746234318676770e-01
|
||||
1.9473644010305513e-01 1.9204573465593366e-01 1.8938976919722261e-01 1.8676809230404867e-01 1.8418025868501697e-01
|
||||
1.8162582908807767e-01 1.7910437020998504e-01 1.7661545460728956e-01 1.7415866060892160e-01 1.7173357223026997e-01
|
||||
1.6933977908869480e-01 1.6697687632057967e-01 1.6464446449973735e-01 1.6234214955720816e-01 1.6006954270248031e-01
|
||||
1.5782626034600167e-01 1.5561192402300605e-01 1.5342616031865575e-01 1.5126860079441595e-01 1.4913888191569047e-01
|
||||
1.4703664498065105e-01 1.4496153605028095e-01 1.4291320587954814e-01 1.4089130984976528e-01 1.3889550790202332e-01
|
||||
1.3692546447178433e-01 1.3498084842450275e-01 1.3306133299233025e-01 1.3116659571184286e-01 1.2929631836281086e-01
|
||||
1.2745018690792786e-01 1.2562789143357200e-01 1.2382912609148811e-01 1.2205358904140517e-01 1.2030098239464682e-01
|
||||
1.1857101215854371e-01 1.1686338818183373e-01 1.1517782410088362e-01 1.1351403728677267e-01 1.1187174879324324e-01
|
||||
1.1025068330544618e-01 1.0865056908951143e-01 1.0707113794291789e-01 1.0551212514563968e-01 1.0397326941204543e-01
|
||||
1.0245431284357487e-01 1.0095500088214582e-01 9.9475082264262937e-02 9.8014308975870268e-02 9.6572436207889467e-02
|
||||
9.5149222312433945e-02 9.3744428759716225e-02 9.2357820095600562e-02 9.0989163899807046e-02 8.9638230744780500e-02
|
||||
8.8304794155128263e-02 8.6988630567732095e-02 8.5689519292436067e-02 8.4407242473327759e-02 8.3141585050604316e-02
|
||||
8.1892334723027815e-02 8.0659281910912206e-02 7.9442219719687568e-02 7.8240943904000826e-02 7.7055252832346710e-02
|
||||
7.5884947452225848e-02 7.4729831255814450e-02 7.3589710246154016e-02 7.2464392903814900e-02 7.1353690154065674e-02
|
||||
7.0257415334517681e-02 6.9175384163253639e-02 6.8107414707390124e-02 6.7053327352138758e-02 6.6012944770277748e-02
|
||||
6.4986091892085707e-02 6.3972595875702698e-02 6.2972286077917161e-02 6.1984994025379159e-02 6.1010553386208866e-02
|
||||
6.0048799942037157e-02 5.9099571560412123e-02 5.8162708167624810e-02 5.7238051721903105e-02 5.6325446186999528e-02
|
||||
5.5424737506136967e-02 5.4535773576326108e-02 5.3658404223060785e-02 5.2792481175329975e-02 5.1937858041017471e-02
|
||||
5.1094390282629742e-02 5.0261935193351093e-02 4.9440351873451860e-02 4.8629501207008818e-02 4.7829245838954204e-02
|
||||
4.7039450152438045e-02 4.6259980246510013e-02 4.5490703914087494e-02 4.4731490620259606e-02 4.3982211480854794e-02
|
||||
4.3242739241325268e-02 4.2512948255901239e-02 4.1792714467042247e-02 4.1081915385161816e-02 4.0380430068628126e-02
|
||||
3.9688139104032016e-02 3.9004924586723888e-02 3.8330670101623943e-02 3.7665260704261794e-02 3.7008582902100740e-02
|
||||
3.6360524636098734e-02 3.5720975262514720e-02 3.5089825534961649e-02 3.4466967586696873e-02 3.3852294913154113e-02
|
||||
3.3245702354694373e-02 3.2647086079598653e-02 3.2056343567280043e-02 3.1473373591718756e-02 3.0898076205114089e-02
|
||||
3.0330352721755771e-02 2.9770105702100036e-02 2.9217238937061962e-02 2.8671657432515429e-02 2.8133267393987360e-02
|
||||
2.7601976211552692e-02 2.7077692444942292e-02 2.6560325808820506e-02 2.6049787158272331e-02 2.5545988474477754e-02
|
||||
2.5048842850559749e-02 2.4558264477628988e-02 2.4074168631003978e-02 2.3596471656606832e-02 2.3125090957536454e-02
|
||||
2.2659944980823798e-02 2.2200953204335683e-02 2.1748036123873327e-02 2.1301115240412782e-02 2.0860113047532769e-02
|
||||
2.0424953018977066e-02 1.9995559596398205e-02 1.9571858177249157e-02 1.9153775102828896e-02 1.8741237646474174e-02
|
||||
1.8334174001923720e-02 1.7932513271801676e-02 1.7536185456265119e-02 1.7145121441799471e-02 1.6759252990136364e-02
|
||||
1.6378512727332817e-02 1.6002834132978205e-02 1.5632151529535232e-02 1.5266400071822006e-02 1.4905515736628239e-02
|
||||
1.4549435312452008e-02 1.4198096389378967e-02 1.3851437349077567e-02 1.3509397354926733e-02 1.3171916342264223e-02
|
||||
1.2838935008766206e-02 1.2510394804928215e-02 1.2186237924689647e-02 1.1866407296154180e-02 1.1550846572444651e-02
|
||||
1.1239500122655843e-02 1.0932313022929629e-02 1.0629231047647014e-02 1.0330200660712663e-02 1.0035169006965661e-02
|
||||
9.7440839036889715e-03 9.4568938322237006e-03 9.1735479296908284e-03 8.8939959808188584e-03 8.6181884098633921e-03
|
||||
8.3460762726380588e-03 8.0776112486364848e-03 7.8127456332576228e-03 7.5514323301215658e-03 7.2936248434884998e-03
|
||||
7.0392772707632556e-03 6.7883442950981143e-03 6.5407811780883174e-03 6.2965437525517309e-03 6.0555884154033235e-03
|
||||
5.8178721206175732e-03 5.5833523722726985e-03 5.3519872176873706e-03 5.1237352406410253e-03 4.8985555546731119e-03
|
||||
4.6764077964680517e-03 4.4572521193242398e-03 4.2410491867018174e-03 4.0277601658472717e-03 3.8173467214993595e-03
|
||||
3.6097710096757996e-03 3.4049956715283547e-03 3.2029838272795708e-03 3.0036990702375643e-03 2.8071054608720947e-03
|
||||
2.6131675209760674e-03 2.4218502278956500e-03 2.2331190088270558e-03 2.0469397351849938e-03 1.8632787170468346e-03
|
||||
1.6821026976564513e-03 1.5033788479984489e-03 1.3270747614446132e-03 1.1531584484569257e-03 9.8159833136179930e-04
|
||||
8.1236323918953968e-04 6.4542240257081662e-04 4.8074544870146951e-04 3.1830239637042901e-04 1.5806365104042985e-04
|
||||
0. 0. 0. 0. 0.
|
||||
0. 5.4383329664155645e-05 9.3944898415945083e-04 4.3251847212615047e-03 1.2334244035325348e-02
|
||||
2.7137722173468548e-02 5.0697119791449641e-02 8.4607638668976470e-02 1.3001641279549414e-01 1.8759487452762702e-01
|
||||
2.5754900895683441e-01 3.3965493779430744e-01 4.3331024634064264e-01 5.3759384878832961e-01 6.5132908316254046e-01
|
||||
7.7314622535699939e-01 9.0154178511424377e-01 1.0349328562818201e+00 1.1717054897399350e+00 1.3102565818166738e+00
|
||||
1.4490291582473986e+00 1.5865412121263560e+00 1.7214084470448441e+00 1.8523614026473965e+00 1.9782575145276269e+00
|
||||
2.0980886961566938e+00 2.2109850373516764e+00 2.3162151996095730e+00 2.4131840597491703e+00 2.5014281146549706e+00
|
||||
2.5806091153285706e+00 2.6505063508648590e+00 2.7110079545661563e+00 2.7621015568249447e+00 2.8038645637913220e+00
|
||||
2.8364542979766156e+00 2.8600981973448825e+00 2.8750842333755031e+00 2.8817516761559574e+00 2.8804823057701157e+00
|
||||
2.8716921439699092e+00 2.8558237581894161e+00 2.8333391711552594e+00 2.8047133934346959e+00 2.7704285829676252e+00
|
||||
2.7309688247181469e+00 2.6868155147671331e+00 2.6384433262347358e+00 2.5863167291097398e+00 2.5308870321738226e+00
|
||||
2.4725899125317596e+00 2.4118433966060167e+00 2.3490462556752334e+00 2.2845767789603002e+00 2.2187918877813502e+00
|
||||
2.1520265552815943e+00 2.0845934975626363e+00 2.0167831036919637e+00 1.9488635738636404e+00 1.8810812369508270e+00
|
||||
1.8136610207193371e+00 1.7468070500507196e+00 1.6807033505858371e+00 1.6155146372447149e+00 1.5513871690559142e+00
|
||||
1.4884496536383409e+00 1.4268141864958608e+00 1.3665772120042590e+00 1.3078204945836447e+00 1.2506120900523854e+00
|
||||
1.1950073085502879e+00 1.1410496616995687e+00 1.0887717878420631e+00 1.0381963502565981e+00 9.8933690422003551e-01
|
||||
9.4219872964247031e-01 8.9677962677415124e-01 8.5307067316958651e-01 8.1105694069385592e-01 7.7071817188505065e-01
|
||||
7.3202941544290212e-01 6.9496162100761794e-01 6.5948219372701189e-01 6.2555550939233484e-01 5.9314339115629977e-01
|
||||
5.6220554903693554e-01 5.3269998356387660e-01 5.0458335504023211e-01 4.7781131998032222e-01 4.5233883634534777e-01
|
||||
4.2812043923464138e-01 4.0511048870905242e-01 3.8326339142174781e-01 3.6253379771729577e-01 3.4287677583286325e-01
|
||||
3.2424796479760154e-01 3.0660370758054967e-01 2.8990116598452254e-01 2.7409841872609064e-01 2.5915454407883409e-01
|
||||
2.4502968839369110e-01 2.3168512174254197e-01 2.1908328186436687e-01 2.0718780752542632e-01 1.9596356233750800e-01
|
||||
1.8537665001230508e-01 1.7539442196444632e-01 1.6598547811304609e-01 1.5711966166996927e-01 1.4876804864444715e-01
|
||||
1.4090293273673637e-01 1.3349780623990259e-01 1.2652733751724909e-01 1.1996734557434463e-01 1.1379477219856060e-01
|
||||
1.0798765209582406e-01 1.0252508141368288e-01 9.7387185001678311e-02 9.2555082724584015e-02 8.8010855111109620e-02
|
||||
8.3737508589961873e-02 7.9718940536826377e-02 7.5939904329596963e-02 7.2385974585237101e-02 6.9043512729294765e-02
|
||||
6.5899633029043336e-02 6.2942169202580001e-02 6.0159641699440547e-02 5.7541225732930634e-02 5.5076720130546430e-02
|
||||
5.2756517056398833e-02 5.0571572648238083e-02 4.8513378601664936e-02 4.6573934725081756e-02 4.4745722480991068e-02
|
||||
4.3021679522073253e-02 4.1395175224364866e-02 3.9859987214311721e-02 3.8410278881708670e-02 3.7040577866510604e-02
|
||||
3.5745755503880039e-02 3.4521007208912380e-02 3.3361833779917971e-02 3.2264023597108116e-02 3.1223635691821294e-02
|
||||
3.0236983660070216e-02 2.9300620393215571e-02 2.8411323597772320e-02 2.7566082075896281e-02 2.6762082737777249e-02
|
||||
2.5996698317105604e-02 2.5267475760840985e-02 2.4572125264713973e-02 2.3908509926274246e-02 2.3274635987705516e-02
|
||||
2.2668643641204911e-02 2.2088798370316409e-02 2.1533482801290083e-02 2.1001189039288493e-02 2.0490511464994254e-02
|
||||
2.0000139967999431e-02 1.9528853594166895e-02 1.9075514584991349e-02 1.8639062787818239e-02 1.8218510416650235e-02
|
||||
1.7812937144080498e-02 1.7421485505751177e-02 1.7043356599549031e-02 1.6677806062561751e-02 1.6324140309613155e-02
|
||||
1.5981713017976018e-02 1.5649921843605585e-02 1.5328205354974755e-02 1.5016040171312250e-02 1.4712938292708366e-02
|
||||
1.4418444610242331e-02 1.4132134584901757e-02 1.3853612084676337e-02 1.3582507369821917e-02 1.3318475216818060e-02
|
||||
1.3061193172097418e-02 1.2810359927147186e-02 1.2565693807050415e-02 1.2326931365025051e-02 1.2093826075940506e-02
|
||||
1.1866147122233661e-02 1.1643678266026136e-02 1.1426216801644407e-02 1.1213572583084475e-02 1.1005567121320226e-02
|
||||
1.0802032746662471e-02 1.0602811831688208e-02 1.0407756070544782e-02 1.0216725810699157e-02 1.0029589433467268e-02
|
||||
9.8462227798860602e-03 9.6665086187306404e-03 9.4903361536790021e-03 9.3176005668363371e-03 9.1482025960089031e-03
|
||||
8.9820481433065535e-03 8.8190479128032462e-03 8.6591170751522117e-03 8.5021749571883021e-03 8.3481447546937537e-03
|
||||
8.1969532666261724e-03 8.0485306492223962e-03 7.9028101885199598e-03 7.7597280899136256e-03 7.6192232834934315e-03
|
||||
7.4812372439735375e-03 7.3457138241272979e-03 7.2125991007052359e-03 7.0818412319012813e-03 6.9533903254870300e-03
|
||||
6.8271983168139705e-03 6.7032188559211503e-03 6.5814072030662141e-03 6.4617201320263939e-03 6.3441158405819764e-03
|
||||
6.2285538676237207e-03 6.1149950163802147e-03 6.0034012832899109e-03 5.8937357920846312e-03 5.7859627326801166e-03
|
||||
5.6800473044990030e-03 5.5759556638887986e-03 5.4736548753111791e-03 5.3731128660109428e-03 5.2742983838981461e-03
|
||||
5.1771809583849582e-03 5.0817308639591330e-03 4.9879190862693046e-03 4.8957172905357560e-03 4.8050977921015592e-03
|
||||
4.7160335289582467e-03 4.6284980360953021e-03 4.5424654215287241e-03 4.4579103438822931e-03 4.3748079913988880e-03
|
||||
4.2931340622749670e-03 4.2128647462132407e-03 4.1339767071033873e-03 4.0564470667446839e-03 3.9802533895282599e-03
|
||||
3.9053736680121076e-03 3.8317863093158128e-03 3.7594701222811860e-03 3.6884043053326127e-03 3.6185684349951674e-03
|
||||
3.5499424550168301e-03 3.4825066660512660e-03 3.4162417158645347e-03 3.3511285900229004e-03 3.2871486030347646e-03
|
||||
3.2242833899080170e-03 3.1625148980992668e-03 3.1018253798278661e-03 3.0421973847258310e-03 2.9836137528083811e-03
|
||||
2.9260576077371064e-03 2.8695123503632708e-03 2.8139616525287708e-03 2.7593894511106498e-03 2.7057799422959966e-03
|
||||
2.6531175760685227e-03 2.6013870509009052e-03 2.5505733086344240e-03 2.5006615295404683e-03 2.4516371275501436e-03
|
||||
2.4034857456453340e-03 2.3561932514012535e-03 2.3097457326723414e-03 2.2641294934160616e-03 2.2193310496436136e-03
|
||||
2.1753371254977782e-03 2.1321346494441173e-03 2.0897107505768314e-03 2.0480527550303662e-03 2.0071481824917164e-03
|
||||
1.9669847428123305e-03 1.9275503327108034e-03 1.8888330325659355e-03 1.8508211032951805e-03 1.8135029833145980e-03
|
||||
1.7768672855772646e-03 1.7409027946878666e-03 1.7055984640891586e-03 1.6709434133182904e-03 1.6369269253308227e-03
|
||||
1.6035384438881917e-03 1.5707675710093030e-03 1.5386040644797400e-03 1.5070378354209296e-03 1.4760589459142243e-03
|
||||
1.4456576066784674e-03 1.4158241748004133e-03 1.3865491515145517e-03 1.3578231800324136e-03 1.3296370434173130e-03
|
||||
1.3019816625059188e-03 1.2748480938728074e-03 1.2482275278369870e-03 1.2221112865106742e-03 1.1964908218862064e-03
|
||||
1.1713577139624703e-03 1.1467036689077198e-03 1.1225205172586891e-03 1.0988002121543120e-03 1.0755348276031765e-03
|
||||
1.0527165567835728e-03 1.0303377103750150e-03 1.0083907149206553e-03 9.8686811121878604e-04 9.6576255274356815e-04
|
||||
9.4506680409354657e-04 9.2477373946662708e-04 9.0487634116191706e-04 8.8536769810608137e-04 8.6624100440530968e-04
|
||||
8.4748955791986991e-04 8.2910675886310736e-04 8.1108610842155551e-04 7.9342120739794852e-04 7.7610575487466887e-04
|
||||
7.5913354689786591e-04 7.4249847518158968e-04 7.2619452583109687e-04 7.1021577808524222e-04 6.9455640307671332e-04
|
||||
6.7921066261025093e-04 6.6417290795844214e-04 6.4943757867335500e-04 6.3499920141575628e-04 6.2085238879914031e-04
|
||||
6.0699183824991856e-04 5.9341233088238896e-04 5.8010873038847818e-04 5.6707598194186137e-04 5.5430911111587280e-04
|
||||
5.4180322281523891e-04 5.2955350022104025e-04 5.1755520374872563e-04 5.0580367001857793e-04 4.9429431083891986e-04
|
||||
4.8302261220136561e-04 4.7198413328763435e-04 4.6117450548847222e-04 4.5058943143359842e-04 4.4022468403297037e-04
|
||||
4.3007610552883886e-04 4.2013960655883260e-04 4.1041116522908330e-04 4.0088682619821882e-04 3.9156269977118005e-04
|
||||
3.8243496100300207e-04 3.7349984881274514e-04 3.6475366510662147e-04 3.5619277391102898e-04 3.4781360051482253e-04
|
||||
3.3961263062063513e-04 3.3158640950565685e-04 3.2373154119109092e-04 3.1604468762060252e-04 3.0852256784754707e-04
|
||||
3.0116195723081836e-04 2.9395968663908575e-04 2.8691264166377101e-04 2.8001776184017647e-04 2.7327203987681688e-04
|
||||
2.6667252089326854e-04 2.6021630166557681e-04 2.5390052988028163e-04 2.4772240339593181e-04 2.4167916951265550e-04
|
||||
2.3576812424967210e-04 2.2998661163024531e-04 2.2433202297460642e-04 2.1880179620031078e-04 2.1339341513026532e-04
|
||||
2.0810440880823181e-04 2.0293235082175821e-04 1.9787485863260665e-04 1.9292959291436311e-04 1.8809425689761319e-04
|
||||
1.8336659572205580e-04 1.7874439579616125e-04 1.7422548416372047e-04 1.6980772787763936e-04 1.6548903338088530e-04
|
||||
1.6126734589430591e-04 1.5714064881157744e-04 1.5310696310104604e-04 1.4916434671449329e-04 1.4531089400280153e-04
|
||||
1.4154473513841234e-04 1.3786403554466153e-04 1.3426699533172857e-04 1.3075184873951283e-04 1.2731686358694039e-04
|
||||
1.2396034072819674e-04 1.2068061351527565e-04 1.1747604726729168e-04 1.1434503874632306e-04 1.1128601563955686e-04
|
||||
1.0829743604811193e-04 1.0537778798212988e-04 1.0252558886227753e-04 9.9739385027582898e-05 9.7017751249615057e-05
|
||||
9.4359290252773662e-05 9.1762632240957511e-05 8.9226434430383569e-05 8.6749380588361721e-05 8.4330180578390864e-05
|
||||
8.1967569911181246e-05 7.9660309301724484e-05 7.7407184232279429e-05 7.5207004521348451e-05 7.3058603898526649e-05
|
||||
7.0960839585107720e-05 6.8912591880629977e-05 6.6912763755002085e-05 6.4960280446513426e-05 6.3054089065330086e-05
|
||||
6.1193158202771814e-05 5.9376477546041213e-05 5.7603057498502742e-05 5.5871928805544500e-05 5.4182142185708361e-05
|
||||
5.2532767967318744e-05 5.0922895730446966e-05 4.9351633954125953e-05 4.7818109668823321e-05 4.6321468114150300e-05
|
||||
4.4860872401664663e-05 4.3435503182825573e-05 4.2044558321957873e-05 4.0687252574273750e-05 3.9362817268785450e-05
|
||||
3.8070499996214428e-05 3.6809564301621984e-05 3.5579289382025496e-05 3.4378969788611451e-05 3.3207915133769052e-05
|
||||
3.2065449802711312e-05 3.0950912669766876e-05 2.9863656819185611e-05 2.8803049270468119e-05 2.7768470708167169e-05
|
||||
2.6759315216115260e-05 2.5774990015931323e-05 2.4814915209964844e-05 2.3878523528387922e-05 2.2965260080560611e-05
|
||||
2.2074582110528148e-05 2.1205958756658535e-05 2.0358870815317476e-05 1.9532810508535560e-05 1.8727281255713447e-05
|
||||
1.7941797449145505e-05 1.7175884233475961e-05 1.6429077288930018e-05 1.5700922618341645e-05 1.4990976337865471e-05
|
||||
1.4298804471386687e-05 1.3623982748522034e-05 1.2966096406226424e-05 1.2324739993882115e-05 1.1699517181902770e-05
|
||||
1.1090040573734860e-05 1.0495931521266495e-05 9.9168199435395021e-06 9.3523441487842465e-06 8.8021506596591475e-06
|
||||
8.2658940417265321e-06 7.7432367350197678e-06 7.2338488887770244e-06 6.7374081991923703e-06 6.2535997501888662e-06
|
||||
5.7821158571569505e-06 5.3226559136389283e-06 4.8749262408651290e-06 4.4386399401326240e-06 4.0135167480073166e-06
|
||||
3.5992828942305738e-06 3.1956709623667747e-06 2.8024197531120341e-06 2.4192741502208947e-06 2.0459849890155880e-06
|
||||
1.6823089274468580e-06 1.3280083196495871e-06 9.8285109196557868e-07 6.4661062138351467e-07 3.1906561636122974e-07
|
||||
0. 0. 0. 0. 0.
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,3 +0,0 @@
|
||||
2 0 1 0 1 0 1
|
||||
1 0.1 0.166 0.55 0 1 0 0
|
||||
1 0.1 0.833 0.45 0 -1 0 0
|
||||
@@ -1,9 +0,0 @@
|
||||
pbc_x 0
|
||||
pbc_y 0
|
||||
pbc_z 0
|
||||
ntimes 200
|
||||
dt 0.01
|
||||
reneigh_every 1
|
||||
mass 0.1
|
||||
k_s 100
|
||||
k_dn 10
|
||||
10
evaluate_cpu_openmpi_metrics.sh
Normal file
10
evaluate_cpu_openmpi_metrics.sh
Normal file
@@ -0,0 +1,10 @@
|
||||
END=32
|
||||
for ((i=1;i<=END;i++)); do
|
||||
output=$(eval "likwid-mpirun -np 1 -t $i -m -g FLOPS_DP -omp gnu ./MDBench-GCC -n 50")
|
||||
echo -n "$i,"
|
||||
echo "$output" > "FLOPS_DP/thread_$i.txt"
|
||||
done
|
||||
|
||||
## likwid perf measurements on testfront1:
|
||||
# srun --nodes=1 --exclusive --nodelist=rome1 --time=00:30:00 --export=NONE -c 64 -C hwperf --pty /bin/bash -l
|
||||
# likwid-mpirun -np 1 -t 32 -m -g MEM -omp gnu -d ./MDBench-GCC
|
||||
6
evaluate_cpu_runtime.sh
Normal file
6
evaluate_cpu_runtime.sh
Normal file
@@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
for i in $(seq 1 32); do
|
||||
echo "$i"
|
||||
export "OMP_NUM_THREADS=$i"
|
||||
./MDBench-GCC -n 50 | grep "Performance"
|
||||
done
|
||||
5
evaluate_gpu_ncu_profiles_per_thread.sh
Normal file
5
evaluate_gpu_ncu_profiles_per_thread.sh
Normal file
@@ -0,0 +1,5 @@
|
||||
END=32
|
||||
for ((i=16;i<=END;i++)); do
|
||||
export NUM_THREADS=$i
|
||||
$(eval "ncu --set full -o /home/hpc/rzku/ptfs410h/MD-Bench/log/MG/presentation_2/Resources/GPU/Metrics/threads_$i ./MDBench-NVCC -n 50")
|
||||
done
|
||||
6
evaluate_gpu_perf_per_thread.sh
Normal file
6
evaluate_gpu_perf_per_thread.sh
Normal file
@@ -0,0 +1,6 @@
|
||||
END=64
|
||||
for ((i=1;i<=END;i*=2)); do
|
||||
output=$(eval "NUM_THREADS=$i ./MDBench-NVCC -n 2000")
|
||||
echo -n "$i,"
|
||||
echo "$output" | grep 'atom updates per second' | sed 's/[^0-9.,]//g' | awk '{print $1"e6"}'
|
||||
done
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 273 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 98 KiB |
@@ -1,523 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
width="297mm"
|
||||
height="210mm"
|
||||
viewBox="0 0 297 210"
|
||||
version="1.1"
|
||||
id="svg5"
|
||||
inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
|
||||
sodipodi:docname="gather_bench.svg"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:svg="http://www.w3.org/2000/svg">
|
||||
<sodipodi:namedview
|
||||
id="namedview7"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1.0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:pageopacity="0.0"
|
||||
inkscape:pagecheckerboard="0"
|
||||
inkscape:document-units="mm"
|
||||
showgrid="false"
|
||||
inkscape:zoom="0.73508842"
|
||||
inkscape:cx="551.63432"
|
||||
inkscape:cy="348.25743"
|
||||
inkscape:window-width="1920"
|
||||
inkscape:window-height="1011"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-y="165"
|
||||
inkscape:window-maximized="1"
|
||||
inkscape:current-layer="layer1" />
|
||||
<defs
|
||||
id="defs2">
|
||||
<rect
|
||||
x="144.01516"
|
||||
y="304.36604"
|
||||
width="248.99777"
|
||||
height="100.91557"
|
||||
id="rect79475" />
|
||||
<rect
|
||||
x="309.01869"
|
||||
y="43.698615"
|
||||
width="552.19421"
|
||||
height="71.390348"
|
||||
id="rect65238" />
|
||||
<rect
|
||||
x="762.55856"
|
||||
y="341.3838"
|
||||
width="277.62756"
|
||||
height="105.0235"
|
||||
id="rect47632" />
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
id="linearGradient40704">
|
||||
<stop
|
||||
style="stop-color:#ccffaa;stop-opacity:1;"
|
||||
offset="0"
|
||||
id="stop40700" />
|
||||
<stop
|
||||
style="stop-color:#ccffaa;stop-opacity:0;"
|
||||
offset="1"
|
||||
id="stop40702" />
|
||||
</linearGradient>
|
||||
<marker
|
||||
style="overflow:visible;"
|
||||
id="Arrow2Mend"
|
||||
refX="0.0"
|
||||
refY="0.0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow2Mend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(0.6) rotate(180) translate(0,0)"
|
||||
d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
|
||||
style="stroke:context-stroke;fill-rule:evenodd;fill:context-stroke;stroke-width:0.62500000;stroke-linejoin:round;"
|
||||
id="path39486" />
|
||||
</marker>
|
||||
<marker
|
||||
style="overflow:visible;"
|
||||
id="Arrow1Mend"
|
||||
refX="0.0"
|
||||
refY="0.0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow1Mend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(0.4) rotate(180) translate(10,0)"
|
||||
style="fill-rule:evenodd;fill:context-stroke;stroke:context-stroke;stroke-width:1.0pt;"
|
||||
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
|
||||
id="path39468" />
|
||||
</marker>
|
||||
<marker
|
||||
style="overflow:visible;"
|
||||
id="Arrow1Lend"
|
||||
refX="0.0"
|
||||
refY="0.0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow1Lend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(0.8) rotate(180) translate(12.5,0)"
|
||||
style="fill-rule:evenodd;fill:context-stroke;stroke:context-stroke;stroke-width:1.0pt;"
|
||||
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
|
||||
id="path39462" />
|
||||
</marker>
|
||||
<rect
|
||||
x="707.09731"
|
||||
y="616.36746"
|
||||
width="407.71288"
|
||||
height="417.08306"
|
||||
id="rect24254" />
|
||||
<rect
|
||||
x="47.404365"
|
||||
y="100.3268"
|
||||
width="398.49855"
|
||||
height="110.16514"
|
||||
id="rect5050" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-3" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-3-5" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-3-5-6" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-3-5-6-1" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-0" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-0-6" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-0-6-2" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-0-6-2-8" />
|
||||
<marker
|
||||
style="overflow:visible"
|
||||
id="Arrow2Mend-2"
|
||||
refX="0"
|
||||
refY="0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow2Mend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(-0.6)"
|
||||
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
|
||||
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
|
||||
id="path39486-3" />
|
||||
</marker>
|
||||
<marker
|
||||
style="overflow:visible"
|
||||
id="Arrow2Mend-2-5"
|
||||
refX="0"
|
||||
refY="0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow2Mend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(-0.6)"
|
||||
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
|
||||
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
|
||||
id="path39486-3-9" />
|
||||
</marker>
|
||||
<marker
|
||||
style="overflow:visible"
|
||||
id="Arrow2Mend-2-5-2"
|
||||
refX="0"
|
||||
refY="0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow2Mend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(-0.6)"
|
||||
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
|
||||
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
|
||||
id="path39486-3-9-8" />
|
||||
</marker>
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient40704"
|
||||
id="linearGradient40706"
|
||||
x1="324.58157"
|
||||
y1="127.35331"
|
||||
x2="363.61096"
|
||||
y2="98.957848"
|
||||
gradientUnits="userSpaceOnUse" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-3-5-6-1-7" />
|
||||
<rect
|
||||
x="309.01868"
|
||||
y="43.698616"
|
||||
width="552.19421"
|
||||
height="71.39035"
|
||||
id="rect65238-1" />
|
||||
</defs>
|
||||
<g
|
||||
inkscape:label="Layer 1"
|
||||
inkscape:groupmode="layer"
|
||||
id="layer1">
|
||||
<rect
|
||||
style="fill:#d5d5ff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0"
|
||||
id="rect55834"
|
||||
width="250.31726"
|
||||
height="74.676537"
|
||||
x="25.257824"
|
||||
y="97.277718" />
|
||||
<rect
|
||||
style="fill:#d5f6ff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0"
|
||||
id="rect55832"
|
||||
width="250.35208"
|
||||
height="64.461151"
|
||||
x="25.256891"
|
||||
y="32.817505" />
|
||||
<rect
|
||||
style="fill:#ccffaa;stroke:#091600;stroke-width:1.31891"
|
||||
id="rect6462"
|
||||
width="82.385742"
|
||||
height="20.525751"
|
||||
x="28.355024"
|
||||
y="48.740646" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,17.244577,26.206534)"
|
||||
id="text5048"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82948"><tspan
|
||||
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
|
||||
id="tspan82946">gather-bench</tspan></tspan></text>
|
||||
<rect
|
||||
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
|
||||
id="rect6462-9"
|
||||
width="18.764017"
|
||||
height="20.965076"
|
||||
x="39.518955"
|
||||
y="140.726" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.33667319,0,0,0.33667319,25.589293,109.42998)"
|
||||
id="text5048-3"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82950">L1</tspan></text>
|
||||
<rect
|
||||
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
|
||||
id="rect6462-9-0"
|
||||
width="21.653919"
|
||||
height="24.193966"
|
||||
x="97.687294"
|
||||
y="138.51564" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.3885252,0,0,0.3885252,81.212654,102.39964)"
|
||||
id="text5048-3-6"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82952">L2</tspan></text>
|
||||
<rect
|
||||
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
|
||||
id="rect6462-9-0-6"
|
||||
width="27.217058"
|
||||
height="30.409672"
|
||||
x="149.19933"
|
||||
y="134.83977" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.48834178,0,0,0.48834178,128.49215,89.445174)"
|
||||
id="text5048-3-6-1"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6-2);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82954">L3</tspan></text>
|
||||
<rect
|
||||
style="fill:#eeaaff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
|
||||
id="rect6462-9-0-6-7"
|
||||
width="61.032539"
|
||||
height="29.96501"
|
||||
x="204.01265"
|
||||
y="135.61238" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.48834178,0,0,0.48834178,182.37007,89.995434)"
|
||||
id="text5048-3-6-1-9"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6-2-8);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82956">DRAM</tspan></text>
|
||||
<rect
|
||||
style="fill:#ffccaa;stroke:#091600;stroke-width:1.10636"
|
||||
id="rect6462-6"
|
||||
width="74.980759"
|
||||
height="15.869514"
|
||||
x="126.09525"
|
||||
y="38.773243" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,115.65481,14.295323)"
|
||||
id="text5048-7"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82958">Single gather</tspan></text>
|
||||
<rect
|
||||
style="fill:#ffccaa;stroke:#091600;stroke-width:1.03971"
|
||||
id="rect6462-6-3"
|
||||
width="66.071701"
|
||||
height="15.904838"
|
||||
x="126.90776"
|
||||
y="63.642746" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,116.63325,39.114393)"
|
||||
id="text5048-7-5"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82960">MD gathers</tspan></text>
|
||||
<rect
|
||||
style="fill:#afe9dd;stroke:#091600;stroke-width:1.02848"
|
||||
id="rect6462-6-3-2"
|
||||
width="64.479698"
|
||||
height="15.947394"
|
||||
x="206.65364"
|
||||
y="52.98967" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,196.01512,28.482594)"
|
||||
id="text5048-7-5-9"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5-6);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82962">Contiguous</tspan></text>
|
||||
<rect
|
||||
style="fill:#afe9dd;stroke:#091600;stroke-width:0.987323"
|
||||
id="rect6462-6-3-2-2"
|
||||
width="59.269382"
|
||||
height="15.988551"
|
||||
x="208.16559"
|
||||
y="76.856781" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,197.58604,52.220445)"
|
||||
id="text5048-7-5-9-7"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5-6-1);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82964">"Random"</tspan></text>
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="scale(0.26458333)"
|
||||
id="text24252"
|
||||
style="fill:black;fill-opacity:1;stroke:none;font-family:sans-serif;font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect24254)" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="M 193.10512,71.273276 206.30683,61.033513"
|
||||
id="path39049"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="M 193.08841,71.196939 207.86207,84.43804"
|
||||
id="path39053"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.39816;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 58.548229,151.24436 38.298093,0.25023"
|
||||
id="path39219"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.24847;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 119.19252,150.09399 29.28333,0.26095"
|
||||
id="path39219-2"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 177.02022,150.44367 26.36623,0.26095"
|
||||
id="path39219-2-0"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend)"
|
||||
d="m 48.145458,92.71788 -0.644819,47.57709"
|
||||
id="path39377"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2)"
|
||||
d="M 48.121208,92.873762 106.60807,137.41946"
|
||||
id="path39377-7"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2-5)"
|
||||
d="M 48.073928,92.825143 158.88023,133.04546"
|
||||
id="path39377-7-2"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2-5-2)"
|
||||
d="M 48.051946,92.813593 233.0959,134.16596"
|
||||
id="path39377-7-2-9"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<rect
|
||||
style="fill:#e9afaf;stroke:#091600;stroke-width:1.34518"
|
||||
id="rect6462-6-3-2-2-3"
|
||||
width="65.880661"
|
||||
height="26.700579"
|
||||
x="38.104012"
|
||||
y="80.530182" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
|
||||
d="m 77.365612,69.678744 h 2e-6"
|
||||
id="path39808"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 111.64767,59.183009 6.84466,0.03069"
|
||||
id="path41004"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 119.03378,47.056357 -0.58704,25.198541"
|
||||
id="path41006"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.02423;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 118.07503,72.254897 7.94998,-0.05784"
|
||||
id="path41008"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.882836;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 118.26666,47.054814 7.69322,0.173925"
|
||||
id="path41112"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
d="M 68.213642,69.068864 67.910274,80.302728"
|
||||
id="path55728"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,-1.3782637,4.0412367)"
|
||||
id="text65236"
|
||||
style="font-style:normal;font-weight:normal;font-size:53.3333px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect65238);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="309.01953"
|
||||
y="90.886691"
|
||||
id="tspan82968"><tspan
|
||||
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
|
||||
id="tspan82966">Application Level</tspan></tspan></text>
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,2.7015103,160.71919)"
|
||||
id="text65236-2"
|
||||
style="font-style:normal;font-weight:normal;font-size:53.3333px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect65238-1);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="309.01953"
|
||||
y="90.886691"
|
||||
id="tspan82972"><tspan
|
||||
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
|
||||
id="tspan82970">Hardware Level</tspan></tspan></text>
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,2.3490396,0.57331532)"
|
||||
id="text79473"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect79475);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="144.01562"
|
||||
y="339.75586"
|
||||
id="tspan82974">vgather </tspan><tspan
|
||||
x="144.01562"
|
||||
y="389.75586"
|
||||
id="tspan82976">instructions</tspan></text>
|
||||
</g>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 21 KiB |
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 128 KiB |
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 52 KiB |
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 62 KiB |
798
gromacs/atom.c
798
gromacs/atom.c
@@ -1,798 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <util.h>
|
||||
#include <mpi.h>
|
||||
|
||||
void initAtom(Atom *atom) {
|
||||
atom->x = NULL; atom->y = NULL; atom->z = NULL;
|
||||
atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
|
||||
atom->cl_x = NULL;
|
||||
atom->cl_v = NULL;
|
||||
atom->cl_f = NULL;
|
||||
atom->cl_type = NULL;
|
||||
atom->Natoms = 0;
|
||||
atom->Nlocal = 0;
|
||||
atom->Nghost = 0;
|
||||
atom->Nmax = 0;
|
||||
atom->Nclusters = 0;
|
||||
atom->Nclusters_local = 0;
|
||||
atom->Nclusters_ghost = 0;
|
||||
atom->NmaxGhost = 0; //Temporal
|
||||
atom->Nclusters_max = 0;
|
||||
atom->type = NULL;
|
||||
atom->ntypes = 0;
|
||||
atom->epsilon = NULL;
|
||||
atom->sigma6 = NULL;
|
||||
atom->cutforcesq = NULL;
|
||||
atom->cutneighsq = NULL;
|
||||
atom->iclusters = NULL;
|
||||
atom->jclusters = NULL;
|
||||
atom->icluster_bin = NULL;
|
||||
atom->PBCx = NULL;
|
||||
atom->PBCy = NULL;
|
||||
atom->PBCz = NULL;
|
||||
initMasks(atom);
|
||||
//MPI
|
||||
Box *mybox = &(atom->mybox);
|
||||
mybox->xprd = mybox->yprd = mybox->zprd = 0;
|
||||
mybox->lo[0] = mybox->lo[1] = mybox->lo[2] = 0;
|
||||
mybox->hi[0] = mybox->hi[1] = mybox->hi[2] = 0;
|
||||
}
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param) {
|
||||
|
||||
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
|
||||
atom->Natoms = 4 * param->nx * param->ny * param->nz;
|
||||
atom->Nlocal = 0;
|
||||
atom->ntypes = param->ntypes;
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||
int ilo = (int) (xlo / (0.5 * alat) - 1);
|
||||
int ihi = (int) (xhi / (0.5 * alat) + 1);
|
||||
int jlo = (int) (ylo / (0.5 * alat) - 1);
|
||||
int jhi = (int) (yhi / (0.5 * alat) + 1);
|
||||
int klo = (int) (zlo / (0.5 * alat) - 1);
|
||||
int khi = (int) (zhi / (0.5 * alat) + 1);
|
||||
|
||||
ilo = MAX(ilo, 0);
|
||||
ihi = MIN(ihi, 2 * param->nx - 1);
|
||||
jlo = MAX(jlo, 0);
|
||||
jhi = MIN(jhi, 2 * param->ny - 1);
|
||||
klo = MAX(klo, 0);
|
||||
khi = MIN(khi, 2 * param->nz - 1);
|
||||
|
||||
MD_FLOAT xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
|
||||
int i, j, k, m, n;
|
||||
int sx = 0; int sy = 0; int sz = 0;
|
||||
int ox = 0; int oy = 0; int oz = 0;
|
||||
int subboxdim = 8;
|
||||
|
||||
while(oz * subboxdim <= khi) {
|
||||
k = oz * subboxdim + sz;
|
||||
j = oy * subboxdim + sy;
|
||||
i = ox * subboxdim + sx;
|
||||
|
||||
if(((i + j + k) % 2 == 0) && (i >= ilo) && (i <= ihi) && (j >= jlo) && (j <= jhi) && (k >= klo) && (k <= khi)) {
|
||||
xtmp = 0.5 * alat * i;
|
||||
ytmp = 0.5 * alat * j;
|
||||
ztmp = 0.5 * alat * k;
|
||||
|
||||
if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) {
|
||||
n = k * (2 * param->ny) * (2 * param->nx) + j * (2 * param->nx) + i + 1;
|
||||
for(m = 0; m < 5; m++) { myrandom(&n); }
|
||||
vxtmp = myrandom(&n);
|
||||
for(m = 0; m < 5; m++){ myrandom(&n); }
|
||||
vytmp = myrandom(&n);
|
||||
for(m = 0; m < 5; m++) { myrandom(&n); }
|
||||
vztmp = myrandom(&n);
|
||||
|
||||
if(atom->Nlocal == atom->Nmax) { growAtom(atom); }
|
||||
atom_x(atom->Nlocal) = xtmp;
|
||||
atom_y(atom->Nlocal) = ytmp;
|
||||
atom_z(atom->Nlocal) = ztmp;
|
||||
atom->vx[atom->Nlocal] = vxtmp;
|
||||
atom->vy[atom->Nlocal] = vytmp;
|
||||
atom->vz[atom->Nlocal] = vztmp;
|
||||
atom->type[atom->Nlocal] = rand() % atom->ntypes;
|
||||
atom->Nlocal++;
|
||||
}
|
||||
}
|
||||
|
||||
sx++;
|
||||
if(sx == subboxdim) { sx = 0; sy++; }
|
||||
if(sy == subboxdim) { sy = 0; sz++; }
|
||||
if(sz == subboxdim) { sz = 0; ox++; }
|
||||
if(ox * subboxdim > ihi) { ox = 0; oy++; }
|
||||
if(oy * subboxdim > jhi) { oy = 0; oz++; }
|
||||
}
|
||||
}
|
||||
|
||||
int type_str2int(const char *type) {
|
||||
if(strncmp(type, "Ar", 2) == 0) { return 0; } // Argon
|
||||
fprintf(stderr, "Invalid atom type: %s\n", type);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int readAtom(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
int len = strlen(param->input_file);
|
||||
if(strncmp(¶m->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
|
||||
if(me==0) fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int read_atoms = 0;
|
||||
|
||||
if(!fp) {
|
||||
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while(!feof(fp)) {
|
||||
readline(line, fp);
|
||||
char *item = strtok(line, " ");
|
||||
if(strncmp(item, "CRYST1", 6) == 0) {
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(NULL, "\t "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
// alpha, beta, gamma, sGroup, z
|
||||
} else if(strncmp(item, "ATOM", 4) == 0) {
|
||||
char *label;
|
||||
int atom_id, comp_id;
|
||||
MD_FLOAT occupancy, charge;
|
||||
atom_id = atoi(strtok(NULL, "\t ")) - 1;
|
||||
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type_str2int(strtok(NULL, "\t "));
|
||||
label = strtok(NULL, "\t ");
|
||||
comp_id = atoi(strtok(NULL, "\t "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->vx[atom_id] = 0.0;
|
||||
atom->vy[atom_id] = 0.0;
|
||||
atom->vz[atom_id] = 0.0;
|
||||
occupancy = atof(strtok(NULL, "\t "));
|
||||
charge = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
read_atoms++;
|
||||
} else if(strncmp(item, "HEADER", 6) == 0 ||
|
||||
strncmp(item, "REMARK", 6) == 0 ||
|
||||
strncmp(item, "MODEL", 5) == 0 ||
|
||||
strncmp(item, "TER", 3) == 0 ||
|
||||
strncmp(item, "ENDMDL", 6) == 0) {
|
||||
// Do nothing
|
||||
} else {
|
||||
if(me==0) fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(!read_atoms) {
|
||||
if(me==0) fprintf(stderr, "Input error: No atoms read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
if(me==0) fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
char desc[MAXLINE];
|
||||
int read_atoms = 0;
|
||||
int atoms_to_read = 0;
|
||||
int i = 0;
|
||||
|
||||
if(!fp) {
|
||||
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
readline(desc, fp);
|
||||
for(i = 0; desc[i] != '\n'; i++);
|
||||
desc[i] = '\0';
|
||||
readline(line, fp);
|
||||
atoms_to_read = atoi(strtok(line, " "));
|
||||
if(me==0) fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
|
||||
|
||||
while(!feof(fp) && read_atoms < atoms_to_read) {
|
||||
readline(line, fp);
|
||||
char *label = strtok(line, "\t ");
|
||||
int type = type_str2int(strtok(NULL, "\t "));
|
||||
int atom_id = atoi(strtok(NULL, "\t ")) - 1;
|
||||
atom_id = read_atoms;
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type;
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->vx[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->vy[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->vz[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
read_atoms++;
|
||||
}
|
||||
|
||||
if(!feof(fp)) {
|
||||
readline(line, fp);
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(line, "\t "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
}
|
||||
|
||||
if(read_atoms != atoms_to_read) {
|
||||
if(me==0) fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
if(me==0) fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int natoms = 0;
|
||||
int read_atoms = 0;
|
||||
int atom_id = -1;
|
||||
int ts = -1;
|
||||
|
||||
if(!fp) {
|
||||
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while(!feof(fp) && ts < 1 && !read_atoms) {
|
||||
readline(line, fp);
|
||||
if(strncmp(line, "ITEM: ", 6) == 0) {
|
||||
char *item = &line[6];
|
||||
|
||||
if(strncmp(item, "TIMESTEP", 8) == 0) {
|
||||
readline(line, fp);
|
||||
ts = atoi(line);
|
||||
} else if(strncmp(item, "NUMBER OF ATOMS", 15) == 0) {
|
||||
readline(line, fp);
|
||||
natoms = atoi(line);
|
||||
atom->Natoms = natoms;
|
||||
atom->Nlocal = natoms;
|
||||
while(atom->Nlocal >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
} else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
|
||||
readline(line, fp);
|
||||
param->xlo = atof(strtok(line, "\t "));
|
||||
param->xhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
|
||||
readline(line, fp);
|
||||
param->ylo = atof(strtok(line, "\t "));
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
|
||||
readline(line, fp);
|
||||
param->zlo = atof(strtok(line, "\t "));
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
} else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
|
||||
for(int i = 0; i < natoms; i++) {
|
||||
readline(line, fp);
|
||||
atom_id = atoi(strtok(line, "\t ")) - 1;
|
||||
atom->type[atom_id] = atoi(strtok(NULL, "\t "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->vx[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->vy[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->vz[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
|
||||
read_atoms++;
|
||||
}
|
||||
} else {
|
||||
if(me==0) fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
if(me==0) fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(ts < 0 || !natoms || !read_atoms) {
|
||||
if(me==0) fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
if(me==0) fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
fclose(fp);
|
||||
return natoms;
|
||||
}
|
||||
|
||||
void initMasks(Atom *atom) {
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
|
||||
atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
|
||||
atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
|
||||
atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
|
||||
//atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
|
||||
|
||||
for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
|
||||
atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||
}
|
||||
|
||||
for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
|
||||
atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||
atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
|
||||
}
|
||||
|
||||
for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
|
||||
atom->exclusion_filter[i] = (1U << i);
|
||||
}
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
||||
atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
||||
atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
|
||||
}
|
||||
#else
|
||||
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||
for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
#else
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
||||
#endif
|
||||
|
||||
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
|
||||
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void growAtom(Atom *atom) {
|
||||
int nold = atom->Nmax;
|
||||
atom->Nmax += DELTA;
|
||||
|
||||
#ifdef AOS
|
||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
#else
|
||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->z = (MD_FLOAT*) reallocate(atom->z, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
#endif
|
||||
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
|
||||
}
|
||||
|
||||
void growClusters(Atom *atom) {
|
||||
int nold = atom->Nclusters_max;
|
||||
int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
|
||||
atom->Nclusters_max += DELTA;
|
||||
atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
|
||||
atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster));
|
||||
atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
|
||||
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
|
||||
}
|
||||
|
||||
/* MPI added*/
|
||||
void growPbc(Atom* atom) {
|
||||
int nold = atom->NmaxGhost;
|
||||
atom->NmaxGhost += DELTA;
|
||||
|
||||
if (atom->PBCx || atom->PBCy || atom->PBCz){
|
||||
atom->PBCx = (int*) reallocate(atom->PBCx, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
atom->PBCy = (int*) reallocate(atom->PBCy, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
atom->PBCz = (int*) reallocate(atom->PBCz, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
} else {
|
||||
atom->PBCx = (int*) malloc(atom->NmaxGhost * sizeof(int));
|
||||
atom->PBCy = (int*) malloc(atom->NmaxGhost * sizeof(int));
|
||||
atom->PBCz = (int*) malloc(atom->NmaxGhost * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
void packForward(Atom* atom, int nc, int* list, MD_FLOAT* buf, int* pbc)
|
||||
{
|
||||
for(int i = 0; i < nc; i++) {
|
||||
int cj = list[i];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
int displ = i*CLUSTER_N;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
buf[3*(displ+cjj)+0] = cj_x[CL_X_OFFSET + cjj] + pbc[_x] * atom->mybox.xprd;
|
||||
buf[3*(displ+cjj)+1] = cj_x[CL_Y_OFFSET + cjj] + pbc[_y] * atom->mybox.yprd;
|
||||
buf[3*(displ+cjj)+2] = cj_x[CL_Z_OFFSET + cjj] + pbc[_z] * atom->mybox.zprd;
|
||||
}
|
||||
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
buf[3*(displ+cjj)+0] = -1; //x
|
||||
buf[3*(displ+cjj)+1] = -1; //y
|
||||
buf[3*(displ+cjj)+2] = -1; //z
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void unpackForward(Atom* atom, int nc, int c0, MD_FLOAT* buf)
|
||||
{
|
||||
for(int i = 0; i < nc; i++) {
|
||||
int cj = c0+i;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
int displ = i*CLUSTER_N;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
if(cj_x[CL_X_OFFSET + cjj]<INFINITY) cj_x[CL_X_OFFSET + cjj] = buf[3*(displ+cjj)+0];
|
||||
if(cj_x[CL_Y_OFFSET + cjj]<INFINITY) cj_x[CL_Y_OFFSET + cjj] = buf[3*(displ+cjj)+1];
|
||||
if(cj_x[CL_Z_OFFSET + cjj]<INFINITY) cj_x[CL_Z_OFFSET + cjj] = buf[3*(displ+cjj)+2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int packGhost(Atom* atom, int cj, MD_FLOAT* buf, int* pbc)
|
||||
{
|
||||
//#of elements per cluster natoms,x0,y0,z0,type_0, . . ,xn,yn,zn,type_n,bbminx,bbmaxxy,bbminy,bbmaxy,bbminz,bbmaxz
|
||||
//count = 4*N_CLUSTER+7, if N_CLUSTER =4 => count = 23 value/cluster + trackpbc[x] + trackpbc[y] + trackpbc[z]
|
||||
int m = 0;
|
||||
if(atom->jclusters[cj].natoms > 0) {
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
buf[m++] = atom->jclusters[cj].natoms;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
|
||||
MD_FLOAT xtmp = cj_x[CL_X_OFFSET + cjj] + pbc[_x] * atom->mybox.xprd;
|
||||
MD_FLOAT ytmp = cj_x[CL_Y_OFFSET + cjj] + pbc[_y] * atom->mybox.yprd;
|
||||
MD_FLOAT ztmp = cj_x[CL_Z_OFFSET + cjj] + pbc[_z] * atom->mybox.zprd;
|
||||
|
||||
buf[m++] = xtmp;
|
||||
buf[m++] = ytmp;
|
||||
buf[m++] = ztmp;
|
||||
buf[m++]= atom->cl_type[cj_sca_base + cjj];
|
||||
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
}
|
||||
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
buf[m++] = -1; //x
|
||||
buf[m++] = -1; //y
|
||||
buf[m++] = -1; //z
|
||||
buf[m++] = -1; //type
|
||||
}
|
||||
|
||||
buf[m++] = bbminx;
|
||||
buf[m++] = bbmaxx;
|
||||
buf[m++] = bbminy;
|
||||
buf[m++] = bbmaxy;
|
||||
buf[m++] = bbminz;
|
||||
buf[m++] = bbmaxz;
|
||||
//TODO: check atom->ncj
|
||||
int ghostId = cj-atom->ncj;
|
||||
//check for ghost particles
|
||||
buf[m++] = (cj-atom->ncj>=0) ? pbc[_x]+atom->PBCx[ghostId]:pbc[_x];
|
||||
buf[m++] = (cj-atom->ncj>=0) ? pbc[_y]+atom->PBCy[ghostId]:pbc[_y];
|
||||
buf[m++] = (cj-atom->ncj>=0) ? pbc[_z]+atom->PBCz[ghostId]:pbc[_z];
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
int unpackGhost(Atom* atom, int cj, MD_FLOAT* buf)
|
||||
{
|
||||
int m = 0;
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
if(cj*jfac>=atom->Nclusters_max) growClusters(atom);
|
||||
if(atom->Nclusters_ghost>=atom->NmaxGhost) growPbc(atom);
|
||||
|
||||
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj);
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
atom->jclusters[cj].natoms = buf[m++];
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
|
||||
cj_x[CL_X_OFFSET + cjj] = buf[m++];
|
||||
cj_x[CL_Y_OFFSET + cjj] = buf[m++];
|
||||
cj_x[CL_Z_OFFSET + cjj] = buf[m++];
|
||||
atom->cl_type[cj_sca_base + cjj] = buf[m++];
|
||||
atom->Nghost++;
|
||||
}
|
||||
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
||||
atom->cl_type[cj_sca_base + cjj] = -1;
|
||||
m+=4;
|
||||
}
|
||||
|
||||
atom->jclusters[cj].bbminx = buf[m++];
|
||||
atom->jclusters[cj].bbmaxx = buf[m++];
|
||||
atom->jclusters[cj].bbminy = buf[m++];
|
||||
atom->jclusters[cj].bbmaxy = buf[m++];
|
||||
atom->jclusters[cj].bbminz = buf[m++];
|
||||
atom->jclusters[cj].bbmaxz = buf[m++];
|
||||
atom->PBCx[atom->Nclusters_ghost] = buf[m++];
|
||||
atom->PBCy[atom->Nclusters_ghost] = buf[m++];
|
||||
atom->PBCz[atom->Nclusters_ghost] = buf[m++];
|
||||
atom->Nclusters_ghost++;
|
||||
|
||||
}
|
||||
|
||||
void packReverse(Atom* atom, int nc, int c0, MD_FLOAT* buf)
|
||||
{
|
||||
for(int i = 0; i < nc; i++) {
|
||||
int cj = c0+i;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
int displ = i*CLUSTER_N;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
buf[3*(displ+cjj)+0] = cj_f[CL_X_OFFSET + cjj];
|
||||
buf[3*(displ+cjj)+1] = cj_f[CL_Y_OFFSET + cjj];
|
||||
buf[3*(displ+cjj)+2] = cj_f[CL_Z_OFFSET + cjj];
|
||||
}
|
||||
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
buf[3*(displ+cjj)+0] = -1; //x
|
||||
buf[3*(displ+cjj)+1] = -1; //y
|
||||
buf[3*(displ+cjj)+2] = -1; //z
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void unpackReverse(Atom* atom, int nc, int* list, MD_FLOAT* buf)
|
||||
{
|
||||
for(int i = 0; i < nc; i++) {
|
||||
int cj = list[i];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
int displ = i*CLUSTER_N;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
cj_f[CL_X_OFFSET + cjj] += buf[3*(displ+cjj)+0];
|
||||
cj_f[CL_Y_OFFSET + cjj] += buf[3*(displ+cjj)+1];
|
||||
cj_f[CL_Z_OFFSET + cjj] += buf[3*(displ+cjj)+2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int packExchange(Atom* atom, int i, MD_FLOAT* buf)
|
||||
{
|
||||
int m = 0;
|
||||
buf[m++] = atom_x(i);
|
||||
buf[m++] = atom_y(i);
|
||||
buf[m++] = atom_z(i);
|
||||
buf[m++] = atom_vx(i);
|
||||
buf[m++] = atom_vy(i);
|
||||
buf[m++] = atom_vz(i);
|
||||
buf[m++] = atom->type[i];
|
||||
return m;
|
||||
}
|
||||
|
||||
int unpackExchange(Atom* atom, int i, MD_FLOAT* buf)
|
||||
{
|
||||
while(i >= atom->Nmax) growAtom(atom);
|
||||
int m = 0;
|
||||
atom_x(i) = buf[m++];
|
||||
atom_y(i) = buf[m++];
|
||||
atom_z(i) = buf[m++];
|
||||
atom_vx(i) = buf[m++];
|
||||
atom_vy(i) = buf[m++];
|
||||
atom_vz(i) = buf[m++];
|
||||
atom->type[i] = buf[m++];
|
||||
return m;
|
||||
}
|
||||
|
||||
void pbc(Atom* atom)
|
||||
{
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
|
||||
MD_FLOAT xprd = atom->mybox.xprd;
|
||||
MD_FLOAT yprd = atom->mybox.yprd;
|
||||
MD_FLOAT zprd = atom->mybox.zprd;
|
||||
|
||||
if(atom_x(i) < 0.0) atom_x(i) += xprd;
|
||||
if(atom_y(i) < 0.0) atom_y(i) += yprd;
|
||||
if(atom_z(i) < 0.0) atom_z(i) +=zprd;
|
||||
if(atom_x(i) >= xprd) atom_x(i) -=xprd;
|
||||
if(atom_y(i) >= yprd) atom_y(i) -=yprd;
|
||||
if(atom_z(i) >= zprd) atom_z(i) -=zprd;
|
||||
}
|
||||
}
|
||||
|
||||
void copy(Atom* atom, int i, int j)
|
||||
{
|
||||
atom_x(i) = atom_x(j);
|
||||
atom_y(i) = atom_y(j);
|
||||
atom_z(i) = atom_z(j);
|
||||
atom_vx(i) = atom_vx(j);
|
||||
atom_vy(i) = atom_vy(j);
|
||||
atom_vz(i) = atom_vz(j);
|
||||
atom->type[i] = atom->type[j];
|
||||
}
|
||||
@@ -1,317 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
extern "C" {
|
||||
|
||||
#include <stdio.h>
|
||||
//---
|
||||
#include <cuda.h>
|
||||
#include <driver_types.h>
|
||||
//---
|
||||
#include <likwid-marker.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
MD_FLOAT *cuda_cl_x;
|
||||
MD_FLOAT *cuda_cl_v;
|
||||
MD_FLOAT *cuda_cl_f;
|
||||
int *cuda_neighbors;
|
||||
int *cuda_numneigh;
|
||||
int *cuda_natoms;
|
||||
int *natoms;
|
||||
int *ngatoms;
|
||||
int *cuda_border_map;
|
||||
int *cuda_jclusters_natoms;
|
||||
MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
|
||||
MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
|
||||
MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
|
||||
int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
|
||||
int isReneighboured;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
cuda_assert("cudaDeviceSetup", cudaDeviceReset());
|
||||
cuda_assert("cudaDeviceSetup", cudaSetDevice(0));
|
||||
cuda_cl_x = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_cl_v = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_cl_f = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_jclusters_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_border_map = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_PBCx = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_PBCy = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_PBCz = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_numneigh = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_neighbors = (int *) allocateGPU(atom->Nclusters_max * neighbor->maxneighs * sizeof(int));
|
||||
natoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
|
||||
ngatoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
|
||||
isReneighboured = 1;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void copyDataToCUDADevice(Atom *atom) {
|
||||
memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
natoms[ci] = atom->iclusters[ci].natoms;
|
||||
}
|
||||
|
||||
memcpyToGPU(cuda_natoms, natoms, atom->Nclusters_local * sizeof(int));
|
||||
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = atom->Nclusters_local / jfac;
|
||||
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
|
||||
const int cj = ncj + cg;
|
||||
ngatoms[cg] = atom->jclusters[cj].natoms;
|
||||
}
|
||||
|
||||
memcpyToGPU(cuda_jclusters_natoms, ngatoms, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_border_map, atom->border_map, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void copyDataFromCUDADevice(Atom *atom) {
|
||||
memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void cudaDeviceFree() {
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_x));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_v));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_f));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_numneigh));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_neighbors));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_natoms));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_border_map));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_jclusters_natoms));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCx));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCy));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
|
||||
free(natoms);
|
||||
free(ngatoms);
|
||||
}
|
||||
|
||||
__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||
int *cuda_natoms,
|
||||
int Nclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
|
||||
|
||||
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (ci_pos >= Nclusters_local) return;
|
||||
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
|
||||
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
|
||||
for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
ci_x[CL_X_OFFSET + cii] += dt * ci_v[CL_X_OFFSET + cii];
|
||||
ci_x[CL_Y_OFFSET + cii] += dt * ci_v[CL_Y_OFFSET + cii];
|
||||
ci_x[CL_Z_OFFSET + cii] += dt * ci_v[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
|
||||
int *cuda_jclusters_natoms,
|
||||
int *cuda_PBCx,
|
||||
int *cuda_PBCy,
|
||||
int *cuda_PBCz,
|
||||
int Nclusters_local,
|
||||
int Nclusters_ghost,
|
||||
MD_FLOAT param_xprd,
|
||||
MD_FLOAT param_yprd,
|
||||
MD_FLOAT param_zprd) {
|
||||
unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (cg >= Nclusters_ghost) return;
|
||||
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = Nclusters_local / jfac;
|
||||
MD_FLOAT xprd = param_xprd;
|
||||
MD_FLOAT yprd = param_yprd;
|
||||
MD_FLOAT zprd = param_zprd;
|
||||
|
||||
const int cj = ncj + cg;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
|
||||
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
|
||||
MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
|
||||
|
||||
for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
|
||||
cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
|
||||
cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
|
||||
int Nclusters_local, int Nclusters_max,
|
||||
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
|
||||
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
|
||||
|
||||
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
|
||||
if ((ci_pos >= Nclusters_local) || (cii_pos >= CLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
|
||||
|
||||
int ci_cj0 = CJ0_FROM_CI(ci_pos);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
|
||||
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
int numneighs = cuda_numneigh[ci_pos];
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = (&cuda_neighs[ci_pos * maxneighs])[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
|
||||
|
||||
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii_pos];
|
||||
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii_pos];
|
||||
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii_pos];
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
int cond;
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
|
||||
(ci_cj0 != cj || cii_pos != cjj_pos);
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
|
||||
(ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
|
||||
#endif
|
||||
if(cond) {
|
||||
MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj_pos];
|
||||
MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj_pos];
|
||||
MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj_pos];
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
|
||||
if(half_neigh) {
|
||||
atomicAdd(&cj_f[CL_X_OFFSET + cjj_pos], -delx * force);
|
||||
atomicAdd(&cj_f[CL_Y_OFFSET + cjj_pos], -dely * force);
|
||||
atomicAdd(&cj_f[CL_Z_OFFSET + cjj_pos], -delz * force);
|
||||
}
|
||||
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
|
||||
atomicAdd(&ci_f[CL_X_OFFSET + cii_pos], fix);
|
||||
atomicAdd(&ci_f[CL_Y_OFFSET + cii_pos], fiy);
|
||||
atomicAdd(&ci_f[CL_Z_OFFSET + cii_pos], fiz);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void cudaFinalIntegrate_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||
int *cuda_natoms,
|
||||
int Nclusters_local, MD_FLOAT dtforce) {
|
||||
|
||||
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (ci_pos >= Nclusters_local) return;
|
||||
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
|
||||
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
|
||||
for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void cudaInitialIntegrate(Parameter *param, Atom *atom) {
|
||||
const int threads_num = 16;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);
|
||||
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
||||
cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
|
||||
cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
|
||||
cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
|
||||
cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
extern "C"
|
||||
void cudaUpdatePbc(Atom *atom, Parameter *param) {
|
||||
const int threads_num = 512;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);;
|
||||
dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
|
||||
cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
|
||||
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
|
||||
atom->Nclusters_local, atom->Nclusters_ghost,
|
||||
param->xprd, param->yprd, param->zprd);
|
||||
cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
|
||||
cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
if (isReneighboured) {
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
|
||||
memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
|
||||
}
|
||||
|
||||
isReneighboured = 0;
|
||||
}
|
||||
|
||||
const int threads_num = 1;
|
||||
dim3 block_size = dim3(threads_num, CLUSTER_M, CLUSTER_N);
|
||||
dim3 grid_size = dim3(atom->Nclusters_local/threads_num+1, 1, 1);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
computeForceLJ_cuda_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_f,
|
||||
atom->Nclusters_local, atom->Nclusters_max,
|
||||
cuda_numneigh, cuda_neighbors,
|
||||
neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
|
||||
sigma6, epsilon);
|
||||
cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
|
||||
cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void cudaFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
const int threads_num = 16;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);
|
||||
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
||||
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
|
||||
cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
|
||||
cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
|
||||
}
|
||||
@@ -1,201 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <likwid-marker.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <allocate.h>
|
||||
#include <timing.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <stats.h>
|
||||
#include <eam.h>
|
||||
#include <util.h>
|
||||
|
||||
double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
/*
|
||||
if(eam->nmax < atom->Nmax) {
|
||||
eam->nmax = atom->Nmax;
|
||||
if(eam->fp != NULL) { free(eam->fp); }
|
||||
eam->fp = (MD_FLOAT *) allocate(ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT));
|
||||
}
|
||||
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
|
||||
MD_FLOAT* rhor_spline = eam->rhor_spline; MD_FLOAT* frho_spline = eam->frho_spline; MD_FLOAT* z2r_spline = eam->z2r_spline;
|
||||
MD_FLOAT rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; MD_FLOAT rdrho = eam->rdrho;
|
||||
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
|
||||
*/
|
||||
double S = getTimeStamp();
|
||||
|
||||
LIKWID_MARKER_START("force_eam_fp");
|
||||
/*
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT rhoi = 0;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
#pragma ivdep
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
#else
|
||||
const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
#endif
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT p = sqrt(rsq) * rdr + 1.0;
|
||||
int m = (int)(p);
|
||||
m = m < nr - 1 ? m : nr - 1;
|
||||
p -= m;
|
||||
p = p < 1.0 ? p : 1.0;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
rhoi += ((rhor_spline[type_ij * nr_tot + m * 7 + 3] * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 4]) * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 5]) * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 6];
|
||||
#else
|
||||
rhoi += ((rhor_spline[m * 7 + 3] * p +
|
||||
rhor_spline[m * 7 + 4]) * p +
|
||||
rhor_spline[m * 7 + 5]) * p +
|
||||
rhor_spline[m * 7 + 6];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_ii = type_i * type_i;
|
||||
#endif
|
||||
MD_FLOAT p = 1.0 * rhoi * rdrho + 1.0;
|
||||
int m = (int)(p);
|
||||
m = MAX(1, MIN(m, nrho - 1));
|
||||
p -= m;
|
||||
p = MIN(p, 1.0);
|
||||
#ifdef EXPLICIT_TYPES
|
||||
fp[i] = (frho_spline[type_ii * nrho_tot + m * 7 + 0] * p +
|
||||
frho_spline[type_ii * nrho_tot + m * 7 + 1]) * p +
|
||||
frho_spline[type_ii * nrho_tot + m * 7 + 2];
|
||||
#else
|
||||
fp[i] = (frho_spline[m * 7 + 0] * p + frho_spline[m * 7 + 1]) * p + frho_spline[m * 7 + 2];
|
||||
#endif
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam_fp");
|
||||
|
||||
// We still need to update fp for PBC atoms
|
||||
for(int i = 0; i < atom->Nghost; i++) {
|
||||
fp[Nlocal + i] = fp[atom->border_map[i]];
|
||||
}
|
||||
|
||||
LIKWID_MARKER_START("force_eam");
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
#pragma ivdep
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
#else
|
||||
const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT r = sqrt(rsq);
|
||||
MD_FLOAT p = r * rdr + 1.0;
|
||||
int m = (int)(p);
|
||||
m = m < nr - 1 ? m : nr - 1;
|
||||
p -= m;
|
||||
p = p < 1.0 ? p : 1.0;
|
||||
|
||||
|
||||
// rhoip = derivative of (density at atom j due to atom i)
|
||||
// rhojp = derivative of (density at atom i due to atom j)
|
||||
// phi = pair potential energy
|
||||
// phip = phi'
|
||||
// z2 = phi * r
|
||||
// z2p = (phi * r)' = (phi' r) + phi
|
||||
// psip needs both fp[i] and fp[j] terms since r_ij appears in two
|
||||
// terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
|
||||
// hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
MD_FLOAT rhoip = (rhor_spline[type_ij * nr_tot + m * 7 + 0] * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 1]) * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 2];
|
||||
|
||||
MD_FLOAT z2p = (z2r_spline[type_ij * nr_tot + m * 7 + 0] * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 1]) * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 2];
|
||||
|
||||
MD_FLOAT z2 = ((z2r_spline[type_ij * nr_tot + m * 7 + 3] * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 4]) * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 5]) * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 6];
|
||||
#else
|
||||
MD_FLOAT rhoip = (rhor_spline[m * 7 + 0] * p + rhor_spline[m * 7 + 1]) * p + rhor_spline[m * 7 + 2];
|
||||
MD_FLOAT z2p = (z2r_spline[m * 7 + 0] * p + z2r_spline[m * 7 + 1]) * p + z2r_spline[m * 7 + 2];
|
||||
MD_FLOAT z2 = ((z2r_spline[m * 7 + 3] * p +
|
||||
z2r_spline[m * 7 + 4]) * p +
|
||||
z2r_spline[m * 7 + 5]) * p +
|
||||
z2r_spline[m * 7 + 6];
|
||||
#endif
|
||||
|
||||
MD_FLOAT recip = 1.0 / r;
|
||||
MD_FLOAT phi = z2 * recip;
|
||||
MD_FLOAT phip = z2p * recip - phi * recip;
|
||||
MD_FLOAT psip = fp[i] * rhoip + fp[j] * rhoip + phip;
|
||||
MD_FLOAT fpair = -psip * recip;
|
||||
|
||||
fix += delx * fpair;
|
||||
fiy += dely * fpair;
|
||||
fiz += delz * fpair;
|
||||
//fpair *= 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
fx[i] = fix;
|
||||
fy[i] = fiy;
|
||||
fz[i] = fiz;
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
}
|
||||
|
||||
*/
|
||||
LIKWID_MARKER_STOP("force_eam");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
1216
gromacs/force_lj.c
1216
gromacs/force_lj.c
File diff suppressed because it is too large
Load Diff
@@ -1,188 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <parameter.h>
|
||||
#include <box.h>
|
||||
|
||||
#ifndef __ATOM_H_
|
||||
#define __ATOM_H_
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
// Nbnxn layouts (as of GROMACS):
|
||||
// Simd4xN: M=4, N=VECTOR_WIDTH
|
||||
// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
|
||||
// Cuda: M=8, N=VECTOR_WIDTH
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
# undef VECTOR_WIDTH
|
||||
# define VECTOR_WIDTH 8
|
||||
# define KERNEL_NAME "CUDA"
|
||||
# define CLUSTER_M 8
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define UNROLL_J 1
|
||||
# define computeForceLJ computeForceLJ_cuda
|
||||
# define initialIntegrate cudaInitialIntegrate
|
||||
# define finalIntegrate cudaFinalIntegrate
|
||||
# define updatePbc cudaUpdatePbc
|
||||
#else
|
||||
# define CLUSTER_M 4
|
||||
// Simd2xNN (here used for single-precision)
|
||||
# if VECTOR_WIDTH > CLUSTER_M * 2
|
||||
# define KERNEL_NAME "Simd2xNN"
|
||||
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
||||
# define UNROLL_I 4
|
||||
# define UNROLL_J 2
|
||||
# define computeForceLJ computeForceLJ_2xnn
|
||||
// Simd4xN
|
||||
# else
|
||||
# define KERNEL_NAME "Simd4xN"
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define UNROLL_I 4
|
||||
# define UNROLL_J 1
|
||||
# define computeForceLJ computeForceLJ_4xn
|
||||
# endif
|
||||
# ifdef USE_REFERENCE_VERSION
|
||||
# undef KERNEL_NAME
|
||||
# undef computeForceLJ
|
||||
# define KERNEL_NAME "Reference"
|
||||
# define computeForceLJ computeForceLJ_ref
|
||||
# endif
|
||||
# define initialIntegrate cpuInitialIntegrate
|
||||
# define finalIntegrate cpuFinalIntegrate
|
||||
# define updatePbc cpuUpdatePbc
|
||||
#endif
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
# define CJ0_FROM_CI(a) (a)
|
||||
# define CJ1_FROM_CI(a) (a)
|
||||
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
#elif CLUSTER_M == CLUSTER_N * 2 // M > N
|
||||
# define CJ0_FROM_CI(a) ((a) << 1)
|
||||
# define CJ1_FROM_CI(a) (((a) << 1) | 0x1)
|
||||
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b))
|
||||
# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
|
||||
#elif CLUSTER_M == CLUSTER_N / 2 // M < N
|
||||
# define CJ0_FROM_CI(a) ((a) >> 1)
|
||||
# define CJ1_FROM_CI(a) ((a) >> 1)
|
||||
# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
|
||||
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
#else
|
||||
# error "Invalid cluster configuration!"
|
||||
#endif
|
||||
|
||||
#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8
|
||||
# error "Cluster N dimension can be only 2, 4 and 8"
|
||||
#endif
|
||||
|
||||
#define CI_SCALAR_BASE_INDEX(a) (CI_BASE_INDEX(a, 1))
|
||||
#define CI_VECTOR_BASE_INDEX(a) (CI_BASE_INDEX(a, 3))
|
||||
#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1))
|
||||
#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3))
|
||||
|
||||
#if CLUSTER_M >= CLUSTER_N
|
||||
# define CL_X_OFFSET (0 * CLUSTER_M)
|
||||
# define CL_Y_OFFSET (1 * CLUSTER_M)
|
||||
# define CL_Z_OFFSET (2 * CLUSTER_M)
|
||||
#else
|
||||
# define CL_X_OFFSET (0 * CLUSTER_N)
|
||||
# define CL_Y_OFFSET (1 * CLUSTER_N)
|
||||
# define CL_Z_OFFSET (2 * CLUSTER_N)
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int natoms;
|
||||
MD_FLOAT bbminx, bbmaxx;
|
||||
MD_FLOAT bbminy, bbmaxy;
|
||||
MD_FLOAT bbminz, bbmaxz;
|
||||
} Cluster;
|
||||
|
||||
typedef struct {
|
||||
int Natoms, Nlocal, Nghost, Nmax;
|
||||
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max, NmaxGhost,ncj;
|
||||
MD_FLOAT *x, *y, *z;
|
||||
MD_FLOAT *vx, *vy, *vz;
|
||||
int *border_map;
|
||||
int *type;
|
||||
int ntypes;
|
||||
MD_FLOAT *epsilon;
|
||||
MD_FLOAT *sigma6;
|
||||
MD_FLOAT *cutforcesq;
|
||||
MD_FLOAT *cutneighsq;
|
||||
//track the movement of a particle along boundaries
|
||||
int *PBCx, *PBCy, *PBCz;
|
||||
// Data in cluster format
|
||||
MD_FLOAT *cl_x;
|
||||
MD_FLOAT *cl_v;
|
||||
MD_FLOAT *cl_f;
|
||||
int *cl_type;
|
||||
Cluster *iclusters, *jclusters;
|
||||
int *icluster_bin;
|
||||
int dummy_cj;
|
||||
MD_UINT *exclusion_filter;
|
||||
MD_FLOAT *diagonal_4xn_j_minus_i;
|
||||
MD_FLOAT *diagonal_2xnn_j_minus_i;
|
||||
unsigned int masks_2xnn_hn[8];
|
||||
unsigned int masks_2xnn_fn[8];
|
||||
unsigned int masks_4xn_hn[16];
|
||||
unsigned int masks_4xn_fn[16];
|
||||
|
||||
//Info Subdomain
|
||||
Box mybox;
|
||||
} Atom;
|
||||
|
||||
extern void initAtom(Atom*);
|
||||
extern void initMasks(Atom*);
|
||||
extern void createAtom(Atom*, Parameter*);
|
||||
extern int readAtom(Atom*, Parameter*);
|
||||
extern int readAtom_pdb(Atom*, Parameter*);
|
||||
extern int readAtom_gro(Atom*, Parameter*);
|
||||
extern int readAtom_dmp(Atom*, Parameter*);
|
||||
extern void growAtom(Atom*);
|
||||
extern void growClusters(Atom*);
|
||||
|
||||
int packGhost(Atom*, int, MD_FLOAT* , int*);
|
||||
int unpackGhost(Atom*, int, MD_FLOAT*);
|
||||
int packExchange(Atom*, int, MD_FLOAT*);
|
||||
int unpackExchange(Atom*, int, MD_FLOAT*);
|
||||
void packForward(Atom*, int, int*, MD_FLOAT*, int*);
|
||||
void unpackForward(Atom*, int, int, MD_FLOAT*);
|
||||
void packReverse(Atom* , int , int , MD_FLOAT*);
|
||||
void unpackReverse(Atom*, int, int*, MD_FLOAT*);
|
||||
void pbc(Atom*);
|
||||
void copy(Atom*, int, int);
|
||||
|
||||
|
||||
#ifdef AOS
|
||||
# define POS_DATA_LAYOUT "AoS"
|
||||
# define atom_x(i) atom->x[(i) * 3 + 0]
|
||||
# define atom_y(i) atom->x[(i) * 3 + 1]
|
||||
# define atom_z(i) atom->x[(i) * 3 + 2]
|
||||
/*
|
||||
# define atom_vx(i) atom->vx[(i) * 3 + 0]
|
||||
# define atom_vy(i) atom->vx[(i) * 3 + 1]
|
||||
# define atom_vz(i) atom->vx[(i) * 3 + 2]
|
||||
# define atom_fx(i) atom->fx[(i) * 3 + 0]
|
||||
# define atom_fy(i) atom->fx[(i) * 3 + 1]
|
||||
# define atom_fz(i) atom->fx[(i) * 3 + 2]
|
||||
*/
|
||||
#else
|
||||
# define POS_DATA_LAYOUT "SoA"
|
||||
# define atom_x(i) atom->x[i]
|
||||
# define atom_y(i) atom->y[i]
|
||||
# define atom_z(i) atom->z[i]
|
||||
#endif
|
||||
|
||||
// TODO: allow to switch velocites and forces to AoS
|
||||
# define atom_vx(i) atom->vx[i]
|
||||
# define atom_vy(i) atom->vy[i]
|
||||
# define atom_vz(i) atom->vz[i]
|
||||
# define atom_fx(i) atom->fx[i]
|
||||
# define atom_fy(i) atom->fy[i]
|
||||
# define atom_fz(i) atom->fz[i]
|
||||
|
||||
#endif
|
||||
@@ -1,112 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdbool.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
#include <timers.h>
|
||||
#include <timing.h>
|
||||
#include <simd.h>
|
||||
/*
|
||||
void cpuInitialIntegrate(Parameter *param, Atom *atom) {
|
||||
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate start\n");
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
|
||||
ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
|
||||
ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate end\n");
|
||||
}
|
||||
|
||||
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate start\n");
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate end\n");
|
||||
}
|
||||
*/
|
||||
|
||||
void cpuInitialIntegrate(Parameter *param, Atom *atom) {
|
||||
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate start\n");
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci+=2) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
MD_SIMD_FLOAT dtforce = simd_broadcast(param->dtforce);
|
||||
MD_SIMD_FLOAT dt = simd_broadcast(param->dt);
|
||||
|
||||
MD_SIMD_FLOAT vx_vector = simd_fma(simd_load(&ci_f[CL_X_OFFSET]), dtforce, simd_load(&ci_v[CL_X_OFFSET]));
|
||||
MD_SIMD_FLOAT vy_vector = simd_fma(simd_load(&ci_f[CL_Y_OFFSET]), dtforce, simd_load(&ci_v[CL_Y_OFFSET]));
|
||||
MD_SIMD_FLOAT vz_vector = simd_fma(simd_load(&ci_f[CL_Z_OFFSET]), dtforce, simd_load(&ci_v[CL_Z_OFFSET]));
|
||||
MD_SIMD_FLOAT x_vector = simd_fma(vx_vector, dt, simd_load(&ci_x[CL_X_OFFSET]));
|
||||
MD_SIMD_FLOAT y_vector = simd_fma(vy_vector, dt, simd_load(&ci_x[CL_Y_OFFSET]));
|
||||
MD_SIMD_FLOAT z_vector = simd_fma(vz_vector, dt, simd_load(&ci_x[CL_Z_OFFSET]));
|
||||
|
||||
simd_store(&ci_v[CL_X_OFFSET], vx_vector);
|
||||
simd_store(&ci_v[CL_Y_OFFSET], vy_vector);
|
||||
simd_store(&ci_v[CL_Z_OFFSET], vz_vector);
|
||||
simd_store(&ci_x[CL_X_OFFSET], x_vector);
|
||||
simd_store(&ci_x[CL_Y_OFFSET], y_vector);
|
||||
simd_store(&ci_x[CL_Z_OFFSET], z_vector);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate end\n");
|
||||
}
|
||||
|
||||
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate start\n");
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci+=2) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
MD_SIMD_FLOAT dtforce = simd_broadcast(param->dtforce);
|
||||
MD_SIMD_FLOAT vx_vector = simd_fma(simd_load(&ci_f[CL_X_OFFSET]), dtforce, simd_load(&ci_v[CL_X_OFFSET]));
|
||||
MD_SIMD_FLOAT vy_vector = simd_fma(simd_load(&ci_f[CL_Y_OFFSET]), dtforce, simd_load(&ci_v[CL_Y_OFFSET]));
|
||||
MD_SIMD_FLOAT vz_vector = simd_fma(simd_load(&ci_f[CL_Z_OFFSET]), dtforce, simd_load(&ci_v[CL_Z_OFFSET]));
|
||||
simd_store(&ci_v[CL_X_OFFSET], vx_vector);
|
||||
simd_store(&ci_v[CL_Y_OFFSET], vy_vector);
|
||||
simd_store(&ci_v[CL_Z_OFFSET], vz_vector);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate end\n");
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
void cudaInitialIntegrate(Parameter*, Atom*);
|
||||
void cudaFinalIntegrate(Parameter*, Atom*);
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __NEIGHBOR_H_
|
||||
#define __NEIGHBOR_H_
|
||||
// Interaction masks from GROMACS, things to remember (maybe these confused just me):
|
||||
// 1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
|
||||
// interaction masks (1 = interaction, 0 = no interaction)
|
||||
// 2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
|
||||
// so read them from right to left (least significant to most significant bit)
|
||||
// All interaction mask is the same for all kernels
|
||||
#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
|
||||
// 4x4 kernel diagonal mask
|
||||
#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
|
||||
// 4x2 kernel diagonal masks
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
|
||||
// 4x8 kernel diagonal masks
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
|
||||
|
||||
typedef struct {
|
||||
int cluster;
|
||||
int atom;
|
||||
} Pair;
|
||||
|
||||
typedef struct {
|
||||
int every;
|
||||
int ncalls;
|
||||
int maxneighs;
|
||||
int* numneigh;
|
||||
int* numneigh_masked;
|
||||
int half_neigh;
|
||||
int* neighbors;
|
||||
unsigned int* neighbors_imask;
|
||||
//MPI
|
||||
/*
|
||||
int Nshell; //# of atoms in listShell(Cluster here cover all possible ghost interactions)
|
||||
int *numNeighShell; //# of neighs for each atom in listShell
|
||||
Pair *neighshell; //list of neighs for each atom in listShell
|
||||
Pair *listshell; //Atoms to compute the force
|
||||
*/
|
||||
int Nshell; //# of cluster in listShell(Cluster here cover all possible ghost interactions)
|
||||
int *numNeighShell; //# of neighs for each atom in listShell
|
||||
int *neighshell; //list of neighs for each atom in listShell
|
||||
int *listshell; //Atoms to compute the force
|
||||
} Neighbor;
|
||||
|
||||
|
||||
extern void initNeighbor(Neighbor*, Parameter*);
|
||||
extern void setupNeighbor(Parameter*, Atom*);
|
||||
extern void binatoms(Atom*);
|
||||
extern void buildNeighbor(Atom*, Neighbor*);
|
||||
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
|
||||
extern void sortAtom(Atom*);
|
||||
extern void buildClusters(Atom*);
|
||||
extern void defineJClusters(Atom*);
|
||||
extern void binClusters(Atom*);
|
||||
extern void updateSingleAtoms(Atom*);
|
||||
#endif
|
||||
@@ -1,20 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __PBC_H_
|
||||
#define __PBC_H_
|
||||
extern void initPbc();
|
||||
extern void cpuUpdatePbc(Atom*, Parameter*, int);
|
||||
extern void updateAtomsPbc(Atom*, Parameter*);
|
||||
extern void setupPbc(Atom*, Parameter*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern void cudaUpdatePbc(Atom*, Parameter*, int);
|
||||
#endif
|
||||
#endif
|
||||
@@ -1,35 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __STATS_H_
|
||||
#define __STATS_H_
|
||||
typedef struct {
|
||||
long long int calculated_forces;
|
||||
long long int num_neighs;
|
||||
long long int force_iters;
|
||||
long long int atoms_within_cutoff;
|
||||
long long int atoms_outside_cutoff;
|
||||
long long int clusters_within_cutoff;
|
||||
long long int clusters_outside_cutoff;
|
||||
} Stats;
|
||||
|
||||
void initStats(Stats *s);
|
||||
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
|
||||
|
||||
#ifdef COMPUTE_STATS
|
||||
# define addStat(stat, value) stat += value;
|
||||
# define beginStatTimer() double Si = getTimeStamp();
|
||||
# define endStatTimer(stat) stat += getTimeStamp() - Si;
|
||||
#else
|
||||
# define addStat(stat, value)
|
||||
# define beginStatTimer()
|
||||
# define endStatTimer(stat)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,102 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
|
||||
#ifndef VECTOR_WIDTH
|
||||
# define VECTOR_WIDTH 8
|
||||
#endif
|
||||
|
||||
#ifndef TRACER_CONDITION
|
||||
# define TRACER_CONDITION (!(timestep % param->every))
|
||||
#endif
|
||||
|
||||
#ifdef MEM_TRACER
|
||||
# define MEM_TRACER_INIT FILE *mem_tracer_fp; \
|
||||
if(TRACER_CONDITION) { \
|
||||
char mem_tracer_fn[128]; \
|
||||
snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \
|
||||
mem_tracer_fp = fopen(mem_tracer_fn, "w");
|
||||
}
|
||||
|
||||
# define MEM_TRACER_END if(TRACER_CONDITION) { fclose(mem_tracer_fp); }
|
||||
# define MEM_TRACE(addr, op) if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); }
|
||||
#else
|
||||
# define MEM_TRACER_INIT
|
||||
# define MEM_TRACER_END
|
||||
# define MEM_TRACE(addr, op)
|
||||
#endif
|
||||
|
||||
#ifdef INDEX_TRACER
|
||||
# define INDEX_TRACER_INIT FILE *index_tracer_fp; \
|
||||
if(TRACER_CONDITION) { \
|
||||
char index_tracer_fn[128]; \
|
||||
snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \
|
||||
index_tracer_fp = fopen(index_tracer_fn, "w"); \
|
||||
}
|
||||
|
||||
# define INDEX_TRACER_END if(TRACER_CONDITION) { fclose(index_tracer_fp); }
|
||||
# define INDEX_TRACE_NATOMS(nl, ng, mn) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); }
|
||||
# define INDEX_TRACE_ATOM(a) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); }
|
||||
# define INDEX_TRACE(l, e) if(TRACER_CONDITION) { \
|
||||
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
|
||||
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
|
||||
fprintf(index_tracer_fp, "I: "); \
|
||||
for(int __j = 0; __j < __e; ++__j) { \
|
||||
fprintf(index_tracer_fp, "%d ", l[__i + __j]); \
|
||||
} \
|
||||
fprintf(index_tracer_fp, "\n"); \
|
||||
} \
|
||||
}
|
||||
|
||||
# define DIST_TRACE_SORT(l, e) if(TRACER_CONDITION) { \
|
||||
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
|
||||
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
|
||||
if(__e > 1) { \
|
||||
for(int __j = __i; __j < __i + __e - 1; ++__j) { \
|
||||
for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \
|
||||
if(l[__k] > l[__k + 1]) { \
|
||||
int __t = l[__k]; \
|
||||
l[__k] = l[__k + 1]; \
|
||||
l[__k + 1] = __t; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
# define DIST_TRACE(l, e) if(TRACER_CONDITION) { \
|
||||
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
|
||||
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
|
||||
if(__e > 1) { \
|
||||
fprintf(index_tracer_fp, "D: "); \
|
||||
for(int __j = 0; __j < __e - 1; ++__j) { \
|
||||
int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \
|
||||
fprintf(index_tracer_fp, "%d ", __dist); \
|
||||
} \
|
||||
fprintf(index_tracer_fp, "\n"); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
# define INDEX_TRACER_INIT
|
||||
# define INDEX_TRACER_END
|
||||
# define INDEX_TRACE_NATOMS(nl, ng, mn)
|
||||
# define INDEX_TRACE_ATOM(a)
|
||||
# define INDEX_TRACE(l, e)
|
||||
# define DIST_TRACE_SORT(l, e)
|
||||
# define DIST_TRACE(l, e)
|
||||
#endif
|
||||
|
||||
extern void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep);
|
||||
@@ -1,19 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <comm.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __VTK_H_
|
||||
#define __VTK_H_
|
||||
extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
|
||||
extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep);
|
||||
#endif
|
||||
@@ -1,21 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
|
||||
#ifndef __XTC_H_
|
||||
#define __XTC_H_
|
||||
|
||||
#ifdef XTC_OUTPUT
|
||||
void xtc_init(const char *, Atom*, int);
|
||||
void xtc_write(Atom*, int, int, int);
|
||||
void xtc_end();
|
||||
#else
|
||||
#define xtc_init(a,b,c)
|
||||
#define xtc_write(a,b,c,d)
|
||||
#define xtc_end()
|
||||
#endif
|
||||
#endif
|
||||
@@ -1,344 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
//---
|
||||
#include <likwid-marker.h>
|
||||
//---
|
||||
#include <timing.h>
|
||||
#include <allocate.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <stats.h>
|
||||
#include <thermo.h>
|
||||
#include <eam.h>
|
||||
#include <pbc.h>
|
||||
#include <timers.h>
|
||||
#include <util.h>
|
||||
|
||||
#define HLINE "----------------------------------------------------------------------------\n"
|
||||
|
||||
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||
|
||||
// Patterns
|
||||
#define P_SEQ 0
|
||||
#define P_FIX 1
|
||||
#define P_RAND 2
|
||||
|
||||
void init(Parameter *param) {
|
||||
param->input_file = NULL;
|
||||
param->force_field = FF_LJ;
|
||||
param->epsilon = 1.0;
|
||||
param->sigma6 = 1.0;
|
||||
param->rho = 0.8442;
|
||||
param->ntypes = 4;
|
||||
param->ntimes = 200;
|
||||
param->nx = 1;
|
||||
param->ny = 1;
|
||||
param->nz = 1;
|
||||
param->lattice = 1.0;
|
||||
param->cutforce = 1000000.0;
|
||||
param->cutneigh = param->cutforce;
|
||||
param->mass = 1.0;
|
||||
param->half_neigh = 0;
|
||||
// Unused
|
||||
param->dt = 0.005;
|
||||
param->dtforce = 0.5 * param->dt;
|
||||
param->nstat = 100;
|
||||
param->temp = 1.44;
|
||||
param->reneigh_every = 20;
|
||||
param->proc_freq = 2.4;
|
||||
param->eam_file = NULL;
|
||||
}
|
||||
|
||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
|
||||
const int maxneighs = nneighs * nreps;
|
||||
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
const int ncj = atom->Nclusters_local / jfac;
|
||||
const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
|
||||
neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
|
||||
neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
|
||||
neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
|
||||
|
||||
if(pattern == P_RAND && ncj <= nneighs) {
|
||||
fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
|
||||
int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
|
||||
int m = (pattern == P_SEQ) ? ncj : nneighs;
|
||||
int k = 0;
|
||||
|
||||
for(int k = 0; k < nneighs; k++) {
|
||||
if(pattern == P_RAND) {
|
||||
int found = 0;
|
||||
do {
|
||||
int cj = rand() % ncj;
|
||||
neighptr[k] = cj;
|
||||
neighptr_imask[k] = imask;
|
||||
found = 0;
|
||||
for(int l = 0; l < k; l++) {
|
||||
if(neighptr[l] == cj) {
|
||||
found = 1;
|
||||
}
|
||||
}
|
||||
} while(found == 1);
|
||||
} else {
|
||||
neighptr[k] = j;
|
||||
neighptr_imask[k] = imask;
|
||||
j = (j + 1) % m;
|
||||
}
|
||||
}
|
||||
|
||||
for(int r = 1; r < nreps; r++) {
|
||||
for(int k = 0; k < nneighs; k++) {
|
||||
neighptr[r * nneighs + k] = neighptr[k];
|
||||
neighptr_imask[r * nneighs + k] = neighptr_imask[k];
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = nneighs * nreps;
|
||||
neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char *argv[]) {
|
||||
Eam eam;
|
||||
Atom atom_data;
|
||||
Atom *atom = (Atom *)(&atom_data);
|
||||
Neighbor neighbor;
|
||||
Stats stats;
|
||||
Parameter param;
|
||||
char *pattern_str = NULL;
|
||||
int pattern = P_SEQ;
|
||||
int niclusters = 256; // Number of local i-clusters
|
||||
int iclusters_natoms = CLUSTER_M; // Number of valid atoms within i-clusters
|
||||
int nneighs = 9; // Number of j-cluster neighbors per i-cluster
|
||||
int masked = 0; // Use masked loop
|
||||
int nreps = 1;
|
||||
int csv = 0;
|
||||
|
||||
LIKWID_MARKER_INIT;
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
DEBUG_MESSAGE("Initializing parameters...\n");
|
||||
init(¶m);
|
||||
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if((strcmp(argv[i], "-f") == 0)) {
|
||||
if((param.force_field = str2ff(argv[++i])) < 0) {
|
||||
fprintf(stderr, "Invalid force field!\n");
|
||||
exit(-1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-p") == 0)) {
|
||||
pattern_str = strdup(argv[++i]);
|
||||
if(strncmp(pattern_str, "seq", 3) == 0) { pattern = P_SEQ; }
|
||||
else if(strncmp(pattern_str, "fix", 3) == 0) { pattern = P_FIX; }
|
||||
else if(strncmp(pattern_str, "rand", 3) == 0) { pattern = P_RAND; }
|
||||
else {
|
||||
fprintf(stderr, "Invalid pattern!\n");
|
||||
exit(-1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-e") == 0)) {
|
||||
param.eam_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-m") == 0)) {
|
||||
masked = 1;
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
|
||||
param.ntimes = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-ni") == 0)) {
|
||||
niclusters = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-na") == 0)) {
|
||||
iclusters_natoms = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nn") == 0)) {
|
||||
nneighs = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nr") == 0)) {
|
||||
nreps = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--freq") == 0)) {
|
||||
param.proc_freq = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--csv") == 0)) {
|
||||
csv = 1;
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
|
||||
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
|
||||
printf(HLINE);
|
||||
printf("-f <string>: force field (lj or eam), default lj\n");
|
||||
printf("-p <string>: pattern for data accesses (seq, fix or rand)\n");
|
||||
printf("-n / --nsteps <int>: number of timesteps for simulation\n");
|
||||
printf("-ni <int>: number of i-clusters (default 256)\n");
|
||||
printf("-na <int>: number of atoms per i-cluster (default %d)\n", CLUSTER_M);
|
||||
printf("-nn <int>: number of j-cluster neighbors per i-cluster (default 9)\n");
|
||||
printf("-nr <int>: number of times neighbor lists should be replicated (default 1)\n");
|
||||
printf("--freq <real>: set CPU frequency (GHz) and display average cycles per atom and neighbors\n");
|
||||
printf("--csv: set output as CSV style\n");
|
||||
printf(HLINE);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
if(pattern_str == NULL) {
|
||||
pattern_str = strdup("seq\0");
|
||||
}
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
DEBUG_MESSAGE("Initializing EAM parameters...\n");
|
||||
initEam(&eam, ¶m);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Initializing atoms...\n");
|
||||
initAtom(atom);
|
||||
initStats(&stats);
|
||||
|
||||
atom->ntypes = param.ntypes;
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param.epsilon;
|
||||
atom->sigma6[i] = param.sigma6;
|
||||
atom->cutneighsq[i] = param.cutneigh * param.cutneigh;
|
||||
atom->cutforcesq[i] = param.cutforce * param.cutforce;
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Creating atoms...\n");
|
||||
while(atom->Nmax < niclusters * iclusters_natoms) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
while(atom->Nclusters_max < niclusters) {
|
||||
growClusters(atom);
|
||||
}
|
||||
|
||||
for(int ci = 0; ci < niclusters; ++ci) {
|
||||
int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
int *ci_type = &atom->cl_type[ci_sca_base];
|
||||
|
||||
for(int cii = 0; cii < iclusters_natoms; ++cii) {
|
||||
ci_x[CL_X_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
|
||||
ci_x[CL_Y_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
|
||||
ci_x[CL_Z_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
|
||||
ci_v[CL_X_OFFSET + cii] = 0.0;
|
||||
ci_v[CL_Y_OFFSET + cii] = 0.0;
|
||||
ci_v[CL_Z_OFFSET + cii] = 0.0;
|
||||
ci_type[cii] = rand() % atom->ntypes;
|
||||
atom->Nlocal++;
|
||||
}
|
||||
|
||||
for(int cii = iclusters_natoms; cii < CLUSTER_M; cii++) {
|
||||
ci_x[CL_X_OFFSET + cii] = INFINITY;
|
||||
ci_x[CL_Y_OFFSET + cii] = INFINITY;
|
||||
ci_x[CL_Z_OFFSET + cii] = INFINITY;
|
||||
}
|
||||
|
||||
atom->iclusters[ci].natoms = iclusters_natoms;
|
||||
atom->Nclusters_local++;
|
||||
}
|
||||
|
||||
const double estim_atom_volume = (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT));
|
||||
const double estim_neighbors_volume = (double)(atom->Nlocal * (nneighs + 2) * sizeof(int));
|
||||
const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + estim_neighbors_volume);
|
||||
|
||||
if(!csv) {
|
||||
printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
|
||||
printf("Floating-point precision: %s\n", PRECISION_STRING);
|
||||
printf("Pattern: %s\n", pattern_str);
|
||||
printf("Number of timesteps: %d\n", param.ntimes);
|
||||
printf("Number of i-clusters: %d\n", niclusters);
|
||||
printf("Number of atoms per i-cluster: %d\n", iclusters_natoms);
|
||||
printf("Number of j-cluster neighbors per i-cluster: %d\n", nneighs);
|
||||
printf("Number of times to replicate neighbor lists: %d\n", nreps);
|
||||
printf("Estimated total data volume (kB): %.4f\n", estim_volume / 1000.0);
|
||||
printf("Estimated atom data volume (kB): %.4f\n", estim_atom_volume / 1000.0);
|
||||
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Defining j-clusters...\n");
|
||||
defineJClusters(atom);
|
||||
DEBUG_MESSAGE("Initializing neighbor lists...\n");
|
||||
initNeighbor(&neighbor, ¶m);
|
||||
DEBUG_MESSAGE("Creating neighbor lists...\n");
|
||||
createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
|
||||
DEBUG_MESSAGE("Computing forces...\n");
|
||||
|
||||
double T_accum = 0.0;
|
||||
for(int i = 0; i < param.ntimes; i++) {
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, atom, &neighbor, i + 1);
|
||||
#endif
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
T_accum += computeForceEam(&eam, ¶m, atom, &neighbor, &stats);
|
||||
} else {
|
||||
T_accum += computeForceLJ(¶m, atom, &neighbor, &stats);
|
||||
}
|
||||
}
|
||||
|
||||
double freq_hz = param.proc_freq * 1.e9;
|
||||
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
|
||||
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
|
||||
const double cycles_per_neigh = cycles_per_atom / (double)(nneighs);
|
||||
|
||||
if(!csv) {
|
||||
printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
|
||||
if(param.proc_freq > 0.0) {
|
||||
printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
|
||||
}
|
||||
} else {
|
||||
printf("steps,pattern,niclusters,iclusters_natoms,nneighs,nreps,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
|
||||
if(param.proc_freq > 0.0) {
|
||||
printf(",cy/atom,cy/neigh");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("%d,%s,%d,%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
|
||||
param.ntimes, pattern_str, niclusters, iclusters_natoms, nneighs, nreps,
|
||||
estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
|
||||
|
||||
if(param.proc_freq > 0.0) {
|
||||
printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
double timer[NUMTIMER];
|
||||
timer[FORCE] = T_accum;
|
||||
displayStatistics(atom, ¶m, &stats, timer);
|
||||
LIKWID_MARKER_CLOSE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
404
gromacs/main.c
404
gromacs/main.c
@@ -1,404 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
//--
|
||||
#include <likwid-marker.h>
|
||||
//--
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <device.h>
|
||||
#include <eam.h>
|
||||
#include <integrate.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <pbc.h>
|
||||
#include <stats.h>
|
||||
#include <thermo.h>
|
||||
#include <timers.h>
|
||||
#include <timing.h>
|
||||
#include <util.h>
|
||||
#include <vtk.h>
|
||||
#include <xtc.h>
|
||||
#include <comm.h>
|
||||
#include <grid.h>
|
||||
#include <shell_methods.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#define HLINE "----------------------------------------------------------------------------\n"
|
||||
|
||||
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern int isReneighboured;
|
||||
extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
|
||||
extern void copyDataToCUDADevice(Atom *atom);
|
||||
extern void copyDataFromCUDADevice(Atom *atom);
|
||||
extern void cudaDeviceFree();
|
||||
#endif
|
||||
|
||||
double dynamicBalance(Comm* comm, Grid* grid, Atom* atom, Parameter* param, double time)
|
||||
{
|
||||
double S, E;
|
||||
int dims = 3; //TODO: Adjust to do in 3d and 2d
|
||||
S = getTimeStamp();
|
||||
if(param->balance == RCB) {
|
||||
rcbBalance(grid, atom, param, meanBisect,dims,0);
|
||||
neighComm(comm, param, grid);
|
||||
}else if(param->balance == meanTimeRCB){
|
||||
rcbBalance(grid, atom, param, meanTimeBisect,dims,time);
|
||||
neighComm(comm, param, grid);
|
||||
}else if(param->balance == Staggered) {
|
||||
staggeredBalance(grid, atom, param, time);
|
||||
neighComm(comm, param, grid);
|
||||
exchangeComm(comm,atom);
|
||||
}else { } //Do nothing
|
||||
//printGrid(grid);
|
||||
E = getTimeStamp();
|
||||
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double initialBalance(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid)
|
||||
{
|
||||
double E,S,time;
|
||||
int me;
|
||||
MPI_Comm_rank(world,&me);
|
||||
S = getTimeStamp();
|
||||
if(param->balance == meanTimeRCB || param->balance == RCB){
|
||||
rcbBalance(grid, atom, param, meanBisect,3,0);
|
||||
neighComm(comm, param, grid);
|
||||
}
|
||||
MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world);
|
||||
printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
|
||||
MPI_Barrier(world);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid) {
|
||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||
double S, E;
|
||||
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||
param->xprd = param->nx * param->lattice;
|
||||
param->yprd = param->ny * param->lattice;
|
||||
param->zprd = param->nz * param->lattice;
|
||||
S = getTimeStamp();
|
||||
initAtom(atom);
|
||||
//initPbc(atom);
|
||||
initStats(stats);
|
||||
initNeighbor(neighbor, param);
|
||||
if(param->input_file == NULL) {
|
||||
createAtom(atom, param);
|
||||
} else {
|
||||
readAtom(atom, param);
|
||||
}
|
||||
setupGrid(grid,atom,param);
|
||||
setupNeighbor(param, atom);
|
||||
setupComm(comm, param, grid);
|
||||
if(param->balance){
|
||||
initialBalance(param, eam, atom, neighbor, stats, comm, grid);
|
||||
}
|
||||
setupThermo(param, atom->Natoms);
|
||||
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
||||
buildClusters(atom);
|
||||
defineJClusters(atom);
|
||||
//setupPbc(atom, param);
|
||||
ghostNeighbor(comm, atom, param); //change
|
||||
binClusters(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
initDevice(atom, neighbor);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double reneighbour(Comm* comm, Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
double S, E;
|
||||
S = getTimeStamp();
|
||||
LIKWID_MARKER_START("reneighbour");
|
||||
//updateAtomsPbc(atom, param);
|
||||
buildClusters(atom);
|
||||
defineJClusters(atom);
|
||||
//setupPbc(atom, param);
|
||||
ghostNeighbor(comm, atom, param);
|
||||
binClusters(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
LIKWID_MARKER_STOP("reneighbour");
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double updateAtoms(Comm* comm, Atom* atom){
|
||||
double S,E;
|
||||
S = getTimeStamp();
|
||||
updateSingleAtoms(atom);
|
||||
exchangeComm(comm, atom);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
double timer[NUMTIMER];
|
||||
Eam eam;
|
||||
Atom atom;
|
||||
Neighbor neighbor;
|
||||
Stats stats;
|
||||
Parameter param;
|
||||
Comm comm;
|
||||
Grid grid;
|
||||
LIKWID_MARKER_INIT;
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
//LIKWID_MARKER_REGISTER("reneighbour");
|
||||
//LIKWID_MARKER_REGISTER("pbc");
|
||||
}
|
||||
initComm(&argc, &argv, &comm); //change
|
||||
initParameter(¶m);
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
|
||||
readParameter(¶m, argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-f") == 0)) {
|
||||
if((param.force_field = str2ff(argv[++i])) < 0) {
|
||||
fprintf(stderr, "Invalid force field!\n");
|
||||
exit(-1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-i") == 0)) {
|
||||
param.input_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-e") == 0)) {
|
||||
param.eam_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
|
||||
param.ntimes = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nx") == 0)) {
|
||||
param.nx = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-ny") == 0)) {
|
||||
param.ny = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nz") == 0)) {
|
||||
param.nz = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-half") == 0)) {
|
||||
param.half_neigh = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-method") == 0)) {
|
||||
param.method = atoi(argv[++i]);
|
||||
if (param.method>2 || param.method< 0){
|
||||
if(comm.myproc == 0) fprintf(stderr, "Method does not exist!\n");
|
||||
endComm(&comm);
|
||||
exit(0);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-bal") == 0)) {
|
||||
param.balance = atoi(argv[++i]);
|
||||
if (param.balance>3 || param.balance< 0){
|
||||
if(comm.myproc == 0) fprintf(stderr, "Load balance does not exist!\n");
|
||||
endComm(&comm);
|
||||
exit(0);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) {
|
||||
param.mass = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
|
||||
param.cutforce = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--skin") == 0)) {
|
||||
param.skin = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--freq") == 0)) {
|
||||
param.proc_freq = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--vtk") == 0)) {
|
||||
param.vtk_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--xtc") == 0)) {
|
||||
#ifndef XTC_OUTPUT
|
||||
fprintf(stderr, "XTC not available, set XTC_OUTPUT option in config.mk file and recompile MD-Bench!");
|
||||
exit(-1);
|
||||
#else
|
||||
param.xtc_file = strdup(argv[++i]);
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
|
||||
//TODO: add the shell and ac print options
|
||||
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
|
||||
printf(HLINE);
|
||||
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
|
||||
printf("-f <string>: force field (lj or eam), default lj\n");
|
||||
printf("-i <string>: input file with atom positions (dump)\n");
|
||||
printf("-e <string>: input file for EAM\n");
|
||||
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
|
||||
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
|
||||
printf("-r / --radius <real>: set cutoff radius\n");
|
||||
printf("-s / --skin <real>: set skin (verlet buffer)\n");
|
||||
printf("--freq <real>: processor frequency (GHz)\n");
|
||||
printf("--vtk <string>: VTK file for visualization\n");
|
||||
printf("--xtc <string>: XTC file for visualization\n");
|
||||
printf(HLINE);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
if(param.balance>0 && param.method == 1){
|
||||
if(comm.myproc == 0) fprintf(stderr, "Half Shell is not supported by load balance!\n");
|
||||
endComm(&comm);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
param.cutneigh = param.cutforce + param.skin;
|
||||
timer[SETUP]=setup(¶m, &eam, &atom, &neighbor, &stats, &comm, &grid);
|
||||
if(comm.myproc == 0) printParameter(¶m);
|
||||
if(comm.myproc == 0) printf(HLINE);
|
||||
if(comm.myproc == 0) printf("step\ttemp\t\tpressure\n");
|
||||
computeThermo(0, ¶m, &atom);
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataToCUDADevice(&atom);
|
||||
#endif
|
||||
if(param.force_field == FF_EAM) {
|
||||
timer[FORCE] = computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
} else {
|
||||
timer[FORCE] = computeForceLJ(¶m, &atom, &neighbor, &stats);
|
||||
}
|
||||
timer[NEIGH] = 0.0;
|
||||
timer[FORWARD] = 0.0;
|
||||
timer[UPDATE] = 0.0;
|
||||
timer[BALANCE] = 0.0;
|
||||
timer[REVERSE] = reverse(&comm, &atom, ¶m);
|
||||
MPI_Barrier(world);
|
||||
timer[TOTAL] = getTimeStamp();
|
||||
if(param.vtk_file != NULL) {
|
||||
//write_data_to_vtk_file(param.vtk_file, &comm ,&atom, 0);
|
||||
printvtk(param.vtk_file, &comm, &atom, ¶m, 0);
|
||||
}
|
||||
//TODO: modify xct
|
||||
if(param.xtc_file != NULL) {
|
||||
xtc_init(param.xtc_file, &atom, 0);
|
||||
}
|
||||
double forceTime=0.0;
|
||||
double commTime=0.0;
|
||||
for(int n = 0; n < param.ntimes; n++) {
|
||||
initialIntegrate(¶m, &atom);
|
||||
if((n + 1) % param.reneigh_every) {
|
||||
timer[FORWARD]+=forward(&comm, &atom, ¶m);
|
||||
if(!((n + 1) % param.prune_every)){
|
||||
pruneNeighbor(¶m, &atom, &neighbor);
|
||||
}
|
||||
} else {
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataFromCUDADevice(&atom);
|
||||
#endif
|
||||
timer[UPDATE] +=updateAtoms(&comm,&atom);
|
||||
if(param.balance && !((n+1)%param.balance_every))
|
||||
timer[BALANCE] +=dynamicBalance(&comm, &grid, &atom , ¶m, timer[FORCE]);
|
||||
timer[NEIGH] += reneighbour(&comm, ¶m, &atom, &neighbor);
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataToCUDADevice(&atom);
|
||||
isReneighboured = 1;
|
||||
#endif
|
||||
}
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
if(param.force_field == FF_EAM) {
|
||||
timer[FORCE] += computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
} else {
|
||||
timer[FORCE] += computeForceLJ(¶m, &atom, &neighbor, &stats);
|
||||
}
|
||||
timer[REVERSE] += reverse(&comm, &atom, ¶m);
|
||||
finalIntegrate(¶m, &atom);
|
||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||
computeThermo(n + 1, ¶m, &atom);
|
||||
}
|
||||
int write_pos = !((n + 1) % param.x_out_every);
|
||||
int write_vel = !((n + 1) % param.v_out_every);
|
||||
if(write_pos || write_vel) {
|
||||
if(param.vtk_file != NULL) {
|
||||
printvtk(param.vtk_file, &comm, &atom, ¶m, n+1);
|
||||
}
|
||||
//TODO: xtc file
|
||||
if(param.xtc_file != NULL) {
|
||||
xtc_write(&atom, n + 1, write_pos, write_vel);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataFromCUDADevice(&atom);
|
||||
#endif
|
||||
MPI_Barrier(world);
|
||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||
updateAtoms(&comm,&atom);
|
||||
computeThermo(-1, ¶m, &atom);
|
||||
//TODO:
|
||||
if(param.xtc_file != NULL) {
|
||||
xtc_end();
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
cudaDeviceFree();
|
||||
#endif
|
||||
double mint[NUMTIMER];
|
||||
double maxt[NUMTIMER];
|
||||
double sumt[NUMTIMER];
|
||||
timer[REST] = timer[TOTAL]-timer[FORCE]-timer[NEIGH]-timer[BALANCE]-timer[FORWARD]-timer[REVERSE];
|
||||
MPI_Reduce(timer,mint,NUMTIMER,MPI_DOUBLE,MPI_MIN,0,world);
|
||||
MPI_Reduce(timer,maxt,NUMTIMER,MPI_DOUBLE,MPI_MAX,0,world);
|
||||
MPI_Reduce(timer,sumt,NUMTIMER,MPI_DOUBLE,MPI_SUM,0,world);
|
||||
int Nghost;
|
||||
MPI_Reduce(&atom.Nghost,&Nghost,1,MPI_INT,MPI_SUM,0,world);
|
||||
|
||||
if(comm.myproc == 0){
|
||||
int n = comm.numproc;
|
||||
printf(HLINE);
|
||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, Nghost, param.ntimes);
|
||||
printf("TOTAL %.2fs\n\n",timer[TOTAL]);
|
||||
printf("%4s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|\n","","FORCE ", "NEIGH ", "BALANCE", "FORWARD", "REVERSE","UPDATE","REST ","SETUP");
|
||||
printf("----|-------|-------|-------|-------|-------|-------|-------|-------|\n");
|
||||
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "AVG", sumt[FORCE]/n,sumt[NEIGH]/n,sumt[BALANCE]/n,sumt[FORWARD]/n,sumt[REVERSE]/n,sumt[UPDATE]/n,sumt[REST]/n,sumt[SETUP]/n);
|
||||
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MIN", mint[FORCE],mint[NEIGH],mint[BALANCE],mint[FORWARD],mint[REVERSE],mint[UPDATE],mint[REST],mint[SETUP]);
|
||||
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MAX", maxt[FORCE],maxt[NEIGH],maxt[BALANCE],maxt[FORWARD],maxt[REVERSE],maxt[UPDATE],maxt[REST],maxt[SETUP]);
|
||||
printf(HLINE);
|
||||
printf("Performance: %.2f million atom updates per second\n",
|
||||
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
||||
|
||||
#ifdef COMPUTE_STATS
|
||||
displayStatistics(&atom, ¶m, &stats, timer);
|
||||
#endif
|
||||
}
|
||||
endComm(&comm);
|
||||
LIKWID_MARKER_CLOSE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
1114
gromacs/neighbor.c
1114
gromacs/neighbor.c
File diff suppressed because it is too large
Load Diff
231
gromacs/pbc.c
231
gromacs/pbc.c
@@ -1,231 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <pbc.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <neighbor.h>
|
||||
#include <util.h>
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
static int NmaxGhost;
|
||||
|
||||
static void growPbc(Atom*);
|
||||
|
||||
/* exported subroutines */
|
||||
void initPbc(Atom* atom) {
|
||||
NmaxGhost = 0;
|
||||
atom->border_map = NULL;
|
||||
atom->PBCx = NULL; atom->PBCy = NULL; atom->PBCz = NULL;
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
||||
DEBUG_MESSAGE("updatePbc start\n");
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = atom->Nclusters_local / jfac;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
|
||||
const int cj = ncj + cg;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
|
||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
|
||||
MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
|
||||
MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
|
||||
|
||||
cj_x[CL_X_OFFSET + cjj] = xtmp;
|
||||
cj_x[CL_Y_OFFSET + cjj] = ytmp;
|
||||
cj_x[CL_Z_OFFSET + cjj] = ztmp;
|
||||
|
||||
if(firstUpdate) {
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
}
|
||||
}
|
||||
|
||||
if(firstUpdate) {
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
||||
}
|
||||
|
||||
atom->jclusters[cj].bbminx = bbminx;
|
||||
atom->jclusters[cj].bbmaxx = bbmaxx;
|
||||
atom->jclusters[cj].bbminy = bbminy;
|
||||
atom->jclusters[cj].bbmaxy = bbmaxy;
|
||||
atom->jclusters[cj].bbminz = bbminz;
|
||||
atom->jclusters[cj].bbmaxz = bbmaxz;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("updatePbc end\n");
|
||||
}
|
||||
|
||||
/* relocate atoms that have left domain according
|
||||
* to periodic boundary conditions */
|
||||
void updateAtomsPbc(Atom *atom, Parameter *param) {
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
if(atom_x(i) < 0.0) {
|
||||
atom_x(i) += xprd;
|
||||
} else if(atom_x(i) >= xprd) {
|
||||
atom_x(i) -= xprd;
|
||||
}
|
||||
|
||||
if(atom_y(i) < 0.0) {
|
||||
atom_y(i) += yprd;
|
||||
} else if(atom_y(i) >= yprd) {
|
||||
atom_y(i) -= yprd;
|
||||
}
|
||||
|
||||
if(atom_z(i) < 0.0) {
|
||||
atom_z(i) += zprd;
|
||||
} else if(atom_z(i) >= zprd) {
|
||||
atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* setup periodic boundary conditions by
|
||||
* defining ghost atoms around domain
|
||||
* only creates mapping and coordinate corrections
|
||||
* that are then enforced in updatePbc */
|
||||
#define ADDGHOST(dx,dy,dz); \
|
||||
Nghost++; \
|
||||
const int cg = ncj + Nghost; \
|
||||
const int cj_natoms = atom->jclusters[cj].natoms; \
|
||||
atom->border_map[Nghost] = cj; \
|
||||
atom->PBCx[Nghost] = dx; \
|
||||
atom->PBCy[Nghost] = dy; \
|
||||
atom->PBCz[Nghost] = dz; \
|
||||
atom->jclusters[cg].natoms = cj_natoms; \
|
||||
Nghost_atoms += cj_natoms; \
|
||||
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); \
|
||||
int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg); \
|
||||
for(int cjj = 0; cjj < cj_natoms; cjj++) { \
|
||||
atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj]; \
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
void growPbc(Atom* atom) {
|
||||
int nold = NmaxGhost;
|
||||
NmaxGhost += DELTA;
|
||||
|
||||
atom->border_map = (int*) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
atom->PBCx = (int*) reallocate(atom->PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
atom->PBCy = (int*) reallocate(atom->PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
atom->PBCz = (int*) reallocate(atom->PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
}
|
||||
|
||||
void setupPbc(Atom *atom, Parameter *param) {
|
||||
DEBUG_MESSAGE("setupPbc start\n");
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
MD_FLOAT Cutneigh = param->cutneigh;
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = atom->Nclusters_local / jfac;
|
||||
int Nghost = -1;
|
||||
int Nghost_atoms = 0;
|
||||
|
||||
for(int cj = 0; cj < ncj; cj++) {
|
||||
if(atom->jclusters[cj].natoms > 0) {
|
||||
if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) {
|
||||
growClusters(atom);
|
||||
}
|
||||
|
||||
if((Nghost + 7) * jfac >= NmaxGhost) {
|
||||
growPbc(atom);
|
||||
}
|
||||
|
||||
MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
|
||||
MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
|
||||
MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
|
||||
MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
|
||||
MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
|
||||
MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
|
||||
|
||||
/* Setup ghost atoms */
|
||||
/* 6 planes */
|
||||
if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); }
|
||||
if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
|
||||
if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); }
|
||||
if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
|
||||
if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); }
|
||||
if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
|
||||
/* 8 corners */
|
||||
if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); }
|
||||
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); }
|
||||
if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
|
||||
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
|
||||
/* 12 edges */
|
||||
if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); }
|
||||
if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
|
||||
if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); }
|
||||
if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
|
||||
if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); }
|
||||
if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
|
||||
}
|
||||
}
|
||||
|
||||
if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) {
|
||||
growClusters(atom);
|
||||
}
|
||||
|
||||
// Add dummy cluster at the end
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
||||
}
|
||||
|
||||
// increase by one to make it the ghost atom count
|
||||
atom->dummy_cj = ncj + Nghost + 1;
|
||||
atom->Nghost = Nghost_atoms;
|
||||
atom->Nclusters_ghost = Nghost + 1;
|
||||
atom->Nclusters = atom->Nclusters_local + Nghost + 1;
|
||||
|
||||
// Update created ghost clusters positions
|
||||
cpuUpdatePbc(atom, param, 1);
|
||||
DEBUG_MESSAGE("setupPbc end\n");
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timers.h>
|
||||
|
||||
void initStats(Stats *s) {
|
||||
s->calculated_forces = 0;
|
||||
s->num_neighs = 0;
|
||||
s->force_iters = 0;
|
||||
s->atoms_within_cutoff = 0;
|
||||
s->atoms_outside_cutoff = 0;
|
||||
s->clusters_within_cutoff = 0;
|
||||
s->clusters_outside_cutoff = 0;
|
||||
}
|
||||
|
||||
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
|
||||
#ifdef COMPUTE_STATS
|
||||
|
||||
const int MxN = CLUSTER_M * CLUSTER_N;
|
||||
double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local);
|
||||
double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
|
||||
(double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
|
||||
double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1));
|
||||
double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces);
|
||||
double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1));
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
force_useful_volume += 1e-9 * (double)((atom->Nlocal * (param->ntimes + 1)) + stats->num_neighs) * sizeof(int);
|
||||
#endif
|
||||
|
||||
printf("Statistics:\n");
|
||||
printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param->proc_freq);
|
||||
printf("\tAverage atoms per cluster: %.4f\n", avg_atoms_cluster);
|
||||
printf("\tAverage neighbors per atom: %.4f\n", avg_neigh_atom);
|
||||
printf("\tAverage neighbors per cluster: %.4f\n", avg_neigh_cluster);
|
||||
printf("\tAverage SIMD iterations per atom: %.4f\n", avg_simd);
|
||||
printf("\tTotal number of computed pair interactions: %lld\n", stats->num_neighs * MxN);
|
||||
printf("\tTotal number of SIMD iterations: %lld\n", stats->force_iters);
|
||||
printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume);
|
||||
printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param->proc_freq * 1e9 / stats->force_iters);
|
||||
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
const double atoms_eff = (double)stats->atoms_within_cutoff / (double)(stats->atoms_within_cutoff + stats->atoms_outside_cutoff) * 100.0;
|
||||
printf("\tAtoms within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->atoms_within_cutoff, stats->atoms_outside_cutoff, atoms_eff);
|
||||
const double clusters_eff = (double)stats->clusters_within_cutoff / (double)(stats->clusters_within_cutoff + stats->clusters_outside_cutoff) * 100.0;
|
||||
printf("\tClusters within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->clusters_within_cutoff, stats->clusters_outside_cutoff, clusters_eff);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
@@ -1,61 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <tracing.h>
|
||||
|
||||
void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep) {
|
||||
MEM_TRACER_INIT;
|
||||
INDEX_TRACER_INIT;
|
||||
int Nlocal = atom->Nlocal;
|
||||
int *neighs;
|
||||
unsigned int *neighs_imask;
|
||||
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
||||
|
||||
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MEM_TRACE(atom_x(i), 'R');
|
||||
MEM_TRACE(atom_y(i), 'R');
|
||||
MEM_TRACE(atom_z(i), 'R');
|
||||
INDEX_TRACE_ATOM(i);
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
MEM_TRACE(atom->type[i], 'R');
|
||||
#endif
|
||||
|
||||
DIST_TRACE_SORT(neighs, numneighs);
|
||||
INDEX_TRACE(neighs, numneighs);
|
||||
DIST_TRACE(neighs, numneighs);
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MEM_TRACE(j, 'R');
|
||||
MEM_TRACE(atom_x(j), 'R');
|
||||
MEM_TRACE(atom_y(j), 'R');
|
||||
MEM_TRACE(atom_z(j), 'R');
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
MEM_TRACE(atom->type[j], 'R');
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
MEM_TRACE(fx[i], 'R');
|
||||
MEM_TRACE(fx[i], 'W');
|
||||
MEM_TRACE(fy[i], 'R');
|
||||
MEM_TRACE(fy[i], 'W');
|
||||
MEM_TRACE(fz[i], 'R');
|
||||
MEM_TRACE(fz[i], 'W');
|
||||
*/
|
||||
}
|
||||
|
||||
INDEX_TRACER_END;
|
||||
MEM_TRACER_END;
|
||||
}
|
||||
320
gromacs/vtk.c
320
gromacs/vtk.c
@@ -1,320 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <atom.h>
|
||||
#include <vtk.h>
|
||||
#include <mpi.h>
|
||||
#include <string.h>
|
||||
|
||||
static MPI_File _fh;
|
||||
static inline void flushBuffer(char*);
|
||||
|
||||
void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
|
||||
write_local_atoms_to_vtk_file(filename, atom, timestep);
|
||||
write_ghost_atoms_to_vtk_file(filename, atom, timestep);
|
||||
write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
|
||||
write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
|
||||
}
|
||||
|
||||
int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
|
||||
FILE* fp = fopen(timestep_filename, "wb");
|
||||
|
||||
if(fp == NULL) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
|
||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
||||
fprintf(fp, "1 %d\n", i);
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
|
||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
||||
fprintf(fp, "1\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
|
||||
fprintf(fp, "SCALARS mass double\n");
|
||||
fprintf(fp, "LOOKUP_TABLE default\n");
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
fprintf(fp, "1.0\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_%d.vtk", filename, timestep);
|
||||
FILE* fp = fopen(timestep_filename, "wb");
|
||||
|
||||
if(fp == NULL) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nghost);
|
||||
for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELLS %d %d\n", atom->Nghost, atom->Nghost * 2);
|
||||
for(int i = 0; i < atom->Nghost; ++i) {
|
||||
fprintf(fp, "1 %d\n", i);
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELL_TYPES %d\n", atom->Nghost);
|
||||
for(int i = 0; i < atom->Nghost; ++i) {
|
||||
fprintf(fp, "1\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "POINT_DATA %d\n", atom->Nghost);
|
||||
fprintf(fp, "SCALARS mass double\n");
|
||||
fprintf(fp, "LOOKUP_TABLE default\n");
|
||||
for(int i = 0; i < atom->Nghost; i++) {
|
||||
fprintf(fp, "1.0\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_edges_%d.vtk", filename, timestep);
|
||||
FILE* fp = fopen(timestep_filename, "wb");
|
||||
int N = atom->Nclusters_local;
|
||||
int tot_lines = 0;
|
||||
int i = 0;
|
||||
|
||||
if(fp == NULL) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET POLYDATA\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
|
||||
for(int ci = 0; ci < N; ++ci) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
|
||||
tot_lines += atom->iclusters[ci].natoms;
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "LINES %d %d\n", N, N + tot_lines);
|
||||
for(int ci = 0; ci < N; ++ci) {
|
||||
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%d ", i++);
|
||||
}
|
||||
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_edges_%d.vtk", filename, timestep);
|
||||
FILE* fp = fopen(timestep_filename, "wb");
|
||||
int N = atom->Nclusters_local + atom->Nclusters_ghost;
|
||||
int tot_lines = 0;
|
||||
int i = 0;
|
||||
|
||||
if(fp == NULL) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET POLYDATA\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nghost);
|
||||
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
|
||||
tot_lines += atom->iclusters[ci].natoms;
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines);
|
||||
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
|
||||
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%d ", i++);
|
||||
}
|
||||
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int vtkOpen(const char* filename, Comm* comm, Atom* atom ,int timestep)
|
||||
{
|
||||
char msg[256];
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep);
|
||||
MPI_File_open(MPI_COMM_WORLD, timestep_filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &_fh);
|
||||
if(_fh == MPI_FILE_NULL) {
|
||||
if(comm->myproc == 0) fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (comm->myproc==0){
|
||||
sprintf(msg, "# vtk DataFile Version 2.0\n");
|
||||
sprintf(msg, "%sParticle data\n",msg);
|
||||
sprintf(msg, "%sASCII\n",msg);
|
||||
sprintf(msg, "%sDATASET UNSTRUCTURED_GRID\n",msg);
|
||||
sprintf(msg, "%sPOINTS %d double\n",msg, atom->Natoms);
|
||||
flushBuffer(msg);
|
||||
}
|
||||
}
|
||||
|
||||
int vtkVector(Comm* comm, Atom* atom, Parameter* param)
|
||||
{
|
||||
if (_fh == MPI_FILE_NULL) {
|
||||
if(comm->myproc==0) printf("vtk not initialize! Call vtkOpen first!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
int sizeline= 25; //#initial guess of characters in "%.4f %.4f %.4f\n"
|
||||
int extrabuff = 100;
|
||||
int sizebuff = sizeline*atom->Nlocal+extrabuff;
|
||||
int mysize = 0;
|
||||
char* msg = (char*) malloc(sizebuff);
|
||||
sprintf(msg, "");
|
||||
for(int i = 0; i < atom->Nlocal; i++){
|
||||
if(mysize+extrabuff >= sizebuff){
|
||||
sizebuff*= 1.5;
|
||||
msg = (char*) realloc(msg, sizebuff);
|
||||
}
|
||||
//TODO: do not forget to add param->xlo, param->ylo, param->zlo
|
||||
sprintf(msg, "%s%.4f %.4f %.4f\n",msg, atom_x(i), atom_y(i), atom_z(i));
|
||||
mysize = strlen(msg);
|
||||
}
|
||||
int gatherSize[comm->numproc];
|
||||
|
||||
MPI_Allgather(&mysize, 1, MPI_INT, gatherSize, 1, MPI_INT, MPI_COMM_WORLD);
|
||||
int offset=0;
|
||||
int globalSize = 0;
|
||||
|
||||
for(int i = 0; i < comm->myproc; i++)
|
||||
offset+= gatherSize[i];
|
||||
|
||||
for(int i = 0; i < comm->numproc; i++)
|
||||
globalSize+= gatherSize[i];
|
||||
|
||||
MPI_Offset displ;
|
||||
MPI_Datatype FileType;
|
||||
int GlobalSize[] = {globalSize};
|
||||
int LocalSize[] = {mysize};
|
||||
int Start[] = {offset};
|
||||
|
||||
if(LocalSize[0]>0){
|
||||
MPI_Type_create_subarray(1, GlobalSize, LocalSize, Start, MPI_ORDER_C, MPI_CHAR, &FileType);
|
||||
} else {
|
||||
MPI_Type_vector(0,0,0,MPI_CHAR,&FileType);
|
||||
}
|
||||
MPI_Type_commit(&FileType);
|
||||
MPI_File_get_size(_fh, &displ);
|
||||
MPI_File_set_view(_fh, displ, MPI_CHAR, FileType, "native", MPI_INFO_NULL);
|
||||
MPI_File_write_all (_fh, msg, mysize , MPI_CHAR ,MPI_STATUS_IGNORE);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
MPI_File_set_view(_fh,0,MPI_CHAR, MPI_CHAR, "native", MPI_INFO_NULL);
|
||||
|
||||
if (comm->myproc==0){
|
||||
|
||||
sprintf(msg, "\n\n");
|
||||
sprintf(msg, "%sCELLS %d %d\n", msg, atom->Natoms, atom->Natoms * 2);
|
||||
|
||||
for(int i = 0; i < atom->Natoms; i++)
|
||||
sprintf(msg, "%s1 %d\n", msg, i);
|
||||
flushBuffer(msg);
|
||||
|
||||
sprintf(msg, "\n\n");
|
||||
sprintf(msg, "%sCELL_TYPES %d\n",msg, atom->Natoms);
|
||||
for(int i = 0; i < atom->Natoms; i++)
|
||||
sprintf(msg, "%s1\n",msg);
|
||||
flushBuffer(msg);
|
||||
|
||||
sprintf(msg, "\n\n");
|
||||
sprintf(msg, "%sPOINT_DATA %d\n",msg,atom->Natoms);
|
||||
sprintf(msg, "%sSCALARS mass double\n",msg);
|
||||
sprintf(msg, "%sLOOKUP_TABLE default\n",msg);
|
||||
for(int i = 0; i < atom->Natoms; i++)
|
||||
sprintf(msg, "%s1.0\n",msg);
|
||||
sprintf(msg, "%s\n\n",msg);
|
||||
flushBuffer(msg);
|
||||
}
|
||||
}
|
||||
|
||||
void vtkClose()
|
||||
{
|
||||
MPI_File_close(&_fh);
|
||||
_fh=MPI_FILE_NULL;
|
||||
}
|
||||
|
||||
//TODO: print ghost and cluster using MPI
|
||||
void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep)
|
||||
{
|
||||
if(comm->numproc == 1)
|
||||
{
|
||||
write_data_to_vtk_file(filename, atom, timestep);
|
||||
return;
|
||||
}
|
||||
|
||||
vtkOpen(filename, comm, atom, timestep);
|
||||
vtkVector(comm, atom, param);
|
||||
vtkClose();
|
||||
}
|
||||
|
||||
static inline void flushBuffer(char* msg){
|
||||
MPI_Offset displ;
|
||||
MPI_File_get_size(_fh, &displ);
|
||||
MPI_File_write_at(_fh, displ, msg, strlen(msg), MPI_CHAR, MPI_STATUS_IGNORE);
|
||||
}
|
||||
@@ -1,56 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <xtc.h>
|
||||
|
||||
#ifdef XTC_OUTPUT
|
||||
#include <gromacs/fileio/xtcio.h>
|
||||
|
||||
static struct t_fileio *xtc_file = NULL;
|
||||
static rvec *x_buf = NULL;
|
||||
static rvec basis[3];
|
||||
|
||||
void xtc_init(const char *filename, Atom *atom, int timestep) {
|
||||
basis[0][XX] = 1.0;
|
||||
basis[0][YY] = 0.0;
|
||||
basis[0][ZZ] = 0.0;
|
||||
basis[1][XX] = 0.0;
|
||||
basis[1][YY] = 1.0;
|
||||
basis[1][ZZ] = 0.0;
|
||||
basis[2][XX] = 0.0;
|
||||
basis[2][YY] = 0.0;
|
||||
basis[2][ZZ] = 1.0;
|
||||
|
||||
xtc_file = open_xtc(filename, "w");
|
||||
x_buf = (rvec *) allocate(ALIGNMENT, sizeof(rvec) * (atom->Nlocal + 1));
|
||||
xtc_write(atom, timestep, 1, 1);
|
||||
}
|
||||
|
||||
void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) {
|
||||
int i = 0;
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
|
||||
x_buf[i][XX] = ci_x[CL_X_OFFSET + cii];
|
||||
x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii];
|
||||
x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii];
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
write_xtc(xtc_file, atom->Nlocal, timestep, 0.0, (const rvec *) basis, (const rvec *) x_buf, 1000);
|
||||
}
|
||||
|
||||
void xtc_end() {
|
||||
free(x_buf);
|
||||
close_xtc(xtc_file);
|
||||
}
|
||||
#endif
|
||||
@@ -1,4 +1,4 @@
|
||||
CC = clang
|
||||
CC = cc
|
||||
LINKER = $(CC)
|
||||
|
||||
ANSI_CFLAGS = -ansi
|
||||
@@ -6,10 +6,7 @@ ANSI_CFLAGS += -std=c99
|
||||
ANSI_CFLAGS += -pedantic
|
||||
ANSI_CFLAGS += -Wextra
|
||||
|
||||
CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||
#CFLAGS = -Ofast -march=core-avx2 $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||
#CFLAGS = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||
#CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
|
||||
CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
|
||||
ASFLAGS = -masm=intel
|
||||
LFLAGS =
|
||||
DEFINES = -D_GNU_SOURCE
|
||||
|
||||
@@ -6,31 +6,10 @@ ANSI_CFLAGS += -std=c99
|
||||
ANSI_CFLAGS += -pedantic
|
||||
ANSI_CFLAGS += -Wextra
|
||||
|
||||
ifeq ($(ISA),AVX512)
|
||||
CFLAGS = -Ofast -mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
#CFLAGS = -Ofast -march=native -mavx2 -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -Ofast -mavx2 -ffast-math -funroll-loops # -fopenmp
|
||||
CFLAGS = -Ofast -mavx2 -mfma -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
CFLAGS = -Ofast -mavx -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
CFLAGS = -Ofast -msse4.2 -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
ASFLAGS = #-masm=intel
|
||||
# CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||
CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
|
||||
ASFLAGS = -masm=intel
|
||||
LFLAGS =
|
||||
DEFINES = -D_GNU_SOURCE -DNO_ZMM_INTRIN
|
||||
DEFINES = -D_GNU_SOURCE -DLIKWID_PERFMON
|
||||
INCLUDES = $(LIKWID_INC)
|
||||
LIBS = -lm
|
||||
LIBS = -lm $(LIKWID_LIB) -llikwid
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
GROMACS_PATH=/apps/Gromacs/2018.1-mkl
|
||||
GROMACS_INC ?= -I${GROMACS_PATH}/include
|
||||
GROMACS_DEFINES ?=
|
||||
GROMACS_LIB ?= -L${GROMACS_PATH}/lib64
|
||||
|
||||
ifeq ($(strip $(XTC_OUTPUT)),true)
|
||||
INCLUDES += ${GROMACS_INC}
|
||||
DEFINES += ${GROMACS_DEFINES}
|
||||
LIBS += -lgromacs
|
||||
LFLAGS += ${GROMACS_LIB}
|
||||
endif
|
||||
@@ -1,32 +1,17 @@
|
||||
CC = icc
|
||||
LINKER = $(CC)
|
||||
|
||||
OPENMP = -qopenmp
|
||||
OPENMP = #-qopenmp
|
||||
PROFILE = #-profile-functions -g -pg
|
||||
|
||||
ifeq ($(ISA),AVX512)
|
||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
OPTS = -Ofast -xAVX $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
endif
|
||||
|
||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
#OPTS = -fast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -fast -xAVX $(PROFILE)
|
||||
#OPTS = -fast -xSSE4.2 $(PROFILE)
|
||||
#OPTS = -fast -no-vec $(PROFILE)
|
||||
#OPTS = -fast -xHost $(PROFILE)
|
||||
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
|
||||
ASFLAGS = #-masm=intel
|
||||
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
|
||||
DEFINES = -std=c11 -pedantic-errors -D_GNU_SOURCE
|
||||
INCLUDES =
|
||||
LIBS = -lm
|
||||
DEFINES = -D_GNU_SOURCE #-DLIKWID_PERFMON
|
||||
INCLUDES = #$(LIKWID_INC)
|
||||
LIBS = -lm #$(LIKWID_LIB) -llikwid
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
CC = icx
|
||||
LINKER = $(CC)
|
||||
|
||||
OPENMP = #-qopenmp
|
||||
PROFILE = #-profile-functions -g -pg
|
||||
|
||||
ifeq ($(ISA),AVX512)
|
||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
#OPTS = -Ofast -march=cascadelake -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
OPTS = -Ofast -xAVX $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
endif
|
||||
|
||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
CFLAGS = $(PROFILE) $(OPENMP) $(OPTS)
|
||||
ASFLAGS = #-masm=intel
|
||||
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
|
||||
DEFINES = -std=c11 -pedantic-errors -D_GNU_SOURCE -DNO_ZMM_INTRIN
|
||||
INCLUDES =
|
||||
LIBS = -lm
|
||||
@@ -1,29 +0,0 @@
|
||||
ifeq ($(strip $(ISA)), SSE)
|
||||
__ISA_SSE__=true
|
||||
__SIMD_WIDTH_DBL__=2
|
||||
else ifeq ($(strip $(ISA)), AVX)
|
||||
__ISA_AVX__=true
|
||||
__SIMD_WIDTH_DBL__=4
|
||||
else ifeq ($(strip $(ISA)), AVX_FMA)
|
||||
__ISA_AVX__=true
|
||||
__ISA_AVX_FMA__=true
|
||||
__SIMD_WIDTH_DBL__=4
|
||||
else ifeq ($(strip $(ISA)), AVX2)
|
||||
#__SIMD_KERNEL__=true
|
||||
__ISA_AVX2__=true
|
||||
__SIMD_WIDTH_DBL__=4
|
||||
else ifeq ($(strip $(ISA)), AVX512)
|
||||
__ISA_AVX512__=true
|
||||
__SIMD_WIDTH_DBL__=8
|
||||
ifeq ($(strip $(DATA_TYPE)), DP)
|
||||
__SIMD_KERNEL__=true
|
||||
endif
|
||||
endif
|
||||
|
||||
# SIMD width is specified in double-precision, hence it may
|
||||
# need to be adjusted for single-precision
|
||||
ifeq ($(strip $(DATA_TYPE)), SP)
|
||||
VECTOR_WIDTH=$(shell echo $$(( $(__SIMD_WIDTH_DBL__) * 2 )))
|
||||
else
|
||||
VECTOR_WIDTH=$(__SIMD_WIDTH_DBL__)
|
||||
endif
|
||||
@@ -1,32 +0,0 @@
|
||||
CC = mpiicc
|
||||
LINKER = $(CC)
|
||||
|
||||
OPENMP = #-qopenmp
|
||||
PROFILE = #-profile-functions -g -pg
|
||||
|
||||
ifeq ($(ISA),AVX512)
|
||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE) #-g -debug
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
OPTS = -Ofast -xAVX $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
endif
|
||||
|
||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
|
||||
ASFLAGS = #-masm=intel
|
||||
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
|
||||
DEFINES = -std=c11 -pedantic-errors -D_GNU_SOURCE -DNO_ZMM_INTRIN
|
||||
INCLUDES =
|
||||
LIBS = -lm
|
||||
@@ -6,18 +6,11 @@ ANSI_CFLAGS += -std=c99
|
||||
ANSI_CFLAGS += -pedantic
|
||||
ANSI_CFLAGS += -Wextra
|
||||
|
||||
#
|
||||
# A100 + Native
|
||||
CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
||||
# A40 + Native
|
||||
#CFLAGS = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
||||
# Cascade Lake
|
||||
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
||||
# For GROMACS kernels, we need at least sm_61 due to atomicAdd with doubles
|
||||
# TODO: Check if this is required for full neighbor-lists and just compile kernel for that case if not
|
||||
# CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||
#CFLAGS = -O3 -g -arch=sm_61 # -fopenmp
|
||||
CFLAGS = -O3 -g # -fopenmp
|
||||
ASFLAGS = -masm=intel
|
||||
LFLAGS =
|
||||
DEFINES = -D_GNU_SOURCE -DCUDA_TARGET -DNO_ZMM_INTRIN #-DLIKWID_PERFMON
|
||||
DEFINES = -D_GNU_SOURCE #-DLIKWID_PERFMON
|
||||
INCLUDES = $(LIKWID_INC)
|
||||
LIBS = -lm $(LIKWID_LIB) -lcuda -lcudart #-llikwid
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
CC = icx
|
||||
LINKER = $(CC)
|
||||
|
||||
OPENMP = -qopenmp-simd
|
||||
PROFILE = #-g -pg
|
||||
#OPTS = -Ofast -no-vec
|
||||
#OPTS = -Ofast -xSSE4.2
|
||||
#OPTS = -Ofast -xAVX
|
||||
#OPTS = -Ofast -xCORE-AVX2
|
||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high
|
||||
#OPTS = -Ofast -xHost
|
||||
CFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
|
||||
ASFLAGS = -masm=intel
|
||||
LFLAGS = $(PROFILE) $(OPTS)
|
||||
DEFINES = -D_GNU_SOURCE -DNOCHUNK
|
||||
INCLUDES =
|
||||
LIBS = -lm
|
||||
89
lammps/allocate.c
Normal file
89
lammps/allocate.c
Normal file
@@ -0,0 +1,89 @@
|
||||
/*
|
||||
* =======================================================================================
|
||||
*
|
||||
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
void checkCUDAError(const char *msg, cudaError_t err)
|
||||
{
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
//print a human readable error message
|
||||
printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err));
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void* allocate (int alignment, size_t bytesize)
|
||||
{
|
||||
int errorCode;
|
||||
void* ptr;
|
||||
|
||||
checkCUDAError( "allocate", cudaMallocHost((void**)&ptr, bytesize) );
|
||||
|
||||
return ptr;
|
||||
|
||||
/*
|
||||
errorCode = posix_memalign(&ptr, alignment, bytesize);
|
||||
|
||||
if (errorCode) {
|
||||
if (errorCode == EINVAL) {
|
||||
fprintf(stderr,
|
||||
"Error: Alignment parameter is not a power of two\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (errorCode == ENOMEM) {
|
||||
fprintf(stderr,
|
||||
"Error: Insufficient memory to fulfill the request\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
if (ptr == NULL) {
|
||||
fprintf(stderr, "Error: posix_memalign failed!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
return ptr;
|
||||
*/
|
||||
}
|
||||
|
||||
void* reallocate (
|
||||
void* ptr,
|
||||
int alignment,
|
||||
size_t newBytesize,
|
||||
size_t oldBytesize)
|
||||
{
|
||||
void* newarray = allocate(alignment, newBytesize);
|
||||
|
||||
if(ptr != NULL) {
|
||||
memcpy(newarray, ptr, oldBytesize);
|
||||
cudaFreeHost(ptr);
|
||||
}
|
||||
|
||||
return newarray;
|
||||
}
|
||||
613
lammps/atom.c
613
lammps/atom.c
@@ -1,32 +1,42 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
* =======================================================================================
|
||||
*
|
||||
* Authors: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Rafael Ravedutti (rr), rafaelravedutti@gmail.com
|
||||
*
|
||||
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <device.h>
|
||||
#include <util.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
#ifndef MAXLINE
|
||||
#define MAXLINE 4096
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(a,b) ((a) > (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
void initAtom(Atom *atom){
|
||||
void initAtom(Atom *atom)
|
||||
{
|
||||
atom->x = NULL; atom->y = NULL; atom->z = NULL;
|
||||
atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
|
||||
atom->fx = NULL; atom->fy = NULL; atom->fz = NULL;
|
||||
@@ -40,41 +50,20 @@ void initAtom(Atom *atom){
|
||||
atom->sigma6 = NULL;
|
||||
atom->cutforcesq = NULL;
|
||||
atom->cutneighsq = NULL;
|
||||
atom->radius = NULL;
|
||||
atom->av = NULL;
|
||||
atom->r = NULL;
|
||||
atom->border_map = NULL;
|
||||
|
||||
DeviceAtom *d_atom = &(atom->d_atom);
|
||||
d_atom->x = NULL; d_atom->y = NULL; d_atom->z = NULL;
|
||||
d_atom->vx = NULL; d_atom->vy = NULL; d_atom->vz = NULL;
|
||||
d_atom->fx = NULL; d_atom->fy = NULL; d_atom->fz = NULL;
|
||||
d_atom->border_map = NULL;
|
||||
d_atom->type = NULL;
|
||||
d_atom->epsilon = NULL;
|
||||
d_atom->sigma6 = NULL;
|
||||
d_atom->cutforcesq = NULL;
|
||||
d_atom->cutneighsq = NULL;
|
||||
//MPI
|
||||
Box *mybox = &(atom->mybox);
|
||||
mybox->xprd = mybox->yprd = mybox->zprd = 0;
|
||||
mybox->lo[_x] = mybox->lo[_y] = mybox->lo[_z] = 0;
|
||||
mybox->hi[_x] = mybox->hi[_y] = mybox->hi[_z] = 0;
|
||||
}
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param) {
|
||||
|
||||
MD_FLOAT xlo = 0; MD_FLOAT xhi = param->xprd;
|
||||
MD_FLOAT ylo = 0; MD_FLOAT yhi = param->yprd;
|
||||
MD_FLOAT zlo = 0; MD_FLOAT zhi = param->zprd;
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param)
|
||||
{
|
||||
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
|
||||
atom->Natoms = 4 * param->nx * param->ny * param->nz;
|
||||
atom->Nlocal = 0;
|
||||
atom->ntypes = param->ntypes;
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
@@ -117,15 +106,15 @@ void createAtom(Atom *atom, Parameter *param) {
|
||||
xtmp = 0.5 * alat * i;
|
||||
ytmp = 0.5 * alat * j;
|
||||
ztmp = 0.5 * alat * k;
|
||||
|
||||
|
||||
if( xtmp >= xlo && xtmp < xhi &&
|
||||
ytmp >= ylo && ytmp < yhi &&
|
||||
ztmp >= zlo && ztmp < zhi ) {
|
||||
|
||||
|
||||
n = k * (2 * param->ny) * (2 * param->nx) +
|
||||
j * (2 * param->nx) +
|
||||
i + 1;
|
||||
|
||||
|
||||
for(m = 0; m < 5; m++) {
|
||||
myrandom(&n);
|
||||
}
|
||||
@@ -141,7 +130,7 @@ void createAtom(Atom *atom, Parameter *param) {
|
||||
}
|
||||
vztmp = myrandom(&n);
|
||||
|
||||
while(atom->Nlocal >= atom->Nmax) {
|
||||
if(atom->Nlocal == atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
@@ -166,511 +155,31 @@ void createAtom(Atom *atom, Parameter *param) {
|
||||
}
|
||||
}
|
||||
|
||||
int type_str2int(const char *type) {
|
||||
if(strncmp(type, "Ar", 2) == 0) { return 0; } // Argon
|
||||
fprintf(stderr, "Invalid atom type: %s\n", type);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int readAtom(Atom *atom, Parameter *param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
int len = strlen(param->input_file);
|
||||
if(strncmp(¶m->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 3], ".in", 3) == 0) { return readAtom_in(atom, param); }
|
||||
if(me==0) fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int read_atoms = 0;
|
||||
|
||||
if(!fp) {
|
||||
if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while(!feof(fp)) {
|
||||
readline(line, fp);
|
||||
char *item = strtok(line, "\t ");
|
||||
if(strncmp(item, "CRYST1", 6) == 0) {
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(NULL, "\t "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
// alpha, beta, gamma, sGroup, z
|
||||
} else if(strncmp(item, "ATOM", 4) == 0) {
|
||||
char *label;
|
||||
int atom_id, comp_id;
|
||||
MD_FLOAT occupancy, charge;
|
||||
atom_id = atoi(strtok(NULL, "\t ")) - 1;
|
||||
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type_str2int(strtok(NULL, "\t "));
|
||||
label = strtok(NULL, "\t ");
|
||||
comp_id = atoi(strtok(NULL, "\t "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vx(atom_id) = 0.0;
|
||||
atom_vy(atom_id) = 0.0;
|
||||
atom_vz(atom_id) = 0.0;
|
||||
occupancy = atof(strtok(NULL, "\t "));
|
||||
charge = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
read_atoms++;
|
||||
} else if(strncmp(item, "HEADER", 6) == 0 ||
|
||||
strncmp(item, "REMARK", 6) == 0 ||
|
||||
strncmp(item, "MODEL", 5) == 0 ||
|
||||
strncmp(item, "TER", 3) == 0 ||
|
||||
strncmp(item, "ENDMDL", 6) == 0) {
|
||||
// Do nothing
|
||||
} else {
|
||||
if(me==0)fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(!read_atoms) {
|
||||
if(me==0)fprintf(stderr, "Input error: No atoms read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
char desc[MAXLINE];
|
||||
int read_atoms = 0;
|
||||
int atoms_to_read = 0;
|
||||
int i = 0;
|
||||
|
||||
if(!fp) {
|
||||
if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
readline(desc, fp);
|
||||
for(i = 0; desc[i] != '\n'; i++);
|
||||
desc[i] = '\0';
|
||||
readline(line, fp);
|
||||
atoms_to_read = atoi(strtok(line, "\t "));
|
||||
if(me==0)fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
|
||||
|
||||
while(!feof(fp) && read_atoms < atoms_to_read) {
|
||||
readline(line, fp);
|
||||
char *label = strtok(line, "\t ");
|
||||
int type = type_str2int(strtok(NULL, "\t "));
|
||||
int atom_id = atoi(strtok(NULL, "\t ")) - 1;
|
||||
atom_id = read_atoms;
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type;
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
read_atoms++;
|
||||
}
|
||||
|
||||
if(!feof(fp)) {
|
||||
readline(line, fp);
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(line, "\t "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
}
|
||||
|
||||
if(read_atoms != atoms_to_read) {
|
||||
if(me==0)fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int natoms = 0;
|
||||
int read_atoms = 0;
|
||||
int atom_id = -1;
|
||||
int ts = -1;
|
||||
|
||||
if(!fp) {
|
||||
if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while(!feof(fp) && ts < 1 && !read_atoms) {
|
||||
readline(line, fp);
|
||||
if(strncmp(line, "ITEM: ", 6) == 0) {
|
||||
char *item = &line[6];
|
||||
|
||||
if(strncmp(item, "TIMESTEP", 8) == 0) {
|
||||
readline(line, fp);
|
||||
ts = atoi(line);
|
||||
} else if(strncmp(item, "NUMBER OF ATOMS", 15) == 0) {
|
||||
readline(line, fp);
|
||||
natoms = atoi(line);
|
||||
atom->Natoms = natoms;
|
||||
atom->Nlocal = natoms;
|
||||
while(atom->Nlocal >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
} else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
|
||||
readline(line, fp);
|
||||
param->xlo = atof(strtok(line, "\t "));
|
||||
param->xhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
|
||||
readline(line, fp);
|
||||
param->ylo = atof(strtok(line, "\t "));
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
|
||||
readline(line, fp);
|
||||
param->zlo = atof(strtok(line, "\t "));
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
} else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
|
||||
for(int i = 0; i < natoms; i++) {
|
||||
readline(line, fp);
|
||||
atom_id = atoi(strtok(line, "\t ")) - 1;
|
||||
atom->type[atom_id] = atoi(strtok(NULL, "\t "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
|
||||
read_atoms++;
|
||||
}
|
||||
} else {
|
||||
if(me==0)fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
if(me==0)fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(ts < 0 || !natoms || !read_atoms) {
|
||||
if(me==0)fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
return natoms;
|
||||
}
|
||||
|
||||
int readAtom_in(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int natoms = 0;
|
||||
int atom_id = 0;
|
||||
|
||||
if(!fp) {
|
||||
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
readline(line, fp);
|
||||
natoms = atoi(strtok(line, "\t "));
|
||||
param->xlo = atof(strtok(NULL, "\t "));
|
||||
param->xhi = atof(strtok(NULL, "\t "));
|
||||
param->ylo = atof(strtok(NULL, "\t "));
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->zlo = atof(strtok(NULL, "\t "));
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
atom->Natoms = natoms;
|
||||
atom->Nlocal = natoms;
|
||||
atom->ntypes = 1;
|
||||
|
||||
while(atom->Nlocal >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
for(int i = 0; i < natoms; i++) {
|
||||
readline(line, fp);
|
||||
|
||||
// TODO: store mass per atom
|
||||
char *s_mass = strtok(line, "\t ");
|
||||
if(strncmp(s_mass, "inf", 3) == 0) {
|
||||
// Set atom's mass to INFINITY
|
||||
} else {
|
||||
param->mass = atof(s_mass);
|
||||
}
|
||||
|
||||
atom->radius[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->type[atom_id] = 0;
|
||||
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
|
||||
atom_id++;
|
||||
}
|
||||
if(!natoms) {
|
||||
if(me==0)fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
return natoms;
|
||||
}
|
||||
|
||||
void growAtom(Atom *atom) {
|
||||
DeviceAtom *d_atom = &(atom->d_atom);
|
||||
void growAtom(Atom *atom)
|
||||
{
|
||||
int nold = atom->Nmax;
|
||||
atom->Nmax += DELTA;
|
||||
|
||||
#undef REALLOC
|
||||
#define REALLOC(p,t,ns,os); \
|
||||
atom->p = (t *) reallocate(atom->p, ALIGNMENT, ns, os); \
|
||||
atom->d_atom.p = (t *) reallocateGPU(atom->d_atom.p, ns);
|
||||
|
||||
#ifdef AOS
|
||||
REALLOC(x, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
|
||||
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
|
||||
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
#else
|
||||
REALLOC(x, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(y, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(z, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(vy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(vz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(fy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(fz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->z = (MD_FLOAT*) reallocate(atom->z, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
|
||||
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
|
||||
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
|
||||
#endif
|
||||
REALLOC(type, int, atom->Nmax * sizeof(int), nold * sizeof(int));
|
||||
|
||||
// DEM
|
||||
atom->radius = (MD_FLOAT*) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->av = (MD_FLOAT*) reallocate(atom->av, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
atom->r = (MD_FLOAT*) reallocate(atom->r, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 4, nold * sizeof(MD_FLOAT) * 4);
|
||||
}
|
||||
|
||||
/* MPI added*/
|
||||
void packForward(Atom* atom, int n ,int* list, MD_FLOAT* buf, int* pbc)
|
||||
{
|
||||
int i, j;
|
||||
for(i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf_x(i) = atom_x(j) + pbc[0] * atom->mybox.xprd;
|
||||
buf_y(i) = atom_y(j) + pbc[1] * atom->mybox.yprd;
|
||||
buf_z(i) = atom_z(j) + pbc[2] * atom->mybox.zprd;
|
||||
}
|
||||
}
|
||||
|
||||
void unpackForward(Atom* atom, int n, int first, MD_FLOAT* buf)
|
||||
{
|
||||
for(int i = 0; i < n; i++) {
|
||||
atom_x((first + i)) = buf_x(i);
|
||||
atom_y((first + i)) = buf_y(i);
|
||||
atom_z((first + i)) = buf_z(i);
|
||||
}
|
||||
}
|
||||
|
||||
int packGhost(Atom* atom, int i, MD_FLOAT* buf, int* pbc)
|
||||
{
|
||||
int m = 0;
|
||||
buf[m++] = atom_x(i) + pbc[_x] * atom->mybox.xprd;
|
||||
buf[m++] = atom_y(i) + pbc[_y] * atom->mybox.yprd;
|
||||
buf[m++] = atom_z(i) + pbc[_z] * atom->mybox.zprd;
|
||||
buf[m++] = atom->type[i];
|
||||
return m;
|
||||
}
|
||||
|
||||
int unpackGhost(Atom* atom, int i, MD_FLOAT* buf)
|
||||
{
|
||||
while (i>=atom->Nmax) growAtom(atom);
|
||||
int m = 0;
|
||||
atom_x(i) = buf[m++];
|
||||
atom_y(i) = buf[m++];
|
||||
atom_z(i) = buf[m++];
|
||||
atom->type[i] = buf[m++];
|
||||
atom->Nghost++;
|
||||
return m;
|
||||
}
|
||||
|
||||
void packReverse(Atom* atom, int n, int first, MD_FLOAT* buf)
|
||||
{
|
||||
for(int i = 0; i < n; i++) {
|
||||
buf_x(i) = atom_fx(first + i);
|
||||
buf_y(i) = atom_fy(first + i);
|
||||
buf_z(i) = atom_fz(first + i);
|
||||
}
|
||||
}
|
||||
|
||||
void unpackReverse(Atom* atom, int n, int* list, MD_FLOAT* buf)
|
||||
{
|
||||
int i, j;
|
||||
for(i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
atom_fx(j) += buf_x(i);
|
||||
atom_fy(j) += buf_y(i);
|
||||
atom_fz(j) += buf_z(i);
|
||||
}
|
||||
}
|
||||
|
||||
int packExchange(Atom* atom, int i, MD_FLOAT* buf)
|
||||
{
|
||||
int m = 0;
|
||||
buf[m++] = atom_x(i);
|
||||
buf[m++] = atom_y(i);
|
||||
buf[m++] = atom_z(i);
|
||||
buf[m++] = atom_vx(i);
|
||||
buf[m++] = atom_vy(i);
|
||||
buf[m++] = atom_vz(i);
|
||||
buf[m++] = atom->type[i];
|
||||
return m;
|
||||
}
|
||||
|
||||
int unpackExchange(Atom* atom, int i, MD_FLOAT* buf)
|
||||
{
|
||||
while(i >= atom->Nmax) growAtom(atom);
|
||||
int m = 0;
|
||||
atom_x(i) = buf[m++];
|
||||
atom_y(i) = buf[m++];
|
||||
atom_z(i) = buf[m++];
|
||||
atom_vx(i) = buf[m++];
|
||||
atom_vy(i) = buf[m++];
|
||||
atom_vz(i) = buf[m++];
|
||||
atom->type[i] = buf[m++];
|
||||
return m;
|
||||
}
|
||||
|
||||
void pbc(Atom* atom)
|
||||
{
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
|
||||
MD_FLOAT xprd = atom->mybox.xprd;
|
||||
MD_FLOAT yprd = atom->mybox.yprd;
|
||||
MD_FLOAT zprd = atom->mybox.zprd;
|
||||
|
||||
if(atom_x(i) < 0.0) atom_x(i) += xprd;
|
||||
if(atom_y(i) < 0.0) atom_y(i) += yprd;
|
||||
if(atom_z(i) < 0.0) atom_z(i)+= zprd;
|
||||
if(atom_x(i) >= xprd) atom_x(i) -= xprd;
|
||||
if(atom_y(i) >= yprd) atom_y(i) -= yprd;
|
||||
if(atom_z(i) >= zprd) atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
|
||||
void copy(Atom* atom, int i, int j)
|
||||
{
|
||||
atom_x(i) = atom_x(j);
|
||||
atom_y(i) = atom_y(j);
|
||||
atom_z(i) = atom_z(j);
|
||||
atom_vx(i) = atom_vx(j);
|
||||
atom_vy(i) = atom_vy(j);
|
||||
atom_vz(i) = atom_vz(j);
|
||||
atom->type[i] = atom->type[j];
|
||||
atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
|
||||
}
|
||||
|
||||
@@ -1,293 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
//---
|
||||
|
||||
extern "C" {
|
||||
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <parameter.h>
|
||||
#include <neighbor.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
extern MD_FLOAT xprd, yprd, zprd;
|
||||
extern MD_FLOAT bininvx, bininvy, bininvz;
|
||||
extern int mbinxlo, mbinylo, mbinzlo;
|
||||
extern int nbinx, nbiny, nbinz;
|
||||
extern int mbinx, mbiny, mbinz; // n bins in x, y, z
|
||||
extern int mbins; //total number of bins
|
||||
extern int atoms_per_bin; // max atoms per bin
|
||||
extern MD_FLOAT cutneighsq; // neighbor cutoff squared
|
||||
extern int nmax;
|
||||
extern int nstencil; // # of bins in stencil
|
||||
extern int* stencil; // stencil list of bin offsets
|
||||
static int* c_stencil = NULL;
|
||||
static int* c_resize_needed = NULL;
|
||||
static int* c_new_maxneighs = NULL;
|
||||
static Binning c_binning {
|
||||
.bincount = NULL,
|
||||
.bins = NULL,
|
||||
.mbins = 0,
|
||||
.atoms_per_bin = 0
|
||||
};
|
||||
|
||||
__device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, Neighbor_params np) {
|
||||
int ix, iy, iz;
|
||||
|
||||
if(xin >= np.xprd) {
|
||||
ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
ix = (int)(xin * np.bininvx) - np.mbinxlo;
|
||||
} else {
|
||||
ix = (int)(xin * np.bininvx) - np.mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= np.yprd) {
|
||||
iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
iy = (int)(yin * np.bininvy) - np.mbinylo;
|
||||
} else {
|
||||
iy = (int)(yin * np.bininvy) - np.mbinylo - 1;
|
||||
}
|
||||
|
||||
if(zin >= np.zprd) {
|
||||
iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo;
|
||||
} else if(zin >= 0.0) {
|
||||
iz = (int)(zin * np.bininvz) - np.mbinzlo;
|
||||
} else {
|
||||
iz = (int)(zin * np.bininvz) - np.mbinzlo - 1;
|
||||
}
|
||||
|
||||
return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
|
||||
}
|
||||
|
||||
/* sorts the contents of a bin to make it comparable to the CPU version */
|
||||
/* uses bubble sort since atoms per bin should be relatively small and can be done in situ */
|
||||
__global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= mbins) {
|
||||
return;
|
||||
}
|
||||
|
||||
int atoms_in_bin = bincount[i];
|
||||
int *bin_ptr = &bins[i * atoms_per_bin];
|
||||
int sorted;
|
||||
do {
|
||||
sorted = 1;
|
||||
int tmp;
|
||||
for(int index = 0; index < atoms_in_bin - 1; index++){
|
||||
if (bin_ptr[index] > bin_ptr[index + 1]){
|
||||
tmp = bin_ptr[index];
|
||||
bin_ptr[index] = bin_ptr[index + 1];
|
||||
bin_ptr[index + 1] = tmp;
|
||||
sorted = 0;
|
||||
}
|
||||
}
|
||||
} while (!sorted);
|
||||
}
|
||||
|
||||
__global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) {
|
||||
DeviceAtom* atom = &a;
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= nall) {
|
||||
return;
|
||||
}
|
||||
|
||||
MD_FLOAT x = atom_x(i);
|
||||
MD_FLOAT y = atom_y(i);
|
||||
MD_FLOAT z = atom_z(i);
|
||||
int ibin = coord2bin_device(x, y, z, np);
|
||||
int ac = atomicAdd(&bincount[ibin], 1);
|
||||
|
||||
if(ac < atoms_per_bin){
|
||||
bins[ibin * atoms_per_bin + ac] = i;
|
||||
} else {
|
||||
atomicMax(resize_needed, ac);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void compute_neighborhood(
|
||||
DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
|
||||
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
|
||||
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= nlocal) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
DeviceNeighbor *neighbor = &neigh;
|
||||
|
||||
int* neighptr = &(neighbor->neighbors[i]);
|
||||
int n = 0;
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
int ibin = coord2bin_device(xtmp, ytmp, ztmp, np);
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_i = atom->type[i];
|
||||
#endif
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int* loc_bin = &bins[jbin * atoms_per_bin];
|
||||
|
||||
for(int m = 0; m < bincount[jbin]; m++) {
|
||||
int j = loc_bin[m];
|
||||
|
||||
if ( j == i ){
|
||||
continue;
|
||||
}
|
||||
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_j = atom->type[j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
|
||||
#else
|
||||
const MD_FLOAT cutoff = cutneighsq;
|
||||
#endif
|
||||
|
||||
if( rsq <= cutoff ) {
|
||||
int idx = nlocal * n;
|
||||
neighptr[idx] = j;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[i] = n;
|
||||
if(n > maxneighs) {
|
||||
atomicMax(new_maxneighs, n);
|
||||
}
|
||||
}
|
||||
|
||||
void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) {
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
int resize = 1;
|
||||
const int num_blocks = ceil((float) nall / (float) threads_per_block);
|
||||
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
memsetGPU(c_binning->bincount, 0, c_binning->mbins * sizeof(int));
|
||||
memsetGPU(c_resize_needed, 0, sizeof(int));
|
||||
|
||||
binatoms_kernel<<<num_blocks, threads_per_block>>>(atom->d_atom, atom->Nlocal + atom->Nghost, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
|
||||
cuda_assert("binatoms", cudaPeekAtLastError());
|
||||
cuda_assert("binatoms", cudaDeviceSynchronize());
|
||||
|
||||
memcpyFromGPU(&resize, c_resize_needed, sizeof(int));
|
||||
if(resize) {
|
||||
c_binning->atoms_per_bin *= 2;
|
||||
c_binning->bins = (int *) reallocateGPU(c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
atoms_per_bin = c_binning->atoms_per_bin;
|
||||
const int sortBlocks = ceil((float) mbins / (float) threads_per_block);
|
||||
sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
|
||||
cuda_assert("sort_bin", cudaPeekAtLastError());
|
||||
cuda_assert("sort_bin", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
|
||||
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
|
||||
cudaProfilerStart();
|
||||
|
||||
// TODO move all of this initialization into its own method
|
||||
if(c_stencil == NULL) {
|
||||
c_stencil = (int *) allocateGPU(nstencil * sizeof(int));
|
||||
memcpyToGPU(c_stencil, stencil, nstencil * sizeof(int));
|
||||
}
|
||||
|
||||
if(c_binning.mbins == 0) {
|
||||
c_binning.mbins = mbins;
|
||||
c_binning.atoms_per_bin = atoms_per_bin;
|
||||
c_binning.bincount = (int *) allocateGPU(c_binning.mbins * sizeof(int));
|
||||
c_binning.bins = (int *) allocateGPU(c_binning.mbins * c_binning.atoms_per_bin * sizeof(int));
|
||||
}
|
||||
|
||||
Neighbor_params np {
|
||||
.xprd = xprd,
|
||||
.yprd = yprd,
|
||||
.zprd = zprd,
|
||||
.bininvx = bininvx,
|
||||
.bininvy = bininvy,
|
||||
.bininvz = bininvz,
|
||||
.mbinxlo = mbinxlo,
|
||||
.mbinylo = mbinylo,
|
||||
.mbinzlo = mbinzlo,
|
||||
.nbinx = nbinx,
|
||||
.nbiny = nbiny,
|
||||
.nbinz = nbinz,
|
||||
.mbinx = mbinx,
|
||||
.mbiny = mbiny,
|
||||
.mbinz = mbinz
|
||||
};
|
||||
|
||||
if(c_resize_needed == NULL) {
|
||||
c_resize_needed = (int *) allocateGPU(sizeof(int));
|
||||
}
|
||||
|
||||
/* bin local & ghost atoms */
|
||||
binatoms_cuda(atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
|
||||
if(c_new_maxneighs == NULL) {
|
||||
c_new_maxneighs = (int *) allocateGPU(sizeof(int));
|
||||
}
|
||||
|
||||
int resize = 1;
|
||||
|
||||
if(nall > nmax) {
|
||||
nmax = nall;
|
||||
d_neighbor->neighbors = (int *) reallocateGPU(d_neighbor->neighbors, nmax * neighbor->maxneighs * sizeof(int));
|
||||
d_neighbor->numneigh = (int *) reallocateGPU(d_neighbor->numneigh, nmax * sizeof(int));
|
||||
}
|
||||
|
||||
/* loop over each atom, storing neighbors */
|
||||
while(resize) {
|
||||
resize = 0;
|
||||
memsetGPU(c_new_maxneighs, 0, sizeof(int));
|
||||
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||
compute_neighborhood<<<num_blocks, num_threads_per_block>>>(atom->d_atom, *d_neighbor,
|
||||
np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
|
||||
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
|
||||
c_new_maxneighs,
|
||||
cutneighsq, atom->ntypes);
|
||||
|
||||
cuda_assert("compute_neighborhood", cudaPeekAtLastError());
|
||||
cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
|
||||
|
||||
int new_maxneighs;
|
||||
memcpyFromGPU(&new_maxneighs, c_new_maxneighs, sizeof(int));
|
||||
if(new_maxneighs > neighbor->maxneighs){
|
||||
resize = 1;
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
printf("RESIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
printf("NEW SIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->neighbors = (int *) reallocateGPU(neighbor->neighbors, atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
cudaProfilerStop();
|
||||
}
|
||||
@@ -1,111 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
//---
|
||||
|
||||
extern "C" {
|
||||
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <pbc.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
extern int NmaxGhost;
|
||||
extern int *PBCx, *PBCy, *PBCz;
|
||||
static int c_NmaxGhost = 0;
|
||||
static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
|
||||
|
||||
__global__ void computeAtomsPbcUpdate(DeviceAtom a, int nlocal, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) {
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
DeviceAtom *atom = &a;
|
||||
if(i >= nlocal) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (atom_x(i) < 0.0) {
|
||||
atom_x(i) += xprd;
|
||||
} else if (atom_x(i) >= xprd) {
|
||||
atom_x(i) -= xprd;
|
||||
}
|
||||
|
||||
if (atom_y(i) < 0.0) {
|
||||
atom_y(i) += yprd;
|
||||
} else if (atom_y(i) >= yprd) {
|
||||
atom_y(i) -= yprd;
|
||||
}
|
||||
|
||||
if (atom_z(i) < 0.0) {
|
||||
atom_z(i) += zprd;
|
||||
} else if (atom_z(i) >= zprd) {
|
||||
atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= nghost) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom* atom = &a;
|
||||
int *border_map = atom->border_map;
|
||||
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
|
||||
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
|
||||
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
|
||||
if(reneigh) {
|
||||
memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(atom->d_atom.type, atom->type, sizeof(int) * atom->Nmax);
|
||||
|
||||
if(c_NmaxGhost < NmaxGhost) {
|
||||
c_NmaxGhost = NmaxGhost;
|
||||
c_PBCx = (int *) reallocateGPU(c_PBCx, NmaxGhost * sizeof(int));
|
||||
c_PBCy = (int *) reallocateGPU(c_PBCy, NmaxGhost * sizeof(int));
|
||||
c_PBCz = (int *) reallocateGPU(c_PBCz, NmaxGhost * sizeof(int));
|
||||
atom->d_atom.border_map = (int *) reallocateGPU(atom->d_atom.border_map, NmaxGhost * sizeof(int));
|
||||
}
|
||||
|
||||
memcpyToGPU(c_PBCx, PBCx, NmaxGhost * sizeof(int));
|
||||
memcpyToGPU(c_PBCy, PBCy, NmaxGhost * sizeof(int));
|
||||
memcpyToGPU(c_PBCz, PBCz, NmaxGhost * sizeof(int));
|
||||
memcpyToGPU(atom->d_atom.border_map, atom->border_map, NmaxGhost * sizeof(int));
|
||||
cuda_assert("updatePbc.reneigh", cudaPeekAtLastError());
|
||||
cuda_assert("updatePbc.reneigh", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
|
||||
computePbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, atom->Nghost, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
|
||||
cuda_assert("updatePbc", cudaPeekAtLastError());
|
||||
cuda_assert("updatePbc", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||
computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, xprd, yprd, zprd);
|
||||
cuda_assert("computeAtomsPbcUpdate", cudaPeekAtLastError());
|
||||
cuda_assert("computeAtomsPbcUpdate", cudaDeviceSynchronize());
|
||||
memcpyFromGPU(atom->x, atom->d_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <device.h>
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
|
||||
void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
DeviceAtom *d_atom = &(atom->d_atom);
|
||||
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
|
||||
|
||||
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->cutneighsq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
|
||||
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
|
||||
|
||||
memcpyToGPU(d_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->cutneighsq, atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,8 +1,24 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
* =======================================================================================
|
||||
*
|
||||
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@@ -27,7 +43,7 @@ void initEam(Eam* eam, Parameter* param) {
|
||||
}
|
||||
|
||||
void coeff(Eam* eam, Parameter* param) {
|
||||
read_eam_file(&eam->file, param->eam_file);
|
||||
read_file(&eam->file, param->input_file);
|
||||
param->mass = eam->file.mass;
|
||||
param->cutforce = eam->file.cut;
|
||||
param->cutneigh = param->cutforce + 1.0;
|
||||
@@ -43,7 +59,7 @@ void init_style(Eam* eam, Parameter* param) {
|
||||
array2spline(eam, param);
|
||||
}
|
||||
|
||||
void read_eam_file(Funcfl* file, const char* filename) {
|
||||
void read_file(Funcfl* file, const char* filename) {
|
||||
FILE* fptr;
|
||||
char line[MAXLINE];
|
||||
|
||||
@@ -54,10 +70,10 @@ void read_eam_file(Funcfl* file, const char* filename) {
|
||||
}
|
||||
|
||||
int tmp;
|
||||
readline(line, fptr);
|
||||
readline(line, fptr);
|
||||
fgets(line, MAXLINE, fptr);
|
||||
fgets(line, MAXLINE, fptr);
|
||||
sscanf(line, "%d %lg", &tmp, &(file->mass));
|
||||
readline(line, fptr);
|
||||
fgets(line, MAXLINE, fptr);
|
||||
sscanf(line, "%d %lg %d %lg %lg", &file->nrho, &file->drho, &file->nr, &file->dr, &file->cut);
|
||||
|
||||
//printf("Read: %lf %i %lf %i %lf %lf\n",file->mass,file->nrho,file->drho,file->nr,file->dr,file->cut);
|
||||
@@ -261,9 +277,9 @@ void grab(FILE* fptr, int n, MD_FLOAT* list) {
|
||||
int i = 0;
|
||||
|
||||
while(i < n) {
|
||||
readline(line, fptr);
|
||||
fgets(line, MAXLINE, fptr);
|
||||
ptr = strtok(line, " \t\n\r\f");
|
||||
list[i++] = atof(ptr);
|
||||
while((ptr = strtok(NULL, " \t\n\r\f"))) list[i++] = atof(ptr);
|
||||
while(ptr = strtok(NULL, " \t\n\r\f")) list[i++] = atof(ptr);
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,32 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
* =======================================================================================
|
||||
*
|
||||
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <likwid-marker.h>
|
||||
|
||||
#include <timing.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <stats.h>
|
||||
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
#include <stdio.h>
|
||||
@@ -99,4 +119,114 @@
|
||||
# define DIST_TRACE(l, e)
|
||||
#endif
|
||||
|
||||
extern void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep);
|
||||
double computeForceTracing(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) {
|
||||
MEM_TRACER_INIT;
|
||||
INDEX_TRACER_INIT;
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
fx[i] = 0.0;
|
||||
fy[i] = 0.0;
|
||||
fz[i] = 0.0;
|
||||
}
|
||||
|
||||
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
for(int na = 0; na < (first_exec ? 1 : ATOMS_LOOP_RUNS); na++) {
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
MEM_TRACE(atom_x(i), 'R');
|
||||
MEM_TRACE(atom_y(i), 'R');
|
||||
MEM_TRACE(atom_z(i), 'R');
|
||||
INDEX_TRACE_ATOM(i);
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
MEM_TRACE(atom->type(i), 'R');
|
||||
#endif
|
||||
|
||||
#if defined(VARIANT) && VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1
|
||||
#define REPEAT_NEIGHBORS_LOOP
|
||||
int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS;
|
||||
for(int nn = 0; nn < (first_exec ? 1 : NEIGHBORS_LOOP_RUNS); nn++) {
|
||||
#endif
|
||||
|
||||
//DIST_TRACE_SORT(neighs, numneighs);
|
||||
INDEX_TRACE(neighs, numneighs);
|
||||
//DIST_TRACE(neighs, numneighs);
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
MEM_TRACE(neighs[k], 'R');
|
||||
MEM_TRACE(atom_x(j), 'R');
|
||||
MEM_TRACE(atom_y(j), 'R');
|
||||
MEM_TRACE(atom_z(j), 'R');
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
MEM_TRACE(atom->type(j), 'R');
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef REPEAT_NEIGHBORS_LOOP
|
||||
}
|
||||
#endif
|
||||
|
||||
fx[i] += fix;
|
||||
fy[i] += fiy;
|
||||
fz[i] += fiz;
|
||||
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
MEM_TRACE(fx[i], 'R');
|
||||
MEM_TRACE(fx[i], 'W');
|
||||
MEM_TRACE(fy[i], 'R');
|
||||
MEM_TRACE(fy[i], 'W');
|
||||
MEM_TRACE(fz[i], 'R');
|
||||
MEM_TRACE(fz[i], 'W');
|
||||
}
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
|
||||
INDEX_TRACER_END;
|
||||
MEM_TRACER_END;
|
||||
return E-S;
|
||||
}
|
||||
@@ -1,41 +1,56 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
* =======================================================================================
|
||||
*
|
||||
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stddef.h>
|
||||
//---
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
//---
|
||||
#include <likwid-marker.h>
|
||||
|
||||
extern "C" {
|
||||
#include <likwid-marker.h>
|
||||
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <device.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <timing.h>
|
||||
#include <util.h>
|
||||
|
||||
#include <timing.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
}
|
||||
|
||||
// cuda kernel
|
||||
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
|
||||
__global__ void calc_force(
|
||||
Atom a,
|
||||
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon,
|
||||
int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
|
||||
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= Nlocal) {
|
||||
if( i >= Nlocal ) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
Atom *atom = &a;
|
||||
|
||||
const int numneighs = neigh_numneigh[i];
|
||||
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
@@ -46,12 +61,8 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neigh_neighbors[Nlocal * k + i];
|
||||
int j = neigh_neighbors[atom->Nlocal * k + i];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
@@ -59,7 +70,7 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * ntypes + type_j;
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
@@ -80,13 +91,14 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
|
||||
atom_fz(i) = fiz;
|
||||
}
|
||||
|
||||
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, DeviceAtom a) {
|
||||
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
|
||||
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if( i >= Nlocal ) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
Atom *atom = &a;
|
||||
|
||||
atom_vx(i) += dtforce * atom_fx(i);
|
||||
atom_vy(i) += dtforce * atom_fy(i);
|
||||
@@ -96,13 +108,14 @@ __global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nloc
|
||||
atom_z(i) = atom_z(i) + dt * atom_vz(i);
|
||||
}
|
||||
|
||||
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, DeviceAtom a) {
|
||||
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
|
||||
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if( i >= Nlocal ) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
Atom *atom = &a;
|
||||
|
||||
atom_vx(i) += dtforce * atom_fx(i);
|
||||
atom_vy(i) += dtforce * atom_fy(i);
|
||||
@@ -111,40 +124,54 @@ __global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, DeviceAtom
|
||||
|
||||
extern "C" {
|
||||
|
||||
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
|
||||
|
||||
void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
|
||||
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
|
||||
cuda_assert("kernel_final_integrate", cudaPeekAtLastError());
|
||||
cuda_assert("kernel_final_integrate", cudaDeviceSynchronize());
|
||||
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
|
||||
|
||||
if(reneigh) {
|
||||
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||
checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
|
||||
checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
|
||||
|
||||
if(doReneighbour) {
|
||||
checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||
}
|
||||
}
|
||||
|
||||
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
|
||||
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
|
||||
cuda_assert("kernel_initial_integrate", cudaPeekAtLastError());
|
||||
cuda_assert("kernel_initial_integrate", cudaDeviceSynchronize());
|
||||
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
|
||||
|
||||
if(reneigh) {
|
||||
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||
checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
|
||||
checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
|
||||
|
||||
if(doReneighbour) {
|
||||
checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||
}
|
||||
}
|
||||
|
||||
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
double computeForce(
|
||||
bool reneighbourHappenend,
|
||||
Parameter *param,
|
||||
Atom *atom,
|
||||
Neighbor *neighbor,
|
||||
Atom *c_atom,
|
||||
Neighbor *c_neighbor,
|
||||
int num_threads_per_block
|
||||
)
|
||||
{
|
||||
int Nlocal = atom->Nlocal;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
|
||||
/*
|
||||
int nDevices;
|
||||
@@ -160,21 +187,26 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
|
||||
|
||||
|
||||
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
|
||||
// memsetGPU(atom->d_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3);
|
||||
|
||||
// checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
|
||||
|
||||
cudaProfilerStart();
|
||||
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
|
||||
cuda_assert("calc_force", cudaPeekAtLastError());
|
||||
cuda_assert("calc_force", cudaDeviceSynchronize());
|
||||
calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh);
|
||||
|
||||
checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
|
||||
checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
|
||||
|
||||
cudaProfilerStop();
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
|
||||
return E-S;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,123 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <math.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <likwid-marker.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
|
||||
|
||||
double computeForceDemFullNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
MD_FLOAT k_s = param->k_s;
|
||||
MD_FLOAT k_dn = param->k_dn;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
atom_fy(i) = 0.0;
|
||||
atom_fz(i) = 0.0;
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT irad = atom->radius[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT jrad = atom->radius[j];
|
||||
MD_FLOAT xj = atom_x(j);
|
||||
MD_FLOAT yj = atom_y(j);
|
||||
MD_FLOAT zj = atom_z(j);
|
||||
MD_FLOAT delx = xtmp - xj;
|
||||
MD_FLOAT dely = ytmp - yj;
|
||||
MD_FLOAT delz = ztmp - zj;
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT r = sqrt(rsq);
|
||||
MD_FLOAT p = irad + jrad - r;
|
||||
|
||||
if(p > 0) {
|
||||
MD_FLOAT delvx = atom_vx(i) - atom_vx(j);
|
||||
MD_FLOAT delvy = atom_vy(i) - atom_vy(j);
|
||||
MD_FLOAT delvz = atom_vz(i) - atom_vz(j);
|
||||
MD_FLOAT vr = sqrt(delvx * delvx + delvy * delvy + delvz * delvz);
|
||||
|
||||
// normal distance
|
||||
MD_FLOAT nx = delx / r;
|
||||
MD_FLOAT ny = dely / r;
|
||||
MD_FLOAT nz = delz / r;
|
||||
|
||||
// normal contact velocity
|
||||
MD_FLOAT nvx = delvx / vr;
|
||||
MD_FLOAT nvy = delvy / vr;
|
||||
MD_FLOAT nvz = delvz / vr;
|
||||
|
||||
// forces
|
||||
atom_fx(i) += k_s * p * nx - k_dn * nvx;
|
||||
atom_fy(i) += k_s * p * ny - k_dn * nvy;
|
||||
atom_fz(i) += k_s * p * nz - k_dn * nvz;
|
||||
atom_fx(j) += -k_s * p * nx - k_dn * nvx;
|
||||
atom_fy(j) += -k_s * p * ny - k_dn * nvy;
|
||||
atom_fz(j) += -k_s * p * nz - k_dn * nvz;
|
||||
|
||||
// contact position
|
||||
//MD_FLOAT cterm = jrad / (irad + jrad);
|
||||
//MD_FLOAT cx = xj + cterm * delx;
|
||||
//MD_FLOAT cy = yj + cterm * dely;
|
||||
//MD_FLOAT cz = zj + cterm * delz;
|
||||
|
||||
// delta contact and particle position
|
||||
//MD_FLOAT delcx = cx - xtmp;
|
||||
//MD_FLOAT delcy = cy - ytmp;
|
||||
//MD_FLOAT delcz = cz - ztmp;
|
||||
|
||||
// contact velocity
|
||||
//MD_FLOAT cvx = (atom_vx(i) + atom_avx(i) * delcx) - (atom_vx(j) + atom_avx(j) * (cx - xj));
|
||||
//MD_FLOAT cvy = (atom_vy(i) + atom_avy(i) * delcy) - (atom_vy(j) + atom_avy(j) * (cy - yj));
|
||||
//MD_FLOAT cvz = (atom_vz(i) + atom_avz(i) * delcz) - (atom_vz(j) + atom_avz(j) * (cz - zj));
|
||||
|
||||
// tangential force
|
||||
//fix += MIN(kdt * vtsq, kf * fnx) * tx;
|
||||
//fiy += MIN(kdt * vtsq, kf * fny) * ty;
|
||||
//fiz += MIN(kdt * vtsq, kf * fnz) * tz;
|
||||
// torque
|
||||
//MD_FLOAT taux = delcx * ftx;
|
||||
//MD_FLOAT tauy = delcy * fty;
|
||||
//MD_FLOAT tauz = delcz * ftz;
|
||||
}
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
addStat(stats->atoms_within_cutoff, 1);
|
||||
} else {
|
||||
addStat(stats->atoms_outside_cutoff, 1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
@@ -1,8 +1,24 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
* =======================================================================================
|
||||
*
|
||||
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <likwid-marker.h>
|
||||
#include <math.h>
|
||||
@@ -16,7 +32,7 @@
|
||||
#include <eam.h>
|
||||
#include <util.h>
|
||||
|
||||
double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) {
|
||||
if(eam->nmax < atom->Nmax) {
|
||||
eam->nmax = atom->Nmax;
|
||||
if(eam->fp != NULL) { free(eam->fp); }
|
||||
@@ -25,18 +41,14 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
|
||||
MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
|
||||
MD_FLOAT* rhor_spline = eam->rhor_spline; MD_FLOAT* frho_spline = eam->frho_spline; MD_FLOAT* z2r_spline = eam->z2r_spline;
|
||||
MD_FLOAT rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; MD_FLOAT rdrho = eam->rdrho;
|
||||
int rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; int rdrho = eam->rdrho;
|
||||
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
|
||||
double S = getTimeStamp();
|
||||
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force_eam_fp");
|
||||
|
||||
#pragma omp for
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -99,19 +111,13 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam_fp");
|
||||
}
|
||||
|
||||
// We still need to update fp for PBC atoms
|
||||
for(int i = 0; i < atom->Nghost; i++) {
|
||||
fp[Nlocal + i] = fp[atom->border_map[i]];
|
||||
}
|
||||
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force_eam");
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -194,16 +200,14 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
}
|
||||
}
|
||||
|
||||
atom_fx(i) = fix;
|
||||
atom_fy(i) = fiy;
|
||||
atom_fz(i) = fiz;
|
||||
fx[i] = fix;
|
||||
fy[i] = fiy;
|
||||
fz[i] = fiz;
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user